You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by om...@apache.org on 2017/07/19 16:58:24 UTC
[01/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Repository: hive
Updated Branches:
refs/heads/branch-2.2 3e426721c -> df8921d85
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestRunLengthIntegerReader.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestRunLengthIntegerReader.java b/orc/src/test/org/apache/orc/impl/TestRunLengthIntegerReader.java
deleted file mode 100644
index 28239ba..0000000
--- a/orc/src/test/org/apache/orc/impl/TestRunLengthIntegerReader.java
+++ /dev/null
@@ -1,125 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import static junit.framework.Assert.assertEquals;
-
-import java.nio.ByteBuffer;
-import java.util.Random;
-
-import org.apache.orc.CompressionCodec;
-import org.junit.Test;
-
-public class TestRunLengthIntegerReader {
-
- public void runSeekTest(CompressionCodec codec) throws Exception {
- TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
- RunLengthIntegerWriter out = new RunLengthIntegerWriter(
- new OutStream("test", 1000, codec, collect), true);
- TestInStream.PositionCollector[] positions =
- new TestInStream.PositionCollector[4096];
- Random random = new Random(99);
- int[] junk = new int[2048];
- for(int i=0; i < junk.length; ++i) {
- junk[i] = random.nextInt();
- }
- for(int i=0; i < 4096; ++i) {
- positions[i] = new TestInStream.PositionCollector();
- out.getPosition(positions[i]);
- // test runs, incrementing runs, non-runs
- if (i < 1024) {
- out.write(i/4);
- } else if (i < 2048) {
- out.write(2*i);
- } else {
- out.write(junk[i-2048]);
- }
- }
- out.flush();
- ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
- collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
- inBuf.flip();
- RunLengthIntegerReader in = new RunLengthIntegerReader(InStream.create
- ("test", new ByteBuffer[]{inBuf}, new long[]{0}, inBuf.remaining(),
- codec, 1000), true);
- for(int i=0; i < 2048; ++i) {
- int x = (int) in.next();
- if (i < 1024) {
- assertEquals(i/4, x);
- } else if (i < 2048) {
- assertEquals(2*i, x);
- } else {
- assertEquals(junk[i-2048], x);
- }
- }
- for(int i=2047; i >= 0; --i) {
- in.seek(positions[i]);
- int x = (int) in.next();
- if (i < 1024) {
- assertEquals(i/4, x);
- } else if (i < 2048) {
- assertEquals(2*i, x);
- } else {
- assertEquals(junk[i-2048], x);
- }
- }
- }
-
- @Test
- public void testUncompressedSeek() throws Exception {
- runSeekTest(null);
- }
-
- @Test
- public void testCompressedSeek() throws Exception {
- runSeekTest(new ZlibCodec());
- }
-
- @Test
- public void testSkips() throws Exception {
- TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
- RunLengthIntegerWriter out = new RunLengthIntegerWriter(
- new OutStream("test", 100, null, collect), true);
- for(int i=0; i < 2048; ++i) {
- if (i < 1024) {
- out.write(i);
- } else {
- out.write(256 * i);
- }
- }
- out.flush();
- ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
- collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
- inBuf.flip();
- RunLengthIntegerReader in = new RunLengthIntegerReader(InStream.create
- ("test", new ByteBuffer[]{inBuf}, new long[]{0}, inBuf.remaining(),
- null, 100), true);
- for(int i=0; i < 2048; i += 10) {
- int x = (int) in.next();
- if (i < 1024) {
- assertEquals(i, x);
- } else {
- assertEquals(256 * i, x);
- }
- if (i < 2038) {
- in.skip(9);
- }
- in.skip(0);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestSchemaEvolution.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestSchemaEvolution.java b/orc/src/test/org/apache/orc/impl/TestSchemaEvolution.java
deleted file mode 100644
index e0ed2ad..0000000
--- a/orc/src/test/org/apache/orc/impl/TestSchemaEvolution.java
+++ /dev/null
@@ -1,480 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.orc.OrcFile;
-import org.apache.orc.Reader;
-import org.apache.orc.RecordReader;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.Writer;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-
-public class TestSchemaEvolution {
-
- @Rule
- public TestName testCaseName = new TestName();
-
- Configuration conf;
- Path testFilePath;
- FileSystem fs;
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
-
- @Before
- public void setup() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcFile." +
- testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @Test
- public void testDataTypeConversion1() throws IOException {
- TypeDescription fileStruct1 = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createInt())
- .addField("f2", TypeDescription.createString())
- .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10));
- SchemaEvolution same1 = new SchemaEvolution(fileStruct1, null);
- assertFalse(same1.hasConversion());
- TypeDescription readerStruct1 = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createInt())
- .addField("f2", TypeDescription.createString())
- .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10));
- SchemaEvolution both1 = new SchemaEvolution(fileStruct1, readerStruct1, null);
- assertFalse(both1.hasConversion());
- TypeDescription readerStruct1diff = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createLong())
- .addField("f2", TypeDescription.createString())
- .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10));
- SchemaEvolution both1diff = new SchemaEvolution(fileStruct1, readerStruct1diff, null);
- assertTrue(both1diff.hasConversion());
- TypeDescription readerStruct1diffPrecision = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createInt())
- .addField("f2", TypeDescription.createString())
- .addField("f3", TypeDescription.createDecimal().withPrecision(12).withScale(10));
- SchemaEvolution both1diffPrecision = new SchemaEvolution(fileStruct1, readerStruct1diffPrecision, null);
- assertTrue(both1diffPrecision.hasConversion());
- }
-
- @Test
- public void testDataTypeConversion2() throws IOException {
- TypeDescription fileStruct2 = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createUnion()
- .addUnionChild(TypeDescription.createByte())
- .addUnionChild(TypeDescription.createDecimal()
- .withPrecision(20).withScale(10)))
- .addField("f2", TypeDescription.createStruct()
- .addField("f3", TypeDescription.createDate())
- .addField("f4", TypeDescription.createDouble())
- .addField("f5", TypeDescription.createBoolean()))
- .addField("f6", TypeDescription.createChar().withMaxLength(100));
- SchemaEvolution same2 = new SchemaEvolution(fileStruct2, null);
- assertFalse(same2.hasConversion());
- TypeDescription readerStruct2 = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createUnion()
- .addUnionChild(TypeDescription.createByte())
- .addUnionChild(TypeDescription.createDecimal()
- .withPrecision(20).withScale(10)))
- .addField("f2", TypeDescription.createStruct()
- .addField("f3", TypeDescription.createDate())
- .addField("f4", TypeDescription.createDouble())
- .addField("f5", TypeDescription.createBoolean()))
- .addField("f6", TypeDescription.createChar().withMaxLength(100));
- SchemaEvolution both2 = new SchemaEvolution(fileStruct2, readerStruct2, null);
- assertFalse(both2.hasConversion());
- TypeDescription readerStruct2diff = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createUnion()
- .addUnionChild(TypeDescription.createByte())
- .addUnionChild(TypeDescription.createDecimal()
- .withPrecision(20).withScale(10)))
- .addField("f2", TypeDescription.createStruct()
- .addField("f3", TypeDescription.createDate())
- .addField("f4", TypeDescription.createDouble())
- .addField("f5", TypeDescription.createByte()))
- .addField("f6", TypeDescription.createChar().withMaxLength(100));
- SchemaEvolution both2diff = new SchemaEvolution(fileStruct2, readerStruct2diff, null);
- assertTrue(both2diff.hasConversion());
- TypeDescription readerStruct2diffChar = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createUnion()
- .addUnionChild(TypeDescription.createByte())
- .addUnionChild(TypeDescription.createDecimal()
- .withPrecision(20).withScale(10)))
- .addField("f2", TypeDescription.createStruct()
- .addField("f3", TypeDescription.createDate())
- .addField("f4", TypeDescription.createDouble())
- .addField("f5", TypeDescription.createBoolean()))
- .addField("f6", TypeDescription.createChar().withMaxLength(80));
- SchemaEvolution both2diffChar = new SchemaEvolution(fileStruct2, readerStruct2diffChar, null);
- assertTrue(both2diffChar.hasConversion());
- }
-
- @Test
- public void testFloatToDoubleEvolution() throws Exception {
- testFilePath = new Path(workDir, "TestOrcFile." +
- testCaseName.getMethodName() + ".orc");
- TypeDescription schema = TypeDescription.createFloat();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
- .bufferSize(10000));
- VectorizedRowBatch batch = new VectorizedRowBatch(1, 1024);
- DoubleColumnVector dcv = new DoubleColumnVector(1024);
- batch.cols[0] = dcv;
- batch.reset();
- batch.size = 1;
- dcv.vector[0] = 74.72f;
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- TypeDescription schemaOnRead = TypeDescription.createDouble();
- RecordReader rows = reader.rows(new Reader.Options().schema(schemaOnRead));
- batch = schemaOnRead.createRowBatch();
- rows.nextBatch(batch);
- assertEquals(74.72, ((DoubleColumnVector) batch.cols[0]).vector[0], 0.00000000001);
- rows.close();
- }
-
- @Test
- public void testSafePpdEvaluation() throws IOException {
- TypeDescription fileStruct1 = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createInt())
- .addField("f2", TypeDescription.createString())
- .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10));
- SchemaEvolution same1 = new SchemaEvolution(fileStruct1, null);
- assertTrue(same1.isPPDSafeConversion(0));
- assertFalse(same1.hasConversion());
- TypeDescription readerStruct1 = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createInt())
- .addField("f2", TypeDescription.createString())
- .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10));
- SchemaEvolution both1 = new SchemaEvolution(fileStruct1, readerStruct1, null);
- assertFalse(both1.hasConversion());
- assertTrue(both1.isPPDSafeConversion(0));
- assertTrue(both1.isPPDSafeConversion(1));
- assertTrue(both1.isPPDSafeConversion(2));
- assertTrue(both1.isPPDSafeConversion(3));
-
- // int -> long
- TypeDescription readerStruct1diff = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createLong())
- .addField("f2", TypeDescription.createString())
- .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10));
- SchemaEvolution both1diff = new SchemaEvolution(fileStruct1, readerStruct1diff, null);
- assertTrue(both1diff.hasConversion());
- assertFalse(both1diff.isPPDSafeConversion(0));
- assertTrue(both1diff.isPPDSafeConversion(1));
- assertTrue(both1diff.isPPDSafeConversion(2));
- assertTrue(both1diff.isPPDSafeConversion(3));
-
- // decimal(38,10) -> decimal(12, 10)
- TypeDescription readerStruct1diffPrecision = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createInt())
- .addField("f2", TypeDescription.createString())
- .addField("f3", TypeDescription.createDecimal().withPrecision(12).withScale(10));
- SchemaEvolution both1diffPrecision = new SchemaEvolution(fileStruct1, readerStruct1diffPrecision,
- new boolean[] {true, false, false, true});
- assertTrue(both1diffPrecision.hasConversion());
- assertFalse(both1diffPrecision.isPPDSafeConversion(0));
- assertFalse(both1diffPrecision.isPPDSafeConversion(1)); // column not included
- assertFalse(both1diffPrecision.isPPDSafeConversion(2)); // column not included
- assertFalse(both1diffPrecision.isPPDSafeConversion(3));
-
- // add columns
- readerStruct1 = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createInt())
- .addField("f2", TypeDescription.createString())
- .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10))
- .addField("f4", TypeDescription.createBoolean());
- both1 = new SchemaEvolution(fileStruct1, readerStruct1, null);
- assertTrue(both1.hasConversion());
- assertFalse(both1.isPPDSafeConversion(0));
- assertTrue(both1.isPPDSafeConversion(1));
- assertTrue(both1.isPPDSafeConversion(2));
- assertTrue(both1.isPPDSafeConversion(3));
- assertFalse(both1.isPPDSafeConversion(4));
- }
-
- @Test
- public void testSafePpdEvaluationForInts() throws IOException {
- // byte -> short -> int -> long
- TypeDescription fileSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createByte());
- SchemaEvolution schemaEvolution = new SchemaEvolution(fileSchema, null);
- assertFalse(schemaEvolution.hasConversion());
-
- // byte -> short
- TypeDescription readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createShort());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertTrue(schemaEvolution.isPPDSafeConversion(1));
-
- // byte -> int
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createInt());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertTrue(schemaEvolution.isPPDSafeConversion(1));
-
- // byte -> long
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createLong());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertTrue(schemaEvolution.isPPDSafeConversion(1));
-
- // short -> int -> long
- fileSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createShort());
- schemaEvolution = new SchemaEvolution(fileSchema, null);
- assertFalse(schemaEvolution.hasConversion());
-
- // unsafe conversion short -> byte
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createByte());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
-
- // short -> int
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createInt());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertTrue(schemaEvolution.isPPDSafeConversion(1));
-
- // short -> long
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createLong());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertTrue(schemaEvolution.isPPDSafeConversion(1));
-
- // int -> long
- fileSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createInt());
- schemaEvolution = new SchemaEvolution(fileSchema, null);
- assertFalse(schemaEvolution.hasConversion());
-
- // unsafe conversion int -> byte
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createByte());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
-
- // unsafe conversion int -> short
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createShort());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
-
- // int -> long
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createLong());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertTrue(schemaEvolution.isPPDSafeConversion(1));
-
- // long
- fileSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createLong());
- schemaEvolution = new SchemaEvolution(fileSchema, null);
- assertTrue(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.hasConversion());
-
- // unsafe conversion long -> byte
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createByte());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
-
- // unsafe conversion long -> short
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createShort());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
-
- // unsafe conversion long -> int
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createInt());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
-
- // invalid
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createString());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
-
- // invalid
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createFloat());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
-
- // invalid
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createTimestamp());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
- }
-
- @Test
- public void testSafePpdEvaluationForStrings() throws IOException {
- TypeDescription fileSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createString());
- SchemaEvolution schemaEvolution = new SchemaEvolution(fileSchema, null);
- assertTrue(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.hasConversion());
-
- // string -> char
- TypeDescription readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createChar());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
-
- // string -> varchar
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createVarchar());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertTrue(schemaEvolution.isPPDSafeConversion(1));
-
- fileSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createChar());
- schemaEvolution = new SchemaEvolution(fileSchema, null);
- assertTrue(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.hasConversion());
-
- // char -> string
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createString());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
-
- // char -> varchar
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createVarchar());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
-
- fileSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createVarchar());
- schemaEvolution = new SchemaEvolution(fileSchema, null);
- assertTrue(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.hasConversion());
-
- // varchar -> string
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createString());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertTrue(schemaEvolution.isPPDSafeConversion(1));
-
- // varchar -> char
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createChar());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
-
- // invalid
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createDecimal());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
-
- // invalid
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createDate());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
-
- // invalid
- readerSchema = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createInt());
- schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
- assertTrue(schemaEvolution.hasConversion());
- assertFalse(schemaEvolution.isPPDSafeConversion(0));
- assertFalse(schemaEvolution.isPPDSafeConversion(1));
- }
-
- @Test
- public void ensureFileIncluded() throws IOException {
- TypeDescription file = TypeDescription.fromString("struct<x:int,y:int>");
- SchemaEvolution evolution = new SchemaEvolution(file, null);
- boolean[] include = evolution.getFileIncluded();
- assertEquals(3, include.length);
- for(int i=0; i < include.length; ++i) {
- assertTrue("element " + i, include[i]);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestSerializationUtils.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestSerializationUtils.java b/orc/src/test/org/apache/orc/impl/TestSerializationUtils.java
deleted file mode 100644
index 4a8a0f2..0000000
--- a/orc/src/test/org/apache/orc/impl/TestSerializationUtils.java
+++ /dev/null
@@ -1,201 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import static org.junit.Assert.assertArrayEquals;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.fail;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.InputStream;
-import java.math.BigInteger;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Random;
-
-import org.junit.Test;
-
-import com.google.common.math.LongMath;
-
-public class TestSerializationUtils {
-
- private InputStream fromBuffer(ByteArrayOutputStream buffer) {
- return new ByteArrayInputStream(buffer.toByteArray());
- }
-
- @Test
- public void testDoubles() throws Exception {
- double tolerance = 0.0000000000000001;
- ByteArrayOutputStream buffer = new ByteArrayOutputStream();
- SerializationUtils utils = new SerializationUtils();
- utils.writeDouble(buffer, 1343822337.759);
- assertEquals(1343822337.759, utils.readDouble(fromBuffer(buffer)), tolerance);
- buffer = new ByteArrayOutputStream();
- utils.writeDouble(buffer, 0.8);
- double got = utils.readDouble(fromBuffer(buffer));
- assertEquals(0.8, got, tolerance);
- }
-
- @Test
- public void testBigIntegers() throws Exception {
- ByteArrayOutputStream buffer = new ByteArrayOutputStream();
- SerializationUtils.writeBigInteger(buffer, BigInteger.valueOf(0));
- assertArrayEquals(new byte[]{0}, buffer.toByteArray());
- assertEquals(0L,
- SerializationUtils.readBigInteger(fromBuffer(buffer)).longValue());
- buffer.reset();
- SerializationUtils.writeBigInteger(buffer, BigInteger.valueOf(1));
- assertArrayEquals(new byte[]{2}, buffer.toByteArray());
- assertEquals(1L,
- SerializationUtils.readBigInteger(fromBuffer(buffer)).longValue());
- buffer.reset();
- SerializationUtils.writeBigInteger(buffer, BigInteger.valueOf(-1));
- assertArrayEquals(new byte[]{1}, buffer.toByteArray());
- assertEquals(-1L,
- SerializationUtils.readBigInteger(fromBuffer(buffer)).longValue());
- buffer.reset();
- SerializationUtils.writeBigInteger(buffer, BigInteger.valueOf(50));
- assertArrayEquals(new byte[]{100}, buffer.toByteArray());
- assertEquals(50L,
- SerializationUtils.readBigInteger(fromBuffer(buffer)).longValue());
- buffer.reset();
- SerializationUtils.writeBigInteger(buffer, BigInteger.valueOf(-50));
- assertArrayEquals(new byte[]{99}, buffer.toByteArray());
- assertEquals(-50L,
- SerializationUtils.readBigInteger(fromBuffer(buffer)).longValue());
- for(int i=-8192; i < 8192; ++i) {
- buffer.reset();
- SerializationUtils.writeBigInteger(buffer, BigInteger.valueOf(i));
- assertEquals("compare length for " + i,
- i >= -64 && i < 64 ? 1 : 2, buffer.size());
- assertEquals("compare result for " + i,
- i, SerializationUtils.readBigInteger(fromBuffer(buffer)).intValue());
- }
- buffer.reset();
- SerializationUtils.writeBigInteger(buffer,
- new BigInteger("123456789abcdef0",16));
- assertEquals(new BigInteger("123456789abcdef0",16),
- SerializationUtils.readBigInteger(fromBuffer(buffer)));
- buffer.reset();
- SerializationUtils.writeBigInteger(buffer,
- new BigInteger("-123456789abcdef0",16));
- assertEquals(new BigInteger("-123456789abcdef0",16),
- SerializationUtils.readBigInteger(fromBuffer(buffer)));
- StringBuilder buf = new StringBuilder();
- for(int i=0; i < 256; ++i) {
- String num = Integer.toHexString(i);
- if (num.length() == 1) {
- buf.append('0');
- }
- buf.append(num);
- }
- buffer.reset();
- SerializationUtils.writeBigInteger(buffer,
- new BigInteger(buf.toString(),16));
- assertEquals(new BigInteger(buf.toString(),16),
- SerializationUtils.readBigInteger(fromBuffer(buffer)));
- buffer.reset();
- SerializationUtils.writeBigInteger(buffer,
- new BigInteger("ff000000000000000000000000000000000000000000ff",16));
- assertEquals(
- new BigInteger("ff000000000000000000000000000000000000000000ff",16),
- SerializationUtils.readBigInteger(fromBuffer(buffer)));
- }
-
- @Test
- public void testSubtractionOverflow() {
- // cross check results with Guava results below
- SerializationUtils utils = new SerializationUtils();
- assertEquals(false, utils.isSafeSubtract(22222222222L, Long.MIN_VALUE));
- assertEquals(false, utils.isSafeSubtract(-22222222222L, Long.MAX_VALUE));
- assertEquals(false, utils.isSafeSubtract(Long.MIN_VALUE, Long.MAX_VALUE));
- assertEquals(true, utils.isSafeSubtract(-1553103058346370095L, 6553103058346370095L));
- assertEquals(true, utils.isSafeSubtract(0, Long.MAX_VALUE));
- assertEquals(true, utils.isSafeSubtract(Long.MIN_VALUE, 0));
- }
-
- @Test
- public void testSubtractionOverflowGuava() {
- try {
- LongMath.checkedSubtract(22222222222L, Long.MIN_VALUE);
- fail("expected ArithmeticException for overflow");
- } catch (ArithmeticException ex) {
- assertEquals(ex.getMessage(), "overflow");
- }
-
- try {
- LongMath.checkedSubtract(-22222222222L, Long.MAX_VALUE);
- fail("expected ArithmeticException for overflow");
- } catch (ArithmeticException ex) {
- assertEquals(ex.getMessage(), "overflow");
- }
-
- try {
- LongMath.checkedSubtract(Long.MIN_VALUE, Long.MAX_VALUE);
- fail("expected ArithmeticException for overflow");
- } catch (ArithmeticException ex) {
- assertEquals(ex.getMessage(), "overflow");
- }
-
- assertEquals(-8106206116692740190L,
- LongMath.checkedSubtract(-1553103058346370095L, 6553103058346370095L));
- assertEquals(-Long.MAX_VALUE, LongMath.checkedSubtract(0, Long.MAX_VALUE));
- assertEquals(Long.MIN_VALUE, LongMath.checkedSubtract(Long.MIN_VALUE, 0));
- }
-
- @Test
- public void testRandomFloats() throws Exception {
- float tolerance = 0.0000000000000001f;
- ByteArrayOutputStream buffer = new ByteArrayOutputStream();
- SerializationUtils utils = new SerializationUtils();
- Random rand = new Random();
- int n = 100_000;
- float[] expected = new float[n];
- for (int i = 0; i < n; i++) {
- float f = rand.nextFloat();
- expected[i] = f;
- utils.writeFloat(buffer, f);
- }
- InputStream newBuffer = fromBuffer(buffer);
- for (int i = 0; i < n; i++) {
- float got = utils.readFloat(newBuffer);
- assertEquals(expected[i], got, tolerance);
- }
- }
-
- @Test
- public void testRandomDoubles() throws Exception {
- double tolerance = 0.0000000000000001;
- ByteArrayOutputStream buffer = new ByteArrayOutputStream();
- SerializationUtils utils = new SerializationUtils();
- Random rand = new Random();
- int n = 100_000;
- double[] expected = new double[n];
- for (int i = 0; i < n; i++) {
- double d = rand.nextDouble();
- expected[i] = d;
- utils.writeDouble(buffer, d);
- }
- InputStream newBuffer = fromBuffer(buffer);
- for (int i = 0; i < n; i++) {
- double got = utils.readDouble(newBuffer);
- assertEquals(expected[i], got, tolerance);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestStreamName.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestStreamName.java b/orc/src/test/org/apache/orc/impl/TestStreamName.java
deleted file mode 100644
index be58d4c..0000000
--- a/orc/src/test/org/apache/orc/impl/TestStreamName.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import org.apache.orc.OrcProto;
-import org.junit.Test;
-
-import static org.junit.Assert.assertEquals;
-
-public class TestStreamName {
-
- @Test
- public void test1() throws Exception {
- StreamName s1 = new StreamName(3, OrcProto.Stream.Kind.DATA);
- StreamName s2 = new StreamName(3,
- OrcProto.Stream.Kind.DICTIONARY_DATA);
- StreamName s3 = new StreamName(5, OrcProto.Stream.Kind.DATA);
- StreamName s4 = new StreamName(5,
- OrcProto.Stream.Kind.DICTIONARY_DATA);
- StreamName s1p = new StreamName(3, OrcProto.Stream.Kind.DATA);
- assertEquals(true, s1.equals(s1));
- assertEquals(false, s1.equals(s2));
- assertEquals(false, s1.equals(s3));
- assertEquals(true, s1.equals(s1p));
- assertEquals(true, s1.compareTo(null) < 0);
- assertEquals(false, s1.equals(null));
- assertEquals(true, s1.compareTo(s2) < 0);
- assertEquals(true, s2.compareTo(s3) < 0);
- assertEquals(true, s3.compareTo(s4) < 0);
- assertEquals(true, s4.compareTo(s1p) > 0);
- assertEquals(0, s1p.compareTo(s1));
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestStringRedBlackTree.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestStringRedBlackTree.java b/orc/src/test/org/apache/orc/impl/TestStringRedBlackTree.java
deleted file mode 100644
index 3d4612c..0000000
--- a/orc/src/test/org/apache/orc/impl/TestStringRedBlackTree.java
+++ /dev/null
@@ -1,234 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.orc.impl.RedBlackTree;
-import org.apache.orc.impl.StringRedBlackTree;
-import org.junit.Test;
-
-import java.io.IOException;
-
-import static junit.framework.Assert.assertEquals;
-
-/**
- * Test the red-black tree with string keys.
- */
-public class TestStringRedBlackTree {
-
- /**
- * Checks the red-black tree rules to make sure that we have correctly built
- * a valid tree.
- *
- * Properties:
- * 1. Red nodes must have black children
- * 2. Each node must have the same black height on both sides.
- *
- * @param node The id of the root of the subtree to check for the red-black
- * tree properties.
- * @return The black-height of the subtree.
- */
- private int checkSubtree(RedBlackTree tree, int node, IntWritable count
- ) throws IOException {
- if (node == RedBlackTree.NULL) {
- return 1;
- }
- count.set(count.get() + 1);
- boolean is_red = tree.isRed(node);
- int left = tree.getLeft(node);
- int right = tree.getRight(node);
- if (is_red) {
- if (tree.isRed(left)) {
- printTree(tree, "", tree.root);
- throw new IllegalStateException("Left node of " + node + " is " + left +
- " and both are red.");
- }
- if (tree.isRed(right)) {
- printTree(tree, "", tree.root);
- throw new IllegalStateException("Right node of " + node + " is " +
- right + " and both are red.");
- }
- }
- int left_depth = checkSubtree(tree, left, count);
- int right_depth = checkSubtree(tree, right, count);
- if (left_depth != right_depth) {
- printTree(tree, "", tree.root);
- throw new IllegalStateException("Lopsided tree at node " + node +
- " with depths " + left_depth + " and " + right_depth);
- }
- if (is_red) {
- return left_depth;
- } else {
- return left_depth + 1;
- }
- }
-
- /**
- * Checks the validity of the entire tree. Also ensures that the number of
- * nodes visited is the same as the size of the set.
- */
- void checkTree(RedBlackTree tree) throws IOException {
- IntWritable count = new IntWritable(0);
- if (tree.isRed(tree.root)) {
- printTree(tree, "", tree.root);
- throw new IllegalStateException("root is red");
- }
- checkSubtree(tree, tree.root, count);
- if (count.get() != tree.size) {
- printTree(tree, "", tree.root);
- throw new IllegalStateException("Broken tree! visited= " + count.get() +
- " size=" + tree.size);
- }
- }
-
- void printTree(RedBlackTree tree, String indent, int node
- ) throws IOException {
- if (node == RedBlackTree.NULL) {
- System.err.println(indent + "NULL");
- } else {
- System.err.println(indent + "Node " + node + " color " +
- (tree.isRed(node) ? "red" : "black"));
- printTree(tree, indent + " ", tree.getLeft(node));
- printTree(tree, indent + " ", tree.getRight(node));
- }
- }
-
- private static class MyVisitor implements StringRedBlackTree.Visitor {
- private final String[] words;
- private final int[] order;
- private final DataOutputBuffer buffer = new DataOutputBuffer();
- int current = 0;
-
- MyVisitor(String[] args, int[] order) {
- words = args;
- this.order = order;
- }
-
- @Override
- public void visit(StringRedBlackTree.VisitorContext context
- ) throws IOException {
- String word = context.getText().toString();
- assertEquals("in word " + current, words[current], word);
- assertEquals("in word " + current, order[current],
- context.getOriginalPosition());
- buffer.reset();
- context.writeBytes(buffer);
- assertEquals(word, new String(buffer.getData(),0,buffer.getLength()));
- current += 1;
- }
- }
-
- void checkContents(StringRedBlackTree tree, int[] order,
- String... params
- ) throws IOException {
- tree.visit(new MyVisitor(params, order));
- }
-
- StringRedBlackTree buildTree(String... params) throws IOException {
- StringRedBlackTree result = new StringRedBlackTree(1000);
- for(String word: params) {
- result.add(word);
- checkTree(result);
- }
- return result;
- }
-
- @Test
- public void test1() throws Exception {
- StringRedBlackTree tree = new StringRedBlackTree(5);
- assertEquals(0, tree.getSizeInBytes());
- checkTree(tree);
- assertEquals(0, tree.add("owen"));
- checkTree(tree);
- assertEquals(1, tree.add("ashutosh"));
- checkTree(tree);
- assertEquals(0, tree.add("owen"));
- checkTree(tree);
- assertEquals(2, tree.add("alan"));
- checkTree(tree);
- assertEquals(2, tree.add("alan"));
- checkTree(tree);
- assertEquals(1, tree.add("ashutosh"));
- checkTree(tree);
- assertEquals(3, tree.add("greg"));
- checkTree(tree);
- assertEquals(4, tree.add("eric"));
- checkTree(tree);
- assertEquals(5, tree.add("arun"));
- checkTree(tree);
- assertEquals(6, tree.size());
- checkTree(tree);
- assertEquals(6, tree.add("eric14"));
- checkTree(tree);
- assertEquals(7, tree.add("o"));
- checkTree(tree);
- assertEquals(8, tree.add("ziggy"));
- checkTree(tree);
- assertEquals(9, tree.add("z"));
- checkTree(tree);
- checkContents(tree, new int[]{2,5,1,4,6,3,7,0,9,8},
- "alan", "arun", "ashutosh", "eric", "eric14", "greg",
- "o", "owen", "z", "ziggy");
- assertEquals(32888, tree.getSizeInBytes());
- // check that adding greg again bumps the count
- assertEquals(3, tree.add("greg"));
- assertEquals(41, tree.getCharacterSize());
- // add some more strings to test the different branches of the
- // rebalancing
- assertEquals(10, tree.add("zak"));
- checkTree(tree);
- assertEquals(11, tree.add("eric1"));
- checkTree(tree);
- assertEquals(12, tree.add("ash"));
- checkTree(tree);
- assertEquals(13, tree.add("harry"));
- checkTree(tree);
- assertEquals(14, tree.add("john"));
- checkTree(tree);
- tree.clear();
- checkTree(tree);
- assertEquals(0, tree.getSizeInBytes());
- assertEquals(0, tree.getCharacterSize());
- }
-
- @Test
- public void test2() throws Exception {
- StringRedBlackTree tree =
- buildTree("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l",
- "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z");
- assertEquals(26, tree.size());
- checkContents(tree, new int[]{0,1,2, 3,4,5, 6,7,8, 9,10,11, 12,13,14,
- 15,16,17, 18,19,20, 21,22,23, 24,25},
- "a", "b", "c", "d", "e", "f", "g", "h", "i", "j","k", "l", "m", "n", "o",
- "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z");
- }
-
- @Test
- public void test3() throws Exception {
- StringRedBlackTree tree =
- buildTree("z", "y", "x", "w", "v", "u", "t", "s", "r", "q", "p", "o", "n",
- "m", "l", "k", "j", "i", "h", "g", "f", "e", "d", "c", "b", "a");
- assertEquals(26, tree.size());
- checkContents(tree, new int[]{25,24,23, 22,21,20, 19,18,17, 16,15,14,
- 13,12,11, 10,9,8, 7,6,5, 4,3,2, 1,0},
- "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o",
- "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z");
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestZlib.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestZlib.java b/orc/src/test/org/apache/orc/impl/TestZlib.java
deleted file mode 100644
index 327ecfc..0000000
--- a/orc/src/test/org/apache/orc/impl/TestZlib.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import org.apache.orc.CompressionCodec;
-import org.junit.Test;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-import static junit.framework.Assert.assertEquals;
-import static junit.framework.Assert.fail;
-
-public class TestZlib {
-
- @Test
- public void testNoOverflow() throws Exception {
- ByteBuffer in = ByteBuffer.allocate(10);
- ByteBuffer out = ByteBuffer.allocate(10);
- in.put(new byte[]{1,2,3,4,5,6,7,10});
- in.flip();
- CompressionCodec codec = new ZlibCodec();
- assertEquals(false, codec.compress(in, out, null));
- }
-
- @Test
- public void testCorrupt() throws Exception {
- ByteBuffer buf = ByteBuffer.allocate(1000);
- buf.put(new byte[]{127,-128,0,99,98,-1});
- buf.flip();
- CompressionCodec codec = new ZlibCodec();
- ByteBuffer out = ByteBuffer.allocate(1000);
- try {
- codec.decompress(buf, out);
- fail();
- } catch (IOException ioe) {
- // EXPECTED
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/tools/TestFileDump.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/tools/TestFileDump.java b/orc/src/test/org/apache/orc/tools/TestFileDump.java
deleted file mode 100644
index ce3381e..0000000
--- a/orc/src/test/org/apache/orc/tools/TestFileDump.java
+++ /dev/null
@@ -1,486 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.tools;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-
-import java.io.BufferedReader;
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.PrintStream;
-import java.sql.Date;
-import java.sql.Timestamp;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.orc.CompressionKind;
-import org.apache.orc.OrcConf;
-import org.apache.orc.OrcFile;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.Writer;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestFileDump {
-
- Path workDir = new Path(System.getProperty("test.tmp.dir"));
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Before
- public void openFileSystem () throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- fs.setWorkingDirectory(workDir);
- testFilePath = new Path("TestFileDump.testDump.orc");
- fs.delete(testFilePath, false);
- }
-
- static TypeDescription getMyRecordType() {
- return TypeDescription.createStruct()
- .addField("i", TypeDescription.createInt())
- .addField("l", TypeDescription.createLong())
- .addField("s", TypeDescription.createString());
- }
-
- static void appendMyRecord(VectorizedRowBatch batch,
- int i,
- long l,
- String str) {
- ((LongColumnVector) batch.cols[0]).vector[batch.size] = i;
- ((LongColumnVector) batch.cols[1]).vector[batch.size] = l;
- if (str == null) {
- batch.cols[2].noNulls = false;
- batch.cols[2].isNull[batch.size] = true;
- } else {
- ((BytesColumnVector) batch.cols[2]).setVal(batch.size,
- str.getBytes());
- }
- batch.size += 1;
- }
-
- static TypeDescription getAllTypesType() {
- return TypeDescription.createStruct()
- .addField("b", TypeDescription.createBoolean())
- .addField("bt", TypeDescription.createByte())
- .addField("s", TypeDescription.createShort())
- .addField("i", TypeDescription.createInt())
- .addField("l", TypeDescription.createLong())
- .addField("f", TypeDescription.createFloat())
- .addField("d", TypeDescription.createDouble())
- .addField("de", TypeDescription.createDecimal())
- .addField("t", TypeDescription.createTimestamp())
- .addField("dt", TypeDescription.createDate())
- .addField("str", TypeDescription.createString())
- .addField("c", TypeDescription.createChar().withMaxLength(5))
- .addField("vc", TypeDescription.createVarchar().withMaxLength(10))
- .addField("m", TypeDescription.createMap(
- TypeDescription.createString(),
- TypeDescription.createString()))
- .addField("a", TypeDescription.createList(TypeDescription.createInt()))
- .addField("st", TypeDescription.createStruct()
- .addField("i", TypeDescription.createInt())
- .addField("s", TypeDescription.createString()));
- }
-
- static void appendAllTypes(VectorizedRowBatch batch,
- boolean b,
- byte bt,
- short s,
- int i,
- long l,
- float f,
- double d,
- HiveDecimalWritable de,
- Timestamp t,
- DateWritable dt,
- String str,
- String c,
- String vc,
- Map<String, String> m,
- List<Integer> a,
- int sti,
- String sts) {
- int row = batch.size++;
- ((LongColumnVector) batch.cols[0]).vector[row] = b ? 1 : 0;
- ((LongColumnVector) batch.cols[1]).vector[row] = bt;
- ((LongColumnVector) batch.cols[2]).vector[row] = s;
- ((LongColumnVector) batch.cols[3]).vector[row] = i;
- ((LongColumnVector) batch.cols[4]).vector[row] = l;
- ((DoubleColumnVector) batch.cols[5]).vector[row] = f;
- ((DoubleColumnVector) batch.cols[6]).vector[row] = d;
- ((DecimalColumnVector) batch.cols[7]).vector[row].set(de);
- ((TimestampColumnVector) batch.cols[8]).set(row, t);
- ((LongColumnVector) batch.cols[9]).vector[row] = dt.getDays();
- ((BytesColumnVector) batch.cols[10]).setVal(row, str.getBytes());
- ((BytesColumnVector) batch.cols[11]).setVal(row, c.getBytes());
- ((BytesColumnVector) batch.cols[12]).setVal(row, vc.getBytes());
- MapColumnVector map = (MapColumnVector) batch.cols[13];
- int offset = map.childCount;
- map.offsets[row] = offset;
- map.lengths[row] = m.size();
- map.childCount += map.lengths[row];
- for(Map.Entry<String, String> entry: m.entrySet()) {
- ((BytesColumnVector) map.keys).setVal(offset, entry.getKey().getBytes());
- ((BytesColumnVector) map.values).setVal(offset++,
- entry.getValue().getBytes());
- }
- ListColumnVector list = (ListColumnVector) batch.cols[14];
- offset = list.childCount;
- list.offsets[row] = offset;
- list.lengths[row] = a.size();
- list.childCount += list.lengths[row];
- for(int e=0; e < a.size(); ++e) {
- ((LongColumnVector) list.child).vector[offset + e] = a.get(e);
- }
- StructColumnVector struct = (StructColumnVector) batch.cols[15];
- ((LongColumnVector) struct.fields[0]).vector[row] = sti;
- ((BytesColumnVector) struct.fields[1]).setVal(row, sts.getBytes());
- }
-
- public static void checkOutput(String expected,
- String actual) throws Exception {
- BufferedReader eStream =
- new BufferedReader(new FileReader
- (TestJsonFileDump.getFileFromClasspath(expected)));
- BufferedReader aStream =
- new BufferedReader(new FileReader(actual));
- String expectedLine = eStream.readLine().trim();
- while (expectedLine != null) {
- String actualLine = aStream.readLine().trim();
- System.out.println("actual: " + actualLine);
- System.out.println("expected: " + expectedLine);
- Assert.assertEquals(expectedLine, actualLine);
- expectedLine = eStream.readLine();
- expectedLine = expectedLine == null ? null : expectedLine.trim();
- }
- Assert.assertNull(eStream.readLine());
- Assert.assertNull(aStream.readLine());
- eStream.close();
- aStream.close();
- }
-
- @Test
- public void testDump() throws Exception {
- TypeDescription schema = getMyRecordType();
- conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .fileSystem(fs)
- .setSchema(schema)
- .compress(CompressionKind.ZLIB)
- .stripeSize(100000)
- .rowIndexStride(1000));
- Random r1 = new Random(1);
- String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
- "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
- "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
- "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
- "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
- "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
- "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
- "we", "had", "everything", "before", "us,", "we", "had", "nothing",
- "before", "us,", "we", "were", "all", "going", "direct", "to",
- "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
- "way"};
- VectorizedRowBatch batch = schema.createRowBatch(1000);
- for(int i=0; i < 21000; ++i) {
- appendMyRecord(batch, r1.nextInt(), r1.nextLong(),
- words[r1.nextInt(words.length)]);
- if (batch.size == batch.getMaxSize()) {
- writer.addRowBatch(batch);
- batch.reset();
- }
- }
- if (batch.size > 0) {
- writer.addRowBatch(batch);
- }
- writer.close();
- PrintStream origOut = System.out;
- String outputFilename = "orc-file-dump.out";
- FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
-
- // replace stdout and run command
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toString(), "--rowindex=1,2,3"});
- System.out.flush();
- System.setOut(origOut);
-
-
- checkOutput(outputFilename, workDir + File.separator + outputFilename);
- }
-
- @Test
- public void testDataDump() throws Exception {
- TypeDescription schema = getAllTypesType();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .fileSystem(fs)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .rowIndexStride(1000));
- VectorizedRowBatch batch = schema.createRowBatch(1000);
- Map<String, String> m = new HashMap<String, String>(2);
- m.put("k1", "v1");
- appendAllTypes(batch,
- true,
- (byte) 10,
- (short) 100,
- 1000,
- 10000L,
- 4.0f,
- 20.0,
- new HiveDecimalWritable("4.2222"),
- new Timestamp(1416967764000L),
- new DateWritable(new Date(1416967764000L)),
- "string",
- "hello",
- "hello",
- m,
- Arrays.asList(100, 200),
- 10, "foo");
- m.clear();
- m.put("k3", "v3");
- appendAllTypes(
- batch,
- false,
- (byte)20,
- (short)200,
- 2000,
- 20000L,
- 8.0f,
- 40.0,
- new HiveDecimalWritable("2.2222"),
- new Timestamp(1416967364000L),
- new DateWritable(new Date(1411967764000L)),
- "abcd",
- "world",
- "world",
- m,
- Arrays.asList(200, 300),
- 20, "bar");
- writer.addRowBatch(batch);
-
- writer.close();
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
-
- // replace stdout and run command
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toString(), "-d"});
- System.out.flush();
- System.setOut(origOut);
- String[] lines = myOut.toString().split("\n");
- Assert.assertEquals("{\"b\":true,\"bt\":10,\"s\":100,\"i\":1000,\"l\":10000,\"f\":4,\"d\":20,\"de\":\"4.2222\",\"t\":\"2014-11-25 18:09:24.0\",\"dt\":\"2014-11-25\",\"str\":\"string\",\"c\":\"hello\",\"vc\":\"hello\",\"m\":[{\"_key\":\"k1\",\"_value\":\"v1\"}],\"a\":[100,200],\"st\":{\"i\":10,\"s\":\"foo\"}}", lines[0]);
- Assert.assertEquals("{\"b\":false,\"bt\":20,\"s\":200,\"i\":2000,\"l\":20000,\"f\":8,\"d\":40,\"de\":\"2.2222\",\"t\":\"2014-11-25 18:02:44.0\",\"dt\":\"2014-09-28\",\"str\":\"abcd\",\"c\":\"world\",\"vc\":\"world\",\"m\":[{\"_key\":\"k3\",\"_value\":\"v3\"}],\"a\":[200,300],\"st\":{\"i\":20,\"s\":\"bar\"}}", lines[1]);
- }
-
- // Test that if the fraction of rows that have distinct strings is greater than the configured
- // threshold dictionary encoding is turned off. If dictionary encoding is turned off the length
- // of the dictionary stream for the column will be 0 in the ORC file dump.
- @Test
- public void testDictionaryThreshold() throws Exception {
- TypeDescription schema = getMyRecordType();
- Configuration conf = new Configuration();
- conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
- conf.setFloat(OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getAttribute(), 0.49f);
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .fileSystem(fs)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.ZLIB)
- .rowIndexStride(1000)
- .bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch(1000);
- Random r1 = new Random(1);
- String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
- "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
- "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
- "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
- "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
- "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
- "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
- "we", "had", "everything", "before", "us,", "we", "had", "nothing",
- "before", "us,", "we", "were", "all", "going", "direct", "to",
- "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
- "way"};
- int nextInt = 0;
- for(int i=0; i < 21000; ++i) {
- // Write out the same string twice, this guarantees the fraction of rows with
- // distinct strings is 0.5
- if (i % 2 == 0) {
- nextInt = r1.nextInt(words.length);
- // Append the value of i to the word, this guarantees when an index or word is repeated
- // the actual string is unique.
- words[nextInt] += "-" + i;
- }
- appendMyRecord(batch, r1.nextInt(), r1.nextLong(), words[nextInt]);
- if (batch.size == batch.getMaxSize()) {
- writer.addRowBatch(batch);
- batch.reset();
- }
- }
- if (batch.size != 0) {
- writer.addRowBatch(batch);
- }
- writer.close();
- PrintStream origOut = System.out;
- String outputFilename = "orc-file-dump-dictionary-threshold.out";
- FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
-
- // replace stdout and run command
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toString(), "--rowindex=1,2,3"});
- System.out.flush();
- System.setOut(origOut);
-
- checkOutput(outputFilename, workDir + File.separator + outputFilename);
- }
-
- @Test
- public void testBloomFilter() throws Exception {
- TypeDescription schema = getMyRecordType();
- conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
- OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
- .fileSystem(fs)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.ZLIB)
- .bufferSize(10000)
- .rowIndexStride(1000)
- .bloomFilterColumns("S");
- Writer writer = OrcFile.createWriter(testFilePath, options);
- Random r1 = new Random(1);
- String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
- "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
- "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
- "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
- "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
- "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
- "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
- "we", "had", "everything", "before", "us,", "we", "had", "nothing",
- "before", "us,", "we", "were", "all", "going", "direct", "to",
- "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
- "way"};
- VectorizedRowBatch batch = schema.createRowBatch(1000);
- for(int i=0; i < 21000; ++i) {
- appendMyRecord(batch, r1.nextInt(), r1.nextLong(),
- words[r1.nextInt(words.length)]);
- if (batch.size == batch.getMaxSize()) {
- writer.addRowBatch(batch);
- batch.reset();
- }
- }
- if (batch.size > 0) {
- writer.addRowBatch(batch);
- }
- writer.close();
- PrintStream origOut = System.out;
- String outputFilename = "orc-file-dump-bloomfilter.out";
- FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
-
- // replace stdout and run command
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toString(), "--rowindex=3"});
- System.out.flush();
- System.setOut(origOut);
-
-
- checkOutput(outputFilename, workDir + File.separator + outputFilename);
- }
-
- @Test
- public void testBloomFilter2() throws Exception {
- TypeDescription schema = getMyRecordType();
- conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
- OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
- .fileSystem(fs)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.ZLIB)
- .bufferSize(10000)
- .rowIndexStride(1000)
- .bloomFilterColumns("l")
- .bloomFilterFpp(0.01);
- VectorizedRowBatch batch = schema.createRowBatch(1000);
- Writer writer = OrcFile.createWriter(testFilePath, options);
- Random r1 = new Random(1);
- String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
- "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
- "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
- "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
- "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
- "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
- "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
- "we", "had", "everything", "before", "us,", "we", "had", "nothing",
- "before", "us,", "we", "were", "all", "going", "direct", "to",
- "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
- "way"};
- for(int i=0; i < 21000; ++i) {
- appendMyRecord(batch, r1.nextInt(), r1.nextLong(),
- words[r1.nextInt(words.length)]);
- if (batch.size == batch.getMaxSize()) {
- writer.addRowBatch(batch);
- batch.reset();
- }
- }
- if (batch.size > 0) {
- writer.addRowBatch(batch);
- }
- writer.close();
- PrintStream origOut = System.out;
- String outputFilename = "orc-file-dump-bloomfilter2.out";
- FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
-
- // replace stdout and run command
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toString(), "--rowindex=2"});
- System.out.flush();
- System.setOut(origOut);
-
-
- checkOutput(outputFilename, workDir + File.separator + outputFilename);
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/tools/TestJsonFileDump.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/tools/TestJsonFileDump.java b/orc/src/test/org/apache/orc/tools/TestJsonFileDump.java
deleted file mode 100644
index a514824..0000000
--- a/orc/src/test/org/apache/orc/tools/TestJsonFileDump.java
+++ /dev/null
@@ -1,150 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.tools;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.PrintStream;
-import java.net.URL;
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.orc.CompressionKind;
-import org.apache.orc.OrcConf;
-import org.apache.orc.OrcFile;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.Writer;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestJsonFileDump {
- public static String getFileFromClasspath(String name) {
- URL url = ClassLoader.getSystemResource(name);
- if (url == null) {
- throw new IllegalArgumentException("Could not find " + name);
- }
- return url.getPath();
- }
-
- Path workDir = new Path(System.getProperty("test.tmp.dir"));
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Before
- public void openFileSystem () throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- fs.setWorkingDirectory(workDir);
- testFilePath = new Path("TestFileDump.testDump.orc");
- fs.delete(testFilePath, false);
- }
-
- static void checkOutput(String expected,
- String actual) throws Exception {
- BufferedReader eStream =
- new BufferedReader(new FileReader(getFileFromClasspath(expected)));
- BufferedReader aStream =
- new BufferedReader(new FileReader(actual));
- String expectedLine = eStream.readLine();
- while (expectedLine != null) {
- String actualLine = aStream.readLine();
- System.out.println("actual: " + actualLine);
- System.out.println("expected: " + expectedLine);
- assertEquals(expectedLine, actualLine);
- expectedLine = eStream.readLine();
- }
- assertNull(eStream.readLine());
- assertNull(aStream.readLine());
- }
-
- @Test
- public void testJsonDump() throws Exception {
- TypeDescription schema = TypeDescription.createStruct()
- .addField("i", TypeDescription.createInt())
- .addField("l", TypeDescription.createLong())
- .addField("s", TypeDescription.createString());
- conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
- OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
- .fileSystem(fs)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.ZLIB)
- .bufferSize(10000)
- .rowIndexStride(1000)
- .bloomFilterColumns("s");
- Writer writer = OrcFile.createWriter(testFilePath, options);
- Random r1 = new Random(1);
- String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
- "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
- "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
- "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
- "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
- "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
- "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
- "we", "had", "everything", "before", "us,", "we", "had", "nothing",
- "before", "us,", "we", "were", "all", "going", "direct", "to",
- "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
- "way"};
- VectorizedRowBatch batch = schema.createRowBatch(1000);
- for(int i=0; i < 21000; ++i) {
- ((LongColumnVector) batch.cols[0]).vector[batch.size] = r1.nextInt();
- ((LongColumnVector) batch.cols[1]).vector[batch.size] = r1.nextLong();
- if (i % 100 == 0) {
- batch.cols[2].noNulls = false;
- batch.cols[2].isNull[batch.size] = true;
- } else {
- ((BytesColumnVector) batch.cols[2]).setVal(batch.size,
- words[r1.nextInt(words.length)].getBytes());
- }
- batch.size += 1;
- if (batch.size == batch.getMaxSize()) {
- writer.addRowBatch(batch);
- batch.reset();
- }
- }
- if (batch.size > 0) {
- writer.addRowBatch(batch);
- }
-
- writer.close();
- PrintStream origOut = System.out;
- String outputFilename = "orc-file-dump.json";
- FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
-
- // replace stdout and run command
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toString(), "-j", "-p", "--rowindex=3"});
- System.out.flush();
- System.setOut(origOut);
-
-
- checkOutput(outputFilename, workDir + File.separator + outputFilename);
- }
-}
[26/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/TreeReaderFactory.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/TreeReaderFactory.java b/orc/src/java/org/apache/hive/orc/impl/TreeReaderFactory.java
new file mode 100644
index 0000000..d53ef34
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/TreeReaderFactory.java
@@ -0,0 +1,2162 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TimeZone;
+
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hive.orc.TypeDescription;
+import org.apache.hive.orc.OrcProto;
+
+/**
+ * Factory for creating ORC tree readers.
+ */
+public class TreeReaderFactory {
+
+ public abstract static class TreeReader {
+ protected final int columnId;
+ protected BitFieldReader present = null;
+ protected boolean valuePresent = false;
+ protected int vectorColumnCount;
+
+ TreeReader(int columnId) throws IOException {
+ this(columnId, null);
+ }
+
+ protected TreeReader(int columnId, InStream in) throws IOException {
+ this.columnId = columnId;
+ if (in == null) {
+ present = null;
+ valuePresent = true;
+ } else {
+ present = new BitFieldReader(in, 1);
+ }
+ vectorColumnCount = -1;
+ }
+
+ void setVectorColumnCount(int vectorColumnCount) {
+ this.vectorColumnCount = vectorColumnCount;
+ }
+
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ protected static IntegerReader createIntegerReader(OrcProto.ColumnEncoding.Kind kind,
+ InStream in,
+ boolean signed, boolean skipCorrupt) throws IOException {
+ switch (kind) {
+ case DIRECT_V2:
+ case DICTIONARY_V2:
+ return new RunLengthIntegerReaderV2(in, signed, skipCorrupt);
+ case DIRECT:
+ case DICTIONARY:
+ return new RunLengthIntegerReader(in, signed);
+ default:
+ throw new IllegalArgumentException("Unknown encoding " + kind);
+ }
+ }
+
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ checkEncoding(stripeFooter.getColumnsList().get(columnId));
+ InStream in = streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.PRESENT));
+ if (in == null) {
+ present = null;
+ valuePresent = true;
+ } else {
+ present = new BitFieldReader(in, 1);
+ }
+ }
+
+ /**
+ * Seek to the given position.
+ *
+ * @param index the indexes loaded from the file
+ * @throws IOException
+ */
+ public void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ public void seek(PositionProvider index) throws IOException {
+ if (present != null) {
+ present.seek(index);
+ }
+ }
+
+ protected long countNonNulls(long rows) throws IOException {
+ if (present != null) {
+ long result = 0;
+ for (long c = 0; c < rows; ++c) {
+ if (present.next() == 1) {
+ result += 1;
+ }
+ }
+ return result;
+ } else {
+ return rows;
+ }
+ }
+
+ abstract void skipRows(long rows) throws IOException;
+
+ /**
+ * Called at the top level to read into the given batch.
+ * @param batch the batch to read into
+ * @param batchSize the number of rows to read
+ * @throws IOException
+ */
+ public void nextBatch(VectorizedRowBatch batch,
+ int batchSize) throws IOException {
+ batch.cols[0].reset();
+ batch.cols[0].ensureSize(batchSize, false);
+ nextVector(batch.cols[0], null, batchSize);
+ }
+
+ /**
+ * Populates the isNull vector array in the previousVector object based on
+ * the present stream values. This function is called from all the child
+ * readers, and they all set the values based on isNull field value.
+ *
+ * @param previous The columnVector object whose isNull value is populated
+ * @param isNull Whether the each value was null at a higher level. If
+ * isNull is null, all values are non-null.
+ * @param batchSize Size of the column vector
+ * @throws IOException
+ */
+ // TODO: it looks like isNull is never used; it's always null. Remove/deprecate?
+ public void nextVector(ColumnVector previous,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (present != null || isNull != null) {
+ // Set noNulls and isNull vector of the ColumnVector based on
+ // present stream
+ previous.noNulls = true;
+ boolean allNull = true;
+ for (int i = 0; i < batchSize; i++) {
+ if (isNull == null || !isNull[i]) {
+ if (present != null && present.next() != 1) {
+ previous.noNulls = false;
+ previous.isNull[i] = true;
+ } else {
+ previous.isNull[i] = false;
+ allNull = false;
+ }
+ } else {
+ previous.noNulls = false;
+ previous.isNull[i] = true;
+ }
+ }
+ previous.isRepeating = !previous.noNulls && allNull;
+ } else {
+ // There is no present stream, this means that all the values are
+ // present.
+ previous.noNulls = true;
+ for (int i = 0; i < batchSize; i++) {
+ previous.isNull[i] = false;
+ }
+ }
+ }
+
+ public BitFieldReader getPresent() {
+ return present;
+ }
+
+ public int getColumnId() {
+ return columnId;
+ }
+ }
+
+ public static class NullTreeReader extends TreeReader {
+
+ public NullTreeReader(int columnId) throws IOException {
+ super(columnId);
+ }
+
+ @Override
+ public void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter footer) {
+ // PASS
+ }
+
+ @Override
+ void skipRows(long rows) {
+ // PASS
+ }
+
+ @Override
+ public void seek(PositionProvider position) {
+ // PASS
+ }
+
+ @Override
+ public void seek(PositionProvider[] position) {
+ // PASS
+ }
+
+ @Override
+ public void nextVector(ColumnVector vector, boolean[] isNull, int size) {
+ vector.noNulls = false;
+ vector.isNull[0] = true;
+ vector.isRepeating = true;
+ }
+ }
+
+ public static class BooleanTreeReader extends TreeReader {
+ protected BitFieldReader reader = null;
+
+ BooleanTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected BooleanTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ if (data != null) {
+ reader = new BitFieldReader(data, 1);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ reader = new BitFieldReader(streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA)), 1);
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ LongColumnVector result = (LongColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ // Read value entries based on isNull entries
+ reader.nextVector(result, batchSize);
+ }
+ }
+
+ public static class ByteTreeReader extends TreeReader {
+ protected RunLengthByteReader reader = null;
+
+ ByteTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected ByteTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ this.reader = new RunLengthByteReader(data);
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ reader = new RunLengthByteReader(streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA)));
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final LongColumnVector result = (LongColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ // Read value entries based on isNull entries
+ reader.nextVector(result, result.vector, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class ShortTreeReader extends TreeReader {
+ protected IntegerReader reader = null;
+
+ ShortTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null);
+ }
+
+ protected ShortTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.reader = createIntegerReader(encoding.getKind(), data, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final LongColumnVector result = (LongColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ // Read value entries based on isNull entries
+ reader.nextVector(result, result.vector, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class IntTreeReader extends TreeReader {
+ protected IntegerReader reader = null;
+
+ IntTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null);
+ }
+
+ protected IntTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.reader = createIntegerReader(encoding.getKind(), data, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final LongColumnVector result = (LongColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ // Read value entries based on isNull entries
+ reader.nextVector(result, result.vector, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class LongTreeReader extends TreeReader {
+ protected IntegerReader reader = null;
+
+ LongTreeReader(int columnId, boolean skipCorrupt) throws IOException {
+ this(columnId, null, null, null, skipCorrupt);
+ }
+
+ protected LongTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding,
+ boolean skipCorrupt)
+ throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.reader = createIntegerReader(encoding.getKind(), data, true, skipCorrupt);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final LongColumnVector result = (LongColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ // Read value entries based on isNull entries
+ reader.nextVector(result, result.vector, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class FloatTreeReader extends TreeReader {
+ protected InStream stream;
+ private final SerializationUtils utils;
+
+ FloatTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected FloatTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ this.utils = new SerializationUtils();
+ this.stream = data;
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final DoubleColumnVector result = (DoubleColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ final boolean hasNulls = !result.noNulls;
+ boolean allNulls = hasNulls;
+
+ if (hasNulls) {
+ // conditions to ensure bounds checks skips
+ for (int i = 0; batchSize <= result.isNull.length && i < batchSize; i++) {
+ allNulls = allNulls & result.isNull[i];
+ }
+ if (allNulls) {
+ result.vector[0] = Double.NaN;
+ result.isRepeating = true;
+ } else {
+ // some nulls
+ result.isRepeating = false;
+ // conditions to ensure bounds checks skips
+ for (int i = 0; batchSize <= result.isNull.length
+ && batchSize <= result.vector.length && i < batchSize; i++) {
+ if (!result.isNull[i]) {
+ result.vector[i] = utils.readFloat(stream);
+ } else {
+ // If the value is not present then set NaN
+ result.vector[i] = Double.NaN;
+ }
+ }
+ }
+ } else {
+ // no nulls & > 1 row (check repeating)
+ boolean repeating = (batchSize > 1);
+ final float f1 = utils.readFloat(stream);
+ result.vector[0] = f1;
+ // conditions to ensure bounds checks skips
+ for (int i = 1; i < batchSize && batchSize <= result.vector.length; i++) {
+ final float f2 = utils.readFloat(stream);
+ repeating = repeating && (f1 == f2);
+ result.vector[i] = f2;
+ }
+ result.isRepeating = repeating;
+ }
+ }
+
+ @Override
+ protected void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ for (int i = 0; i < items; ++i) {
+ utils.readFloat(stream);
+ }
+ }
+ }
+
+ public static class DoubleTreeReader extends TreeReader {
+ protected InStream stream;
+ private final SerializationUtils utils;
+
+ DoubleTreeReader(int columnId) throws IOException {
+ this(columnId, null, null);
+ }
+
+ protected DoubleTreeReader(int columnId, InStream present, InStream data) throws IOException {
+ super(columnId, present);
+ this.utils = new SerializationUtils();
+ this.stream = data;
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name =
+ new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final DoubleColumnVector result = (DoubleColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ final boolean hasNulls = !result.noNulls;
+ boolean allNulls = hasNulls;
+
+ if (hasNulls) {
+ // conditions to ensure bounds checks skips
+ for (int i = 0; i < batchSize && batchSize <= result.isNull.length; i++) {
+ allNulls = allNulls & result.isNull[i];
+ }
+ if (allNulls) {
+ result.vector[0] = Double.NaN;
+ result.isRepeating = true;
+ } else {
+ // some nulls
+ result.isRepeating = false;
+ // conditions to ensure bounds checks skips
+ for (int i = 0; batchSize <= result.isNull.length
+ && batchSize <= result.vector.length && i < batchSize; i++) {
+ if (!result.isNull[i]) {
+ result.vector[i] = utils.readDouble(stream);
+ } else {
+ // If the value is not present then set NaN
+ result.vector[i] = Double.NaN;
+ }
+ }
+ }
+ } else {
+ // no nulls
+ boolean repeating = (batchSize > 1);
+ final double d1 = utils.readDouble(stream);
+ result.vector[0] = d1;
+ // conditions to ensure bounds checks skips
+ for (int i = 1; i < batchSize && batchSize <= result.vector.length; i++) {
+ final double d2 = utils.readDouble(stream);
+ repeating = repeating && (d1 == d2);
+ result.vector[i] = d2;
+ }
+ result.isRepeating = repeating;
+ }
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long len = items * 8;
+ while (len > 0) {
+ len -= stream.skip(len);
+ }
+ }
+ }
+
+ public static class BinaryTreeReader extends TreeReader {
+ protected InStream stream;
+ protected IntegerReader lengths = null;
+ protected final LongColumnVector scratchlcv;
+
+ BinaryTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null, null);
+ }
+
+ protected BinaryTreeReader(int columnId, InStream present, InStream data, InStream length,
+ OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present);
+ scratchlcv = new LongColumnVector();
+ this.stream = data;
+ if (length != null && encoding != null) {
+ checkEncoding(encoding);
+ this.lengths = createIntegerReader(encoding.getKind(), length, false, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new StreamName(columnId, OrcProto.Stream.Kind.LENGTH)), false, false);
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ lengths.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final BytesColumnVector result = (BytesColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ BytesColumnVectorUtil.readOrcByteArrays(stream, lengths, scratchlcv, result, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long lengthToSkip = 0;
+ for (int i = 0; i < items; ++i) {
+ lengthToSkip += lengths.next();
+ }
+ while (lengthToSkip > 0) {
+ lengthToSkip -= stream.skip(lengthToSkip);
+ }
+ }
+ }
+
+ public static class TimestampTreeReader extends TreeReader {
+ protected IntegerReader data = null;
+ protected IntegerReader nanos = null;
+ private final boolean skipCorrupt;
+ private Map<String, Long> baseTimestampMap;
+ protected long base_timestamp;
+ private final TimeZone readerTimeZone;
+ private TimeZone writerTimeZone;
+ private boolean hasSameTZRules;
+
+ TimestampTreeReader(int columnId, boolean skipCorrupt) throws IOException {
+ this(columnId, null, null, null, null, skipCorrupt, null);
+ }
+
+ protected TimestampTreeReader(int columnId, InStream presentStream, InStream dataStream,
+ InStream nanosStream, OrcProto.ColumnEncoding encoding, boolean skipCorrupt, String writerTimezone)
+ throws IOException {
+ super(columnId, presentStream);
+ this.skipCorrupt = skipCorrupt;
+ this.baseTimestampMap = new HashMap<>();
+ this.readerTimeZone = TimeZone.getDefault();
+ this.writerTimeZone = readerTimeZone;
+ this.hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone);
+ this.base_timestamp = getBaseTimestamp(readerTimeZone.getID());
+ if (encoding != null) {
+ checkEncoding(encoding);
+
+ if (dataStream != null) {
+ this.data = createIntegerReader(encoding.getKind(), dataStream, true, skipCorrupt);
+ }
+
+ if (nanosStream != null) {
+ this.nanos = createIntegerReader(encoding.getKind(), nanosStream, false, skipCorrupt);
+ }
+ base_timestamp = getBaseTimestamp(writerTimezone);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ data = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA)), true, skipCorrupt);
+ nanos = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.SECONDARY)), false, skipCorrupt);
+ base_timestamp = getBaseTimestamp(stripeFooter.getWriterTimezone());
+ }
+
+ protected long getBaseTimestamp(String timeZoneId) throws IOException {
+ // to make sure new readers read old files in the same way
+ if (timeZoneId == null || timeZoneId.isEmpty()) {
+ timeZoneId = readerTimeZone.getID();
+ }
+
+ if (!baseTimestampMap.containsKey(timeZoneId)) {
+ writerTimeZone = TimeZone.getTimeZone(timeZoneId);
+ hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone);
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ sdf.setTimeZone(writerTimeZone);
+ try {
+ long epoch =
+ sdf.parse(WriterImpl.BASE_TIMESTAMP_STRING).getTime() / WriterImpl.MILLIS_PER_SECOND;
+ baseTimestampMap.put(timeZoneId, epoch);
+ return epoch;
+ } catch (ParseException e) {
+ throw new IOException("Unable to create base timestamp", e);
+ } finally {
+ sdf.setTimeZone(readerTimeZone);
+ }
+ }
+
+ return baseTimestampMap.get(timeZoneId);
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ data.seek(index);
+ nanos.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ TimestampColumnVector result = (TimestampColumnVector) previousVector;
+ super.nextVector(previousVector, isNull, batchSize);
+
+ for (int i = 0; i < batchSize; i++) {
+ if (result.noNulls || !result.isNull[i]) {
+ long millis = data.next() + base_timestamp;
+ int newNanos = parseNanos(nanos.next());
+ if (millis < 0 && newNanos != 0) {
+ millis -= 1;
+ }
+ millis *= WriterImpl.MILLIS_PER_SECOND;
+ long offset = 0;
+ // If reader and writer time zones have different rules, adjust the timezone difference
+ // between reader and writer taking day light savings into account.
+ if (!hasSameTZRules) {
+ offset = writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(millis);
+ }
+ long adjustedMillis = millis + offset;
+ // Sometimes the reader timezone might have changed after adding the adjustedMillis.
+ // To account for that change, check for any difference in reader timezone after
+ // adding adjustedMillis. If so use the new offset (offset at adjustedMillis point of time).
+ if (!hasSameTZRules &&
+ (readerTimeZone.getOffset(millis) != readerTimeZone.getOffset(adjustedMillis))) {
+ long newOffset =
+ writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(adjustedMillis);
+ adjustedMillis = millis + newOffset;
+ }
+ result.time[i] = adjustedMillis;
+ result.nanos[i] = newNanos;
+ if (result.isRepeating && i != 0 &&
+ (result.time[0] != result.time[i] ||
+ result.nanos[0] != result.nanos[i])) {
+ result.isRepeating = false;
+ }
+ }
+ }
+ }
+
+ private static int parseNanos(long serialized) {
+ int zeros = 7 & (int) serialized;
+ int result = (int) (serialized >>> 3);
+ if (zeros != 0) {
+ for (int i = 0; i <= zeros; ++i) {
+ result *= 10;
+ }
+ }
+ return result;
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ data.skip(items);
+ nanos.skip(items);
+ }
+ }
+
+ public static class DateTreeReader extends TreeReader {
+ protected IntegerReader reader = null;
+
+ DateTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null);
+ }
+
+ protected DateTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ reader = createIntegerReader(encoding.getKind(), data, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), true, false);
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final LongColumnVector result = (LongColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ // Read value entries based on isNull entries
+ reader.nextVector(result, result.vector, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+ }
+
+ public static class DecimalTreeReader extends TreeReader {
+ protected InStream valueStream;
+ protected IntegerReader scaleReader = null;
+ private int[] scratchScaleVector;
+ private byte[] scratchBytes;
+
+ private final int precision;
+ private final int scale;
+
+ DecimalTreeReader(int columnId, int precision, int scale) throws IOException {
+ this(columnId, precision, scale, null, null, null, null);
+ }
+
+ protected DecimalTreeReader(int columnId, int precision, int scale, InStream present,
+ InStream valueStream, InStream scaleStream, OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ this.precision = precision;
+ this.scale = scale;
+ this.scratchScaleVector = new int[VectorizedRowBatch.DEFAULT_SIZE];
+ this.valueStream = valueStream;
+ this.scratchBytes = new byte[HiveDecimal.SCRATCH_BUFFER_LEN_SERIALIZATION_UTILS_READ];
+ if (scaleStream != null && encoding != null) {
+ checkEncoding(encoding);
+ this.scaleReader = createIntegerReader(encoding.getKind(), scaleStream, true, false);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ valueStream = streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA));
+ scaleReader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new StreamName(columnId, OrcProto.Stream.Kind.SECONDARY)), true, false);
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ valueStream.seek(index);
+ scaleReader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final DecimalColumnVector result = (DecimalColumnVector) previousVector;
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ if (batchSize > scratchScaleVector.length) {
+ scratchScaleVector = new int[(int) batchSize];
+ }
+ // read the scales
+ scaleReader.nextVector(result, scratchScaleVector, batchSize);
+ // Read value entries based on isNull entries
+ // Use the fast ORC deserialization method that emulates SerializationUtils.readBigInteger
+ // provided by HiveDecimalWritable.
+ HiveDecimalWritable[] vector = result.vector;
+ HiveDecimalWritable decWritable;
+ if (result.noNulls) {
+ for (int r=0; r < batchSize; ++r) {
+ decWritable = vector[r];
+ if (!decWritable.serializationUtilsRead(
+ valueStream, scratchScaleVector[r],
+ scratchBytes)) {
+ result.isNull[r] = true;
+ result.noNulls = false;
+ }
+ }
+ } else if (!result.isRepeating || !result.isNull[0]) {
+ for (int r=0; r < batchSize; ++r) {
+ if (!result.isNull[r]) {
+ decWritable = vector[r];
+ if (!decWritable.serializationUtilsRead(
+ valueStream, scratchScaleVector[r],
+ scratchBytes)) {
+ result.isNull[r] = true;
+ result.noNulls = false;
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ HiveDecimalWritable scratchDecWritable = new HiveDecimalWritable();
+ for (int i = 0; i < items; i++) {
+ scratchDecWritable.serializationUtilsRead(valueStream, 0, scratchBytes);
+ }
+ scaleReader.skip(items);
+ }
+ }
+
+ /**
+ * A tree reader that will read string columns. At the start of the
+ * stripe, it creates an internal reader based on whether a direct or
+ * dictionary encoding was used.
+ */
+ public static class StringTreeReader extends TreeReader {
+ protected TreeReader reader;
+
+ StringTreeReader(int columnId) throws IOException {
+ super(columnId);
+ }
+
+ protected StringTreeReader(int columnId, InStream present, InStream data, InStream length,
+ InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present);
+ if (encoding != null) {
+ switch (encoding.getKind()) {
+ case DIRECT:
+ case DIRECT_V2:
+ reader = new StringDirectTreeReader(columnId, present, data, length,
+ encoding.getKind());
+ break;
+ case DICTIONARY:
+ case DICTIONARY_V2:
+ reader = new StringDictionaryTreeReader(columnId, present, data, length, dictionary,
+ encoding);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported encoding " +
+ encoding.getKind());
+ }
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ reader.checkEncoding(encoding);
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ // For each stripe, checks the encoding and initializes the appropriate
+ // reader
+ switch (stripeFooter.getColumnsList().get(columnId).getKind()) {
+ case DIRECT:
+ case DIRECT_V2:
+ reader = new StringDirectTreeReader(columnId);
+ break;
+ case DICTIONARY:
+ case DICTIONARY_V2:
+ reader = new StringDictionaryTreeReader(columnId);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported encoding " +
+ stripeFooter.getColumnsList().get(columnId).getKind());
+ }
+ reader.startStripe(streams, stripeFooter);
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ reader.seek(index);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ reader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ reader.nextVector(previousVector, isNull, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skipRows(items);
+ }
+ }
+
+ private static org.slf4j.Logger LOG = org.slf4j.LoggerFactory.getLogger(TreeReaderFactory.class);
+ // This class collects together very similar methods for reading an ORC vector of byte arrays and
+ // creating the BytesColumnVector.
+ //
+ public static class BytesColumnVectorUtil {
+
+ private static byte[] commonReadByteArrays(InStream stream, IntegerReader lengths,
+ LongColumnVector scratchlcv,
+ BytesColumnVector result, final int batchSize) throws IOException {
+ // Read lengths
+ scratchlcv.isNull = result.isNull; // Notice we are replacing the isNull vector here...
+ scratchlcv.ensureSize(batchSize, false);
+ lengths.nextVector(scratchlcv, scratchlcv.vector, batchSize);
+ int totalLength = 0;
+ if (!scratchlcv.isRepeating) {
+ for (int i = 0; i < batchSize; i++) {
+ if (!scratchlcv.isNull[i]) {
+ totalLength += (int) scratchlcv.vector[i];
+ }
+ }
+ } else {
+ if (!scratchlcv.isNull[0]) {
+ totalLength = (int) (batchSize * scratchlcv.vector[0]);
+ }
+ }
+
+ // Read all the strings for this batch
+ byte[] allBytes = new byte[totalLength];
+ int offset = 0;
+ int len = totalLength;
+ while (len > 0) {
+ int bytesRead = stream.read(allBytes, offset, len);
+ if (bytesRead < 0) {
+ throw new EOFException("Can't finish byte read from " + stream);
+ }
+ len -= bytesRead;
+ offset += bytesRead;
+ }
+
+ return allBytes;
+ }
+
+ // This method has the common code for reading in bytes into a BytesColumnVector.
+ public static void readOrcByteArrays(InStream stream,
+ IntegerReader lengths,
+ LongColumnVector scratchlcv,
+ BytesColumnVector result,
+ final int batchSize) throws IOException {
+ if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
+ byte[] allBytes = commonReadByteArrays(stream, lengths, scratchlcv,
+ result, (int) batchSize);
+
+ // Too expensive to figure out 'repeating' by comparisons.
+ result.isRepeating = false;
+ int offset = 0;
+ if (!scratchlcv.isRepeating) {
+ for (int i = 0; i < batchSize; i++) {
+ if (!scratchlcv.isNull[i]) {
+ result.setRef(i, allBytes, offset, (int) scratchlcv.vector[i]);
+ offset += scratchlcv.vector[i];
+ } else {
+ result.setRef(i, allBytes, 0, 0);
+ }
+ }
+ } else {
+ for (int i = 0; i < batchSize; i++) {
+ if (!scratchlcv.isNull[i]) {
+ result.setRef(i, allBytes, offset, (int) scratchlcv.vector[0]);
+ offset += scratchlcv.vector[0];
+ } else {
+ result.setRef(i, allBytes, 0, 0);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * A reader for string columns that are direct encoded in the current
+ * stripe.
+ */
+ public static class StringDirectTreeReader extends TreeReader {
+ private static final HadoopShims SHIMS = HadoopShims.Factory.get();
+ protected InStream stream;
+ protected HadoopShims.TextReaderShim data;
+ protected IntegerReader lengths;
+ private final LongColumnVector scratchlcv;
+
+ StringDirectTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null, null);
+ }
+
+ protected StringDirectTreeReader(int columnId, InStream present, InStream data,
+ InStream length, OrcProto.ColumnEncoding.Kind encoding) throws IOException {
+ super(columnId, present);
+ this.scratchlcv = new LongColumnVector();
+ this.stream = data;
+ if (length != null && encoding != null) {
+ this.lengths = createIntegerReader(encoding, length, false, false);
+ this.data = SHIMS.getTextReaderShim(this.stream);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT &&
+ encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA);
+ stream = streams.get(name);
+ data = SHIMS.getTextReaderShim(this.stream);
+ lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new StreamName(columnId, OrcProto.Stream.Kind.LENGTH)),
+ false, false);
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ stream.seek(index);
+ // don't seek data stream
+ lengths.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final BytesColumnVector result = (BytesColumnVector) previousVector;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ BytesColumnVectorUtil.readOrcByteArrays(stream, lengths, scratchlcv,
+ result, batchSize);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long lengthToSkip = 0;
+ for (int i = 0; i < items; ++i) {
+ lengthToSkip += lengths.next();
+ }
+
+ while (lengthToSkip > 0) {
+ lengthToSkip -= stream.skip(lengthToSkip);
+ }
+ }
+
+ public IntegerReader getLengths() {
+ return lengths;
+ }
+
+ public InStream getStream() {
+ return stream;
+ }
+ }
+
+ /**
+ * A reader for string columns that are dictionary encoded in the current
+ * stripe.
+ */
+ public static class StringDictionaryTreeReader extends TreeReader {
+ private static final byte[] EMPTY_BYTE_ARRAY = new byte[0];
+ private DynamicByteArray dictionaryBuffer;
+ private int[] dictionaryOffsets;
+ protected IntegerReader reader;
+
+ private byte[] dictionaryBufferInBytesCache = null;
+ private final LongColumnVector scratchlcv;
+
+ StringDictionaryTreeReader(int columnId) throws IOException {
+ this(columnId, null, null, null, null, null);
+ }
+
+ protected StringDictionaryTreeReader(int columnId, InStream present, InStream data,
+ InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ super(columnId, present);
+ scratchlcv = new LongColumnVector();
+ if (data != null && encoding != null) {
+ this.reader = createIntegerReader(encoding.getKind(), data, false, false);
+ }
+
+ if (dictionary != null && encoding != null) {
+ readDictionaryStream(dictionary);
+ }
+
+ if (length != null && encoding != null) {
+ readDictionaryLengthStream(length, encoding);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY &&
+ encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+
+ // read the dictionary blob
+ StreamName name = new StreamName(columnId,
+ OrcProto.Stream.Kind.DICTIONARY_DATA);
+ InStream in = streams.get(name);
+ readDictionaryStream(in);
+
+ // read the lengths
+ name = new StreamName(columnId, OrcProto.Stream.Kind.LENGTH);
+ in = streams.get(name);
+ readDictionaryLengthStream(in, stripeFooter.getColumnsList().get(columnId));
+
+ // set up the row reader
+ name = new StreamName(columnId, OrcProto.Stream.Kind.DATA);
+ reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(name), false, false);
+ }
+
+ private void readDictionaryLengthStream(InStream in, OrcProto.ColumnEncoding encoding)
+ throws IOException {
+ int dictionarySize = encoding.getDictionarySize();
+ if (in != null) { // Guard against empty LENGTH stream.
+ IntegerReader lenReader = createIntegerReader(encoding.getKind(), in, false, false);
+ int offset = 0;
+ if (dictionaryOffsets == null ||
+ dictionaryOffsets.length < dictionarySize + 1) {
+ dictionaryOffsets = new int[dictionarySize + 1];
+ }
+ for (int i = 0; i < dictionarySize; ++i) {
+ dictionaryOffsets[i] = offset;
+ offset += (int) lenReader.next();
+ }
+ dictionaryOffsets[dictionarySize] = offset;
+ in.close();
+ }
+
+ }
+
+ private void readDictionaryStream(InStream in) throws IOException {
+ if (in != null) { // Guard against empty dictionary stream.
+ if (in.available() > 0) {
+ dictionaryBuffer = new DynamicByteArray(64, in.available());
+ dictionaryBuffer.readAll(in);
+ // Since its start of strip invalidate the cache.
+ dictionaryBufferInBytesCache = null;
+ }
+ in.close();
+ } else {
+ dictionaryBuffer = null;
+ }
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ seek(index[columnId]);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ super.seek(index);
+ reader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ final BytesColumnVector result = (BytesColumnVector) previousVector;
+ int offset;
+ int length;
+
+ // Read present/isNull stream
+ super.nextVector(result, isNull, batchSize);
+
+ if (dictionaryBuffer != null) {
+
+ // Load dictionaryBuffer into cache.
+ if (dictionaryBufferInBytesCache == null) {
+ dictionaryBufferInBytesCache = dictionaryBuffer.get();
+ }
+
+ // Read string offsets
+ scratchlcv.isNull = result.isNull;
+ scratchlcv.ensureSize((int) batchSize, false);
+ reader.nextVector(scratchlcv, scratchlcv.vector, batchSize);
+ if (!scratchlcv.isRepeating) {
+
+ // The vector has non-repeating strings. Iterate thru the batch
+ // and set strings one by one
+ for (int i = 0; i < batchSize; i++) {
+ if (!scratchlcv.isNull[i]) {
+ offset = dictionaryOffsets[(int) scratchlcv.vector[i]];
+ length = getDictionaryEntryLength((int) scratchlcv.vector[i], offset);
+ result.setRef(i, dictionaryBufferInBytesCache, offset, length);
+ } else {
+ // If the value is null then set offset and length to zero (null string)
+ result.setRef(i, dictionaryBufferInBytesCache, 0, 0);
+ }
+ }
+ } else {
+ // If the value is repeating then just set the first value in the
+ // vector and set the isRepeating flag to true. No need to iterate thru and
+ // set all the elements to the same value
+ offset = dictionaryOffsets[(int) scratchlcv.vector[0]];
+ length = getDictionaryEntryLength((int) scratchlcv.vector[0], offset);
+ result.setRef(0, dictionaryBufferInBytesCache, offset, length);
+ }
+ result.isRepeating = scratchlcv.isRepeating;
+ } else {
+ if (dictionaryOffsets == null) {
+ // Entire stripe contains null strings.
+ result.isRepeating = true;
+ result.noNulls = false;
+ result.isNull[0] = true;
+ result.setRef(0, EMPTY_BYTE_ARRAY, 0, 0);
+ } else {
+ // stripe contains nulls and empty strings
+ for (int i = 0; i < batchSize; i++) {
+ if (!result.isNull[i]) {
+ result.setRef(i, EMPTY_BYTE_ARRAY, 0, 0);
+ }
+ }
+ }
+ }
+ }
+
+ int getDictionaryEntryLength(int entry, int offset) {
+ final int length;
+ // if it isn't the last entry, subtract the offsets otherwise use
+ // the buffer length.
+ if (entry < dictionaryOffsets.length - 1) {
+ length = dictionaryOffsets[entry + 1] - offset;
+ } else {
+ length = dictionaryBuffer.size() - offset;
+ }
+ return length;
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ reader.skip(countNonNulls(items));
+ }
+
+ public IntegerReader getReader() {
+ return reader;
+ }
+ }
+
+ public static class CharTreeReader extends StringTreeReader {
+ int maxLength;
+
+ CharTreeReader(int columnId, int maxLength) throws IOException {
+ this(columnId, maxLength, null, null, null, null, null);
+ }
+
+ protected CharTreeReader(int columnId, int maxLength, InStream present, InStream data,
+ InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present, data, length, dictionary, encoding);
+ this.maxLength = maxLength;
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ // Get the vector of strings from StringTreeReader, then make a 2nd pass to
+ // adjust down the length (right trim and truncate) if necessary.
+ super.nextVector(previousVector, isNull, batchSize);
+ BytesColumnVector result = (BytesColumnVector) previousVector;
+ int adjustedDownLen;
+ if (result.isRepeating) {
+ if (result.noNulls || !result.isNull[0]) {
+ adjustedDownLen = StringExpr
+ .rightTrimAndTruncate(result.vector[0], result.start[0], result.length[0], maxLength);
+ if (adjustedDownLen < result.length[0]) {
+ result.setRef(0, result.vector[0], result.start[0], adjustedDownLen);
+ }
+ }
+ } else {
+ if (result.noNulls) {
+ for (int i = 0; i < batchSize; i++) {
+ adjustedDownLen = StringExpr
+ .rightTrimAndTruncate(result.vector[i], result.start[i], result.length[i],
+ maxLength);
+ if (adjustedDownLen < result.length[i]) {
+ result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
+ }
+ }
+ } else {
+ for (int i = 0; i < batchSize; i++) {
+ if (!result.isNull[i]) {
+ adjustedDownLen = StringExpr
+ .rightTrimAndTruncate(result.vector[i], result.start[i], result.length[i],
+ maxLength);
+ if (adjustedDownLen < result.length[i]) {
+ result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ public static class VarcharTreeReader extends StringTreeReader {
+ int maxLength;
+
+ VarcharTreeReader(int columnId, int maxLength) throws IOException {
+ this(columnId, maxLength, null, null, null, null, null);
+ }
+
+ protected VarcharTreeReader(int columnId, int maxLength, InStream present, InStream data,
+ InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException {
+ super(columnId, present, data, length, dictionary, encoding);
+ this.maxLength = maxLength;
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ // Get the vector of strings from StringTreeReader, then make a 2nd pass to
+ // adjust down the length (truncate) if necessary.
+ super.nextVector(previousVector, isNull, batchSize);
+ BytesColumnVector result = (BytesColumnVector) previousVector;
+
+ int adjustedDownLen;
+ if (result.isRepeating) {
+ if (result.noNulls || !result.isNull[0]) {
+ adjustedDownLen = StringExpr
+ .truncate(result.vector[0], result.start[0], result.length[0], maxLength);
+ if (adjustedDownLen < result.length[0]) {
+ result.setRef(0, result.vector[0], result.start[0], adjustedDownLen);
+ }
+ }
+ } else {
+ if (result.noNulls) {
+ for (int i = 0; i < batchSize; i++) {
+ adjustedDownLen = StringExpr
+ .truncate(result.vector[i], result.start[i], result.length[i], maxLength);
+ if (adjustedDownLen < result.length[i]) {
+ result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
+ }
+ }
+ } else {
+ for (int i = 0; i < batchSize; i++) {
+ if (!result.isNull[i]) {
+ adjustedDownLen = StringExpr
+ .truncate(result.vector[i], result.start[i], result.length[i], maxLength);
+ if (adjustedDownLen < result.length[i]) {
+ result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ public static class StructTreeReader extends TreeReader {
+ protected final TreeReader[] fields;
+
+ protected StructTreeReader(int columnId,
+ TypeDescription readerSchema,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+
+ List<TypeDescription> childrenTypes = readerSchema.getChildren();
+ this.fields = new TreeReader[childrenTypes.size()];
+ for (int i = 0; i < fields.length; ++i) {
+ TypeDescription subtype = childrenTypes.get(i);
+ this.fields[i] = createTreeReader(subtype, evolution, included, skipCorrupt);
+ }
+ }
+
+ public TreeReader[] getChildReaders() {
+ return fields;
+ }
+
+ protected StructTreeReader(int columnId, InStream present,
+ OrcProto.ColumnEncoding encoding, TreeReader[] childReaders) throws IOException {
+ super(columnId, present);
+ if (encoding != null) {
+ checkEncoding(encoding);
+ }
+ this.fields = childReaders;
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ super.seek(index);
+ for (TreeReader kid : fields) {
+ if (kid != null) {
+ kid.seek(index);
+ }
+ }
+ }
+
+ @Override
+ public void nextBatch(VectorizedRowBatch batch,
+ int batchSize) throws IOException {
+ for(int i=0; i < fields.length &&
+ (vectorColumnCount == -1 || i < vectorColumnCount); ++i) {
+ ColumnVector colVector = batch.cols[i];
+ if (colVector != null) {
+ colVector.reset();
+ colVector.ensureSize((int) batchSize, false);
+ fields[i].nextVector(colVector, null, batchSize);
+ }
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ super.nextVector(previousVector, isNull, batchSize);
+ StructColumnVector result = (StructColumnVector) previousVector;
+ if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
+ result.isRepeating = false;
+
+ // Read all the members of struct as column vectors
+ boolean[] mask = result.noNulls ? null : result.isNull;
+ for (int f = 0; f < fields.length; f++) {
+ if (fields[f] != null) {
+ fields[f].nextVector(result.fields[f], mask, batchSize);
+ }
+ }
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ for (TreeReader field : fields) {
+ if (field != null) {
+ field.startStripe(streams, stripeFooter);
+ }
+ }
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ for (TreeReader field : fields) {
+ if (field != null) {
+ field.skipRows(items);
+ }
+ }
+ }
+ }
+
+ public static class UnionTreeReader extends TreeReader {
+ protected final TreeReader[] fields;
+ protected RunLengthByteReader tags;
+
+ protected UnionTreeReader(int fileColumn,
+ TypeDescription readerSchema,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+ super(fileColumn);
+ List<TypeDescription> childrenTypes = readerSchema.getChildren();
+ int fieldCount = childrenTypes.size();
+ this.fields = new TreeReader[fieldCount];
+ for (int i = 0; i < fieldCount; ++i) {
+ TypeDescription subtype = childrenTypes.get(i);
+ this.fields[i] = createTreeReader(subtype, evolution, included, skipCorrupt);
+ }
+ }
+
+ protected UnionTreeReader(int columnId, InStream present,
+ OrcProto.ColumnEncoding encoding, TreeReader[] childReaders) throws IOException {
+ super(columnId, present);
+ if (encoding != null) {
+ checkEncoding(encoding);
+ }
+ this.fields = childReaders;
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ super.seek(index);
+ tags.seek(index[columnId]);
+ for (TreeReader kid : fields) {
+ kid.seek(index);
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ UnionColumnVector result = (UnionColumnVector) previousVector;
+ super.nextVector(result, isNull, batchSize);
+ if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
+ result.isRepeating = false;
+ tags.nextVector(result.noNulls ? null : result.isNull, result.tags,
+ batchSize);
+ boolean[] ignore = new boolean[(int) batchSize];
+ for (int f = 0; f < result.fields.length; ++f) {
+ // build the ignore list for this tag
+ for (int r = 0; r < batchSize; ++r) {
+ ignore[r] = (!result.noNulls && result.isNull[r]) ||
+ result.tags[r] != f;
+ }
+ fields[f].nextVector(result.fields[f], ignore, batchSize);
+ }
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ tags = new RunLengthByteReader(streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.DATA)));
+ for (TreeReader field : fields) {
+ if (field != null) {
+ field.startStripe(streams, stripeFooter);
+ }
+ }
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long[] counts = new long[fields.length];
+ for (int i = 0; i < items; ++i) {
+ counts[tags.next()] += 1;
+ }
+ for (int i = 0; i < counts.length; ++i) {
+ fields[i].skipRows(counts[i]);
+ }
+ }
+ }
+
+ public static class ListTreeReader extends TreeReader {
+ protected final TreeReader elementReader;
+ protected IntegerReader lengths = null;
+
+ protected ListTreeReader(int fileColumn,
+ TypeDescription readerSchema,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+ super(fileColumn);
+ TypeDescription elementType = readerSchema.getChildren().get(0);
+ elementReader = createTreeReader(elementType, evolution, included,
+ skipCorrupt);
+ }
+
+ protected ListTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding, TreeReader elementReader) throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.lengths = createIntegerReader(encoding.getKind(), data, false, false);
+ }
+ this.elementReader = elementReader;
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ super.seek(index);
+ lengths.seek(index[columnId]);
+ elementReader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previous,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ ListColumnVector result = (ListColumnVector) previous;
+ super.nextVector(result, isNull, batchSize);
+ // if we have some none-null values, then read them
+ if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
+ lengths.nextVector(result, result.lengths, batchSize);
+ // even with repeating lengths, the list doesn't repeat
+ result.isRepeating = false;
+ // build the offsets vector and figure out how many children to read
+ result.childCount = 0;
+ for (int r = 0; r < batchSize; ++r) {
+ if (result.noNulls || !result.isNull[r]) {
+ result.offsets[r] = result.childCount;
+ result.childCount += result.lengths[r];
+ }
+ }
+ result.child.ensureSize(result.childCount, false);
+ elementReader.nextVector(result.child, null, result.childCount);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.LENGTH)), false, false);
+ if (elementReader != null) {
+ elementReader.startStripe(streams, stripeFooter);
+ }
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long childSkip = 0;
+ for (long i = 0; i < items; ++i) {
+ childSkip += lengths.next();
+ }
+ elementReader.skipRows(childSkip);
+ }
+ }
+
+ public static class MapTreeReader extends TreeReader {
+ protected final TreeReader keyReader;
+ protected final TreeReader valueReader;
+ protected IntegerReader lengths = null;
+
+ protected MapTreeReader(int fileColumn,
+ TypeDescription readerSchema,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+ super(fileColumn);
+ TypeDescription keyType = readerSchema.getChildren().get(0);
+ TypeDescription valueType = readerSchema.getChildren().get(1);
+ keyReader = createTreeReader(keyType, evolution, included, skipCorrupt);
+ valueReader = createTreeReader(valueType, evolution, included, skipCorrupt);
+ }
+
+ protected MapTreeReader(int columnId, InStream present, InStream data,
+ OrcProto.ColumnEncoding encoding, TreeReader keyReader, TreeReader valueReader)
+ throws IOException {
+ super(columnId, present);
+ if (data != null && encoding != null) {
+ checkEncoding(encoding);
+ this.lengths = createIntegerReader(encoding.getKind(), data, false, false);
+ }
+ this.keyReader = keyReader;
+ this.valueReader = valueReader;
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ super.seek(index);
+ lengths.seek(index[columnId]);
+ keyReader.seek(index);
+ valueReader.seek(index);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previous,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ MapColumnVector result = (MapColumnVector) previous;
+ super.nextVector(result, isNull, batchSize);
+ if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
+ lengths.nextVector(result, result.lengths, batchSize);
+ // even with repeating lengths, the map doesn't repeat
+ result.isRepeating = false;
+ // build the offsets vector and figure out how many children to read
+ result.childCount = 0;
+ for (int r = 0; r < batchSize; ++r) {
+ if (result.noNulls || !result.isNull[r]) {
+ result.offsets[r] = result.childCount;
+ result.childCount += result.lengths[r];
+ }
+ }
+ result.keys.ensureSize(result.childCount, false);
+ result.values.ensureSize(result.childCount, false);
+ keyReader.nextVector(result.keys, null, result.childCount);
+ valueReader.nextVector(result.values, null, result.childCount);
+ }
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
+ (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
+ throw new IOException("Unknown encoding " + encoding + " in column " +
+ columnId);
+ }
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ super.startStripe(streams, stripeFooter);
+ lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
+ streams.get(new StreamName(columnId,
+ OrcProto.Stream.Kind.LENGTH)), false, false);
+ if (keyReader != null) {
+ keyReader.startStripe(streams, stripeFooter);
+ }
+ if (valueReader != null) {
+ valueReader.startStripe(streams, stripeFooter);
+ }
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ items = countNonNulls(items);
+ long childSkip = 0;
+ for (long i = 0; i < items; ++i) {
+ childSkip += lengths.next();
+ }
+ keyReader.skipRows(childSkip);
+ valueReader.skipRows(childSkip);
+ }
+ }
+
+ public static TreeReader createTreeReader(TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt
+ ) throws IOException {
+ TypeDescription fileType = evolution.getFileType(readerType);
+ if (fileType == null || !evolution.includeReaderColumn(readerType.getId())){
+ return new NullTreeReader(0);
+ }
+ TypeDescription.Category readerTypeCategory = readerType.getCategory();
+ if (!fileType.equals(readerType) &&
+ (readerTypeCategory != TypeDescription.Category.STRUCT &&
+ readerTypeCategory != TypeDescription.Category.MAP &&
+ readerTypeCategory != TypeDescription.Category.LIST &&
+ readerTypeCategory != TypeDescription.Category.UNION)) {
+ // We only convert complex children.
+ return ConvertTreeReaderFactory.createConvertTreeReader(readerType, evolution,
+ included, skipCorrupt);
+ }
+ switch (readerTypeCategory) {
+ case BOOLEAN:
+ return new BooleanTreeReader(fileType.getId());
+ case BYTE:
+ return new ByteTreeReader(fileType.getId());
+ case DOUBLE:
+ return new DoubleTreeReader(fileType.getId());
+ case FLOAT:
+ return new FloatTreeReader(fileType.getId());
+ case SHORT:
+ return new ShortTreeReader(fileType.getId());
+ case INT:
+ return new IntTreeReader(fileType.getId());
+ case LONG:
+ return new LongTreeReader(fileType.getId(), skipCorrupt);
+ case STRING:
+ return new StringTreeReader(fileType.getId());
+ case CHAR:
+ return new CharTreeReader(fileType.getId(), readerType.getMaxLength());
+ case VARCHAR:
+ return new VarcharTreeReader(fileType.getId(), readerType.getMaxLength());
+ case BINARY:
+ return new BinaryTreeReader(fileType.getId());
+ case TIMESTAMP:
+ return new TimestampTreeReader(fileType.getId(), skipCorrupt);
+ case DATE:
+ return new DateTreeReader(fileType.getId());
+ case DECIMAL:
+ return new DecimalTreeReader(fileType.getId(), readerType.getPrecision(),
+ readerType.getScale());
+ case STRUCT:
+ return new StructTreeReader(fileType.getId(), readerType,
+ evolution, included, skipCorrupt);
+ case LIST:
+ return new ListTreeReader(fileType.getId(), readerType,
+ evolution, included, skipCorrupt);
+ case MAP:
+ return new MapTreeReader(fileType.getId(), readerType, evolution,
+ included, skipCorrupt);
+ case UNION:
+ return new UnionTreeReader(fileType.getId(), readerType,
+ evolution, included, skipCorrupt);
+ default:
+ throw new IllegalArgumentException("Unsupported type " +
+ readerTypeCategory);
+ }
+ }
+}
[15/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/TreeReaderFactory.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/TreeReaderFactory.java b/orc/src/java/org/apache/orc/impl/TreeReaderFactory.java
deleted file mode 100644
index eceba17..0000000
--- a/orc/src/java/org/apache/orc/impl/TreeReaderFactory.java
+++ /dev/null
@@ -1,2163 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.math.BigInteger;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.TimeZone;
-
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr;
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.OrcProto;
-
-/**
- * Factory for creating ORC tree readers.
- */
-public class TreeReaderFactory {
-
- public abstract static class TreeReader {
- protected final int columnId;
- protected BitFieldReader present = null;
- protected boolean valuePresent = false;
- protected int vectorColumnCount;
-
- TreeReader(int columnId) throws IOException {
- this(columnId, null);
- }
-
- protected TreeReader(int columnId, InStream in) throws IOException {
- this.columnId = columnId;
- if (in == null) {
- present = null;
- valuePresent = true;
- } else {
- present = new BitFieldReader(in, 1);
- }
- vectorColumnCount = -1;
- }
-
- void setVectorColumnCount(int vectorColumnCount) {
- this.vectorColumnCount = vectorColumnCount;
- }
-
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- protected static IntegerReader createIntegerReader(OrcProto.ColumnEncoding.Kind kind,
- InStream in,
- boolean signed, boolean skipCorrupt) throws IOException {
- switch (kind) {
- case DIRECT_V2:
- case DICTIONARY_V2:
- return new RunLengthIntegerReaderV2(in, signed, skipCorrupt);
- case DIRECT:
- case DICTIONARY:
- return new RunLengthIntegerReader(in, signed);
- default:
- throw new IllegalArgumentException("Unknown encoding " + kind);
- }
- }
-
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- checkEncoding(stripeFooter.getColumnsList().get(columnId));
- InStream in = streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.PRESENT));
- if (in == null) {
- present = null;
- valuePresent = true;
- } else {
- present = new BitFieldReader(in, 1);
- }
- }
-
- /**
- * Seek to the given position.
- *
- * @param index the indexes loaded from the file
- * @throws IOException
- */
- public void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- public void seek(PositionProvider index) throws IOException {
- if (present != null) {
- present.seek(index);
- }
- }
-
- protected long countNonNulls(long rows) throws IOException {
- if (present != null) {
- long result = 0;
- for (long c = 0; c < rows; ++c) {
- if (present.next() == 1) {
- result += 1;
- }
- }
- return result;
- } else {
- return rows;
- }
- }
-
- abstract void skipRows(long rows) throws IOException;
-
- /**
- * Called at the top level to read into the given batch.
- * @param batch the batch to read into
- * @param batchSize the number of rows to read
- * @throws IOException
- */
- public void nextBatch(VectorizedRowBatch batch,
- int batchSize) throws IOException {
- batch.cols[0].reset();
- batch.cols[0].ensureSize(batchSize, false);
- nextVector(batch.cols[0], null, batchSize);
- }
-
- /**
- * Populates the isNull vector array in the previousVector object based on
- * the present stream values. This function is called from all the child
- * readers, and they all set the values based on isNull field value.
- *
- * @param previous The columnVector object whose isNull value is populated
- * @param isNull Whether the each value was null at a higher level. If
- * isNull is null, all values are non-null.
- * @param batchSize Size of the column vector
- * @throws IOException
- */
- // TODO: it looks like isNull is never used; it's always null. Remove/deprecate?
- public void nextVector(ColumnVector previous,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (present != null || isNull != null) {
- // Set noNulls and isNull vector of the ColumnVector based on
- // present stream
- previous.noNulls = true;
- boolean allNull = true;
- for (int i = 0; i < batchSize; i++) {
- if (isNull == null || !isNull[i]) {
- if (present != null && present.next() != 1) {
- previous.noNulls = false;
- previous.isNull[i] = true;
- } else {
- previous.isNull[i] = false;
- allNull = false;
- }
- } else {
- previous.noNulls = false;
- previous.isNull[i] = true;
- }
- }
- previous.isRepeating = !previous.noNulls && allNull;
- } else {
- // There is no present stream, this means that all the values are
- // present.
- previous.noNulls = true;
- for (int i = 0; i < batchSize; i++) {
- previous.isNull[i] = false;
- }
- }
- }
-
- public BitFieldReader getPresent() {
- return present;
- }
-
- public int getColumnId() {
- return columnId;
- }
- }
-
- public static class NullTreeReader extends TreeReader {
-
- public NullTreeReader(int columnId) throws IOException {
- super(columnId);
- }
-
- @Override
- public void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter footer) {
- // PASS
- }
-
- @Override
- void skipRows(long rows) {
- // PASS
- }
-
- @Override
- public void seek(PositionProvider position) {
- // PASS
- }
-
- @Override
- public void seek(PositionProvider[] position) {
- // PASS
- }
-
- @Override
- public void nextVector(ColumnVector vector, boolean[] isNull, int size) {
- vector.noNulls = false;
- vector.isNull[0] = true;
- vector.isRepeating = true;
- }
- }
-
- public static class BooleanTreeReader extends TreeReader {
- protected BitFieldReader reader = null;
-
- BooleanTreeReader(int columnId) throws IOException {
- this(columnId, null, null);
- }
-
- protected BooleanTreeReader(int columnId, InStream present, InStream data) throws IOException {
- super(columnId, present);
- if (data != null) {
- reader = new BitFieldReader(data, 1);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- reader = new BitFieldReader(streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.DATA)), 1);
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- reader.seek(index);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skip(countNonNulls(items));
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- LongColumnVector result = (LongColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- // Read value entries based on isNull entries
- reader.nextVector(result, batchSize);
- }
- }
-
- public static class ByteTreeReader extends TreeReader {
- protected RunLengthByteReader reader = null;
-
- ByteTreeReader(int columnId) throws IOException {
- this(columnId, null, null);
- }
-
- protected ByteTreeReader(int columnId, InStream present, InStream data) throws IOException {
- super(columnId, present);
- this.reader = new RunLengthByteReader(data);
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- reader = new RunLengthByteReader(streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.DATA)));
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- reader.seek(index);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final LongColumnVector result = (LongColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- // Read value entries based on isNull entries
- reader.nextVector(result, result.vector, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skip(countNonNulls(items));
- }
- }
-
- public static class ShortTreeReader extends TreeReader {
- protected IntegerReader reader = null;
-
- ShortTreeReader(int columnId) throws IOException {
- this(columnId, null, null, null);
- }
-
- protected ShortTreeReader(int columnId, InStream present, InStream data,
- OrcProto.ColumnEncoding encoding)
- throws IOException {
- super(columnId, present);
- if (data != null && encoding != null) {
- checkEncoding(encoding);
- this.reader = createIntegerReader(encoding.getKind(), data, true, false);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(name), true, false);
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- reader.seek(index);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final LongColumnVector result = (LongColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- // Read value entries based on isNull entries
- reader.nextVector(result, result.vector, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skip(countNonNulls(items));
- }
- }
-
- public static class IntTreeReader extends TreeReader {
- protected IntegerReader reader = null;
-
- IntTreeReader(int columnId) throws IOException {
- this(columnId, null, null, null);
- }
-
- protected IntTreeReader(int columnId, InStream present, InStream data,
- OrcProto.ColumnEncoding encoding)
- throws IOException {
- super(columnId, present);
- if (data != null && encoding != null) {
- checkEncoding(encoding);
- this.reader = createIntegerReader(encoding.getKind(), data, true, false);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(name), true, false);
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- reader.seek(index);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final LongColumnVector result = (LongColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- // Read value entries based on isNull entries
- reader.nextVector(result, result.vector, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skip(countNonNulls(items));
- }
- }
-
- public static class LongTreeReader extends TreeReader {
- protected IntegerReader reader = null;
-
- LongTreeReader(int columnId, boolean skipCorrupt) throws IOException {
- this(columnId, null, null, null, skipCorrupt);
- }
-
- protected LongTreeReader(int columnId, InStream present, InStream data,
- OrcProto.ColumnEncoding encoding,
- boolean skipCorrupt)
- throws IOException {
- super(columnId, present);
- if (data != null && encoding != null) {
- checkEncoding(encoding);
- this.reader = createIntegerReader(encoding.getKind(), data, true, skipCorrupt);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(name), true, false);
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- reader.seek(index);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final LongColumnVector result = (LongColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- // Read value entries based on isNull entries
- reader.nextVector(result, result.vector, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skip(countNonNulls(items));
- }
- }
-
- public static class FloatTreeReader extends TreeReader {
- protected InStream stream;
- private final SerializationUtils utils;
-
- FloatTreeReader(int columnId) throws IOException {
- this(columnId, null, null);
- }
-
- protected FloatTreeReader(int columnId, InStream present, InStream data) throws IOException {
- super(columnId, present);
- this.utils = new SerializationUtils();
- this.stream = data;
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- stream = streams.get(name);
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- stream.seek(index);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final DoubleColumnVector result = (DoubleColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- final boolean hasNulls = !result.noNulls;
- boolean allNulls = hasNulls;
-
- if (hasNulls) {
- // conditions to ensure bounds checks skips
- for (int i = 0; batchSize <= result.isNull.length && i < batchSize; i++) {
- allNulls = allNulls & result.isNull[i];
- }
- if (allNulls) {
- result.vector[0] = Double.NaN;
- result.isRepeating = true;
- } else {
- // some nulls
- result.isRepeating = false;
- // conditions to ensure bounds checks skips
- for (int i = 0; batchSize <= result.isNull.length
- && batchSize <= result.vector.length && i < batchSize; i++) {
- if (!result.isNull[i]) {
- result.vector[i] = utils.readFloat(stream);
- } else {
- // If the value is not present then set NaN
- result.vector[i] = Double.NaN;
- }
- }
- }
- } else {
- // no nulls & > 1 row (check repeating)
- boolean repeating = (batchSize > 1);
- final float f1 = utils.readFloat(stream);
- result.vector[0] = f1;
- // conditions to ensure bounds checks skips
- for (int i = 1; i < batchSize && batchSize <= result.vector.length; i++) {
- final float f2 = utils.readFloat(stream);
- repeating = repeating && (f1 == f2);
- result.vector[i] = f2;
- }
- result.isRepeating = repeating;
- }
- }
-
- @Override
- protected void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- for (int i = 0; i < items; ++i) {
- utils.readFloat(stream);
- }
- }
- }
-
- public static class DoubleTreeReader extends TreeReader {
- protected InStream stream;
- private final SerializationUtils utils;
-
- DoubleTreeReader(int columnId) throws IOException {
- this(columnId, null, null);
- }
-
- protected DoubleTreeReader(int columnId, InStream present, InStream data) throws IOException {
- super(columnId, present);
- this.utils = new SerializationUtils();
- this.stream = data;
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name =
- new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- stream = streams.get(name);
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- stream.seek(index);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final DoubleColumnVector result = (DoubleColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- final boolean hasNulls = !result.noNulls;
- boolean allNulls = hasNulls;
-
- if (hasNulls) {
- // conditions to ensure bounds checks skips
- for (int i = 0; i < batchSize && batchSize <= result.isNull.length; i++) {
- allNulls = allNulls & result.isNull[i];
- }
- if (allNulls) {
- result.vector[0] = Double.NaN;
- result.isRepeating = true;
- } else {
- // some nulls
- result.isRepeating = false;
- // conditions to ensure bounds checks skips
- for (int i = 0; batchSize <= result.isNull.length
- && batchSize <= result.vector.length && i < batchSize; i++) {
- if (!result.isNull[i]) {
- result.vector[i] = utils.readDouble(stream);
- } else {
- // If the value is not present then set NaN
- result.vector[i] = Double.NaN;
- }
- }
- }
- } else {
- // no nulls
- boolean repeating = (batchSize > 1);
- final double d1 = utils.readDouble(stream);
- result.vector[0] = d1;
- // conditions to ensure bounds checks skips
- for (int i = 1; i < batchSize && batchSize <= result.vector.length; i++) {
- final double d2 = utils.readDouble(stream);
- repeating = repeating && (d1 == d2);
- result.vector[i] = d2;
- }
- result.isRepeating = repeating;
- }
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- long len = items * 8;
- while (len > 0) {
- len -= stream.skip(len);
- }
- }
- }
-
- public static class BinaryTreeReader extends TreeReader {
- protected InStream stream;
- protected IntegerReader lengths = null;
- protected final LongColumnVector scratchlcv;
-
- BinaryTreeReader(int columnId) throws IOException {
- this(columnId, null, null, null, null);
- }
-
- protected BinaryTreeReader(int columnId, InStream present, InStream data, InStream length,
- OrcProto.ColumnEncoding encoding) throws IOException {
- super(columnId, present);
- scratchlcv = new LongColumnVector();
- this.stream = data;
- if (length != null && encoding != null) {
- checkEncoding(encoding);
- this.lengths = createIntegerReader(encoding.getKind(), length, false, false);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- stream = streams.get(name);
- lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(new StreamName(columnId, OrcProto.Stream.Kind.LENGTH)), false, false);
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- stream.seek(index);
- lengths.seek(index);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final BytesColumnVector result = (BytesColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- BytesColumnVectorUtil.readOrcByteArrays(stream, lengths, scratchlcv, result, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- long lengthToSkip = 0;
- for (int i = 0; i < items; ++i) {
- lengthToSkip += lengths.next();
- }
- while (lengthToSkip > 0) {
- lengthToSkip -= stream.skip(lengthToSkip);
- }
- }
- }
-
- public static class TimestampTreeReader extends TreeReader {
- protected IntegerReader data = null;
- protected IntegerReader nanos = null;
- private final boolean skipCorrupt;
- private Map<String, Long> baseTimestampMap;
- protected long base_timestamp;
- private final TimeZone readerTimeZone;
- private TimeZone writerTimeZone;
- private boolean hasSameTZRules;
-
- TimestampTreeReader(int columnId, boolean skipCorrupt) throws IOException {
- this(columnId, null, null, null, null, skipCorrupt, null);
- }
-
- protected TimestampTreeReader(int columnId, InStream presentStream, InStream dataStream,
- InStream nanosStream, OrcProto.ColumnEncoding encoding, boolean skipCorrupt, String writerTimezone)
- throws IOException {
- super(columnId, presentStream);
- this.skipCorrupt = skipCorrupt;
- this.baseTimestampMap = new HashMap<>();
- this.readerTimeZone = TimeZone.getDefault();
- this.writerTimeZone = readerTimeZone;
- this.hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone);
- this.base_timestamp = getBaseTimestamp(readerTimeZone.getID());
- if (encoding != null) {
- checkEncoding(encoding);
-
- if (dataStream != null) {
- this.data = createIntegerReader(encoding.getKind(), dataStream, true, skipCorrupt);
- }
-
- if (nanosStream != null) {
- this.nanos = createIntegerReader(encoding.getKind(), nanosStream, false, skipCorrupt);
- }
- base_timestamp = getBaseTimestamp(writerTimezone);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- data = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.DATA)), true, skipCorrupt);
- nanos = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.SECONDARY)), false, skipCorrupt);
- base_timestamp = getBaseTimestamp(stripeFooter.getWriterTimezone());
- }
-
- protected long getBaseTimestamp(String timeZoneId) throws IOException {
- // to make sure new readers read old files in the same way
- if (timeZoneId == null || timeZoneId.isEmpty()) {
- timeZoneId = readerTimeZone.getID();
- }
-
- if (!baseTimestampMap.containsKey(timeZoneId)) {
- writerTimeZone = TimeZone.getTimeZone(timeZoneId);
- hasSameTZRules = writerTimeZone.hasSameRules(readerTimeZone);
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- sdf.setTimeZone(writerTimeZone);
- try {
- long epoch =
- sdf.parse(WriterImpl.BASE_TIMESTAMP_STRING).getTime() / WriterImpl.MILLIS_PER_SECOND;
- baseTimestampMap.put(timeZoneId, epoch);
- return epoch;
- } catch (ParseException e) {
- throw new IOException("Unable to create base timestamp", e);
- } finally {
- sdf.setTimeZone(readerTimeZone);
- }
- }
-
- return baseTimestampMap.get(timeZoneId);
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- data.seek(index);
- nanos.seek(index);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- TimestampColumnVector result = (TimestampColumnVector) previousVector;
- super.nextVector(previousVector, isNull, batchSize);
-
- for (int i = 0; i < batchSize; i++) {
- if (result.noNulls || !result.isNull[i]) {
- long millis = data.next() + base_timestamp;
- int newNanos = parseNanos(nanos.next());
- if (millis < 0 && newNanos != 0) {
- millis -= 1;
- }
- millis *= WriterImpl.MILLIS_PER_SECOND;
- long offset = 0;
- // If reader and writer time zones have different rules, adjust the timezone difference
- // between reader and writer taking day light savings into account.
- if (!hasSameTZRules) {
- offset = writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(millis);
- }
- long adjustedMillis = millis + offset;
- // Sometimes the reader timezone might have changed after adding the adjustedMillis.
- // To account for that change, check for any difference in reader timezone after
- // adding adjustedMillis. If so use the new offset (offset at adjustedMillis point of time).
- if (!hasSameTZRules &&
- (readerTimeZone.getOffset(millis) != readerTimeZone.getOffset(adjustedMillis))) {
- long newOffset =
- writerTimeZone.getOffset(millis) - readerTimeZone.getOffset(adjustedMillis);
- adjustedMillis = millis + newOffset;
- }
- result.time[i] = adjustedMillis;
- result.nanos[i] = newNanos;
- if (result.isRepeating && i != 0 &&
- (result.time[0] != result.time[i] ||
- result.nanos[0] != result.nanos[i])) {
- result.isRepeating = false;
- }
- }
- }
- }
-
- private static int parseNanos(long serialized) {
- int zeros = 7 & (int) serialized;
- int result = (int) (serialized >>> 3);
- if (zeros != 0) {
- for (int i = 0; i <= zeros; ++i) {
- result *= 10;
- }
- }
- return result;
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- data.skip(items);
- nanos.skip(items);
- }
- }
-
- public static class DateTreeReader extends TreeReader {
- protected IntegerReader reader = null;
-
- DateTreeReader(int columnId) throws IOException {
- this(columnId, null, null, null);
- }
-
- protected DateTreeReader(int columnId, InStream present, InStream data,
- OrcProto.ColumnEncoding encoding) throws IOException {
- super(columnId, present);
- if (data != null && encoding != null) {
- checkEncoding(encoding);
- reader = createIntegerReader(encoding.getKind(), data, true, false);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(name), true, false);
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- reader.seek(index);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final LongColumnVector result = (LongColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- // Read value entries based on isNull entries
- reader.nextVector(result, result.vector, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skip(countNonNulls(items));
- }
- }
-
- public static class DecimalTreeReader extends TreeReader {
- protected InStream valueStream;
- protected IntegerReader scaleReader = null;
- private int[] scratchScaleVector;
- private byte[] scratchBytes;
-
- private final int precision;
- private final int scale;
-
- DecimalTreeReader(int columnId, int precision, int scale) throws IOException {
- this(columnId, precision, scale, null, null, null, null);
- }
-
- protected DecimalTreeReader(int columnId, int precision, int scale, InStream present,
- InStream valueStream, InStream scaleStream, OrcProto.ColumnEncoding encoding)
- throws IOException {
- super(columnId, present);
- this.precision = precision;
- this.scale = scale;
- this.scratchScaleVector = new int[VectorizedRowBatch.DEFAULT_SIZE];
- this.valueStream = valueStream;
- this.scratchBytes = new byte[HiveDecimal.SCRATCH_BUFFER_LEN_SERIALIZATION_UTILS_READ];
- if (scaleStream != null && encoding != null) {
- checkEncoding(encoding);
- this.scaleReader = createIntegerReader(encoding.getKind(), scaleStream, true, false);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- valueStream = streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.DATA));
- scaleReader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(new StreamName(columnId, OrcProto.Stream.Kind.SECONDARY)), true, false);
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- valueStream.seek(index);
- scaleReader.seek(index);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final DecimalColumnVector result = (DecimalColumnVector) previousVector;
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- if (batchSize > scratchScaleVector.length) {
- scratchScaleVector = new int[(int) batchSize];
- }
- // read the scales
- scaleReader.nextVector(result, scratchScaleVector, batchSize);
- // Read value entries based on isNull entries
- // Use the fast ORC deserialization method that emulates SerializationUtils.readBigInteger
- // provided by HiveDecimalWritable.
- HiveDecimalWritable[] vector = result.vector;
- HiveDecimalWritable decWritable;
- if (result.noNulls) {
- for (int r=0; r < batchSize; ++r) {
- decWritable = vector[r];
- if (!decWritable.serializationUtilsRead(
- valueStream, scratchScaleVector[r],
- scratchBytes)) {
- result.isNull[r] = true;
- result.noNulls = false;
- }
- }
- } else if (!result.isRepeating || !result.isNull[0]) {
- for (int r=0; r < batchSize; ++r) {
- if (!result.isNull[r]) {
- decWritable = vector[r];
- if (!decWritable.serializationUtilsRead(
- valueStream, scratchScaleVector[r],
- scratchBytes)) {
- result.isNull[r] = true;
- result.noNulls = false;
- }
- }
- }
- }
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- HiveDecimalWritable scratchDecWritable = new HiveDecimalWritable();
- for (int i = 0; i < items; i++) {
- scratchDecWritable.serializationUtilsRead(valueStream, 0, scratchBytes);
- }
- scaleReader.skip(items);
- }
- }
-
- /**
- * A tree reader that will read string columns. At the start of the
- * stripe, it creates an internal reader based on whether a direct or
- * dictionary encoding was used.
- */
- public static class StringTreeReader extends TreeReader {
- protected TreeReader reader;
-
- StringTreeReader(int columnId) throws IOException {
- super(columnId);
- }
-
- protected StringTreeReader(int columnId, InStream present, InStream data, InStream length,
- InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException {
- super(columnId, present);
- if (encoding != null) {
- switch (encoding.getKind()) {
- case DIRECT:
- case DIRECT_V2:
- reader = new StringDirectTreeReader(columnId, present, data, length,
- encoding.getKind());
- break;
- case DICTIONARY:
- case DICTIONARY_V2:
- reader = new StringDictionaryTreeReader(columnId, present, data, length, dictionary,
- encoding);
- break;
- default:
- throw new IllegalArgumentException("Unsupported encoding " +
- encoding.getKind());
- }
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- reader.checkEncoding(encoding);
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- // For each stripe, checks the encoding and initializes the appropriate
- // reader
- switch (stripeFooter.getColumnsList().get(columnId).getKind()) {
- case DIRECT:
- case DIRECT_V2:
- reader = new StringDirectTreeReader(columnId);
- break;
- case DICTIONARY:
- case DICTIONARY_V2:
- reader = new StringDictionaryTreeReader(columnId);
- break;
- default:
- throw new IllegalArgumentException("Unsupported encoding " +
- stripeFooter.getColumnsList().get(columnId).getKind());
- }
- reader.startStripe(streams, stripeFooter);
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- reader.seek(index);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- reader.seek(index);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- reader.nextVector(previousVector, isNull, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skipRows(items);
- }
- }
-
- private static org.slf4j.Logger LOG = org.slf4j.LoggerFactory.getLogger(TreeReaderFactory.class);
- // This class collects together very similar methods for reading an ORC vector of byte arrays and
- // creating the BytesColumnVector.
- //
- public static class BytesColumnVectorUtil {
-
- private static byte[] commonReadByteArrays(InStream stream, IntegerReader lengths,
- LongColumnVector scratchlcv,
- BytesColumnVector result, final int batchSize) throws IOException {
- // Read lengths
- scratchlcv.isNull = result.isNull; // Notice we are replacing the isNull vector here...
- scratchlcv.ensureSize(batchSize, false);
- lengths.nextVector(scratchlcv, scratchlcv.vector, batchSize);
- int totalLength = 0;
- if (!scratchlcv.isRepeating) {
- for (int i = 0; i < batchSize; i++) {
- if (!scratchlcv.isNull[i]) {
- totalLength += (int) scratchlcv.vector[i];
- }
- }
- } else {
- if (!scratchlcv.isNull[0]) {
- totalLength = (int) (batchSize * scratchlcv.vector[0]);
- }
- }
-
- // Read all the strings for this batch
- byte[] allBytes = new byte[totalLength];
- int offset = 0;
- int len = totalLength;
- while (len > 0) {
- int bytesRead = stream.read(allBytes, offset, len);
- if (bytesRead < 0) {
- throw new EOFException("Can't finish byte read from " + stream);
- }
- len -= bytesRead;
- offset += bytesRead;
- }
-
- return allBytes;
- }
-
- // This method has the common code for reading in bytes into a BytesColumnVector.
- public static void readOrcByteArrays(InStream stream,
- IntegerReader lengths,
- LongColumnVector scratchlcv,
- BytesColumnVector result,
- final int batchSize) throws IOException {
- if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
- byte[] allBytes = commonReadByteArrays(stream, lengths, scratchlcv,
- result, (int) batchSize);
-
- // Too expensive to figure out 'repeating' by comparisons.
- result.isRepeating = false;
- int offset = 0;
- if (!scratchlcv.isRepeating) {
- for (int i = 0; i < batchSize; i++) {
- if (!scratchlcv.isNull[i]) {
- result.setRef(i, allBytes, offset, (int) scratchlcv.vector[i]);
- offset += scratchlcv.vector[i];
- } else {
- result.setRef(i, allBytes, 0, 0);
- }
- }
- } else {
- for (int i = 0; i < batchSize; i++) {
- if (!scratchlcv.isNull[i]) {
- result.setRef(i, allBytes, offset, (int) scratchlcv.vector[0]);
- offset += scratchlcv.vector[0];
- } else {
- result.setRef(i, allBytes, 0, 0);
- }
- }
- }
- }
- }
- }
-
- /**
- * A reader for string columns that are direct encoded in the current
- * stripe.
- */
- public static class StringDirectTreeReader extends TreeReader {
- private static final HadoopShims SHIMS = HadoopShims.Factory.get();
- protected InStream stream;
- protected HadoopShims.TextReaderShim data;
- protected IntegerReader lengths;
- private final LongColumnVector scratchlcv;
-
- StringDirectTreeReader(int columnId) throws IOException {
- this(columnId, null, null, null, null);
- }
-
- protected StringDirectTreeReader(int columnId, InStream present, InStream data,
- InStream length, OrcProto.ColumnEncoding.Kind encoding) throws IOException {
- super(columnId, present);
- this.scratchlcv = new LongColumnVector();
- this.stream = data;
- if (length != null && encoding != null) {
- this.lengths = createIntegerReader(encoding, length, false, false);
- this.data = SHIMS.getTextReaderShim(this.stream);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT &&
- encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DATA);
- stream = streams.get(name);
- data = SHIMS.getTextReaderShim(this.stream);
- lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(new StreamName(columnId, OrcProto.Stream.Kind.LENGTH)),
- false, false);
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- stream.seek(index);
- // don't seek data stream
- lengths.seek(index);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final BytesColumnVector result = (BytesColumnVector) previousVector;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- BytesColumnVectorUtil.readOrcByteArrays(stream, lengths, scratchlcv,
- result, batchSize);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- long lengthToSkip = 0;
- for (int i = 0; i < items; ++i) {
- lengthToSkip += lengths.next();
- }
-
- while (lengthToSkip > 0) {
- lengthToSkip -= stream.skip(lengthToSkip);
- }
- }
-
- public IntegerReader getLengths() {
- return lengths;
- }
-
- public InStream getStream() {
- return stream;
- }
- }
-
- /**
- * A reader for string columns that are dictionary encoded in the current
- * stripe.
- */
- public static class StringDictionaryTreeReader extends TreeReader {
- private static final byte[] EMPTY_BYTE_ARRAY = new byte[0];
- private DynamicByteArray dictionaryBuffer;
- private int[] dictionaryOffsets;
- protected IntegerReader reader;
-
- private byte[] dictionaryBufferInBytesCache = null;
- private final LongColumnVector scratchlcv;
-
- StringDictionaryTreeReader(int columnId) throws IOException {
- this(columnId, null, null, null, null, null);
- }
-
- protected StringDictionaryTreeReader(int columnId, InStream present, InStream data,
- InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding)
- throws IOException {
- super(columnId, present);
- scratchlcv = new LongColumnVector();
- if (data != null && encoding != null) {
- this.reader = createIntegerReader(encoding.getKind(), data, false, false);
- }
-
- if (dictionary != null && encoding != null) {
- readDictionaryStream(dictionary);
- }
-
- if (length != null && encoding != null) {
- readDictionaryLengthStream(length, encoding);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY &&
- encoding.getKind() != OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
-
- // read the dictionary blob
- StreamName name = new StreamName(columnId,
- OrcProto.Stream.Kind.DICTIONARY_DATA);
- InStream in = streams.get(name);
- readDictionaryStream(in);
-
- // read the lengths
- name = new StreamName(columnId, OrcProto.Stream.Kind.LENGTH);
- in = streams.get(name);
- readDictionaryLengthStream(in, stripeFooter.getColumnsList().get(columnId));
-
- // set up the row reader
- name = new StreamName(columnId, OrcProto.Stream.Kind.DATA);
- reader = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(name), false, false);
- }
-
- private void readDictionaryLengthStream(InStream in, OrcProto.ColumnEncoding encoding)
- throws IOException {
- int dictionarySize = encoding.getDictionarySize();
- if (in != null) { // Guard against empty LENGTH stream.
- IntegerReader lenReader = createIntegerReader(encoding.getKind(), in, false, false);
- int offset = 0;
- if (dictionaryOffsets == null ||
- dictionaryOffsets.length < dictionarySize + 1) {
- dictionaryOffsets = new int[dictionarySize + 1];
- }
- for (int i = 0; i < dictionarySize; ++i) {
- dictionaryOffsets[i] = offset;
- offset += (int) lenReader.next();
- }
- dictionaryOffsets[dictionarySize] = offset;
- in.close();
- }
-
- }
-
- private void readDictionaryStream(InStream in) throws IOException {
- if (in != null) { // Guard against empty dictionary stream.
- if (in.available() > 0) {
- dictionaryBuffer = new DynamicByteArray(64, in.available());
- dictionaryBuffer.readAll(in);
- // Since its start of strip invalidate the cache.
- dictionaryBufferInBytesCache = null;
- }
- in.close();
- } else {
- dictionaryBuffer = null;
- }
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- seek(index[columnId]);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- super.seek(index);
- reader.seek(index);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- final BytesColumnVector result = (BytesColumnVector) previousVector;
- int offset;
- int length;
-
- // Read present/isNull stream
- super.nextVector(result, isNull, batchSize);
-
- if (dictionaryBuffer != null) {
-
- // Load dictionaryBuffer into cache.
- if (dictionaryBufferInBytesCache == null) {
- dictionaryBufferInBytesCache = dictionaryBuffer.get();
- }
-
- // Read string offsets
- scratchlcv.isNull = result.isNull;
- scratchlcv.ensureSize((int) batchSize, false);
- reader.nextVector(scratchlcv, scratchlcv.vector, batchSize);
- if (!scratchlcv.isRepeating) {
-
- // The vector has non-repeating strings. Iterate thru the batch
- // and set strings one by one
- for (int i = 0; i < batchSize; i++) {
- if (!scratchlcv.isNull[i]) {
- offset = dictionaryOffsets[(int) scratchlcv.vector[i]];
- length = getDictionaryEntryLength((int) scratchlcv.vector[i], offset);
- result.setRef(i, dictionaryBufferInBytesCache, offset, length);
- } else {
- // If the value is null then set offset and length to zero (null string)
- result.setRef(i, dictionaryBufferInBytesCache, 0, 0);
- }
- }
- } else {
- // If the value is repeating then just set the first value in the
- // vector and set the isRepeating flag to true. No need to iterate thru and
- // set all the elements to the same value
- offset = dictionaryOffsets[(int) scratchlcv.vector[0]];
- length = getDictionaryEntryLength((int) scratchlcv.vector[0], offset);
- result.setRef(0, dictionaryBufferInBytesCache, offset, length);
- }
- result.isRepeating = scratchlcv.isRepeating;
- } else {
- if (dictionaryOffsets == null) {
- // Entire stripe contains null strings.
- result.isRepeating = true;
- result.noNulls = false;
- result.isNull[0] = true;
- result.setRef(0, EMPTY_BYTE_ARRAY, 0, 0);
- } else {
- // stripe contains nulls and empty strings
- for (int i = 0; i < batchSize; i++) {
- if (!result.isNull[i]) {
- result.setRef(i, EMPTY_BYTE_ARRAY, 0, 0);
- }
- }
- }
- }
- }
-
- int getDictionaryEntryLength(int entry, int offset) {
- final int length;
- // if it isn't the last entry, subtract the offsets otherwise use
- // the buffer length.
- if (entry < dictionaryOffsets.length - 1) {
- length = dictionaryOffsets[entry + 1] - offset;
- } else {
- length = dictionaryBuffer.size() - offset;
- }
- return length;
- }
-
- @Override
- void skipRows(long items) throws IOException {
- reader.skip(countNonNulls(items));
- }
-
- public IntegerReader getReader() {
- return reader;
- }
- }
-
- public static class CharTreeReader extends StringTreeReader {
- int maxLength;
-
- CharTreeReader(int columnId, int maxLength) throws IOException {
- this(columnId, maxLength, null, null, null, null, null);
- }
-
- protected CharTreeReader(int columnId, int maxLength, InStream present, InStream data,
- InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException {
- super(columnId, present, data, length, dictionary, encoding);
- this.maxLength = maxLength;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- // Get the vector of strings from StringTreeReader, then make a 2nd pass to
- // adjust down the length (right trim and truncate) if necessary.
- super.nextVector(previousVector, isNull, batchSize);
- BytesColumnVector result = (BytesColumnVector) previousVector;
- int adjustedDownLen;
- if (result.isRepeating) {
- if (result.noNulls || !result.isNull[0]) {
- adjustedDownLen = StringExpr
- .rightTrimAndTruncate(result.vector[0], result.start[0], result.length[0], maxLength);
- if (adjustedDownLen < result.length[0]) {
- result.setRef(0, result.vector[0], result.start[0], adjustedDownLen);
- }
- }
- } else {
- if (result.noNulls) {
- for (int i = 0; i < batchSize; i++) {
- adjustedDownLen = StringExpr
- .rightTrimAndTruncate(result.vector[i], result.start[i], result.length[i],
- maxLength);
- if (adjustedDownLen < result.length[i]) {
- result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
- }
- }
- } else {
- for (int i = 0; i < batchSize; i++) {
- if (!result.isNull[i]) {
- adjustedDownLen = StringExpr
- .rightTrimAndTruncate(result.vector[i], result.start[i], result.length[i],
- maxLength);
- if (adjustedDownLen < result.length[i]) {
- result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
- }
- }
- }
- }
- }
- }
- }
-
- public static class VarcharTreeReader extends StringTreeReader {
- int maxLength;
-
- VarcharTreeReader(int columnId, int maxLength) throws IOException {
- this(columnId, maxLength, null, null, null, null, null);
- }
-
- protected VarcharTreeReader(int columnId, int maxLength, InStream present, InStream data,
- InStream length, InStream dictionary, OrcProto.ColumnEncoding encoding) throws IOException {
- super(columnId, present, data, length, dictionary, encoding);
- this.maxLength = maxLength;
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- // Get the vector of strings from StringTreeReader, then make a 2nd pass to
- // adjust down the length (truncate) if necessary.
- super.nextVector(previousVector, isNull, batchSize);
- BytesColumnVector result = (BytesColumnVector) previousVector;
-
- int adjustedDownLen;
- if (result.isRepeating) {
- if (result.noNulls || !result.isNull[0]) {
- adjustedDownLen = StringExpr
- .truncate(result.vector[0], result.start[0], result.length[0], maxLength);
- if (adjustedDownLen < result.length[0]) {
- result.setRef(0, result.vector[0], result.start[0], adjustedDownLen);
- }
- }
- } else {
- if (result.noNulls) {
- for (int i = 0; i < batchSize; i++) {
- adjustedDownLen = StringExpr
- .truncate(result.vector[i], result.start[i], result.length[i], maxLength);
- if (adjustedDownLen < result.length[i]) {
- result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
- }
- }
- } else {
- for (int i = 0; i < batchSize; i++) {
- if (!result.isNull[i]) {
- adjustedDownLen = StringExpr
- .truncate(result.vector[i], result.start[i], result.length[i], maxLength);
- if (adjustedDownLen < result.length[i]) {
- result.setRef(i, result.vector[i], result.start[i], adjustedDownLen);
- }
- }
- }
- }
- }
- }
- }
-
- public static class StructTreeReader extends TreeReader {
- protected final TreeReader[] fields;
-
- protected StructTreeReader(int columnId,
- TypeDescription readerSchema,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt) throws IOException {
- super(columnId);
-
- List<TypeDescription> childrenTypes = readerSchema.getChildren();
- this.fields = new TreeReader[childrenTypes.size()];
- for (int i = 0; i < fields.length; ++i) {
- TypeDescription subtype = childrenTypes.get(i);
- this.fields[i] = createTreeReader(subtype, evolution, included, skipCorrupt);
- }
- }
-
- public TreeReader[] getChildReaders() {
- return fields;
- }
-
- protected StructTreeReader(int columnId, InStream present,
- OrcProto.ColumnEncoding encoding, TreeReader[] childReaders) throws IOException {
- super(columnId, present);
- if (encoding != null) {
- checkEncoding(encoding);
- }
- this.fields = childReaders;
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- super.seek(index);
- for (TreeReader kid : fields) {
- if (kid != null) {
- kid.seek(index);
- }
- }
- }
-
- @Override
- public void nextBatch(VectorizedRowBatch batch,
- int batchSize) throws IOException {
- for(int i=0; i < fields.length &&
- (vectorColumnCount == -1 || i < vectorColumnCount); ++i) {
- ColumnVector colVector = batch.cols[i];
- if (colVector != null) {
- colVector.reset();
- colVector.ensureSize((int) batchSize, false);
- fields[i].nextVector(colVector, null, batchSize);
- }
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- super.nextVector(previousVector, isNull, batchSize);
- StructColumnVector result = (StructColumnVector) previousVector;
- if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
- result.isRepeating = false;
-
- // Read all the members of struct as column vectors
- boolean[] mask = result.noNulls ? null : result.isNull;
- for (int f = 0; f < fields.length; f++) {
- if (fields[f] != null) {
- fields[f].nextVector(result.fields[f], mask, batchSize);
- }
- }
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- for (TreeReader field : fields) {
- if (field != null) {
- field.startStripe(streams, stripeFooter);
- }
- }
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- for (TreeReader field : fields) {
- if (field != null) {
- field.skipRows(items);
- }
- }
- }
- }
-
- public static class UnionTreeReader extends TreeReader {
- protected final TreeReader[] fields;
- protected RunLengthByteReader tags;
-
- protected UnionTreeReader(int fileColumn,
- TypeDescription readerSchema,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt) throws IOException {
- super(fileColumn);
- List<TypeDescription> childrenTypes = readerSchema.getChildren();
- int fieldCount = childrenTypes.size();
- this.fields = new TreeReader[fieldCount];
- for (int i = 0; i < fieldCount; ++i) {
- TypeDescription subtype = childrenTypes.get(i);
- this.fields[i] = createTreeReader(subtype, evolution, included, skipCorrupt);
- }
- }
-
- protected UnionTreeReader(int columnId, InStream present,
- OrcProto.ColumnEncoding encoding, TreeReader[] childReaders) throws IOException {
- super(columnId, present);
- if (encoding != null) {
- checkEncoding(encoding);
- }
- this.fields = childReaders;
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- super.seek(index);
- tags.seek(index[columnId]);
- for (TreeReader kid : fields) {
- kid.seek(index);
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- UnionColumnVector result = (UnionColumnVector) previousVector;
- super.nextVector(result, isNull, batchSize);
- if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
- result.isRepeating = false;
- tags.nextVector(result.noNulls ? null : result.isNull, result.tags,
- batchSize);
- boolean[] ignore = new boolean[(int) batchSize];
- for (int f = 0; f < result.fields.length; ++f) {
- // build the ignore list for this tag
- for (int r = 0; r < batchSize; ++r) {
- ignore[r] = (!result.noNulls && result.isNull[r]) ||
- result.tags[r] != f;
- }
- fields[f].nextVector(result.fields[f], ignore, batchSize);
- }
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- tags = new RunLengthByteReader(streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.DATA)));
- for (TreeReader field : fields) {
- if (field != null) {
- field.startStripe(streams, stripeFooter);
- }
- }
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- long[] counts = new long[fields.length];
- for (int i = 0; i < items; ++i) {
- counts[tags.next()] += 1;
- }
- for (int i = 0; i < counts.length; ++i) {
- fields[i].skipRows(counts[i]);
- }
- }
- }
-
- public static class ListTreeReader extends TreeReader {
- protected final TreeReader elementReader;
- protected IntegerReader lengths = null;
-
- protected ListTreeReader(int fileColumn,
- TypeDescription readerSchema,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt) throws IOException {
- super(fileColumn);
- TypeDescription elementType = readerSchema.getChildren().get(0);
- elementReader = createTreeReader(elementType, evolution, included,
- skipCorrupt);
- }
-
- protected ListTreeReader(int columnId, InStream present, InStream data,
- OrcProto.ColumnEncoding encoding, TreeReader elementReader) throws IOException {
- super(columnId, present);
- if (data != null && encoding != null) {
- checkEncoding(encoding);
- this.lengths = createIntegerReader(encoding.getKind(), data, false, false);
- }
- this.elementReader = elementReader;
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- super.seek(index);
- lengths.seek(index[columnId]);
- elementReader.seek(index);
- }
-
- @Override
- public void nextVector(ColumnVector previous,
- boolean[] isNull,
- final int batchSize) throws IOException {
- ListColumnVector result = (ListColumnVector) previous;
- super.nextVector(result, isNull, batchSize);
- // if we have some none-null values, then read them
- if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
- lengths.nextVector(result, result.lengths, batchSize);
- // even with repeating lengths, the list doesn't repeat
- result.isRepeating = false;
- // build the offsets vector and figure out how many children to read
- result.childCount = 0;
- for (int r = 0; r < batchSize; ++r) {
- if (result.noNulls || !result.isNull[r]) {
- result.offsets[r] = result.childCount;
- result.childCount += result.lengths[r];
- }
- }
- result.child.ensureSize(result.childCount, false);
- elementReader.nextVector(result.child, null, result.childCount);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.LENGTH)), false, false);
- if (elementReader != null) {
- elementReader.startStripe(streams, stripeFooter);
- }
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- long childSkip = 0;
- for (long i = 0; i < items; ++i) {
- childSkip += lengths.next();
- }
- elementReader.skipRows(childSkip);
- }
- }
-
- public static class MapTreeReader extends TreeReader {
- protected final TreeReader keyReader;
- protected final TreeReader valueReader;
- protected IntegerReader lengths = null;
-
- protected MapTreeReader(int fileColumn,
- TypeDescription readerSchema,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt) throws IOException {
- super(fileColumn);
- TypeDescription keyType = readerSchema.getChildren().get(0);
- TypeDescription valueType = readerSchema.getChildren().get(1);
- keyReader = createTreeReader(keyType, evolution, included, skipCorrupt);
- valueReader = createTreeReader(valueType, evolution, included, skipCorrupt);
- }
-
- protected MapTreeReader(int columnId, InStream present, InStream data,
- OrcProto.ColumnEncoding encoding, TreeReader keyReader, TreeReader valueReader)
- throws IOException {
- super(columnId, present);
- if (data != null && encoding != null) {
- checkEncoding(encoding);
- this.lengths = createIntegerReader(encoding.getKind(), data, false, false);
- }
- this.keyReader = keyReader;
- this.valueReader = valueReader;
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- super.seek(index);
- lengths.seek(index[columnId]);
- keyReader.seek(index);
- valueReader.seek(index);
- }
-
- @Override
- public void nextVector(ColumnVector previous,
- boolean[] isNull,
- final int batchSize) throws IOException {
- MapColumnVector result = (MapColumnVector) previous;
- super.nextVector(result, isNull, batchSize);
- if (result.noNulls || !(result.isRepeating && result.isNull[0])) {
- lengths.nextVector(result, result.lengths, batchSize);
- // even with repeating lengths, the map doesn't repeat
- result.isRepeating = false;
- // build the offsets vector and figure out how many children to read
- result.childCount = 0;
- for (int r = 0; r < batchSize; ++r) {
- if (result.noNulls || !result.isNull[r]) {
- result.offsets[r] = result.childCount;
- result.childCount += result.lengths[r];
- }
- }
- result.keys.ensureSize(result.childCount, false);
- result.values.ensureSize(result.childCount, false);
- keyReader.nextVector(result.keys, null, result.childCount);
- valueReader.nextVector(result.values, null, result.childCount);
- }
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- if ((encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT) &&
- (encoding.getKind() != OrcProto.ColumnEncoding.Kind.DIRECT_V2)) {
- throw new IOException("Unknown encoding " + encoding + " in column " +
- columnId);
- }
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- super.startStripe(streams, stripeFooter);
- lengths = createIntegerReader(stripeFooter.getColumnsList().get(columnId).getKind(),
- streams.get(new StreamName(columnId,
- OrcProto.Stream.Kind.LENGTH)), false, false);
- if (keyReader != null) {
- keyReader.startStripe(streams, stripeFooter);
- }
- if (valueReader != null) {
- valueReader.startStripe(streams, stripeFooter);
- }
- }
-
- @Override
- void skipRows(long items) throws IOException {
- items = countNonNulls(items);
- long childSkip = 0;
- for (long i = 0; i < items; ++i) {
- childSkip += lengths.next();
- }
- keyReader.skipRows(childSkip);
- valueReader.skipRows(childSkip);
- }
- }
-
- public static TreeReader createTreeReader(TypeDescription readerType,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt
- ) throws IOException {
- TypeDescription fileType = evolution.getFileType(readerType);
- if (fileType == null || !evolution.includeReaderColumn(readerType.getId())){
- return new NullTreeReader(0);
- }
- TypeDescription.Category readerTypeCategory = readerType.getCategory();
- if (!fileType.equals(readerType) &&
- (readerTypeCategory != TypeDescription.Category.STRUCT &&
- readerTypeCategory != TypeDescription.Category.MAP &&
- readerTypeCategory != TypeDescription.Category.LIST &&
- readerTypeCategory != TypeDescription.Category.UNION)) {
- // We only convert complex children.
- return ConvertTreeReaderFactory.createConvertTreeReader(readerType, evolution,
- included, skipCorrupt);
- }
- switch (readerTypeCategory) {
- case BOOLEAN:
- return new BooleanTreeReader(fileType.getId());
- case BYTE:
- return new ByteTreeReader(fileType.getId());
- case DOUBLE:
- return new DoubleTreeReader(fileType.getId());
- case FLOAT:
- return new FloatTreeReader(fileType.getId());
- case SHORT:
- return new ShortTreeReader(fileType.getId());
- case INT:
- return new IntTreeReader(fileType.getId());
- case LONG:
- return new LongTreeReader(fileType.getId(), skipCorrupt);
- case STRING:
- return new StringTreeReader(fileType.getId());
- case CHAR:
- return new CharTreeReader(fileType.getId(), readerType.getMaxLength());
- case VARCHAR:
- return new VarcharTreeReader(fileType.getId(), readerType.getMaxLength());
- case BINARY:
- return new BinaryTreeReader(fileType.getId());
- case TIMESTAMP:
- return new TimestampTreeReader(fileType.getId(), skipCorrupt);
- case DATE:
- return new DateTreeReader(fileType.getId());
- case DECIMAL:
- return new DecimalTreeReader(fileType.getId(), readerType.getPrecision(),
- readerType.getScale());
- case STRUCT:
- return new StructTreeReader(fileType.getId(), readerType,
- evolution, included, skipCorrupt);
- case LIST:
- return new ListTreeReader(fileType.getId(), readerType,
- evolution, included, skipCorrupt);
- case MAP:
- return new MapTreeReader(fileType.getId(), readerType, evolution,
- included, skipCorrupt);
- case UNION:
- return new UnionTreeReader(fileType.getId(), readerType,
- evolution, included, skipCorrupt);
- default:
- throw new IllegalArgumentException("Unsupported type " +
- readerTypeCategory);
- }
- }
-}
[04/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/TestVectorOrcFile.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestVectorOrcFile.java b/orc/src/test/org/apache/orc/TestVectorOrcFile.java
deleted file mode 100644
index 73abf9e..0000000
--- a/orc/src/test/org/apache/orc/TestVectorOrcFile.java
+++ /dev/null
@@ -1,2789 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import com.google.common.collect.Lists;
-
-import junit.framework.Assert;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.orc.impl.DataReaderProperties;
-import org.apache.orc.impl.MemoryManager;
-import org.apache.orc.impl.OrcIndex;
-import org.apache.orc.impl.RecordReaderImpl;
-import org.apache.orc.impl.RecordReaderUtils;
-import org.apache.orc.tools.TestJsonFileDump;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-
-import java.io.File;
-import java.io.IOException;
-import java.math.BigInteger;
-import java.nio.ByteBuffer;
-import java.sql.Date;
-import java.sql.Timestamp;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-
-import static junit.framework.TestCase.assertNotNull;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-/**
- * Tests for the vectorized reader and writer for ORC files.
- */
-public class TestVectorOrcFile {
-
- public static class InnerStruct {
- int int1;
- Text string1 = new Text();
- InnerStruct(int int1, Text string1) {
- this.int1 = int1;
- this.string1.set(string1);
- }
- InnerStruct(int int1, String string1) {
- this.int1 = int1;
- this.string1.set(string1);
- }
-
- public String toString() {
- return "{" + int1 + ", " + string1 + "}";
- }
- }
-
- public static class MiddleStruct {
- List<InnerStruct> list = new ArrayList<InnerStruct>();
-
- MiddleStruct(InnerStruct... items) {
- list.clear();
- list.addAll(Arrays.asList(items));
- }
- }
-
- private static InnerStruct inner(int i, String s) {
- return new InnerStruct(i, s);
- }
-
- private static Map<String, InnerStruct> map(InnerStruct... items) {
- Map<String, InnerStruct> result = new HashMap<String, InnerStruct>();
- for(InnerStruct i: items) {
- result.put(i.string1.toString(), i);
- }
- return result;
- }
-
- private static List<InnerStruct> list(InnerStruct... items) {
- List<InnerStruct> result = new ArrayList<InnerStruct>();
- result.addAll(Arrays.asList(items));
- return result;
- }
-
- private static BytesWritable bytes(int... items) {
- BytesWritable result = new BytesWritable();
- result.setSize(items.length);
- for(int i=0; i < items.length; ++i) {
- result.getBytes()[i] = (byte) items[i];
- }
- return result;
- }
-
- private static byte[] bytesArray(int... items) {
- byte[] result = new byte[items.length];
- for(int i=0; i < items.length; ++i) {
- result[i] = (byte) items[i];
- }
- return result;
- }
-
- private static ByteBuffer byteBuf(int... items) {
- ByteBuffer result = ByteBuffer.allocate(items.length);
- for(int item: items) {
- result.put((byte) item);
- }
- result.flip();
- return result;
- }
-
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
-
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem () throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestVectorOrcFile." +
- testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @Test
- public void testReadFormat_0_11() throws Exception {
- Path oldFilePath =
- new Path(TestJsonFileDump.getFileFromClasspath("orc-file-11-format.orc"));
- Reader reader = OrcFile.createReader(oldFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
-
- int stripeCount = 0;
- int rowCount = 0;
- long currentOffset = -1;
- for(StripeInformation stripe : reader.getStripes()) {
- stripeCount += 1;
- rowCount += stripe.getNumberOfRows();
- if (currentOffset < 0) {
- currentOffset = stripe.getOffset() + stripe.getIndexLength()
- + stripe.getDataLength() + stripe.getFooterLength();
- } else {
- assertEquals(currentOffset, stripe.getOffset());
- currentOffset += stripe.getIndexLength() + stripe.getDataLength()
- + stripe.getFooterLength();
- }
- }
- Assert.assertEquals(reader.getNumberOfRows(), rowCount);
- assertEquals(2, stripeCount);
-
- // check the stats
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(7500, stats[1].getNumberOfValues());
- assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getFalseCount());
- assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getTrueCount());
- assertEquals("count: 7500 hasNull: true true: 3750", stats[1].toString());
-
- assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
- assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
- assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
- assertEquals(11520000, ((IntegerColumnStatistics) stats[3]).getSum());
- assertEquals("count: 7500 hasNull: true min: 1024 max: 2048 sum: 11520000",
- stats[3].toString());
-
- assertEquals(Long.MAX_VALUE,
- ((IntegerColumnStatistics) stats[5]).getMaximum());
- assertEquals(Long.MAX_VALUE,
- ((IntegerColumnStatistics) stats[5]).getMinimum());
- assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
- assertEquals(
- "count: 7500 hasNull: true min: 9223372036854775807 max: 9223372036854775807",
- stats[5].toString());
-
- assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 0.0001);
- assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 0.0001);
- assertEquals(-75000.0, ((DoubleColumnStatistics) stats[7]).getSum(),
- 0.00001);
- assertEquals("count: 7500 hasNull: true min: -15.0 max: -5.0 sum: -75000.0",
- stats[7].toString());
-
- assertEquals("count: 7500 hasNull: true min: bye max: hi sum: 0", stats[9].toString());
-
- // check the inspectors
- TypeDescription schema = reader.getSchema();
- assertEquals(TypeDescription.Category.STRUCT, schema.getCategory());
- assertEquals("struct<boolean1:boolean,byte1:tinyint,short1:smallint,"
- + "int1:int,long1:bigint,float1:float,double1:double,bytes1:"
- + "binary,string1:string,middle:struct<list:array<struct<int1:int,"
- + "string1:string>>>,list:array<struct<int1:int,string1:string>>,"
- + "map:map<string,struct<int1:int,string1:string>>,ts:timestamp,"
- + "decimal1:decimal(38,10)>", schema.toString());
- VectorizedRowBatch batch = schema.createRowBatch();
-
- RecordReader rows = reader.rows();
- Assert.assertEquals(true, rows.nextBatch(batch));
- assertEquals(1024, batch.size);
-
- // check the contents of the first row
- assertEquals(false, getBoolean(batch, 0));
- assertEquals(1, getByte(batch, 0));
- assertEquals(1024, getShort(batch, 0));
- assertEquals(65536, getInt(batch, 0));
- assertEquals(Long.MAX_VALUE, getLong(batch, 0));
- assertEquals(1.0, getFloat(batch, 0), 0.00001);
- assertEquals(-15.0, getDouble(batch, 0), 0.00001);
- assertEquals(bytes(0, 1, 2, 3, 4), getBinary(batch, 0));
- assertEquals("hi", getText(batch, 0).toString());
-
- StructColumnVector middle = (StructColumnVector) batch.cols[9];
- ListColumnVector midList = (ListColumnVector) middle.fields[0];
- StructColumnVector midListStruct = (StructColumnVector) midList.child;
- LongColumnVector midListInt = (LongColumnVector) midListStruct.fields[0];
- BytesColumnVector midListStr = (BytesColumnVector) midListStruct.fields[1];
- ListColumnVector list = (ListColumnVector) batch.cols[10];
- StructColumnVector listStruct = (StructColumnVector) list.child;
- LongColumnVector listInts = (LongColumnVector) listStruct.fields[0];
- BytesColumnVector listStrs = (BytesColumnVector) listStruct.fields[1];
- MapColumnVector map = (MapColumnVector) batch.cols[11];
- BytesColumnVector mapKey = (BytesColumnVector) map.keys;
- StructColumnVector mapValue = (StructColumnVector) map.values;
- LongColumnVector mapValueInts = (LongColumnVector) mapValue.fields[0];
- BytesColumnVector mapValueStrs = (BytesColumnVector) mapValue.fields[1];
- TimestampColumnVector timestamp = (TimestampColumnVector) batch.cols[12];
- DecimalColumnVector decs = (DecimalColumnVector) batch.cols[13];
-
- assertEquals(false, middle.isNull[0]);
- assertEquals(2, midList.lengths[0]);
- int start = (int) midList.offsets[0];
- assertEquals(1, midListInt.vector[start]);
- assertEquals("bye", midListStr.toString(start));
- assertEquals(2, midListInt.vector[start + 1]);
- assertEquals("sigh", midListStr.toString(start + 1));
-
- assertEquals(2, list.lengths[0]);
- start = (int) list.offsets[0];
- assertEquals(3, listInts.vector[start]);
- assertEquals("good", listStrs.toString(start));
- assertEquals(4, listInts.vector[start + 1]);
- assertEquals("bad", listStrs.toString(start + 1));
- assertEquals(0, map.lengths[0]);
- assertEquals(Timestamp.valueOf("2000-03-12 15:00:00"),
- timestamp.asScratchTimestamp(0));
- assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547456")),
- decs.vector[0]);
-
- // check the contents of row 7499
- rows.seekToRow(7499);
- Assert.assertEquals(true, rows.nextBatch(batch));
- assertEquals(true, getBoolean(batch, 0));
- assertEquals(100, getByte(batch, 0));
- assertEquals(2048, getShort(batch, 0));
- assertEquals(65536, getInt(batch, 0));
- assertEquals(Long.MAX_VALUE, getLong(batch, 0));
- assertEquals(2.0, getFloat(batch, 0), 0.00001);
- assertEquals(-5.0, getDouble(batch, 0), 0.00001);
- assertEquals(bytes(), getBinary(batch, 0));
- assertEquals("bye", getText(batch, 0).toString());
- assertEquals(false, middle.isNull[0]);
- assertEquals(2, midList.lengths[0]);
- start = (int) midList.offsets[0];
- assertEquals(1, midListInt.vector[start]);
- assertEquals("bye", midListStr.toString(start));
- assertEquals(2, midListInt.vector[start + 1]);
- assertEquals("sigh", midListStr.toString(start + 1));
- assertEquals(3, list.lengths[0]);
- start = (int) list.offsets[0];
- assertEquals(100000000, listInts.vector[start]);
- assertEquals("cat", listStrs.toString(start));
- assertEquals(-100000, listInts.vector[start + 1]);
- assertEquals("in", listStrs.toString(start + 1));
- assertEquals(1234, listInts.vector[start + 2]);
- assertEquals("hat", listStrs.toString(start + 2));
- assertEquals(2, map.lengths[0]);
- start = (int) map.offsets[0];
- assertEquals("chani", mapKey.toString(start));
- assertEquals(5, mapValueInts.vector[start]);
- assertEquals("chani", mapValueStrs.toString(start));
- assertEquals("mauddib", mapKey.toString(start + 1));
- assertEquals(1, mapValueInts.vector[start + 1]);
- assertEquals("mauddib", mapValueStrs.toString(start + 1));
- assertEquals(Timestamp.valueOf("2000-03-12 15:00:01"),
- timestamp.asScratchTimestamp(0));
- assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547457")),
- decs.vector[0]);
-
- // handle the close up
- Assert.assertEquals(false, rows.nextBatch(batch));
- rows.close();
- }
-
- @Test
- public void testTimestamp() throws Exception {
- TypeDescription schema = TypeDescription.createTimestamp();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
- .bufferSize(10000).version(org.apache.orc.OrcFile.Version.V_0_11));
- List<Timestamp> tslist = Lists.newArrayList();
- tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.000999"));
- tslist.add(Timestamp.valueOf("2003-01-01 00:00:00.000000222"));
- tslist.add(Timestamp.valueOf("1999-01-01 00:00:00.999999999"));
- tslist.add(Timestamp.valueOf("1995-01-01 00:00:00.688888888"));
- tslist.add(Timestamp.valueOf("2002-01-01 00:00:00.1"));
- tslist.add(Timestamp.valueOf("2010-03-02 00:00:00.000009001"));
- tslist.add(Timestamp.valueOf("2005-01-01 00:00:00.000002229"));
- tslist.add(Timestamp.valueOf("2006-01-01 00:00:00.900203003"));
- tslist.add(Timestamp.valueOf("2003-01-01 00:00:00.800000007"));
- tslist.add(Timestamp.valueOf("1996-08-02 00:00:00.723100809"));
- tslist.add(Timestamp.valueOf("1998-11-02 00:00:00.857340643"));
- tslist.add(Timestamp.valueOf("2008-10-02 00:00:00"));
-
- VectorizedRowBatch batch = new VectorizedRowBatch(1, 1024);
- TimestampColumnVector vec = new TimestampColumnVector(1024);
- batch.cols[0] = vec;
- batch.reset();
- batch.size = tslist.size();
- for (int i=0; i < tslist.size(); ++i) {
- Timestamp ts = tslist.get(i);
- vec.set(i, ts);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- TimestampColumnVector timestamps = (TimestampColumnVector) batch.cols[0];
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(tslist.get(idx++).getNanos(),
- timestamps.asScratchTimestamp(r).getNanos());
- }
- }
- Assert.assertEquals(tslist.size(), rows.getRowNumber());
- assertEquals(0, writer.getSchema().getMaximumId());
- boolean[] expected = new boolean[] {false};
- boolean[] included = OrcUtils.includeColumns("", writer.getSchema());
- assertEquals(true, Arrays.equals(expected, included));
- }
-
- @Test
- public void testStringAndBinaryStatistics() throws Exception {
-
- TypeDescription schema = TypeDescription.createStruct()
- .addField("bytes1", TypeDescription.createBinary())
- .addField("string1", TypeDescription.createString());
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.size = 4;
- BytesColumnVector field1 = (BytesColumnVector) batch.cols[0];
- BytesColumnVector field2 = (BytesColumnVector) batch.cols[1];
- field1.setVal(0, bytesArray(0, 1, 2, 3, 4));
- field1.setVal(1, bytesArray(0, 1, 2, 3));
- field1.setVal(2, bytesArray(0, 1, 2, 3, 4, 5));
- field1.noNulls = false;
- field1.isNull[3] = true;
- field2.setVal(0, "foo".getBytes());
- field2.setVal(1, "bar".getBytes());
- field2.noNulls = false;
- field2.isNull[2] = true;
- field2.setVal(3, "hi".getBytes());
- writer.addRowBatch(batch);
- writer.close();
- schema = writer.getSchema();
- assertEquals(2, schema.getMaximumId());
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
-
- boolean[] expected = new boolean[] {false, false, true};
- boolean[] included = OrcUtils.includeColumns("string1", schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- expected = new boolean[] {false, false, false};
- included = OrcUtils.includeColumns("", schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- expected = new boolean[] {false, false, false};
- included = OrcUtils.includeColumns(null, schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- // check the stats
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(4, stats[0].getNumberOfValues());
- assertEquals("count: 4 hasNull: false", stats[0].toString());
-
- assertEquals(3, stats[1].getNumberOfValues());
- assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum());
- assertEquals("count: 3 hasNull: true sum: 15", stats[1].toString());
-
- assertEquals(3, stats[2].getNumberOfValues());
- assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum());
- assertEquals("hi", ((StringColumnStatistics) stats[2]).getMaximum());
- assertEquals(8, ((StringColumnStatistics) stats[2]).getSum());
- assertEquals("count: 3 hasNull: true min: bar max: hi sum: 8",
- stats[2].toString());
-
- // check the inspectors
- batch = reader.getSchema().createRowBatch();
- BytesColumnVector bytes = (BytesColumnVector) batch.cols[0];
- BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
- RecordReader rows = reader.rows();
- Assert.assertEquals(true, rows.nextBatch(batch));
- assertEquals(4, batch.size);
-
- // check the contents of the first row
- assertEquals(bytes(0,1,2,3,4), getBinary(bytes, 0));
- assertEquals("foo", strs.toString(0));
-
- // check the contents of second row
- assertEquals(bytes(0,1,2,3), getBinary(bytes, 1));
- assertEquals("bar", strs.toString(1));
-
- // check the contents of third row
- assertEquals(bytes(0,1,2,3,4,5), getBinary(bytes, 2));
- assertNull(strs.toString(2));
-
- // check the contents of fourth row
- assertNull(getBinary(bytes, 3));
- assertEquals("hi", strs.toString(3));
-
- // handle the close up
- Assert.assertEquals(false, rows.nextBatch(batch));
- rows.close();
- }
-
-
- @Test
- public void testStripeLevelStats() throws Exception {
- TypeDescription schema = TypeDescription.createStruct()
- .addField("int1", TypeDescription.createInt())
- .addField("string1", TypeDescription.createString());
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.size = 1000;
- LongColumnVector field1 = (LongColumnVector) batch.cols[0];
- BytesColumnVector field2 = (BytesColumnVector) batch.cols[1];
- field1.isRepeating = true;
- field2.isRepeating = true;
- for (int b = 0; b < 11; b++) {
- if (b >= 5) {
- if (b >= 10) {
- field1.vector[0] = 3;
- field2.setVal(0, "three".getBytes());
- } else {
- field1.vector[0] = 2;
- field2.setVal(0, "two".getBytes());
- }
- } else {
- field1.vector[0] = 1;
- field2.setVal(0, "one".getBytes());
- }
- writer.addRowBatch(batch);
- }
-
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
-
- schema = writer.getSchema();
- assertEquals(2, schema.getMaximumId());
- boolean[] expected = new boolean[] {false, true, false};
- boolean[] included = OrcUtils.includeColumns("int1", schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- List<StripeStatistics> stats = reader.getStripeStatistics();
- int numStripes = stats.size();
- assertEquals(3, numStripes);
- StripeStatistics ss1 = stats.get(0);
- StripeStatistics ss2 = stats.get(1);
- StripeStatistics ss3 = stats.get(2);
-
- assertEquals(5000, ss1.getColumnStatistics()[0].getNumberOfValues());
- assertEquals(5000, ss2.getColumnStatistics()[0].getNumberOfValues());
- assertEquals(1000, ss3.getColumnStatistics()[0].getNumberOfValues());
-
- assertEquals(5000, (ss1.getColumnStatistics()[1]).getNumberOfValues());
- assertEquals(5000, (ss2.getColumnStatistics()[1]).getNumberOfValues());
- assertEquals(1000, (ss3.getColumnStatistics()[1]).getNumberOfValues());
- assertEquals(1, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getMinimum());
- assertEquals(2, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getMinimum());
- assertEquals(3, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getMinimum());
- assertEquals(1, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getMaximum());
- assertEquals(2, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getMaximum());
- assertEquals(3, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getMaximum());
- assertEquals(5000, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getSum());
- assertEquals(10000, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getSum());
- assertEquals(3000, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getSum());
-
- assertEquals(5000, (ss1.getColumnStatistics()[2]).getNumberOfValues());
- assertEquals(5000, (ss2.getColumnStatistics()[2]).getNumberOfValues());
- assertEquals(1000, (ss3.getColumnStatistics()[2]).getNumberOfValues());
- assertEquals("one", ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getMinimum());
- assertEquals("two", ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getMinimum());
- assertEquals("three", ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getMinimum());
- assertEquals("one", ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getMaximum());
- assertEquals("two", ((StringColumnStatistics) ss2.getColumnStatistics()[2]).getMaximum());
- assertEquals("three", ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getMaximum());
- assertEquals(15000, ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getSum());
- assertEquals(15000, ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getSum());
- assertEquals(5000, ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getSum());
-
- RecordReaderImpl recordReader = (RecordReaderImpl) reader.rows();
- OrcProto.RowIndex[] index = recordReader.readRowIndex(0, null, null).getRowGroupIndex();
- assertEquals(3, index.length);
- List<OrcProto.RowIndexEntry> items = index[1].getEntryList();
- assertEquals(1, items.size());
- assertEquals(3, items.get(0).getPositionsCount());
- assertEquals(0, items.get(0).getPositions(0));
- assertEquals(0, items.get(0).getPositions(1));
- assertEquals(0, items.get(0).getPositions(2));
- assertEquals(1,
- items.get(0).getStatistics().getIntStatistics().getMinimum());
- index = recordReader.readRowIndex(1, null, null).getRowGroupIndex();
- assertEquals(3, index.length);
- items = index[1].getEntryList();
- assertEquals(2,
- items.get(0).getStatistics().getIntStatistics().getMaximum());
- }
-
- private static void setInner(StructColumnVector inner, int rowId,
- int i, String value) {
- ((LongColumnVector) inner.fields[0]).vector[rowId] = i;
- if (value != null) {
- ((BytesColumnVector) inner.fields[1]).setVal(rowId, value.getBytes());
- } else {
- inner.fields[1].isNull[rowId] = true;
- inner.fields[1].noNulls = false;
- }
- }
-
- private static void checkInner(StructColumnVector inner, int rowId,
- int rowInBatch, int i, String value) {
- assertEquals("row " + rowId, i,
- ((LongColumnVector) inner.fields[0]).vector[rowInBatch]);
- if (value != null) {
- assertEquals("row " + rowId, value,
- ((BytesColumnVector) inner.fields[1]).toString(rowInBatch));
- } else {
- assertEquals("row " + rowId, true, inner.fields[1].isNull[rowInBatch]);
- assertEquals("row " + rowId, false, inner.fields[1].noNulls);
- }
- }
-
- private static void setInnerList(ListColumnVector list, int rowId,
- List<InnerStruct> value) {
- if (value != null) {
- if (list.childCount + value.size() > list.child.isNull.length) {
- list.child.ensureSize(list.childCount * 2, true);
- }
- list.lengths[rowId] = value.size();
- list.offsets[rowId] = list.childCount;
- for (int i = 0; i < list.lengths[rowId]; ++i) {
- InnerStruct inner = value.get(i);
- setInner((StructColumnVector) list.child, i + list.childCount,
- inner.int1, inner.string1.toString());
- }
- list.childCount += value.size();
- } else {
- list.isNull[rowId] = true;
- list.noNulls = false;
- }
- }
-
- private static void checkInnerList(ListColumnVector list, int rowId,
- int rowInBatch, List<InnerStruct> value) {
- if (value != null) {
- assertEquals("row " + rowId, value.size(), list.lengths[rowInBatch]);
- int start = (int) list.offsets[rowInBatch];
- for (int i = 0; i < list.lengths[rowInBatch]; ++i) {
- InnerStruct inner = value.get(i);
- checkInner((StructColumnVector) list.child, rowId, i + start,
- inner.int1, inner.string1.toString());
- }
- list.childCount += value.size();
- } else {
- assertEquals("row " + rowId, true, list.isNull[rowInBatch]);
- assertEquals("row " + rowId, false, list.noNulls);
- }
- }
-
- private static void setInnerMap(MapColumnVector map, int rowId,
- Map<String, InnerStruct> value) {
- if (value != null) {
- if (map.childCount >= map.keys.isNull.length) {
- map.keys.ensureSize(map.childCount * 2, true);
- map.values.ensureSize(map.childCount * 2, true);
- }
- map.lengths[rowId] = value.size();
- int offset = map.childCount;
- map.offsets[rowId] = offset;
-
- for (Map.Entry<String, InnerStruct> entry : value.entrySet()) {
- ((BytesColumnVector) map.keys).setVal(offset, entry.getKey().getBytes());
- InnerStruct inner = entry.getValue();
- setInner((StructColumnVector) map.values, offset, inner.int1,
- inner.string1.toString());
- offset += 1;
- }
- map.childCount = offset;
- } else {
- map.isNull[rowId] = true;
- map.noNulls = false;
- }
- }
-
- private static void checkInnerMap(MapColumnVector map, int rowId,
- int rowInBatch,
- Map<String, InnerStruct> value) {
- if (value != null) {
- assertEquals("row " + rowId, value.size(), map.lengths[rowInBatch]);
- int offset = (int) map.offsets[rowInBatch];
- for(int i=0; i < value.size(); ++i) {
- String key = ((BytesColumnVector) map.keys).toString(offset + i);
- InnerStruct expected = value.get(key);
- checkInner((StructColumnVector) map.values, rowId, offset + i,
- expected.int1, expected.string1.toString());
- }
- } else {
- assertEquals("row " + rowId, true, map.isNull[rowId]);
- assertEquals("row " + rowId, false, map.noNulls);
- }
- }
-
- private static void setMiddleStruct(StructColumnVector middle, int rowId,
- MiddleStruct value) {
- if (value != null) {
- setInnerList((ListColumnVector) middle.fields[0], rowId, value.list);
- } else {
- middle.isNull[rowId] = true;
- middle.noNulls = false;
- }
- }
-
- private static void checkMiddleStruct(StructColumnVector middle, int rowId,
- int rowInBatch, MiddleStruct value) {
- if (value != null) {
- checkInnerList((ListColumnVector) middle.fields[0], rowId, rowInBatch,
- value.list);
- } else {
- assertEquals("row " + rowId, true, middle.isNull[rowInBatch]);
- assertEquals("row " + rowId, false, middle.noNulls);
- }
- }
-
- private static void setBigRow(VectorizedRowBatch batch, int rowId,
- Boolean b1, Byte b2, Short s1,
- Integer i1, Long l1, Float f1,
- Double d1, BytesWritable b3, String s2,
- MiddleStruct m1, List<InnerStruct> l2,
- Map<String, InnerStruct> m2) {
- ((LongColumnVector) batch.cols[0]).vector[rowId] = b1 ? 1 : 0;
- ((LongColumnVector) batch.cols[1]).vector[rowId] = b2;
- ((LongColumnVector) batch.cols[2]).vector[rowId] = s1;
- ((LongColumnVector) batch.cols[3]).vector[rowId] = i1;
- ((LongColumnVector) batch.cols[4]).vector[rowId] = l1;
- ((DoubleColumnVector) batch.cols[5]).vector[rowId] = f1;
- ((DoubleColumnVector) batch.cols[6]).vector[rowId] = d1;
- if (b3 != null) {
- ((BytesColumnVector) batch.cols[7]).setVal(rowId, b3.getBytes(), 0,
- b3.getLength());
- } else {
- batch.cols[7].isNull[rowId] = true;
- batch.cols[7].noNulls = false;
- }
- if (s2 != null) {
- ((BytesColumnVector) batch.cols[8]).setVal(rowId, s2.getBytes());
- } else {
- batch.cols[8].isNull[rowId] = true;
- batch.cols[8].noNulls = false;
- }
- setMiddleStruct((StructColumnVector) batch.cols[9], rowId, m1);
- setInnerList((ListColumnVector) batch.cols[10], rowId, l2);
- setInnerMap((MapColumnVector) batch.cols[11], rowId, m2);
- }
-
- private static void checkBigRow(VectorizedRowBatch batch,
- int rowInBatch,
- int rowId,
- boolean b1, byte b2, short s1,
- int i1, long l1, float f1,
- double d1, BytesWritable b3, String s2,
- MiddleStruct m1, List<InnerStruct> l2,
- Map<String, InnerStruct> m2) {
- assertEquals("row " + rowId, b1, getBoolean(batch, rowInBatch));
- assertEquals("row " + rowId, b2, getByte(batch, rowInBatch));
- assertEquals("row " + rowId, s1, getShort(batch, rowInBatch));
- assertEquals("row " + rowId, i1, getInt(batch, rowInBatch));
- assertEquals("row " + rowId, l1, getLong(batch, rowInBatch));
- assertEquals("row " + rowId, f1, getFloat(batch, rowInBatch), 0.0001);
- assertEquals("row " + rowId, d1, getDouble(batch, rowInBatch), 0.0001);
- if (b3 != null) {
- BytesColumnVector bytes = (BytesColumnVector) batch.cols[7];
- assertEquals("row " + rowId, b3.getLength(), bytes.length[rowInBatch]);
- for(int i=0; i < b3.getLength(); ++i) {
- assertEquals("row " + rowId + " byte " + i, b3.getBytes()[i],
- bytes.vector[rowInBatch][bytes.start[rowInBatch] + i]);
- }
- } else {
- assertEquals("row " + rowId, true, batch.cols[7].isNull[rowInBatch]);
- assertEquals("row " + rowId, false, batch.cols[7].noNulls);
- }
- if (s2 != null) {
- assertEquals("row " + rowId, s2, getText(batch, rowInBatch).toString());
- } else {
- assertEquals("row " + rowId, true, batch.cols[8].isNull[rowInBatch]);
- assertEquals("row " + rowId, false, batch.cols[8].noNulls);
- }
- checkMiddleStruct((StructColumnVector) batch.cols[9], rowId, rowInBatch,
- m1);
- checkInnerList((ListColumnVector) batch.cols[10], rowId, rowInBatch, l2);
- checkInnerMap((MapColumnVector) batch.cols[11], rowId, rowInBatch, m2);
- }
-
- private static boolean getBoolean(VectorizedRowBatch batch, int rowId) {
- return ((LongColumnVector) batch.cols[0]).vector[rowId] != 0;
- }
-
- private static byte getByte(VectorizedRowBatch batch, int rowId) {
- return (byte) ((LongColumnVector) batch.cols[1]).vector[rowId];
- }
-
- private static short getShort(VectorizedRowBatch batch, int rowId) {
- return (short) ((LongColumnVector) batch.cols[2]).vector[rowId];
- }
-
- private static int getInt(VectorizedRowBatch batch, int rowId) {
- return (int) ((LongColumnVector) batch.cols[3]).vector[rowId];
- }
-
- private static long getLong(VectorizedRowBatch batch, int rowId) {
- return ((LongColumnVector) batch.cols[4]).vector[rowId];
- }
-
- private static float getFloat(VectorizedRowBatch batch, int rowId) {
- return (float) ((DoubleColumnVector) batch.cols[5]).vector[rowId];
- }
-
- private static double getDouble(VectorizedRowBatch batch, int rowId) {
- return ((DoubleColumnVector) batch.cols[6]).vector[rowId];
- }
-
- private static BytesWritable getBinary(BytesColumnVector column, int rowId) {
- if (column.isRepeating) {
- rowId = 0;
- }
- if (column.noNulls || !column.isNull[rowId]) {
- return new BytesWritable(Arrays.copyOfRange(column.vector[rowId],
- column.start[rowId], column.start[rowId] + column.length[rowId]));
- } else {
- return null;
- }
- }
-
- private static BytesWritable getBinary(VectorizedRowBatch batch, int rowId) {
- return getBinary((BytesColumnVector) batch.cols[7], rowId);
- }
-
- private static Text getText(BytesColumnVector vector, int rowId) {
- if (vector.isRepeating) {
- rowId = 0;
- }
- if (vector.noNulls || !vector.isNull[rowId]) {
- return new Text(Arrays.copyOfRange(vector.vector[rowId],
- vector.start[rowId], vector.start[rowId] + vector.length[rowId]));
- } else {
- return null;
- }
- }
-
- private static Text getText(VectorizedRowBatch batch, int rowId) {
- return getText((BytesColumnVector) batch.cols[8], rowId);
- }
-
- private static InnerStruct getInner(StructColumnVector vector,
- int rowId) {
- return new InnerStruct(
- (int) ((LongColumnVector) vector.fields[0]).vector[rowId],
- getText((BytesColumnVector) vector.fields[1], rowId));
- }
-
- private static List<InnerStruct> getList(ListColumnVector cv,
- int rowId) {
- if (cv.isRepeating) {
- rowId = 0;
- }
- if (cv.noNulls || !cv.isNull[rowId]) {
- List<InnerStruct> result =
- new ArrayList<InnerStruct>((int) cv.lengths[rowId]);
- for(long i=cv.offsets[rowId];
- i < cv.offsets[rowId] + cv.lengths[rowId]; ++i) {
- result.add(getInner((StructColumnVector) cv.child, (int) i));
- }
- return result;
- } else {
- return null;
- }
- }
-
- private static List<InnerStruct> getMidList(VectorizedRowBatch batch,
- int rowId) {
- return getList((ListColumnVector) ((StructColumnVector) batch.cols[9])
- .fields[0], rowId);
- }
-
- private static List<InnerStruct> getList(VectorizedRowBatch batch,
- int rowId) {
- return getList((ListColumnVector) batch.cols[10], rowId);
- }
-
- private static Map<Text, InnerStruct> getMap(VectorizedRowBatch batch,
- int rowId) {
- MapColumnVector cv = (MapColumnVector) batch.cols[11];
- if (cv.isRepeating) {
- rowId = 0;
- }
- if (cv.noNulls || !cv.isNull[rowId]) {
- Map<Text, InnerStruct> result =
- new HashMap<Text, InnerStruct>((int) cv.lengths[rowId]);
- for(long i=cv.offsets[rowId];
- i < cv.offsets[rowId] + cv.lengths[rowId]; ++i) {
- result.put(getText((BytesColumnVector) cv.keys, (int) i),
- getInner((StructColumnVector) cv.values, (int) i));
- }
- return result;
- } else {
- return null;
- }
- }
-
- private static TypeDescription createInnerSchema() {
- return TypeDescription.createStruct()
- .addField("int1", TypeDescription.createInt())
- .addField("string1", TypeDescription.createString());
- }
-
- private static TypeDescription createBigRowSchema() {
- return TypeDescription.createStruct()
- .addField("boolean1", TypeDescription.createBoolean())
- .addField("byte1", TypeDescription.createByte())
- .addField("short1", TypeDescription.createShort())
- .addField("int1", TypeDescription.createInt())
- .addField("long1", TypeDescription.createLong())
- .addField("float1", TypeDescription.createFloat())
- .addField("double1", TypeDescription.createDouble())
- .addField("bytes1", TypeDescription.createBinary())
- .addField("string1", TypeDescription.createString())
- .addField("middle", TypeDescription.createStruct()
- .addField("list", TypeDescription.createList(createInnerSchema())))
- .addField("list", TypeDescription.createList(createInnerSchema()))
- .addField("map", TypeDescription.createMap(
- TypeDescription.createString(),
- createInnerSchema()));
- }
-
- static void assertArrayEquals(boolean[] expected, boolean[] actual) {
- assertEquals(expected.length, actual.length);
- boolean diff = false;
- for(int i=0; i < expected.length; ++i) {
- if (expected[i] != actual[i]) {
- System.out.println("Difference at " + i + " expected: " + expected[i] +
- " actual: " + actual[i]);
- diff = true;
- }
- }
- assertEquals(false, diff);
- }
-
- @Test
- public void test1() throws Exception {
- TypeDescription schema = createBigRowSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.size = 2;
- setBigRow(batch, 0, false, (byte) 1, (short) 1024, 65536,
- Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi",
- new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
- list(inner(3, "good"), inner(4, "bad")),
- map());
- setBigRow(batch, 1, true, (byte) 100, (short) 2048, 65536,
- Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye",
- new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
- list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")),
- map(inner(5, "chani"), inner(1, "mauddib")));
- writer.addRowBatch(batch);
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
-
- schema = writer.getSchema();
- assertEquals(23, schema.getMaximumId());
- boolean[] expected = new boolean[] {false, false, false, false, false,
- false, false, false, false, false,
- false, false, false, false, false,
- false, false, false, false, false,
- false, false, false, false};
- boolean[] included = OrcUtils.includeColumns("", schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- expected = new boolean[] {false, true, false, false, false,
- false, false, false, false, true,
- true, true, true, true, true,
- false, false, false, false, true,
- true, true, true, true};
- included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema);
-
- assertArrayEquals(expected, included);
-
- expected = new boolean[] {false, true, false, false, false,
- false, false, false, false, true,
- true, true, true, true, true,
- false, false, false, false, true,
- true, true, true, true};
- included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema);
- assertArrayEquals(expected, included);
-
- expected = new boolean[] {false, true, true, true, true,
- true, true, true, true, true,
- true, true, true, true, true,
- true, true, true, true, true,
- true, true, true, true};
- included = OrcUtils.includeColumns(
- "boolean1,byte1,short1,int1,long1,float1,double1,bytes1,string1,middle,list,map",
- schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- // check the stats
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(2, stats[1].getNumberOfValues());
- assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount());
- assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount());
- assertEquals("count: 2 hasNull: false true: 1", stats[1].toString());
-
- assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
- assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
- assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
- assertEquals(3072, ((IntegerColumnStatistics) stats[3]).getSum());
- assertEquals("count: 2 hasNull: false min: 1024 max: 2048 sum: 3072",
- stats[3].toString());
-
- StripeStatistics ss = reader.getStripeStatistics().get(0);
- assertEquals(2, ss.getColumnStatistics()[0].getNumberOfValues());
- assertEquals(1, ((BooleanColumnStatistics) ss.getColumnStatistics()[1]).getTrueCount());
- assertEquals(1024, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMinimum());
- assertEquals(2048, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMaximum());
- assertEquals(3072, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getSum());
- assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 0.0001);
- assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 0.0001);
- assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
- assertEquals("count: 2 hasNull: false min: -15.0 max: -5.0 sum: -20.0",
- stats[7].toString());
-
- assertEquals("count: 2 hasNull: false min: bye max: hi sum: 5", stats[9].toString());
-
- // check the schema
- TypeDescription readerSchema = reader.getSchema();
- assertEquals(TypeDescription.Category.STRUCT, readerSchema.getCategory());
- assertEquals("struct<boolean1:boolean,byte1:tinyint,short1:smallint,"
- + "int1:int,long1:bigint,float1:float,double1:double,bytes1:"
- + "binary,string1:string,middle:struct<list:array<struct<int1:int,"
- + "string1:string>>>,list:array<struct<int1:int,string1:string>>,"
- + "map:map<string,struct<int1:int,string1:string>>>",
- readerSchema.toString());
- List<String> fieldNames = readerSchema.getFieldNames();
- List<TypeDescription> fieldTypes = readerSchema.getChildren();
- assertEquals("boolean1", fieldNames.get(0));
- assertEquals(TypeDescription.Category.BOOLEAN, fieldTypes.get(0).getCategory());
- assertEquals("byte1", fieldNames.get(1));
- assertEquals(TypeDescription.Category.BYTE, fieldTypes.get(1).getCategory());
- assertEquals("short1", fieldNames.get(2));
- assertEquals(TypeDescription.Category.SHORT, fieldTypes.get(2).getCategory());
- assertEquals("int1", fieldNames.get(3));
- assertEquals(TypeDescription.Category.INT, fieldTypes.get(3).getCategory());
- assertEquals("long1", fieldNames.get(4));
- assertEquals(TypeDescription.Category.LONG, fieldTypes.get(4).getCategory());
- assertEquals("float1", fieldNames.get(5));
- assertEquals(TypeDescription.Category.FLOAT, fieldTypes.get(5).getCategory());
- assertEquals("double1", fieldNames.get(6));
- assertEquals(TypeDescription.Category.DOUBLE, fieldTypes.get(6).getCategory());
- assertEquals("bytes1", fieldNames.get(7));
- assertEquals(TypeDescription.Category.BINARY, fieldTypes.get(7).getCategory());
- assertEquals("string1", fieldNames.get(8));
- assertEquals(TypeDescription.Category.STRING, fieldTypes.get(8).getCategory());
- assertEquals("middle", fieldNames.get(9));
- TypeDescription middle = fieldTypes.get(9);
- assertEquals(TypeDescription.Category.STRUCT, middle.getCategory());
- TypeDescription midList = middle.getChildren().get(0);
- assertEquals(TypeDescription.Category.LIST, midList.getCategory());
- TypeDescription inner = midList.getChildren().get(0);
- assertEquals(TypeDescription.Category.STRUCT, inner.getCategory());
- assertEquals("int1", inner.getFieldNames().get(0));
- assertEquals("string1", inner.getFieldNames().get(1));
-
- RecordReader rows = reader.rows();
- // create a new batch
- batch = readerSchema.createRowBatch();
- Assert.assertEquals(true, rows.nextBatch(batch));
- assertEquals(2, batch.size);
- Assert.assertEquals(false, rows.nextBatch(batch));
-
- // check the contents of the first row
- assertEquals(false, getBoolean(batch, 0));
- assertEquals(1, getByte(batch, 0));
- assertEquals(1024, getShort(batch, 0));
- assertEquals(65536, getInt(batch, 0));
- assertEquals(Long.MAX_VALUE, getLong(batch, 0));
- assertEquals(1.0, getFloat(batch, 0), 0.00001);
- assertEquals(-15.0, getDouble(batch, 0), 0.00001);
- assertEquals(bytes(0,1,2,3,4), getBinary(batch, 0));
- assertEquals("hi", getText(batch, 0).toString());
- List<InnerStruct> midRow = getMidList(batch, 0);
- assertNotNull(midRow);
- assertEquals(2, midRow.size());
- assertEquals(1, midRow.get(0).int1);
- assertEquals("bye", midRow.get(0).string1.toString());
- assertEquals(2, midRow.get(1).int1);
- assertEquals("sigh", midRow.get(1).string1.toString());
- List<InnerStruct> list = getList(batch, 0);
- assertEquals(2, list.size());
- assertEquals(3, list.get(0).int1);
- assertEquals("good", list.get(0).string1.toString());
- assertEquals(4, list.get(1).int1);
- assertEquals("bad", list.get(1).string1.toString());
- Map<Text, InnerStruct> map = getMap(batch, 0);
- assertEquals(0, map.size());
-
- // check the contents of second row
- assertEquals(true, getBoolean(batch, 1));
- assertEquals(100, getByte(batch, 1));
- assertEquals(2048, getShort(batch, 1));
- assertEquals(65536, getInt(batch, 1));
- assertEquals(Long.MAX_VALUE, getLong(batch, 1));
- assertEquals(2.0, getFloat(batch, 1), 0.00001);
- assertEquals(-5.0, getDouble(batch, 1), 0.00001);
- assertEquals(bytes(), getBinary(batch, 1));
- assertEquals("bye", getText(batch, 1).toString());
- midRow = getMidList(batch, 1);
- assertNotNull(midRow);
- assertEquals(2, midRow.size());
- assertEquals(1, midRow.get(0).int1);
- assertEquals("bye", midRow.get(0).string1.toString());
- assertEquals(2, midRow.get(1).int1);
- assertEquals("sigh", midRow.get(1).string1.toString());
- list = getList(batch, 1);
- assertEquals(3, list.size());
- assertEquals(100000000, list.get(0).int1);
- assertEquals("cat", list.get(0).string1.toString());
- assertEquals(-100000, list.get(1).int1);
- assertEquals("in", list.get(1).string1.toString());
- assertEquals(1234, list.get(2).int1);
- assertEquals("hat", list.get(2).string1.toString());
- map = getMap(batch, 1);
- assertEquals(2, map.size());
- InnerStruct value = map.get(new Text("chani"));
- assertEquals(5, value.int1);
- assertEquals("chani", value.string1.toString());
- value = map.get(new Text("mauddib"));
- assertEquals(1, value.int1);
- assertEquals("mauddib", value.string1.toString());
-
- // handle the close up
- Assert.assertEquals(false, rows.nextBatch(batch));
- rows.close();
- }
-
- @Test
- public void testColumnProjection() throws Exception {
- TypeDescription schema = createInnerSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(1000)
- .compress(CompressionKind.NONE)
- .bufferSize(100)
- .rowIndexStride(1000));
- VectorizedRowBatch batch = schema.createRowBatch();
- Random r1 = new Random(1);
- Random r2 = new Random(2);
- int x;
- int minInt=0, maxInt=0;
- String y;
- String minStr = null, maxStr = null;
- batch.size = 1000;
- boolean first = true;
- for(int b=0; b < 21; ++b) {
- for(int r=0; r < 1000; ++r) {
- x = r1.nextInt();
- y = Long.toHexString(r2.nextLong());
- if (first || x < minInt) {
- minInt = x;
- }
- if (first || x > maxInt) {
- maxInt = x;
- }
- if (first || y.compareTo(minStr) < 0) {
- minStr = y;
- }
- if (first || y.compareTo(maxStr) > 0) {
- maxStr = y;
- }
- first = false;
- ((LongColumnVector) batch.cols[0]).vector[r] = x;
- ((BytesColumnVector) batch.cols[1]).setVal(r, y.getBytes());
- }
- writer.addRowBatch(batch);
- }
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
-
- // check out the statistics
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(3, stats.length);
- for(ColumnStatistics s: stats) {
- assertEquals(21000, s.getNumberOfValues());
- if (s instanceof IntegerColumnStatistics) {
- assertEquals(minInt, ((IntegerColumnStatistics) s).getMinimum());
- assertEquals(maxInt, ((IntegerColumnStatistics) s).getMaximum());
- } else if (s instanceof StringColumnStatistics) {
- assertEquals(maxStr, ((StringColumnStatistics) s).getMaximum());
- assertEquals(minStr, ((StringColumnStatistics) s).getMinimum());
- }
- }
-
- // check out the types
- TypeDescription type = reader.getSchema();
- assertEquals(TypeDescription.Category.STRUCT, type.getCategory());
- assertEquals(2, type.getChildren().size());
- TypeDescription type1 = type.getChildren().get(0);
- TypeDescription type2 = type.getChildren().get(1);
- assertEquals(TypeDescription.Category.INT, type1.getCategory());
- assertEquals(TypeDescription.Category.STRING, type2.getCategory());
- assertEquals("struct<int1:int,string1:string>", type.toString());
-
- // read the contents and make sure they match
- RecordReader rows1 = reader.rows(
- new Reader.Options().include(new boolean[]{true, true, false}));
- RecordReader rows2 = reader.rows(
- new Reader.Options().include(new boolean[]{true, false, true}));
- r1 = new Random(1);
- r2 = new Random(2);
- VectorizedRowBatch batch1 = reader.getSchema().createRowBatch(1000);
- VectorizedRowBatch batch2 = reader.getSchema().createRowBatch(1000);
- for(int i = 0; i < 21000; i += 1000) {
- Assert.assertEquals(true, rows1.nextBatch(batch1));
- Assert.assertEquals(true, rows2.nextBatch(batch2));
- assertEquals(1000, batch1.size);
- assertEquals(1000, batch2.size);
- for(int j=0; j < 1000; ++j) {
- assertEquals(r1.nextInt(),
- ((LongColumnVector) batch1.cols[0]).vector[j]);
- assertEquals(Long.toHexString(r2.nextLong()),
- ((BytesColumnVector) batch2.cols[1]).toString(j));
- }
- }
- Assert.assertEquals(false, rows1.nextBatch(batch1));
- Assert.assertEquals(false, rows2.nextBatch(batch2));
- rows1.close();
- rows2.close();
- }
-
- @Test
- public void testEmptyFile() throws Exception {
- TypeDescription schema = createBigRowSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(1000)
- .compress(CompressionKind.NONE)
- .bufferSize(100));
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- VectorizedRowBatch batch = reader.getSchema().createRowBatch();
- Assert.assertEquals(false, reader.rows().nextBatch(batch));
- Assert.assertEquals(CompressionKind.NONE, reader.getCompressionKind());
- Assert.assertEquals(0, reader.getNumberOfRows());
- Assert.assertEquals(0, reader.getCompressionSize());
- Assert.assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
- Assert.assertEquals(3, reader.getContentLength());
- Assert.assertEquals(false, reader.getStripes().iterator().hasNext());
- }
-
- @Test
- public void metaData() throws Exception {
- TypeDescription schema = createBigRowSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(1000)
- .compress(CompressionKind.NONE)
- .bufferSize(100));
- writer.addUserMetadata("my.meta", byteBuf(1, 2, 3, 4, 5, 6, 7, -1, -2, 127,
- -128));
- writer.addUserMetadata("clobber", byteBuf(1, 2, 3));
- writer.addUserMetadata("clobber", byteBuf(4, 3, 2, 1));
- ByteBuffer bigBuf = ByteBuffer.allocate(40000);
- Random random = new Random(0);
- random.nextBytes(bigBuf.array());
- writer.addUserMetadata("big", bigBuf);
- bigBuf.position(0);
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.size = 1;
- setBigRow(batch, 0, true, (byte) 127, (short) 1024, 42,
- 42L * 1024 * 1024 * 1024, (float) 3.1415, -2.713, null,
- null, null, null, null);
- writer.addRowBatch(batch);
- writer.addUserMetadata("clobber", byteBuf(5,7,11,13,17,19));
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- Assert.assertEquals(byteBuf(5, 7, 11, 13, 17, 19), reader.getMetadataValue("clobber"));
- Assert.assertEquals(byteBuf(1, 2, 3, 4, 5, 6, 7, -1, -2, 127, -128),
- reader.getMetadataValue("my.meta"));
- Assert.assertEquals(bigBuf, reader.getMetadataValue("big"));
- try {
- reader.getMetadataValue("unknown");
- assertTrue(false);
- } catch (IllegalArgumentException iae) {
- // PASS
- }
- int i = 0;
- for(String key: reader.getMetadataKeys()) {
- if ("my.meta".equals(key) ||
- "clobber".equals(key) ||
- "big".equals(key)) {
- i += 1;
- } else {
- throw new IllegalArgumentException("unknown key " + key);
- }
- }
- assertEquals(3, i);
- int numStripes = reader.getStripeStatistics().size();
- assertEquals(1, numStripes);
- }
-
- /**
- * Generate an ORC file with a range of dates and times.
- */
- public void createOrcDateFile(Path file, int minYear, int maxYear
- ) throws IOException {
- TypeDescription schema = TypeDescription.createStruct()
- .addField("time", TypeDescription.createTimestamp())
- .addField("date", TypeDescription.createDate());
- Writer writer = OrcFile.createWriter(file,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000)
- .blockPadding(false));
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.size = 1000;
- for (int year = minYear; year < maxYear; ++year) {
- for (int ms = 1000; ms < 2000; ++ms) {
- TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0];
- timestampColVector.set(ms - 1000,
- Timestamp.valueOf(year +
- "-05-05 12:34:56." + ms));
- ((LongColumnVector) batch.cols[1]).vector[ms - 1000] =
- new DateWritable(new Date(year - 1900, 11, 25)).getDays();
- }
- writer.addRowBatch(batch);
- }
- writer.close();
- Reader reader = OrcFile.createReader(file,
- OrcFile.readerOptions(conf));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch(1000);
- TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
- LongColumnVector dates = (LongColumnVector) batch.cols[1];
- for (int year = minYear; year < maxYear; ++year) {
- rows.nextBatch(batch);
- assertEquals(1000, batch.size);
- for(int ms = 1000; ms < 2000; ++ms) {
- StringBuilder buffer = new StringBuilder();
- times.stringifyValue(buffer, ms - 1000);
- String expected = Integer.toString(year) + "-05-05 12:34:56.";
- // suppress the final zeros on the string by dividing by the largest
- // power of 10 that divides evenly.
- int roundedMs = ms;
- for(int round = 1000; round > 0; round /= 10) {
- if (ms % round == 0) {
- roundedMs = ms / round;
- break;
- }
- }
- expected += roundedMs;
- assertEquals(expected, buffer.toString());
- assertEquals(Integer.toString(year) + "-12-25",
- new DateWritable((int) dates.vector[ms - 1000]).toString());
- }
- }
- rows.nextBatch(batch);
- assertEquals(0, batch.size);
- }
-
- @Test
- public void testDate1900() throws Exception {
- createOrcDateFile(testFilePath, 1900, 1970);
- }
-
- @Test
- public void testDate2038() throws Exception {
- createOrcDateFile(testFilePath, 2038, 2250);
- }
-
- private static void setUnion(VectorizedRowBatch batch, int rowId,
- Timestamp ts, Integer tag, Integer i, String s,
- HiveDecimalWritable dec) {
- UnionColumnVector union = (UnionColumnVector) batch.cols[1];
- if (ts != null) {
- TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0];
- timestampColVector.set(rowId, ts);
- } else {
- batch.cols[0].isNull[rowId] = true;
- batch.cols[0].noNulls = false;
- }
- if (tag != null) {
- union.tags[rowId] = tag;
- if (tag == 0) {
- if (i != null) {
- ((LongColumnVector) union.fields[tag]).vector[rowId] = i;
- } else {
- union.fields[tag].isNull[rowId] = true;
- union.fields[tag].noNulls = false;
- }
- } else if (tag == 1) {
- if (s != null) {
- ((BytesColumnVector) union.fields[tag]).setVal(rowId, s.getBytes());
- } else {
- union.fields[tag].isNull[rowId] = true;
- union.fields[tag].noNulls = false;
- }
- } else {
- throw new IllegalArgumentException("Bad tag " + tag);
- }
- } else {
- batch.cols[1].isNull[rowId] = true;
- batch.cols[1].noNulls = false;
- }
- if (dec != null) {
- ((DecimalColumnVector) batch.cols[2]).vector[rowId] = dec;
- } else {
- batch.cols[2].isNull[rowId] = true;
- batch.cols[2].noNulls = false;
- }
- }
-
- /**
- * We test union, timestamp, and decimal separately since we need to make the
- * object inspector manually. (The Hive reflection-based doesn't handle
- * them properly.)
- */
- @Test
- public void testUnionAndTimestamp() throws Exception {
- TypeDescription schema = TypeDescription.createStruct()
- .addField("time", TypeDescription.createTimestamp())
- .addField("union", TypeDescription.createUnion()
- .addUnionChild(TypeDescription.createInt())
- .addUnionChild(TypeDescription.createString()))
- .addField("decimal", TypeDescription.createDecimal()
- .withPrecision(38)
- .withScale(18));
- HiveDecimal maxValue = HiveDecimal.create("10000000000000000000");
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(1000)
- .compress(CompressionKind.NONE)
- .bufferSize(100)
- .blockPadding(false));
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.size = 6;
- setUnion(batch, 0, Timestamp.valueOf("2000-03-12 15:00:00"), 0, 42, null,
- new HiveDecimalWritable("12345678.6547456"));
- setUnion(batch, 1, Timestamp.valueOf("2000-03-20 12:00:00.123456789"),
- 1, null, "hello", new HiveDecimalWritable("-5643.234"));
-
- setUnion(batch, 2, null, null, null, null, null);
- setUnion(batch, 3, null, 0, null, null, null);
- setUnion(batch, 4, null, 1, null, null, null);
-
- setUnion(batch, 5, Timestamp.valueOf("1970-01-01 00:00:00"), 0, 200000,
- null, new HiveDecimalWritable("10000000000000000000"));
- writer.addRowBatch(batch);
-
- batch.reset();
- Random rand = new Random(42);
- for(int i=1970; i < 2038; ++i) {
- Timestamp ts = Timestamp.valueOf(i + "-05-05 12:34:56." + i);
- HiveDecimal dec =
- HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18));
- if ((i & 1) == 0) {
- setUnion(batch, batch.size++, ts, 0, i*i, null,
- new HiveDecimalWritable(dec));
- } else {
- setUnion(batch, batch.size++, ts, 1, null, Integer.toString(i*i),
- new HiveDecimalWritable(dec));
- }
- if (maxValue.compareTo(dec) < 0) {
- maxValue = dec;
- }
- }
- writer.addRowBatch(batch);
- batch.reset();
-
- // let's add a lot of constant rows to test the rle
- batch.size = 1000;
- for(int c=0; c < batch.cols.length; ++c) {
- batch.cols[c].setRepeating(true);
- }
- ((UnionColumnVector) batch.cols[1]).fields[0].isRepeating = true;
- setUnion(batch, 0, null, 0, 1732050807, null, null);
- for(int i=0; i < 5; ++i) {
- writer.addRowBatch(batch);
- }
-
- batch.reset();
- batch.size = 3;
- setUnion(batch, 0, null, 0, 0, null, null);
- setUnion(batch, 1, null, 0, 10, null, null);
- setUnion(batch, 2, null, 0, 138, null, null);
- writer.addRowBatch(batch);
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
-
- schema = writer.getSchema();
- assertEquals(5, schema.getMaximumId());
- boolean[] expected = new boolean[] {false, false, false, false, false, false};
- boolean[] included = OrcUtils.includeColumns("", schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- expected = new boolean[] {false, true, false, false, false, true};
- included = OrcUtils.includeColumns("time,decimal", schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- expected = new boolean[] {false, false, true, true, true, false};
- included = OrcUtils.includeColumns("union", schema);
- assertEquals(true, Arrays.equals(expected, included));
-
- Assert.assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
- Assert.assertEquals(5077, reader.getNumberOfRows());
- DecimalColumnStatistics stats =
- (DecimalColumnStatistics) reader.getStatistics()[5];
- assertEquals(71, stats.getNumberOfValues());
- assertEquals(HiveDecimal.create("-5643.234"), stats.getMinimum());
- assertEquals(maxValue, stats.getMaximum());
- // TODO: fix this
-// assertEquals(null,stats.getSum());
- int stripeCount = 0;
- int rowCount = 0;
- long currentOffset = -1;
- for(StripeInformation stripe: reader.getStripes()) {
- stripeCount += 1;
- rowCount += stripe.getNumberOfRows();
- if (currentOffset < 0) {
- currentOffset = stripe.getOffset() + stripe.getLength();
- } else {
- assertEquals(currentOffset, stripe.getOffset());
- currentOffset += stripe.getLength();
- }
- }
- Assert.assertEquals(reader.getNumberOfRows(), rowCount);
- assertEquals(2, stripeCount);
- Assert.assertEquals(reader.getContentLength(), currentOffset);
- RecordReader rows = reader.rows();
- Assert.assertEquals(0, rows.getRowNumber());
- Assert.assertEquals(0.0, rows.getProgress(), 0.000001);
-
- schema = reader.getSchema();
- batch = schema.createRowBatch(74);
- Assert.assertEquals(0, rows.getRowNumber());
- rows.nextBatch(batch);
- assertEquals(74, batch.size);
- Assert.assertEquals(74, rows.getRowNumber());
- TimestampColumnVector ts = (TimestampColumnVector) batch.cols[0];
- UnionColumnVector union = (UnionColumnVector) batch.cols[1];
- LongColumnVector longs = (LongColumnVector) union.fields[0];
- BytesColumnVector strs = (BytesColumnVector) union.fields[1];
- DecimalColumnVector decs = (DecimalColumnVector) batch.cols[2];
-
- assertEquals("struct<time:timestamp,union:uniontype<int,string>,decimal:decimal(38,18)>",
- schema.toString());
- assertEquals("2000-03-12 15:00:00.0", ts.asScratchTimestamp(0).toString());
- assertEquals(0, union.tags[0]);
- assertEquals(42, longs.vector[0]);
- assertEquals("12345678.6547456", decs.vector[0].toString());
-
- assertEquals("2000-03-20 12:00:00.123456789", ts.asScratchTimestamp(1).toString());
- assertEquals(1, union.tags[1]);
- assertEquals("hello", strs.toString(1));
- assertEquals("-5643.234", decs.vector[1].toString());
-
- assertEquals(false, ts.noNulls);
- assertEquals(false, union.noNulls);
- assertEquals(false, decs.noNulls);
- assertEquals(true, ts.isNull[2]);
- assertEquals(true, union.isNull[2]);
- assertEquals(true, decs.isNull[2]);
-
- assertEquals(true, ts.isNull[3]);
- assertEquals(false, union.isNull[3]);
- assertEquals(0, union.tags[3]);
- assertEquals(true, longs.isNull[3]);
- assertEquals(true, decs.isNull[3]);
-
- assertEquals(true, ts.isNull[4]);
- assertEquals(false, union.isNull[4]);
- assertEquals(1, union.tags[4]);
- assertEquals(true, strs.isNull[4]);
- assertEquals(true, decs.isNull[4]);
-
- assertEquals(false, ts.isNull[5]);
- assertEquals("1970-01-01 00:00:00.0", ts.asScratchTimestamp(5).toString());
- assertEquals(false, union.isNull[5]);
- assertEquals(0, union.tags[5]);
- assertEquals(false, longs.isNull[5]);
- assertEquals(200000, longs.vector[5]);
- assertEquals(false, decs.isNull[5]);
- assertEquals("10000000000000000000", decs.vector[5].toString());
-
- rand = new Random(42);
- for(int i=1970; i < 2038; ++i) {
- int row = 6 + i - 1970;
- assertEquals(Timestamp.valueOf(i + "-05-05 12:34:56." + i),
- ts.asScratchTimestamp(row));
- if ((i & 1) == 0) {
- assertEquals(0, union.tags[row]);
- assertEquals(i*i, longs.vector[row]);
- } else {
- assertEquals(1, union.tags[row]);
- assertEquals(Integer.toString(i * i), strs.toString(row));
- }
- assertEquals(new HiveDecimalWritable(HiveDecimal.create(new BigInteger(64, rand),
- rand.nextInt(18))), decs.vector[row]);
- }
-
- // rebuild the row batch, so that we can read by 1000 rows
- batch = schema.createRowBatch(1000);
- ts = (TimestampColumnVector) batch.cols[0];
- union = (UnionColumnVector) batch.cols[1];
- longs = (LongColumnVector) union.fields[0];
- strs = (BytesColumnVector) union.fields[1];
- decs = (DecimalColumnVector) batch.cols[2];
-
- for(int i=0; i < 5; ++i) {
- rows.nextBatch(batch);
- assertEquals("batch " + i, 1000, batch.size);
- assertEquals("batch " + i, false, union.isRepeating);
- assertEquals("batch " + i, true, union.noNulls);
- for(int r=0; r < batch.size; ++r) {
- assertEquals("bad tag at " + i + "." +r, 0, union.tags[r]);
- }
- assertEquals("batch " + i, true, longs.isRepeating);
- assertEquals("batch " + i, 1732050807, longs.vector[0]);
- }
-
- rows.nextBatch(batch);
- assertEquals(3, batch.size);
- assertEquals(0, union.tags[0]);
- assertEquals(0, longs.vector[0]);
- assertEquals(0, union.tags[1]);
- assertEquals(10, longs.vector[1]);
- assertEquals(0, union.tags[2]);
- assertEquals(138, longs.vector[2]);
-
- rows.nextBatch(batch);
- assertEquals(0, batch.size);
- Assert.assertEquals(1.0, rows.getProgress(), 0.00001);
- Assert.assertEquals(reader.getNumberOfRows(), rows.getRowNumber());
- rows.seekToRow(1);
- rows.nextBatch(batch);
- assertEquals(1000, batch.size);
- assertEquals(Timestamp.valueOf("2000-03-20 12:00:00.123456789"), ts.asScratchTimestamp(0));
- assertEquals(1, union.tags[0]);
- assertEquals("hello", strs.toString(0));
- assertEquals(new HiveDecimalWritable(HiveDecimal.create("-5643.234")), decs.vector[0]);
- rows.close();
- }
-
- /**
- * Read and write a randomly generated snappy file.
- * @throws Exception
- */
- @Test
- public void testSnappy() throws Exception {
- TypeDescription schema = createInnerSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(1000)
- .compress(CompressionKind.SNAPPY)
- .bufferSize(100));
- VectorizedRowBatch batch = schema.createRowBatch();
- Random rand = new Random(12);
- batch.size = 1000;
- for(int b=0; b < 10; ++b) {
- for (int r=0; r < 1000; ++r) {
- ((LongColumnVector) batch.cols[0]).vector[r] = rand.nextInt();
- ((BytesColumnVector) batch.cols[1]).setVal(r,
- Integer.toHexString(rand.nextInt()).getBytes());
- }
- writer.addRowBatch(batch);
- }
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- Assert.assertEquals(CompressionKind.SNAPPY, reader.getCompressionKind());
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch(1000);
- rand = new Random(12);
- LongColumnVector longs = (LongColumnVector) batch.cols[0];
- BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
- for(int b=0; b < 10; ++b) {
- rows.nextBatch(batch);
- assertEquals(1000, batch.size);
- for(int r=0; r < batch.size; ++r) {
- assertEquals(rand.nextInt(), longs.vector[r]);
- assertEquals(Integer.toHexString(rand.nextInt()), strs.toString(r));
- }
- }
- rows.nextBatch(batch);
- assertEquals(0, batch.size);
- rows.close();
- }
-
- /**
- * Read and write a randomly generated snappy file.
- * @throws Exception
- */
- @Test
- public void testWithoutIndex() throws Exception {
- TypeDescription schema = createInnerSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(5000)
- .compress(CompressionKind.SNAPPY)
- .bufferSize(1000)
- .rowIndexStride(0));
- VectorizedRowBatch batch = schema.createRowBatch();
- Random rand = new Random(24);
- batch.size = 5;
- for(int c=0; c < batch.cols.length; ++c) {
- batch.cols[c].setRepeating(true);
- }
- for(int i=0; i < 10000; ++i) {
- ((LongColumnVector) batch.cols[0]).vector[0] = rand.nextInt();
- ((BytesColumnVector) batch.cols[1])
- .setVal(0, Integer.toBinaryString(rand.nextInt()).getBytes());
- writer.addRowBatch(batch);
- }
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- Assert.assertEquals(50000, reader.getNumberOfRows());
- Assert.assertEquals(0, reader.getRowIndexStride());
- StripeInformation stripe = reader.getStripes().iterator().next();
- assertEquals(true, stripe.getDataLength() != 0);
- assertEquals(0, stripe.getIndexLength());
- RecordReader rows = reader.rows();
- rand = new Random(24);
- batch = reader.getSchema().createRowBatch(1000);
- LongColumnVector longs = (LongColumnVector) batch.cols[0];
- BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
- for(int i=0; i < 50; ++i) {
- rows.nextBatch(batch);
- assertEquals("batch " + i, 1000, batch.size);
- for(int j=0; j < 200; ++j) {
- int intVal = rand.nextInt();
- String strVal = Integer.toBinaryString(rand.nextInt());
- for (int k = 0; k < 5; ++k) {
- assertEquals(intVal, longs.vector[j * 5 + k]);
- assertEquals(strVal, strs.toString(j * 5 + k));
- }
- }
- }
- rows.nextBatch(batch);
- assertEquals(0, batch.size);
- rows.close();
- }
-
- @Test
- public void testSeek() throws Exception {
- TypeDescription schema = createBigRowSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(200000)
- .bufferSize(65536)
- .rowIndexStride(1000));
- VectorizedRowBatch batch = schema.createRowBatch();
- Random rand = new Random(42);
- final int COUNT=32768;
- long[] intValues= new long[COUNT];
- double[] doubleValues = new double[COUNT];
- String[] stringValues = new String[COUNT];
- BytesWritable[] byteValues = new BytesWritable[COUNT];
- String[] words = new String[128];
- for(int i=0; i < words.length; ++i) {
- words[i] = Integer.toHexString(rand.nextInt());
- }
- for(int i=0; i < COUNT/2; ++i) {
- intValues[2*i] = rand.nextLong();
- intValues[2*i+1] = intValues[2*i];
- stringValues[2*i] = words[rand.nextInt(words.length)];
- stringValues[2*i+1] = stringValues[2*i];
- }
- for(int i=0; i < COUNT; ++i) {
- doubleValues[i] = rand.nextDouble();
- byte[] buf = new byte[20];
- rand.nextBytes(buf);
- byteValues[i] = new BytesWritable(buf);
- }
- for(int i=0; i < COUNT; ++i) {
- appendRandomRow(batch, intValues, doubleValues, stringValues,
- byteValues, words, i);
- if (batch.size == 1024) {
- writer.addRowBatch(batch);
- batch.reset();
- }
- }
- if (batch.size != 0) {
- writer.addRowBatch(batch);
- }
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- Assert.assertEquals(COUNT, reader.getNumberOfRows());
- RecordReader rows = reader.rows();
- // get the row index
- DataReader meta = RecordReaderUtils.createDefaultDataReader(
- DataReaderProperties.builder()
- .withBufferSize(reader.getCompressionSize())
- .withFileSystem(fs)
- .withPath(testFilePath)
- .withCompression(reader.getCompressionKind())
- .withTypeCount(reader.getSchema().getMaximumId() + 1)
- .withZeroCopy(false)
- .build());
- OrcIndex index =
- meta.readRowIndex(reader.getStripes().get(0), null, null, null, null,
- null);
- // check the primitive columns to make sure they have the right number of
- // items in the first row group
- for(int c=1; c < 9; ++c) {
- OrcProto.RowIndex colIndex = index.getRowGroupIndex()[c];
- assertEquals(1000,
- colIndex.getEntry(0).getStatistics().getNumberOfValues());
- }
- batch = reader.getSchema().createRowBatch();
- int nextRowInBatch = -1;
- for(int i=COUNT-1; i >= 0; --i, --nextRowInBatch) {
- // if we have consumed the previous batch read a new one
- if (nextRowInBatch < 0) {
- long base = Math.max(i - 1023, 0);
- rows.seekToRow(base);
- Assert.assertEquals("row " + i, true, rows.nextBatch(batch));
- nextRowInBatch = batch.size - 1;
- }
- checkRandomRow(batch, intValues, doubleValues,
- stringValues, byteValues, words, i, nextRowInBatch);
- }
- rows.close();
- Iterator<StripeInformation> stripeIterator =
- reader.getStripes().iterator();
- long offsetOfStripe2 = 0;
- long offsetOfStripe4 = 0;
- long lastRowOfStripe2 = 0;
- for(int i = 0; i < 5; ++i) {
- StripeInformation stripe = stripeIterator.next();
- if (i < 2) {
- lastRowOfStripe2 += stripe.getNumberOfRows();
- } else if (i == 2) {
- offsetOfStripe2 = stripe.getOffset();
- lastRowOfStripe2 += stripe.getNumberOfRows() - 1;
- } else if (i == 4) {
- offsetOfStripe4 = stripe.getOffset();
- }
- }
- boolean[] columns = new boolean[reader.getStatistics().length];
- columns[5] = true; // long colulmn
- columns[9] = true; // text column
- rows = reader.rows(new Reader.Options()
- .range(offsetOfStripe2, offsetOfStripe4 - offsetOfStripe2)
- .include(columns));
- rows.seekToRow(lastRowOfStripe2);
- // we only want two rows
- batch = reader.getSchema().createRowBatch(2);
- Assert.assertEquals(true, rows.nextBatch(batch));
- assertEquals(1, batch.size);
- assertEquals(intValues[(int) lastRowOfStripe2], getLong(batch, 0));
- assertEquals(stringValues[(int) lastRowOfStripe2],
- getText(batch, 0).toString());
- Assert.assertEquals(true, rows.nextBatch(batch));
- assertEquals(intValues[(int) lastRowOfStripe2 + 1], getLong(batch, 0));
- assertEquals(stringValues[(int) lastRowOfStripe2 + 1],
- getText(batch, 0).toString());
- rows.close();
- }
-
- private void appendRandomRow(VectorizedRowBatch batch,
- long[] intValues, double[] doubleValues,
- String[] stringValues,
- BytesWritable[] byteValues,
- String[] words, int i) {
- InnerStruct inner = new InnerStruct((int) intValues[i], stringValues[i]);
- InnerStruct inner2 = new InnerStruct((int) (intValues[i] >> 32),
- words[i % words.length] + "-x");
- setBigRow(batch, batch.size++, (intValues[i] & 1) == 0, (byte) intValues[i],
- (short) intValues[i], (int) intValues[i], intValues[i],
- (float) doubleValues[i], doubleValues[i], byteValues[i], stringValues[i],
- new MiddleStruct(inner, inner2), list(), map(inner, inner2));
- }
-
- private void checkRandomRow(VectorizedRowBatch batch,
- long[] intValues, double[] doubleValues,
- String[] stringValues,
- BytesWritable[] byteValues,
- String[] words, int i, int rowInBatch) {
- InnerStruct inner = new InnerStruct((int) intValues[i], stringValues[i]);
- InnerStruct inner2 = new InnerStruct((int) (intValues[i] >> 32),
- words[i % words.length] + "-x");
- checkBigRow(batch, rowInBatch, i, (intValues[i] & 1) == 0, (byte) intValues[i],
- (short) intValues[i], (int) intValues[i], intValues[i],
- (float) doubleValues[i], doubleValues[i], byteValues[i], stringValues[i],
- new MiddleStruct(inner, inner2), list(), map(inner, inner2));
- }
-
- private static class MyMemoryManager extends MemoryManager {
- final long totalSpace;
- double rate;
- Path path = null;
- long lastAllocation = 0;
- int rows = 0;
- Callback callback;
-
- MyMemoryManager(Configuration conf, long totalSpace, double rate) {
- super(conf);
- this.totalSpace = totalSpace;
- this.rate = rate;
- }
-
- @Override
- public void addWriter(Path path, long requestedAllocation,
- Callback callback) {
- this.path = path;
- this.lastAllocation = requestedAllocation;
- this.callback = callback;
- }
-
- @Override
- public synchronized void removeWriter(Path path) {
- this.path = null;
- this.lastAllocation = 0;
- }
-
- @Override
- public long getTotalMemoryPool() {
- return totalSpace;
- }
-
- @Override
- public double getAllocationScale() {
- return rate;
- }
-
- @Override
- public void addedRow(int count) throws IOException {
- rows += count;
- if (rows % 100 == 0) {
- callback.checkMemory(rate);
- }
- }
- }
-
- @Test
- public void testMemoryManagementV11() throws Exception {
- TypeDescription schema = createInnerSchema();
- MyMemoryManager memory = new MyMemoryManager(conf, 10000, 0.1);
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .compress(CompressionKind.NONE)
- .stripeSize(50000)
- .bufferSize(100)
- .rowIndexStride(0)
- .memory(memory)
- .version(OrcFile.Version.V_0_11));
- assertEquals(testFilePath, memory.path);
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.size = 1;
- for(int i=0; i < 2500; ++i) {
- ((LongColumnVector) batch.cols[0]).vector[0] = i * 300;
- ((BytesColumnVector) batch.cols[1]).setVal(0,
- Integer.toHexString(10*i).getBytes());
- writer.addRowBatch(batch);
- }
- writer.close();
- assertEquals(null, memory.path);
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- int i = 0;
- for(StripeInformation stripe: reader.getStripes()) {
- i += 1;
- assertTrue("stripe " + i + " is too long at " + stripe.getDataLength(),
- stripe.getDataLength() < 5000);
- }
- assertEquals(25, i);
- assertEquals(2500, reader.getNumberOfRows());
- }
-
- @Test
- public void testMemoryManagementV12() throws Exception {
- TypeDescription schema = createInnerSchema();
- MyMemoryManager memory = new MyMemoryManager(conf, 10000, 0.1);
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .compress(CompressionKind.NONE)
- .stripeSize(50000)
- .bufferSize(100)
- .rowIndexStride(0)
- .memory(memory)
- .version(OrcFile.Version.V_0_12));
- VectorizedRowBatch batch = schema.createRowBatch();
- assertEquals(testFilePath, memory.path);
- batch.size = 1;
- for(int i=0; i < 2500; ++i) {
- ((LongColumnVector) batch.cols[0]).vector[0] = i * 300;
- ((BytesColumnVector) batch.cols[1]).setVal(0,
- Integer.toHexString(10*i).getBytes());
- writer.addRowBatch(batch);
- }
- writer.close();
- assertEquals(null, memory.path);
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- int i = 0;
- for(StripeInformation stripe: reader.getStripes()) {
- i += 1;
- assertTrue("stripe " + i + " is too long at " + stripe.getDataLength(),
- stripe.getDataLength() < 5000);
- }
- // with HIVE-7832, the dictionaries will be disabled after writing the first
- // stripe as there are too many distinct values. Hence only 3 stripes as
- // compared to 25 stripes in version 0.11 (above test case)
- assertEquals(3, i);
- assertEquals(2500, reader.getNumberOfRows());
- }
-
- @Test
- public void testPredicatePushdown() throws Exception {
- TypeDescription schema = createInnerSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(400000L)
- .compress(CompressionKind.NONE)
- .bufferSize(500)
- .rowIndexStride(1000));
- VectorizedRowBatch batch = schema.createRowBatch();
- batch.ensureSize(3500);
- batch.size = 3500;
- for(int i=0; i < 3500; ++i) {
- ((LongColumnVector) batch.cols[0]).vector[i] = i * 300;
- ((BytesColumnVector) batch.cols[1]).setVal(i,
- Integer.toHexString(10*i).getBytes());
- }
- writer.addRowBatch(batch);
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- assertEquals(3500, reader.getNumberOfRows());
-
- SearchArgument sarg = SearchArgumentFactory.newBuilder()
- .startAnd()
- .startNot()
- .lessThan("int1", PredicateLeaf.Type.LONG, 300000L)
- .end()
- .lessThan("int1", PredicateLeaf.Type.LONG, 600000L)
- .end()
- .build();
- RecordReader rows = reader.rows(new Reader.Options()
- .range(0L, Long.MAX_VALUE)
- .include(new boolean[]{true, true, true})
- .searchArgument(sarg, new String[]{null, "int1", "string1"}));
- batch = reader.getSchema().createRowBatch(2000);
- LongColumnVector ints = (LongColumnVector) batch.cols[0];
- BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
-
- Assert.assertEquals(1000L, rows.getRowNumber());
- Assert.assertEquals(true, rows.nextBatch(batch));
- assertEquals(1000, batch.size);
-
- for(int i=1000; i < 2000; ++i) {
- assertEquals(300 * i, ints.vector[i - 1000]);
- assertEquals(Integer.toHexString(10*i), strs.toString(i - 1000));
- }
- Assert.assertEquals(false, rows.nextBatch(batch));
- Assert.assertEquals(3500, rows.getRowNumber());
-
- // look through the file with no rows selected
- sarg = SearchArgumentFactory.newBuilder()
- .startAnd()
- .lessThan("int1", PredicateLeaf.Type.LONG, 0L)
- .end()
- .build();
- rows = reader.rows(new Reader.Options()
- .range(0L, Long.MAX_VALUE)
- .include(new boolean[]{true, true, true})
- .searchArgument(sarg, new String[]{null, "int1", "string1"}));
- Assert.assertEquals(3500L, rows.getRowNumber());
- assertTrue(!rows.nextBatch(batch));
-
- // select first 100 and last 100 rows
- sarg = SearchArgumentFactory.newBuilder()
- .startOr()
- .lessThan("int1", PredicateLeaf.Type.LONG, 300L * 100)
- .startNot()
- .lessThan("int1", PredicateLeaf.Type.LONG, 300L * 3400)
- .end()
- .end()
- .build();
- rows = reader.rows(new Reader.Options()
- .range(0L, Long.MAX_VALUE)
- .include(new boolean[]{true, true, true})
- .searchArgument(sarg, new String[]{null, "int1", "string1"}));
- Assert.assertEquals(0, rows.getRowNumber());
- Assert.assertEquals(true, rows.nextBatch(batch));
- assertEquals(1000, batch.size);
- Assert.assertEquals(3000, rows.getRowNumber());
- for(int i=0; i < 1000; ++i) {
- assertEquals(300 * i, ints.vector[i]);
- assertEquals(Integer.toHexString(10*i), strs.toString(i));
- }
-
- Assert.assertEquals(true, rows.nextBatch(batch));
- assertEquals(500, batch.size);
- Assert.assertEquals(3500, rows.getRowNumber());
- for(int i=3000; i < 3500; ++i) {
- assertEquals(300 * i, ints.vector[i - 3000]);
- assertEquals(Integer.toHexString(10*i), strs.toString(i - 3000));
- }
- Assert.assertEquals(false, rows.nextBatch(batch));
- Assert.assertEquals(3500, rows.getRowNumber());
- }
-
- /**
- * Test all of the types that have distinct ORC writers using the vectorized
- * writer with different combinations of repeating and null values.
- * @throws Exception
- */
- @T
<TRUNCATED>
[05/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/TestOrcTimezone2.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestOrcTimezone2.java b/orc/src/test/org/apache/orc/TestOrcTimezone2.java
deleted file mode 100644
index 4a02855..0000000
--- a/orc/src/test/org/apache/orc/TestOrcTimezone2.java
+++ /dev/null
@@ -1,143 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import static junit.framework.Assert.assertEquals;
-
-import java.io.File;
-import java.sql.Timestamp;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-import java.util.Random;
-import java.util.TimeZone;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-
-import com.google.common.collect.Lists;
-
-/**
- *
- */
-@RunWith(Parameterized.class)
-public class TestOrcTimezone2 {
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
- String writerTimeZone;
- String readerTimeZone;
- static TimeZone defaultTimeZone = TimeZone.getDefault();
-
- public TestOrcTimezone2(String writerTZ, String readerTZ) {
- this.writerTimeZone = writerTZ;
- this.readerTimeZone = readerTZ;
- }
-
- @Parameterized.Parameters
- public static Collection<Object[]> data() {
- String[] allTimeZones = TimeZone.getAvailableIDs();
- Random rand = new Random(123);
- int len = allTimeZones.length;
- int n = 500;
- Object[][] data = new Object[n][];
- for (int i = 0; i < n; i++) {
- int wIdx = rand.nextInt(len);
- int rIdx = rand.nextInt(len);
- data[i] = new Object[2];
- data[i][0] = allTimeZones[wIdx];
- data[i][1] = allTimeZones[rIdx];
- }
- return Arrays.asList(data);
- }
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcFile." +
- testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @After
- public void restoreTimeZone() {
- TimeZone.setDefault(defaultTimeZone);
- }
-
- @Test
- public void testTimestampWriter() throws Exception {
- TypeDescription schema = TypeDescription.createTimestamp();
-
- TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone));
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf).setSchema(schema)
- .stripeSize(100000).bufferSize(10000));
- assertEquals(writerTimeZone, TimeZone.getDefault().getID());
- List<String> ts = Lists.newArrayList();
- ts.add("2003-01-01 01:00:00.000000222");
- ts.add("1999-01-01 02:00:00.999999999");
- ts.add("1995-01-02 03:00:00.688888888");
- ts.add("2002-01-01 04:00:00.1");
- ts.add("2010-03-02 05:00:00.000009001");
- ts.add("2005-01-01 06:00:00.000002229");
- ts.add("2006-01-01 07:00:00.900203003");
- ts.add("2003-01-01 08:00:00.800000007");
- ts.add("1996-08-02 09:00:00.723100809");
- ts.add("1998-11-02 10:00:00.857340643");
- ts.add("2008-10-02 11:00:00.0");
- ts.add("2037-01-01 00:00:00.000999");
- VectorizedRowBatch batch = schema.createRowBatch();
- TimestampColumnVector tsc = (TimestampColumnVector) batch.cols[0];
- for (String t : ts) {
- tsc.set(batch.size++, Timestamp.valueOf(t));
- }
- writer.addRowBatch(batch);
- writer.close();
-
- TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- assertEquals(readerTimeZone, TimeZone.getDefault().getID());
- RecordReader rows = reader.rows();
- int idx = 0;
- batch = reader.getSchema().createRowBatch();
- tsc = (TimestampColumnVector) batch.cols[0];
- while (rows.nextBatch(batch)) {
- for (int r=0; r < batch.size; ++r) {
- assertEquals(ts.get(idx++), tsc.asScratchTimestamp(r).toString());
- }
- }
- rows.close();
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/TestOrcTimezone3.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestOrcTimezone3.java b/orc/src/test/org/apache/orc/TestOrcTimezone3.java
deleted file mode 100644
index 40ab0c9..0000000
--- a/orc/src/test/org/apache/orc/TestOrcTimezone3.java
+++ /dev/null
@@ -1,126 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import static junit.framework.Assert.assertEquals;
-
-import java.io.File;
-import java.sql.Timestamp;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-import java.util.TimeZone;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-
-import com.google.common.collect.Lists;
-
-import junit.framework.Assert;
-
-/**
- *
- */
-@RunWith(Parameterized.class)
-public class TestOrcTimezone3 {
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
- String writerTimeZone;
- String readerTimeZone;
- static TimeZone defaultTimeZone = TimeZone.getDefault();
-
- public TestOrcTimezone3(String writerTZ, String readerTZ) {
- this.writerTimeZone = writerTZ;
- this.readerTimeZone = readerTZ;
- }
-
- @Parameterized.Parameters
- public static Collection<Object[]> data() {
- List<Object[]> result = Arrays.asList(new Object[][]{
- {"America/Chicago", "America/Los_Angeles"},
- });
- return result;
- }
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcTimezone3." +
- testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @After
- public void restoreTimeZone() {
- TimeZone.setDefault(defaultTimeZone);
- }
-
- @Test
- public void testTimestampWriter() throws Exception {
- TypeDescription schema = TypeDescription.createTimestamp();
-
- TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone));
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
- .bufferSize(10000));
- assertEquals(writerTimeZone, TimeZone.getDefault().getID());
- List<String> ts = Lists.newArrayList();
- ts.add("1969-12-31 16:00:14.007");
- ts.add("1969-12-31 16:00:06.021");
- ts.add("1969-12-31 16:00:03.963");
- VectorizedRowBatch batch = schema.createRowBatch();
- TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
- for (String t : ts) {
- times.set(batch.size++, Timestamp.valueOf(t));
- }
- writer.addRowBatch(batch);
- writer.close();
-
- TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- assertEquals(readerTimeZone, TimeZone.getDefault().getID());
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- times = (TimestampColumnVector) batch.cols[0];
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(ts.get(idx++), times.asScratchTimestamp(r).toString());
- }
- }
- rows.close();
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/TestStringDictionary.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestStringDictionary.java b/orc/src/test/org/apache/orc/TestStringDictionary.java
deleted file mode 100644
index 46209bb..0000000
--- a/orc/src/test/org/apache/orc/TestStringDictionary.java
+++ /dev/null
@@ -1,290 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.File;
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-
-import org.apache.orc.impl.RecordReaderImpl;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-
-public class TestStringDictionary {
-
- Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
- + File.separator + "tmp"));
-
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @Test
- public void testTooManyDistinct() throws Exception {
- TypeDescription schema = TypeDescription.createString();
-
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).setSchema(schema)
- .compress(CompressionKind.NONE)
- .bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch();
- BytesColumnVector col = (BytesColumnVector) batch.cols[0];
- for (int i = 0; i < 20000; i++) {
- if (batch.size == batch.getMaxSize()) {
- writer.addRowBatch(batch);
- batch.reset();
- }
- col.setVal(batch.size++, String.valueOf(i).getBytes());
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- col = (BytesColumnVector) batch.cols[0];
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(String.valueOf(idx++), col.toString(r));
- }
- }
-
- // make sure the encoding type is correct
- for (StripeInformation stripe : reader.getStripes()) {
- // hacky but does the job, this casting will work as long this test resides
- // within the same package as ORC reader
- OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- assertEquals(OrcProto.ColumnEncoding.Kind.DIRECT_V2, encoding.getKind());
- }
- }
- }
-
- @Test
- public void testHalfDistinct() throws Exception {
- TypeDescription schema = TypeDescription.createString();
-
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).setSchema(schema).compress(CompressionKind.NONE)
- .bufferSize(10000));
- Random rand = new Random(123);
- int[] input = new int[20000];
- for (int i = 0; i < 20000; i++) {
- input[i] = rand.nextInt(10000);
- }
-
- VectorizedRowBatch batch = schema.createRowBatch();
- BytesColumnVector col = (BytesColumnVector) batch.cols[0];
- for (int i = 0; i < 20000; i++) {
- if (batch.size == batch.getMaxSize()) {
- writer.addRowBatch(batch);
- batch.reset();
- }
- col.setVal(batch.size++, String.valueOf(input[i]).getBytes());
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- col = (BytesColumnVector) batch.cols[0];
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(String.valueOf(input[idx++]), col.toString(r));
- }
- }
-
- // make sure the encoding type is correct
- for (StripeInformation stripe : reader.getStripes()) {
- // hacky but does the job, this casting will work as long this test resides
- // within the same package as ORC reader
- OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2, encoding.getKind());
- }
- }
- }
-
- @Test
- public void testTooManyDistinctCheckDisabled() throws Exception {
- TypeDescription schema = TypeDescription.createString();
-
- conf.setBoolean(OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getAttribute(), false);
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).setSchema(schema).compress(CompressionKind.NONE)
- .bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch();
- BytesColumnVector string = (BytesColumnVector) batch.cols[0];
- for (int i = 0; i < 20000; i++) {
- if (batch.size == batch.getMaxSize()) {
- writer.addRowBatch(batch);
- batch.reset();
- }
- string.setVal(batch.size++, String.valueOf(i).getBytes());
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- string = (BytesColumnVector) batch.cols[0];
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(String.valueOf(idx++), string.toString(r));
- }
- }
-
- // make sure the encoding type is correct
- for (StripeInformation stripe : reader.getStripes()) {
- // hacky but does the job, this casting will work as long this test resides
- // within the same package as ORC reader
- OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- assertEquals(OrcProto.ColumnEncoding.Kind.DIRECT_V2, encoding.getKind());
- }
- }
- }
-
- @Test
- public void testHalfDistinctCheckDisabled() throws Exception {
- TypeDescription schema = TypeDescription.createString();
-
- conf.setBoolean(OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getAttribute(),
- false);
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).setSchema(schema)
- .compress(CompressionKind.NONE)
- .bufferSize(10000));
- Random rand = new Random(123);
- int[] input = new int[20000];
- for (int i = 0; i < 20000; i++) {
- input[i] = rand.nextInt(10000);
- }
- VectorizedRowBatch batch = schema.createRowBatch();
- BytesColumnVector string = (BytesColumnVector) batch.cols[0];
- for (int i = 0; i < 20000; i++) {
- if (batch.size == batch.getMaxSize()) {
- writer.addRowBatch(batch);
- batch.reset();
- }
- string.setVal(batch.size++, String.valueOf(input[i]).getBytes());
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- string = (BytesColumnVector) batch.cols[0];
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(String.valueOf(input[idx++]), string.toString(r));
- }
- }
-
- // make sure the encoding type is correct
- for (StripeInformation stripe : reader.getStripes()) {
- // hacky but does the job, this casting will work as long this test resides
- // within the same package as ORC reader
- OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2, encoding.getKind());
- }
- }
- }
-
- @Test
- public void testTooManyDistinctV11AlwaysDictionary() throws Exception {
- TypeDescription schema = TypeDescription.createString();
-
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).setSchema(schema)
- .compress(CompressionKind.NONE)
- .version(OrcFile.Version.V_0_11).bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch();
- BytesColumnVector string = (BytesColumnVector) batch.cols[0];
- for (int i = 0; i < 20000; i++) {
- if (batch.size == batch.getMaxSize()) {
- writer.addRowBatch(batch);
- batch.reset();
- }
- string.setVal(batch.size++, String.valueOf(i).getBytes());
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- batch = reader.getSchema().createRowBatch();
- string = (BytesColumnVector) batch.cols[0];
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(String.valueOf(idx++), string.toString(r));
- }
- }
-
- // make sure the encoding type is correct
- for (StripeInformation stripe : reader.getStripes()) {
- // hacky but does the job, this casting will work as long this test resides
- // within the same package as ORC reader
- OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY, encoding.getKind());
- }
- }
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/TestTypeDescription.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestTypeDescription.java b/orc/src/test/org/apache/orc/TestTypeDescription.java
deleted file mode 100644
index 27516be..0000000
--- a/orc/src/test/org/apache/orc/TestTypeDescription.java
+++ /dev/null
@@ -1,91 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.orc.TypeDescription;
-import org.junit.Test;
-
-public class TestTypeDescription {
-
- @Test
- public void testJson() {
- TypeDescription bin = TypeDescription.createBinary();
- assertEquals("{\"category\": \"binary\", \"id\": 0, \"max\": 0}",
- bin.toJson());
- assertEquals("binary", bin.toString());
- TypeDescription struct = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createInt())
- .addField("f2", TypeDescription.createString())
- .addField("f3", TypeDescription.createDecimal());
- assertEquals("struct<f1:int,f2:string,f3:decimal(38,10)>",
- struct.toString());
- assertEquals("{\"category\": \"struct\", \"id\": 0, \"max\": 3, \"fields\": [\n"
- + " \"f1\": {\"category\": \"int\", \"id\": 1, \"max\": 1},\n"
- + " \"f2\": {\"category\": \"string\", \"id\": 2, \"max\": 2},\n"
- + " \"f3\": {\"category\": \"decimal\", \"id\": 3, \"max\": 3, \"precision\": 38, \"scale\": 10}]}",
- struct.toJson());
- struct = TypeDescription.createStruct()
- .addField("f1", TypeDescription.createUnion()
- .addUnionChild(TypeDescription.createByte())
- .addUnionChild(TypeDescription.createDecimal()
- .withPrecision(20).withScale(10)))
- .addField("f2", TypeDescription.createStruct()
- .addField("f3", TypeDescription.createDate())
- .addField("f4", TypeDescription.createDouble())
- .addField("f5", TypeDescription.createBoolean()))
- .addField("f6", TypeDescription.createChar().withMaxLength(100));
- assertEquals("struct<f1:uniontype<tinyint,decimal(20,10)>,f2:struct<f3:date,f4:double,f5:boolean>,f6:char(100)>",
- struct.toString());
- assertEquals(
- "{\"category\": \"struct\", \"id\": 0, \"max\": 8, \"fields\": [\n" +
- " \"f1\": {\"category\": \"uniontype\", \"id\": 1, \"max\": 3, \"children\": [\n" +
- " {\"category\": \"tinyint\", \"id\": 2, \"max\": 2},\n" +
- " {\"category\": \"decimal\", \"id\": 3, \"max\": 3, \"precision\": 20, \"scale\": 10}]},\n" +
- " \"f2\": {\"category\": \"struct\", \"id\": 4, \"max\": 7, \"fields\": [\n" +
- " \"f3\": {\"category\": \"date\", \"id\": 5, \"max\": 5},\n" +
- " \"f4\": {\"category\": \"double\", \"id\": 6, \"max\": 6},\n" +
- " \"f5\": {\"category\": \"boolean\", \"id\": 7, \"max\": 7}]},\n" +
- " \"f6\": {\"category\": \"char\", \"id\": 8, \"max\": 8, \"length\": 100}]}",
- struct.toJson());
- }
-
- @Test
- public void testEquals() {
- TypeDescription type1 =
- TypeDescription.createStruct()
- .addField("a", TypeDescription.createInt())
- .addField("b", TypeDescription.createStruct()
- .addField("x", TypeDescription.createString())
- .addField("y", TypeDescription.createBinary())
- .addField("z", TypeDescription.createDouble()))
- .addField("c", TypeDescription.createString());
- assertEquals(0, type1.getId());
- assertEquals(6, type1.getMaximumId());
- TypeDescription type2 =
- TypeDescription.createStruct()
- .addField("x", TypeDescription.createString())
- .addField("y", TypeDescription.createBinary())
- .addField("z", TypeDescription.createDouble());
- assertEquals(0, type2.getId());
- assertEquals(3, type2.getMaximumId());
- assertEquals(type2, type1.getChildren().get(1));
- assertEquals(type2.hashCode(), type1.getChildren().get(1).hashCode());
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/TestUnrolledBitPack.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestUnrolledBitPack.java b/orc/src/test/org/apache/orc/TestUnrolledBitPack.java
deleted file mode 100644
index ef8fcd0..0000000
--- a/orc/src/test/org/apache/orc/TestUnrolledBitPack.java
+++ /dev/null
@@ -1,114 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.File;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-import com.google.common.collect.Lists;
-import com.google.common.primitives.Longs;
-
-@RunWith(value = Parameterized.class)
-public class TestUnrolledBitPack {
-
- private long val;
-
- public TestUnrolledBitPack(long val) {
- this.val = val;
- }
-
- @Parameters
- public static Collection<Object[]> data() {
- Object[][] data = new Object[][] { { -1 }, { 1 }, { 7 }, { -128 }, { 32000 }, { 8300000 },
- { Integer.MAX_VALUE }, { 540000000000L }, { 140000000000000L }, { 36000000000000000L },
- { Long.MAX_VALUE } };
- return Arrays.asList(data);
- }
-
- Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
- + File.separator + "tmp"));
-
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @Test
- public void testBitPacking() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- long[] inp = new long[] { val, 0, val, val, 0, val, 0, val, val, 0, val, 0, val, val, 0, 0,
- val, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val,
- 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0,
- 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0,
- val, 0, val, 0, 0, val, 0, val, 0, 0, val, val };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
- .compress(CompressionKind.NONE).bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch();
- for (Long l : input) {
- int row = batch.size++;
- ((LongColumnVector) batch.cols[0]).vector[row] = l;
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
-}
[14/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/WriterImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/WriterImpl.java b/orc/src/java/org/apache/orc/impl/WriterImpl.java
deleted file mode 100644
index d703563..0000000
--- a/orc/src/java/org/apache/orc/impl/WriterImpl.java
+++ /dev/null
@@ -1,2446 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import static com.google.common.base.Preconditions.checkArgument;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.sql.Timestamp;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Map;
-import java.util.TimeZone;
-import java.util.TreeMap;
-
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.hadoop.hive.ql.util.JavaDataModel;
-import org.apache.orc.BinaryColumnStatistics;
-import org.apache.orc.BloomFilterIO;
-import org.apache.orc.OrcConf;
-import org.apache.orc.OrcFile;
-import org.apache.orc.OrcProto;
-import org.apache.orc.OrcProto.BloomFilterIndex;
-import org.apache.orc.OrcProto.RowIndex;
-import org.apache.orc.OrcProto.Stream;
-import org.apache.orc.OrcUtils;
-import org.apache.orc.StringColumnStatistics;
-import org.apache.orc.StripeInformation;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.Writer;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.hadoop.io.Text;
-
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.Lists;
-import com.google.common.primitives.Longs;
-import com.google.protobuf.ByteString;
-
-/**
- * An ORC file writer. The file is divided into stripes, which is the natural
- * unit of work when reading. Each stripe is buffered in memory until the
- * memory reaches the stripe size and then it is written out broken down by
- * columns. Each column is written by a TreeWriter that is specific to that
- * type of column. TreeWriters may have children TreeWriters that handle the
- * sub-types. Each of the TreeWriters writes the column's data as a set of
- * streams.
- *
- * This class is unsynchronized like most Stream objects, so from the creation
- * of an OrcFile and all access to a single instance has to be from a single
- * thread.
- *
- * There are no known cases where these happen between different threads today.
- *
- * Caveat: the MemoryManager is created during WriterOptions create, that has
- * to be confined to a single thread as well.
- *
- */
-public class WriterImpl implements Writer, MemoryManager.Callback {
-
- private static final Logger LOG = LoggerFactory.getLogger(WriterImpl.class);
-
- private static final int MIN_ROW_INDEX_STRIDE = 1000;
-
- private final Path path;
- private final int rowIndexStride;
- private final TypeDescription schema;
-
- @VisibleForTesting
- protected final PhysicalWriter physWriter;
- private int columnCount;
- private long rowCount = 0;
- private long rowsInStripe = 0;
- private long rawDataSize = 0;
- private int rowsInIndex = 0;
- private int stripesAtLastFlush = -1;
- private final List<OrcProto.StripeInformation> stripes =
- new ArrayList<OrcProto.StripeInformation>();
- private final Map<String, ByteString> userMetadata =
- new TreeMap<String, ByteString>();
- private final StreamFactory streamFactory = new StreamFactory();
- private final TreeWriter treeWriter;
- private final boolean buildIndex;
- private final MemoryManager memoryManager;
- private final OrcFile.Version version;
- private final Configuration conf;
- private final OrcFile.WriterCallback callback;
- private final OrcFile.WriterContext callbackContext;
- private final OrcFile.EncodingStrategy encodingStrategy;
- private final boolean[] bloomFilterColumns;
- private final double bloomFilterFpp;
- private boolean writeTimeZone;
-
- public WriterImpl(FileSystem fs,
- Path path,
- OrcFile.WriterOptions opts) throws IOException {
- this(new PhysicalFsWriter(fs, path, opts.getSchema().getMaximumId() + 1, opts), path, opts);
- }
-
- public WriterImpl(PhysicalWriter writer,
- Path pathForMem,
- OrcFile.WriterOptions opts) throws IOException {
- this.physWriter = writer;
- this.path = pathForMem;
- this.conf = opts.getConfiguration();
- this.schema = opts.getSchema();
- this.callback = opts.getCallback();
- if (callback != null) {
- callbackContext = new OrcFile.WriterContext(){
-
- @Override
- public Writer getWriter() {
- return WriterImpl.this;
- }
- };
- } else {
- callbackContext = null;
- }
- this.version = opts.getVersion();
- this.encodingStrategy = opts.getEncodingStrategy();
- this.rowIndexStride = opts.getRowIndexStride();
- this.memoryManager = opts.getMemoryManager();
- buildIndex = rowIndexStride > 0;
- if (version == OrcFile.Version.V_0_11) {
- /* do not write bloom filters for ORC v11 */
- this.bloomFilterColumns = new boolean[schema.getMaximumId() + 1];
- } else {
- this.bloomFilterColumns =
- OrcUtils.includeColumns(opts.getBloomFilterColumns(), schema);
- }
- this.bloomFilterFpp = opts.getBloomFilterFpp();
- treeWriter = createTreeWriter(schema, streamFactory, false);
- if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
- throw new IllegalArgumentException("Row stride must be at least " +
- MIN_ROW_INDEX_STRIDE);
- }
-
- // ensure that we are able to handle callbacks before we register ourselves
- if (path != null) {
- memoryManager.addWriter(path, opts.getStripeSize(), this);
- }
- }
-
- @Override
- public boolean checkMemory(double newScale) throws IOException {
- long limit = (long) Math.round(physWriter.getPhysicalStripeSize() * newScale);
- long size = estimateStripeSize();
- if (LOG.isDebugEnabled()) {
- LOG.debug("ORC writer " + path + " size = " + size + " limit = " +
- limit);
- }
- if (size > limit) {
- flushStripe();
- return true;
- }
- return false;
- }
-
- private static class RowIndexPositionRecorder implements PositionRecorder {
- private final OrcProto.RowIndexEntry.Builder builder;
-
- RowIndexPositionRecorder(OrcProto.RowIndexEntry.Builder builder) {
- this.builder = builder;
- }
-
- @Override
- public void addPosition(long position) {
- builder.addPositions(position);
- }
- }
-
- /**
- * Interface from the Writer to the TreeWriters. This limits the visibility
- * that the TreeWriters have into the Writer.
- */
- private class StreamFactory {
- /**
- * Create a stream to store part of a column.
- * @param column the column id for the stream
- * @param kind the kind of stream
- * @return The output outStream that the section needs to be written to.
- * @throws IOException
- */
- public OutStream createStream(int column,
- OrcProto.Stream.Kind kind
- ) throws IOException {
- final StreamName name = new StreamName(column, kind);
- return physWriter.getOrCreatePhysicalStream(name);
- }
-
- public void writeIndex(int column, RowIndex.Builder rowIndex) throws IOException {
- physWriter.writeIndexStream(new StreamName(column, Stream.Kind.ROW_INDEX), rowIndex);
- }
-
- public void writeBloomFilter(
- int column, BloomFilterIndex.Builder bloomFilterIndex) throws IOException {
- physWriter.writeBloomFilterStream(
- new StreamName(column, Stream.Kind.BLOOM_FILTER), bloomFilterIndex);
- }
- /**
- * Get the next column id.
- * @return a number from 0 to the number of columns - 1
- */
- public int getNextColumnId() {
- return columnCount++;
- }
-
- /**
- * Get the stride rate of the row index.
- */
- public int getRowIndexStride() {
- return rowIndexStride;
- }
-
- /**
- * Should be building the row index.
- * @return true if we are building the index
- */
- public boolean buildIndex() {
- return buildIndex;
- }
-
- /**
- * Is the ORC file compressed?
- * @return are the streams compressed
- */
- public boolean isCompressed() {
- return physWriter.isCompressed();
- }
-
- /**
- * Get the encoding strategy to use.
- * @return encoding strategy
- */
- public OrcFile.EncodingStrategy getEncodingStrategy() {
- return encodingStrategy;
- }
-
- /**
- * Get the bloom filter columns
- * @return bloom filter columns
- */
- public boolean[] getBloomFilterColumns() {
- return bloomFilterColumns;
- }
-
- /**
- * Get bloom filter false positive percentage.
- * @return fpp
- */
- public double getBloomFilterFPP() {
- return bloomFilterFpp;
- }
-
- /**
- * Get the writer's configuration.
- * @return configuration
- */
- public Configuration getConfiguration() {
- return conf;
- }
-
- /**
- * Get the version of the file to write.
- */
- public OrcFile.Version getVersion() {
- return version;
- }
-
- public void useWriterTimeZone(boolean val) {
- writeTimeZone = val;
- }
-
- public boolean hasWriterTimeZone() {
- return writeTimeZone;
- }
- }
-
- /**
- * The parent class of all of the writers for each column. Each column
- * is written by an instance of this class. The compound types (struct,
- * list, map, and union) have children tree writers that write the children
- * types.
- */
- private abstract static class TreeWriter {
- protected final int id;
- protected final BitFieldWriter isPresent;
- private final boolean isCompressed;
- protected final ColumnStatisticsImpl indexStatistics;
- protected final ColumnStatisticsImpl stripeColStatistics;
- private final ColumnStatisticsImpl fileStatistics;
- protected TreeWriter[] childrenWriters;
- protected final RowIndexPositionRecorder rowIndexPosition;
- private final OrcProto.RowIndex.Builder rowIndex;
- private final OrcProto.RowIndexEntry.Builder rowIndexEntry;
- protected final BloomFilterIO bloomFilter;
- protected final boolean createBloomFilter;
- private final OrcProto.BloomFilterIndex.Builder bloomFilterIndex;
- private final OrcProto.BloomFilter.Builder bloomFilterEntry;
- private boolean foundNulls;
- private OutStream isPresentOutStream;
- private final List<OrcProto.StripeStatistics.Builder> stripeStatsBuilders;
- private final StreamFactory streamFactory;
-
- /**
- * Create a tree writer.
- * @param columnId the column id of the column to write
- * @param schema the row schema
- * @param streamFactory limited access to the Writer's data.
- * @param nullable can the value be null?
- * @throws IOException
- */
- TreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory streamFactory,
- boolean nullable) throws IOException {
- this.streamFactory = streamFactory;
- this.isCompressed = streamFactory.isCompressed();
- this.id = columnId;
- if (nullable) {
- isPresentOutStream = streamFactory.createStream(id,
- OrcProto.Stream.Kind.PRESENT);
- isPresent = new BitFieldWriter(isPresentOutStream, 1);
- } else {
- isPresent = null;
- }
- this.foundNulls = false;
- createBloomFilter = streamFactory.getBloomFilterColumns()[columnId];
- indexStatistics = ColumnStatisticsImpl.create(schema);
- stripeColStatistics = ColumnStatisticsImpl.create(schema);
- fileStatistics = ColumnStatisticsImpl.create(schema);
- childrenWriters = new TreeWriter[0];
- rowIndex = OrcProto.RowIndex.newBuilder();
- rowIndexEntry = OrcProto.RowIndexEntry.newBuilder();
- rowIndexPosition = new RowIndexPositionRecorder(rowIndexEntry);
- stripeStatsBuilders = Lists.newArrayList();
- if (createBloomFilter) {
- bloomFilterEntry = OrcProto.BloomFilter.newBuilder();
- bloomFilterIndex = OrcProto.BloomFilterIndex.newBuilder();
- bloomFilter = new BloomFilterIO(streamFactory.getRowIndexStride(),
- streamFactory.getBloomFilterFPP());
- } else {
- bloomFilterEntry = null;
- bloomFilterIndex = null;
- bloomFilter = null;
- }
- }
-
- protected OrcProto.RowIndex.Builder getRowIndex() {
- return rowIndex;
- }
-
- protected ColumnStatisticsImpl getStripeStatistics() {
- return stripeColStatistics;
- }
-
- protected OrcProto.RowIndexEntry.Builder getRowIndexEntry() {
- return rowIndexEntry;
- }
-
- IntegerWriter createIntegerWriter(PositionedOutputStream output,
- boolean signed, boolean isDirectV2,
- StreamFactory writer) {
- if (isDirectV2) {
- boolean alignedBitpacking = false;
- if (writer.getEncodingStrategy().equals(OrcFile.EncodingStrategy.SPEED)) {
- alignedBitpacking = true;
- }
- return new RunLengthIntegerWriterV2(output, signed, alignedBitpacking);
- } else {
- return new RunLengthIntegerWriter(output, signed);
- }
- }
-
- boolean isNewWriteFormat(StreamFactory writer) {
- return writer.getVersion() != OrcFile.Version.V_0_11;
- }
-
- /**
- * Handle the top level object write.
- *
- * This default method is used for all types except structs, which are the
- * typical case. VectorizedRowBatch assumes the top level object is a
- * struct, so we use the first column for all other types.
- * @param batch the batch to write from
- * @param offset the row to start on
- * @param length the number of rows to write
- * @throws IOException
- */
- void writeRootBatch(VectorizedRowBatch batch, int offset,
- int length) throws IOException {
- writeBatch(batch.cols[0], offset, length);
- }
-
- /**
- * Write the values from the given vector from offset for length elements.
- * @param vector the vector to write from
- * @param offset the first value from the vector to write
- * @param length the number of values from the vector to write
- * @throws IOException
- */
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- if (vector.noNulls) {
- indexStatistics.increment(length);
- if (isPresent != null) {
- for (int i = 0; i < length; ++i) {
- isPresent.write(1);
- }
- }
- } else {
- if (vector.isRepeating) {
- boolean isNull = vector.isNull[0];
- if (isPresent != null) {
- for (int i = 0; i < length; ++i) {
- isPresent.write(isNull ? 0 : 1);
- }
- }
- if (isNull) {
- foundNulls = true;
- indexStatistics.setNull();
- } else {
- indexStatistics.increment(length);
- }
- } else {
- // count the number of non-null values
- int nonNullCount = 0;
- for(int i = 0; i < length; ++i) {
- boolean isNull = vector.isNull[i + offset];
- if (!isNull) {
- nonNullCount += 1;
- }
- if (isPresent != null) {
- isPresent.write(isNull ? 0 : 1);
- }
- }
- indexStatistics.increment(nonNullCount);
- if (nonNullCount != length) {
- foundNulls = true;
- indexStatistics.setNull();
- }
- }
- }
- }
-
- private void removeIsPresentPositions() {
- for(int i=0; i < rowIndex.getEntryCount(); ++i) {
- OrcProto.RowIndexEntry.Builder entry = rowIndex.getEntryBuilder(i);
- List<Long> positions = entry.getPositionsList();
- // bit streams use 3 positions if uncompressed, 4 if compressed
- positions = positions.subList(isCompressed ? 4 : 3, positions.size());
- entry.clearPositions();
- entry.addAllPositions(positions);
- }
- }
-
- /**
- * Write the stripe out to the file.
- * @param builder the stripe footer that contains the information about the
- * layout of the stripe. The TreeWriter is required to update
- * the footer with its information.
- * @param requiredIndexEntries the number of index entries that are
- * required. this is to check to make sure the
- * row index is well formed.
- * @throws IOException
- */
- void writeStripe(OrcProto.StripeFooter.Builder builder,
- int requiredIndexEntries) throws IOException {
- if (isPresent != null) {
- isPresent.flush();
-
- // if no nulls are found in a stream, then suppress the stream
- if (!foundNulls) {
- isPresentOutStream.suppress();
- // since isPresent bitstream is suppressed, update the index to
- // remove the positions of the isPresent stream
- if (streamFactory.buildIndex()) {
- removeIsPresentPositions();
- }
- }
- }
-
- // merge stripe-level column statistics to file statistics and write it to
- // stripe statistics
- OrcProto.StripeStatistics.Builder stripeStatsBuilder = OrcProto.StripeStatistics.newBuilder();
- writeStripeStatistics(stripeStatsBuilder, this);
- stripeStatsBuilders.add(stripeStatsBuilder);
-
- // reset the flag for next stripe
- foundNulls = false;
-
- builder.addColumns(getEncoding());
- if (streamFactory.hasWriterTimeZone()) {
- builder.setWriterTimezone(TimeZone.getDefault().getID());
- }
- if (streamFactory.buildIndex()) {
- if (rowIndex.getEntryCount() != requiredIndexEntries) {
- throw new IllegalArgumentException("Column has wrong number of " +
- "index entries found: " + rowIndex.getEntryCount() + " expected: " +
- requiredIndexEntries);
- }
- streamFactory.writeIndex(id, rowIndex);
- }
-
- rowIndex.clear();
- rowIndexEntry.clear();
-
- // write the bloom filter to out stream
- if (createBloomFilter) {
- streamFactory.writeBloomFilter(id, bloomFilterIndex);
- bloomFilterIndex.clear();
- bloomFilterEntry.clear();
- }
- }
-
- private void writeStripeStatistics(OrcProto.StripeStatistics.Builder builder,
- TreeWriter treeWriter) {
- treeWriter.fileStatistics.merge(treeWriter.stripeColStatistics);
- builder.addColStats(treeWriter.stripeColStatistics.serialize().build());
- treeWriter.stripeColStatistics.reset();
- for (TreeWriter child : treeWriter.getChildrenWriters()) {
- writeStripeStatistics(builder, child);
- }
- }
-
- TreeWriter[] getChildrenWriters() {
- return childrenWriters;
- }
-
- /**
- * Get the encoding for this column.
- * @return the information about the encoding of this column
- */
- OrcProto.ColumnEncoding getEncoding() {
- return OrcProto.ColumnEncoding.newBuilder().setKind(
- OrcProto.ColumnEncoding.Kind.DIRECT).build();
- }
-
- /**
- * Create a row index entry with the previous location and the current
- * index statistics. Also merges the index statistics into the file
- * statistics before they are cleared. Finally, it records the start of the
- * next index and ensures all of the children columns also create an entry.
- * @throws IOException
- */
- void createRowIndexEntry() throws IOException {
- stripeColStatistics.merge(indexStatistics);
- rowIndexEntry.setStatistics(indexStatistics.serialize());
- indexStatistics.reset();
- rowIndex.addEntry(rowIndexEntry);
- rowIndexEntry.clear();
- addBloomFilterEntry();
- recordPosition(rowIndexPosition);
- for(TreeWriter child: childrenWriters) {
- child.createRowIndexEntry();
- }
- }
-
- void addBloomFilterEntry() {
- if (createBloomFilter) {
- bloomFilterEntry.setNumHashFunctions(bloomFilter.getNumHashFunctions());
- bloomFilterEntry.addAllBitset(Longs.asList(bloomFilter.getBitSet()));
- bloomFilterIndex.addBloomFilter(bloomFilterEntry.build());
- bloomFilter.reset();
- bloomFilterEntry.clear();
- }
- }
-
- /**
- * Record the current position in each of this column's streams.
- * @param recorder where should the locations be recorded
- * @throws IOException
- */
- void recordPosition(PositionRecorder recorder) throws IOException {
- if (isPresent != null) {
- isPresent.getPosition(recorder);
- }
- }
-
- /**
- * Estimate how much memory the writer is consuming excluding the streams.
- * @return the number of bytes.
- */
- long estimateMemory() {
- long result = 0;
- for (TreeWriter child: childrenWriters) {
- result += child.estimateMemory();
- }
- return result;
- }
- }
-
- private static class BooleanTreeWriter extends TreeWriter {
- private final BitFieldWriter writer;
-
- BooleanTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- PositionedOutputStream out = writer.createStream(id,
- OrcProto.Stream.Kind.DATA);
- this.writer = new BitFieldWriter(out, 1);
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- LongColumnVector vec = (LongColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- int value = vec.vector[0] == 0 ? 0 : 1;
- indexStatistics.updateBoolean(value != 0, length);
- for(int i=0; i < length; ++i) {
- writer.write(value);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- int value = vec.vector[i + offset] == 0 ? 0 : 1;
- writer.write(value);
- indexStatistics.updateBoolean(value != 0, 1);
- }
- }
- }
- }
-
- @Override
- void writeStripe(OrcProto.StripeFooter.Builder builder,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, requiredIndexEntries);
- writer.flush();
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- writer.getPosition(recorder);
- }
- }
-
- private static class ByteTreeWriter extends TreeWriter {
- private final RunLengthByteWriter writer;
-
- ByteTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.writer = new RunLengthByteWriter(writer.createStream(id,
- OrcProto.Stream.Kind.DATA));
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- LongColumnVector vec = (LongColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- byte value = (byte) vec.vector[0];
- indexStatistics.updateInteger(value, length);
- if (createBloomFilter) {
- bloomFilter.addLong(value);
- }
- for(int i=0; i < length; ++i) {
- writer.write(value);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- byte value = (byte) vec.vector[i + offset];
- writer.write(value);
- indexStatistics.updateInteger(value, 1);
- if (createBloomFilter) {
- bloomFilter.addLong(value);
- }
- }
- }
- }
- }
-
- @Override
- void writeStripe(OrcProto.StripeFooter.Builder builder,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, requiredIndexEntries);
- writer.flush();
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- writer.getPosition(recorder);
- }
- }
-
- private static class IntegerTreeWriter extends TreeWriter {
- private final IntegerWriter writer;
- private boolean isDirectV2 = true;
-
- IntegerTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- OutStream out = writer.createStream(id,
- OrcProto.Stream.Kind.DATA);
- this.isDirectV2 = isNewWriteFormat(writer);
- this.writer = createIntegerWriter(out, true, isDirectV2, writer);
- recordPosition(rowIndexPosition);
- }
-
- @Override
- OrcProto.ColumnEncoding getEncoding() {
- if (isDirectV2) {
- return OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
- }
- return OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build();
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- LongColumnVector vec = (LongColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- long value = vec.vector[0];
- indexStatistics.updateInteger(value, length);
- if (createBloomFilter) {
- bloomFilter.addLong(value);
- }
- for(int i=0; i < length; ++i) {
- writer.write(value);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- long value = vec.vector[i + offset];
- writer.write(value);
- indexStatistics.updateInteger(value, 1);
- if (createBloomFilter) {
- bloomFilter.addLong(value);
- }
- }
- }
- }
- }
-
- @Override
- void writeStripe(OrcProto.StripeFooter.Builder builder,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, requiredIndexEntries);
- writer.flush();
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- writer.getPosition(recorder);
- }
- }
-
- private static class FloatTreeWriter extends TreeWriter {
- private final PositionedOutputStream stream;
- private final SerializationUtils utils;
-
- FloatTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.stream = writer.createStream(id,
- OrcProto.Stream.Kind.DATA);
- this.utils = new SerializationUtils();
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- DoubleColumnVector vec = (DoubleColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- float value = (float) vec.vector[0];
- indexStatistics.updateDouble(value);
- if (createBloomFilter) {
- bloomFilter.addDouble(value);
- }
- for(int i=0; i < length; ++i) {
- utils.writeFloat(stream, value);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- float value = (float) vec.vector[i + offset];
- utils.writeFloat(stream, value);
- indexStatistics.updateDouble(value);
- if (createBloomFilter) {
- bloomFilter.addDouble(value);
- }
- }
- }
- }
- }
-
-
- @Override
- void writeStripe(OrcProto.StripeFooter.Builder builder,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, requiredIndexEntries);
- stream.flush();
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- stream.getPosition(recorder);
- }
- }
-
- private static class DoubleTreeWriter extends TreeWriter {
- private final PositionedOutputStream stream;
- private final SerializationUtils utils;
-
- DoubleTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.stream = writer.createStream(id,
- OrcProto.Stream.Kind.DATA);
- this.utils = new SerializationUtils();
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- DoubleColumnVector vec = (DoubleColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- double value = vec.vector[0];
- indexStatistics.updateDouble(value);
- if (createBloomFilter) {
- bloomFilter.addDouble(value);
- }
- for(int i=0; i < length; ++i) {
- utils.writeDouble(stream, value);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- double value = vec.vector[i + offset];
- utils.writeDouble(stream, value);
- indexStatistics.updateDouble(value);
- if (createBloomFilter) {
- bloomFilter.addDouble(value);
- }
- }
- }
- }
- }
-
- @Override
- void writeStripe(OrcProto.StripeFooter.Builder builder,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, requiredIndexEntries);
- stream.flush();
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- stream.getPosition(recorder);
- }
- }
-
- private static abstract class StringBaseTreeWriter extends TreeWriter {
- private static final int INITIAL_DICTIONARY_SIZE = 4096;
- private final OutStream stringOutput;
- private final IntegerWriter lengthOutput;
- private final IntegerWriter rowOutput;
- protected final StringRedBlackTree dictionary =
- new StringRedBlackTree(INITIAL_DICTIONARY_SIZE);
- protected final DynamicIntArray rows = new DynamicIntArray();
- protected final PositionedOutputStream directStreamOutput;
- protected final IntegerWriter directLengthOutput;
- private final List<OrcProto.RowIndexEntry> savedRowIndex =
- new ArrayList<OrcProto.RowIndexEntry>();
- private final boolean buildIndex;
- private final List<Long> rowIndexValueCount = new ArrayList<Long>();
- // If the number of keys in a dictionary is greater than this fraction of
- //the total number of non-null rows, turn off dictionary encoding
- private final double dictionaryKeySizeThreshold;
- protected boolean useDictionaryEncoding;
- private boolean isDirectV2 = true;
- private boolean doneDictionaryCheck;
- private final boolean strideDictionaryCheck;
-
- StringBaseTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.isDirectV2 = isNewWriteFormat(writer);
- stringOutput = writer.createStream(id,
- OrcProto.Stream.Kind.DICTIONARY_DATA);
- lengthOutput = createIntegerWriter(writer.createStream(id,
- OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
- rowOutput = createIntegerWriter(writer.createStream(id,
- OrcProto.Stream.Kind.DATA), false, isDirectV2, writer);
- recordPosition(rowIndexPosition);
- rowIndexValueCount.add(0L);
- buildIndex = writer.buildIndex();
- directStreamOutput = writer.createStream(id, OrcProto.Stream.Kind.DATA);
- directLengthOutput = createIntegerWriter(writer.createStream(id,
- OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
- Configuration conf = writer.getConfiguration();
- dictionaryKeySizeThreshold =
- OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getDouble(conf);
- strideDictionaryCheck =
- OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getBoolean(conf);
- useDictionaryEncoding = dictionaryKeySizeThreshold >= 0.000001; // Epsilon.
- doneDictionaryCheck = !useDictionaryEncoding;
- }
-
- private boolean checkDictionaryEncoding() {
- if (!doneDictionaryCheck) {
- // Set the flag indicating whether or not to use dictionary encoding
- // based on whether or not the fraction of distinct keys over number of
- // non-null rows is less than the configured threshold
- float ratio = rows.size() > 0 ? (float) (dictionary.size()) / rows.size() : 0.0f;
- useDictionaryEncoding = !isDirectV2 || ratio <= dictionaryKeySizeThreshold;
- doneDictionaryCheck = true;
- }
- return useDictionaryEncoding;
- }
-
- @Override
- void writeStripe(OrcProto.StripeFooter.Builder builder,
- int requiredIndexEntries) throws IOException {
- // if rows in stripe is less than dictionaryCheckAfterRows, dictionary
- // checking would not have happened. So do it again here.
- checkDictionaryEncoding();
-
- if (useDictionaryEncoding) {
- flushDictionary();
- } else {
- // flushout any left over entries from dictionary
- if (rows.size() > 0) {
- flushDictionary();
- }
-
- // suppress the stream for every stripe if dictionary is disabled
- stringOutput.suppress();
- }
-
- // we need to build the rowindex before calling super, since it
- // writes it out.
- super.writeStripe(builder, requiredIndexEntries);
- stringOutput.flush();
- lengthOutput.flush();
- rowOutput.flush();
- directStreamOutput.flush();
- directLengthOutput.flush();
- // reset all of the fields to be ready for the next stripe.
- dictionary.clear();
- savedRowIndex.clear();
- rowIndexValueCount.clear();
- recordPosition(rowIndexPosition);
- rowIndexValueCount.add(0L);
-
- if (!useDictionaryEncoding) {
- // record the start positions of first index stride of next stripe i.e
- // beginning of the direct streams when dictionary is disabled
- recordDirectStreamPosition();
- }
- }
-
- private void flushDictionary() throws IOException {
- final int[] dumpOrder = new int[dictionary.size()];
-
- if (useDictionaryEncoding) {
- // Write the dictionary by traversing the red-black tree writing out
- // the bytes and lengths; and creating the map from the original order
- // to the final sorted order.
-
- dictionary.visit(new StringRedBlackTree.Visitor() {
- private int currentId = 0;
- @Override
- public void visit(StringRedBlackTree.VisitorContext context
- ) throws IOException {
- context.writeBytes(stringOutput);
- lengthOutput.write(context.getLength());
- dumpOrder[context.getOriginalPosition()] = currentId++;
- }
- });
- } else {
- // for direct encoding, we don't want the dictionary data stream
- stringOutput.suppress();
- }
- int length = rows.size();
- int rowIndexEntry = 0;
- OrcProto.RowIndex.Builder rowIndex = getRowIndex();
- Text text = new Text();
- // write the values translated into the dump order.
- for(int i = 0; i <= length; ++i) {
- // now that we are writing out the row values, we can finalize the
- // row index
- if (buildIndex) {
- while (i == rowIndexValueCount.get(rowIndexEntry) &&
- rowIndexEntry < savedRowIndex.size()) {
- OrcProto.RowIndexEntry.Builder base =
- savedRowIndex.get(rowIndexEntry++).toBuilder();
- if (useDictionaryEncoding) {
- rowOutput.getPosition(new RowIndexPositionRecorder(base));
- } else {
- PositionRecorder posn = new RowIndexPositionRecorder(base);
- directStreamOutput.getPosition(posn);
- directLengthOutput.getPosition(posn);
- }
- rowIndex.addEntry(base.build());
- }
- }
- if (i != length) {
- if (useDictionaryEncoding) {
- rowOutput.write(dumpOrder[rows.get(i)]);
- } else {
- dictionary.getText(text, rows.get(i));
- directStreamOutput.write(text.getBytes(), 0, text.getLength());
- directLengthOutput.write(text.getLength());
- }
- }
- }
- rows.clear();
- }
-
- @Override
- OrcProto.ColumnEncoding getEncoding() {
- // Returns the encoding used for the last call to writeStripe
- if (useDictionaryEncoding) {
- if(isDirectV2) {
- return OrcProto.ColumnEncoding.newBuilder().setKind(
- OrcProto.ColumnEncoding.Kind.DICTIONARY_V2).
- setDictionarySize(dictionary.size()).build();
- }
- return OrcProto.ColumnEncoding.newBuilder().setKind(
- OrcProto.ColumnEncoding.Kind.DICTIONARY).
- setDictionarySize(dictionary.size()).build();
- } else {
- if(isDirectV2) {
- return OrcProto.ColumnEncoding.newBuilder().setKind(
- OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
- }
- return OrcProto.ColumnEncoding.newBuilder().setKind(
- OrcProto.ColumnEncoding.Kind.DIRECT).build();
- }
- }
-
- /**
- * This method doesn't call the super method, because unlike most of the
- * other TreeWriters, this one can't record the position in the streams
- * until the stripe is being flushed. Therefore it saves all of the entries
- * and augments them with the final information as the stripe is written.
- * @throws IOException
- */
- @Override
- void createRowIndexEntry() throws IOException {
- getStripeStatistics().merge(indexStatistics);
- OrcProto.RowIndexEntry.Builder rowIndexEntry = getRowIndexEntry();
- rowIndexEntry.setStatistics(indexStatistics.serialize());
- indexStatistics.reset();
- OrcProto.RowIndexEntry base = rowIndexEntry.build();
- savedRowIndex.add(base);
- rowIndexEntry.clear();
- addBloomFilterEntry();
- recordPosition(rowIndexPosition);
- rowIndexValueCount.add(Long.valueOf(rows.size()));
- if (strideDictionaryCheck) {
- checkDictionaryEncoding();
- }
- if (!useDictionaryEncoding) {
- if (rows.size() > 0) {
- flushDictionary();
- // just record the start positions of next index stride
- recordDirectStreamPosition();
- } else {
- // record the start positions of next index stride
- recordDirectStreamPosition();
- getRowIndex().addEntry(base);
- }
- }
- }
-
- private void recordDirectStreamPosition() throws IOException {
- directStreamOutput.getPosition(rowIndexPosition);
- directLengthOutput.getPosition(rowIndexPosition);
- }
-
- @Override
- long estimateMemory() {
- return rows.getSizeInBytes() + dictionary.getSizeInBytes();
- }
- }
-
- private static class StringTreeWriter extends StringBaseTreeWriter {
- StringTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- BytesColumnVector vec = (BytesColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- if (useDictionaryEncoding) {
- int id = dictionary.add(vec.vector[0], vec.start[0], vec.length[0]);
- for(int i=0; i < length; ++i) {
- rows.add(id);
- }
- } else {
- for(int i=0; i < length; ++i) {
- directStreamOutput.write(vec.vector[0], vec.start[0],
- vec.length[0]);
- directLengthOutput.write(vec.length[0]);
- }
- }
- indexStatistics.updateString(vec.vector[0], vec.start[0],
- vec.length[0], length);
- if (createBloomFilter) {
- bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- if (useDictionaryEncoding) {
- rows.add(dictionary.add(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i]));
- } else {
- directStreamOutput.write(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i]);
- directLengthOutput.write(vec.length[offset + i]);
- }
- indexStatistics.updateString(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i], 1);
- if (createBloomFilter) {
- bloomFilter.addBytes(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i]);
- }
- }
- }
- }
- }
- }
-
- /**
- * Under the covers, char is written to ORC the same way as string.
- */
- private static class CharTreeWriter extends StringBaseTreeWriter {
- private final int itemLength;
- private final byte[] padding;
-
- CharTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- itemLength = schema.getMaxLength();
- padding = new byte[itemLength];
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- BytesColumnVector vec = (BytesColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- byte[] ptr;
- int ptrOffset;
- if (vec.length[0] >= itemLength) {
- ptr = vec.vector[0];
- ptrOffset = vec.start[0];
- } else {
- ptr = padding;
- ptrOffset = 0;
- System.arraycopy(vec.vector[0], vec.start[0], ptr, 0,
- vec.length[0]);
- Arrays.fill(ptr, vec.length[0], itemLength, (byte) ' ');
- }
- if (useDictionaryEncoding) {
- int id = dictionary.add(ptr, ptrOffset, itemLength);
- for(int i=0; i < length; ++i) {
- rows.add(id);
- }
- } else {
- for(int i=0; i < length; ++i) {
- directStreamOutput.write(ptr, ptrOffset, itemLength);
- directLengthOutput.write(itemLength);
- }
- }
- indexStatistics.updateString(ptr, ptrOffset, itemLength, length);
- if (createBloomFilter) {
- bloomFilter.addBytes(ptr, ptrOffset, itemLength);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- byte[] ptr;
- int ptrOffset;
- if (vec.length[offset + i] >= itemLength) {
- ptr = vec.vector[offset + i];
- ptrOffset = vec.start[offset + i];
- } else {
- // it is the wrong length, so copy it
- ptr = padding;
- ptrOffset = 0;
- System.arraycopy(vec.vector[offset + i], vec.start[offset + i],
- ptr, 0, vec.length[offset + i]);
- Arrays.fill(ptr, vec.length[offset + i], itemLength, (byte) ' ');
- }
- if (useDictionaryEncoding) {
- rows.add(dictionary.add(ptr, ptrOffset, itemLength));
- } else {
- directStreamOutput.write(ptr, ptrOffset, itemLength);
- directLengthOutput.write(itemLength);
- }
- indexStatistics.updateString(ptr, ptrOffset, itemLength, 1);
- if (createBloomFilter) {
- bloomFilter.addBytes(ptr, ptrOffset, itemLength);
- }
- }
- }
- }
- }
- }
-
- /**
- * Under the covers, varchar is written to ORC the same way as string.
- */
- private static class VarcharTreeWriter extends StringBaseTreeWriter {
- private final int maxLength;
-
- VarcharTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- maxLength = schema.getMaxLength();
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- BytesColumnVector vec = (BytesColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- int itemLength = Math.min(vec.length[0], maxLength);
- if (useDictionaryEncoding) {
- int id = dictionary.add(vec.vector[0], vec.start[0], itemLength);
- for(int i=0; i < length; ++i) {
- rows.add(id);
- }
- } else {
- for(int i=0; i < length; ++i) {
- directStreamOutput.write(vec.vector[0], vec.start[0],
- itemLength);
- directLengthOutput.write(itemLength);
- }
- }
- indexStatistics.updateString(vec.vector[0], vec.start[0],
- itemLength, length);
- if (createBloomFilter) {
- bloomFilter.addBytes(vec.vector[0], vec.start[0], itemLength);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- int itemLength = Math.min(vec.length[offset + i], maxLength);
- if (useDictionaryEncoding) {
- rows.add(dictionary.add(vec.vector[offset + i],
- vec.start[offset + i], itemLength));
- } else {
- directStreamOutput.write(vec.vector[offset + i],
- vec.start[offset + i], itemLength);
- directLengthOutput.write(itemLength);
- }
- indexStatistics.updateString(vec.vector[offset + i],
- vec.start[offset + i], itemLength, 1);
- if (createBloomFilter) {
- bloomFilter.addBytes(vec.vector[offset + i],
- vec.start[offset + i], itemLength);
- }
- }
- }
- }
- }
- }
-
- private static class BinaryTreeWriter extends TreeWriter {
- private final PositionedOutputStream stream;
- private final IntegerWriter length;
- private boolean isDirectV2 = true;
-
- BinaryTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.stream = writer.createStream(id,
- OrcProto.Stream.Kind.DATA);
- this.isDirectV2 = isNewWriteFormat(writer);
- this.length = createIntegerWriter(writer.createStream(id,
- OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
- recordPosition(rowIndexPosition);
- }
-
- @Override
- OrcProto.ColumnEncoding getEncoding() {
- if (isDirectV2) {
- return OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
- }
- return OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build();
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- BytesColumnVector vec = (BytesColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- for(int i=0; i < length; ++i) {
- stream.write(vec.vector[0], vec.start[0],
- vec.length[0]);
- this.length.write(vec.length[0]);
- }
- indexStatistics.updateBinary(vec.vector[0], vec.start[0],
- vec.length[0], length);
- if (createBloomFilter) {
- bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- stream.write(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i]);
- this.length.write(vec.length[offset + i]);
- indexStatistics.updateBinary(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i], 1);
- if (createBloomFilter) {
- bloomFilter.addBytes(vec.vector[offset + i],
- vec.start[offset + i], vec.length[offset + i]);
- }
- }
- }
- }
- }
-
-
- @Override
- void writeStripe(OrcProto.StripeFooter.Builder builder,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, requiredIndexEntries);
- stream.flush();
- length.flush();
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- stream.getPosition(recorder);
- length.getPosition(recorder);
- }
- }
-
- public static long MILLIS_PER_DAY = 24 * 60 * 60 * 1000;
- public static long NANOS_PER_MILLI = 1000000;
- public static final int MILLIS_PER_SECOND = 1000;
- static final int NANOS_PER_SECOND = 1000000000;
- public static final String BASE_TIMESTAMP_STRING = "2015-01-01 00:00:00";
-
- private static class TimestampTreeWriter extends TreeWriter {
- private final IntegerWriter seconds;
- private final IntegerWriter nanos;
- private final boolean isDirectV2;
- private final long base_timestamp;
-
- TimestampTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.isDirectV2 = isNewWriteFormat(writer);
- this.seconds = createIntegerWriter(writer.createStream(id,
- OrcProto.Stream.Kind.DATA), true, isDirectV2, writer);
- this.nanos = createIntegerWriter(writer.createStream(id,
- OrcProto.Stream.Kind.SECONDARY), false, isDirectV2, writer);
- recordPosition(rowIndexPosition);
- // for unit tests to set different time zones
- this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / MILLIS_PER_SECOND;
- writer.useWriterTimeZone(true);
- }
-
- @Override
- OrcProto.ColumnEncoding getEncoding() {
- if (isDirectV2) {
- return OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
- }
- return OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build();
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- TimestampColumnVector vec = (TimestampColumnVector) vector;
- Timestamp val;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- val = vec.asScratchTimestamp(0);
- long millis = val.getTime();
- indexStatistics.updateTimestamp(millis);
- if (createBloomFilter) {
- bloomFilter.addLong(millis);
- }
- final long secs = millis / MILLIS_PER_SECOND - base_timestamp;
- final long nano = formatNanos(val.getNanos());
- for(int i=0; i < length; ++i) {
- seconds.write(secs);
- nanos.write(nano);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- val = vec.asScratchTimestamp(i + offset);
- long millis = val.getTime();
- long secs = millis / MILLIS_PER_SECOND - base_timestamp;
- seconds.write(secs);
- nanos.write(formatNanos(val.getNanos()));
- indexStatistics.updateTimestamp(millis);
- if (createBloomFilter) {
- bloomFilter.addLong(millis);
- }
- }
- }
- }
- }
-
- @Override
- void writeStripe(OrcProto.StripeFooter.Builder builder,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, requiredIndexEntries);
- seconds.flush();
- nanos.flush();
- recordPosition(rowIndexPosition);
- }
-
- private static long formatNanos(int nanos) {
- if (nanos == 0) {
- return 0;
- } else if (nanos % 100 != 0) {
- return ((long) nanos) << 3;
- } else {
- nanos /= 100;
- int trailingZeros = 1;
- while (nanos % 10 == 0 && trailingZeros < 7) {
- nanos /= 10;
- trailingZeros += 1;
- }
- return ((long) nanos) << 3 | trailingZeros;
- }
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- seconds.getPosition(recorder);
- nanos.getPosition(recorder);
- }
- }
-
- private static class DateTreeWriter extends TreeWriter {
- private final IntegerWriter writer;
- private final boolean isDirectV2;
-
- DateTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- OutStream out = writer.createStream(id,
- OrcProto.Stream.Kind.DATA);
- this.isDirectV2 = isNewWriteFormat(writer);
- this.writer = createIntegerWriter(out, true, isDirectV2, writer);
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- LongColumnVector vec = (LongColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- int value = (int) vec.vector[0];
- indexStatistics.updateDate(value);
- if (createBloomFilter) {
- bloomFilter.addLong(value);
- }
- for(int i=0; i < length; ++i) {
- writer.write(value);
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- int value = (int) vec.vector[i + offset];
- writer.write(value);
- indexStatistics.updateDate(value);
- if (createBloomFilter) {
- bloomFilter.addLong(value);
- }
- }
- }
- }
- }
-
- @Override
- void writeStripe(OrcProto.StripeFooter.Builder builder,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, requiredIndexEntries);
- writer.flush();
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- writer.getPosition(recorder);
- }
-
- @Override
- OrcProto.ColumnEncoding getEncoding() {
- if (isDirectV2) {
- return OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
- }
- return OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build();
- }
- }
-
- private static class DecimalTreeWriter extends TreeWriter {
- private final PositionedOutputStream valueStream;
-
- // These scratch buffers allow us to serialize decimals much faster.
- private final long[] scratchLongs;
- private final byte[] scratchBuffer;
-
- private final IntegerWriter scaleStream;
- private final boolean isDirectV2;
-
- DecimalTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.isDirectV2 = isNewWriteFormat(writer);
- valueStream = writer.createStream(id, OrcProto.Stream.Kind.DATA);
- scratchLongs = new long[HiveDecimal.SCRATCH_LONGS_LEN];
- scratchBuffer = new byte[HiveDecimal.SCRATCH_BUFFER_LEN_TO_BYTES];
- this.scaleStream = createIntegerWriter(writer.createStream(id,
- OrcProto.Stream.Kind.SECONDARY), true, isDirectV2, writer);
- recordPosition(rowIndexPosition);
- }
-
- @Override
- OrcProto.ColumnEncoding getEncoding() {
- if (isDirectV2) {
- return OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
- }
- return OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build();
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- DecimalColumnVector vec = (DecimalColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- HiveDecimalWritable value = vec.vector[0];
- indexStatistics.updateDecimal(value);
- if (createBloomFilter) {
-
- // The HiveDecimalWritable toString() method with a scratch buffer for good performance
- // when creating the String. We need to use a String hash code and not UTF-8 byte[]
- // hash code in order to get the right hash code.
- bloomFilter.addString(value.toString(scratchBuffer));
- }
- for(int i=0; i < length; ++i) {
-
- // Use the fast ORC serialization method that emulates SerializationUtils.writeBigInteger
- // provided by HiveDecimalWritable.
- value.serializationUtilsWrite(
- valueStream,
- scratchLongs);
- scaleStream.write(value.scale());
- }
- }
- } else {
- for(int i=0; i < length; ++i) {
- if (vec.noNulls || !vec.isNull[i + offset]) {
- HiveDecimalWritable value = vec.vector[i + offset];
- value.serializationUtilsWrite(
- valueStream,
- scratchLongs);
- scaleStream.write(value.scale());
- indexStatistics.updateDecimal(value);
- if (createBloomFilter) {
- bloomFilter.addString(value.toString(scratchBuffer));
- }
- }
- }
- }
- }
-
- @Override
- void writeStripe(OrcProto.StripeFooter.Builder builder,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, requiredIndexEntries);
- valueStream.flush();
- scaleStream.flush();
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- valueStream.getPosition(recorder);
- scaleStream.getPosition(recorder);
- }
- }
-
- private static class StructTreeWriter extends TreeWriter {
- StructTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- List<TypeDescription> children = schema.getChildren();
- childrenWriters = new TreeWriter[children.size()];
- for(int i=0; i < childrenWriters.length; ++i) {
- childrenWriters[i] = createTreeWriter(
- children.get(i), writer,
- true);
- }
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void writeRootBatch(VectorizedRowBatch batch, int offset,
- int length) throws IOException {
- // update the statistics for the root column
- indexStatistics.increment(length);
- // I'm assuming that the root column isn't nullable so that I don't need
- // to update isPresent.
- for(int i=0; i < childrenWriters.length; ++i) {
- childrenWriters[i].writeBatch(batch.cols[i], offset, length);
- }
- }
-
- private static void writeFields(StructColumnVector vector,
- TreeWriter[] childrenWriters,
- int offset, int length) throws IOException {
- for(int field=0; field < childrenWriters.length; ++field) {
- childrenWriters[field].writeBatch(vector.fields[field], offset, length);
- }
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- StructColumnVector vec = (StructColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- writeFields(vec, childrenWriters, offset, length);
- }
- } else if (vector.noNulls) {
- writeFields(vec, childrenWriters, offset, length);
- } else {
- // write the records in runs
- int currentRun = 0;
- boolean started = false;
- for(int i=0; i < length; ++i) {
- if (!vec.isNull[i + offset]) {
- if (!started) {
- started = true;
- currentRun = i;
- }
- } else if (started) {
- started = false;
- writeFields(vec, childrenWriters, offset + currentRun,
- i - currentRun);
- }
- }
- if (started) {
- writeFields(vec, childrenWriters, offset + currentRun,
- length - currentRun);
- }
- }
- }
-
- @Override
- void writeStripe(OrcProto.StripeFooter.Builder builder,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, requiredIndexEntries);
- for(TreeWriter child: childrenWriters) {
- child.writeStripe(builder, requiredIndexEntries);
- }
- recordPosition(rowIndexPosition);
- }
- }
-
- private static class ListTreeWriter extends TreeWriter {
- private final IntegerWriter lengths;
- private final boolean isDirectV2;
-
- ListTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.isDirectV2 = isNewWriteFormat(writer);
- childrenWriters = new TreeWriter[1];
- childrenWriters[0] =
- createTreeWriter(schema.getChildren().get(0), writer, true);
- lengths = createIntegerWriter(writer.createStream(columnId,
- OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
- recordPosition(rowIndexPosition);
- }
-
- @Override
- OrcProto.ColumnEncoding getEncoding() {
- if (isDirectV2) {
- return OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
- }
- return OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build();
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- ListColumnVector vec = (ListColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- int childOffset = (int) vec.offsets[0];
- int childLength = (int) vec.lengths[0];
- for(int i=0; i < length; ++i) {
- lengths.write(childLength);
- childrenWriters[0].writeBatch(vec.child, childOffset, childLength);
- }
- if (createBloomFilter) {
- bloomFilter.addLong(childLength);
- }
- }
- } else {
- // write the elements in runs
- int currentOffset = 0;
- int currentLength = 0;
- for(int i=0; i < length; ++i) {
- if (!vec.isNull[i + offset]) {
- int nextLength = (int) vec.lengths[offset + i];
- int nextOffset = (int) vec.offsets[offset + i];
- lengths.write(nextLength);
- if (currentLength == 0) {
- currentOffset = nextOffset;
- currentLength = nextLength;
- } else if (currentOffset + currentLength != nextOffset) {
- childrenWriters[0].writeBatch(vec.child, currentOffset,
- currentLength);
- currentOffset = nextOffset;
- currentLength = nextLength;
- } else {
- currentLength += nextLength;
- }
- }
- }
- if (currentLength != 0) {
- childrenWriters[0].writeBatch(vec.child, currentOffset,
- currentLength);
- }
- }
- }
-
- @Override
- void writeStripe(OrcProto.StripeFooter.Builder builder,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, requiredIndexEntries);
- lengths.flush();
- for(TreeWriter child: childrenWriters) {
- child.writeStripe(builder, requiredIndexEntries);
- }
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- lengths.getPosition(recorder);
- }
- }
-
- private static class MapTreeWriter extends TreeWriter {
- private final IntegerWriter lengths;
- private final boolean isDirectV2;
-
- MapTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- this.isDirectV2 = isNewWriteFormat(writer);
- childrenWriters = new TreeWriter[2];
- List<TypeDescription> children = schema.getChildren();
- childrenWriters[0] =
- createTreeWriter(children.get(0), writer, true);
- childrenWriters[1] =
- createTreeWriter(children.get(1), writer, true);
- lengths = createIntegerWriter(writer.createStream(columnId,
- OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
- recordPosition(rowIndexPosition);
- }
-
- @Override
- OrcProto.ColumnEncoding getEncoding() {
- if (isDirectV2) {
- return OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
- }
- return OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build();
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- MapColumnVector vec = (MapColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- int childOffset = (int) vec.offsets[0];
- int childLength = (int) vec.lengths[0];
- for(int i=0; i < length; ++i) {
- lengths.write(childLength);
- childrenWriters[0].writeBatch(vec.keys, childOffset, childLength);
- childrenWriters[1].writeBatch(vec.values, childOffset, childLength);
- }
- if (createBloomFilter) {
- bloomFilter.addLong(childLength);
- }
- }
- } else {
- // write the elements in runs
- int currentOffset = 0;
- int currentLength = 0;
- for(int i=0; i < length; ++i) {
- if (!vec.isNull[i + offset]) {
- int nextLength = (int) vec.lengths[offset + i];
- int nextOffset = (int) vec.offsets[offset + i];
- lengths.write(nextLength);
- if (currentLength == 0) {
- currentOffset = nextOffset;
- currentLength = nextLength;
- } else if (currentOffset + currentLength != nextOffset) {
- childrenWriters[0].writeBatch(vec.keys, currentOffset,
- currentLength);
- childrenWriters[1].writeBatch(vec.values, currentOffset,
- currentLength);
- currentOffset = nextOffset;
- currentLength = nextLength;
- } else {
- currentLength += nextLength;
- }
- }
- }
- if (currentLength != 0) {
- childrenWriters[0].writeBatch(vec.keys, currentOffset,
- currentLength);
- childrenWriters[1].writeBatch(vec.values, currentOffset,
- currentLength);
- }
- }
- }
-
- @Override
- void writeStripe(OrcProto.StripeFooter.Builder builder,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, requiredIndexEntries);
- lengths.flush();
- for(TreeWriter child: childrenWriters) {
- child.writeStripe(builder, requiredIndexEntries);
- }
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- lengths.getPosition(recorder);
- }
- }
-
- private static class UnionTreeWriter extends TreeWriter {
- private final RunLengthByteWriter tags;
-
- UnionTreeWriter(int columnId,
- TypeDescription schema,
- StreamFactory writer,
- boolean nullable) throws IOException {
- super(columnId, schema, writer, nullable);
- List<TypeDescription> children = schema.getChildren();
- childrenWriters = new TreeWriter[children.size()];
- for(int i=0; i < childrenWriters.length; ++i) {
- childrenWriters[i] =
- createTreeWriter(children.get(i), writer, true);
- }
- tags =
- new RunLengthByteWriter(writer.createStream(columnId,
- OrcProto.Stream.Kind.DATA));
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void writeBatch(ColumnVector vector, int offset,
- int length) throws IOException {
- super.writeBatch(vector, offset, length);
- UnionColumnVector vec = (UnionColumnVector) vector;
- if (vector.isRepeating) {
- if (vector.noNulls || !vector.isNull[0]) {
- byte tag = (byte) vec.tags[0];
- for(int i=0; i < length; ++i) {
- tags.write(tag);
- }
- if (createBloomFilter) {
- bloomFilter.addLong(tag);
- }
- childrenWriters[tag].writeBatch(vec.fields[tag], offset, length);
- }
- } else {
- // write the records in runs of the same tag
- int[] currentStart = new int[vec.fields.length];
- int[] currentLength = new int[vec.fields.length];
- for(int i=0; i < length; ++i) {
- // only need to deal with the non-nulls, since the nulls were dealt
- // with in the super method.
- if (vec.noNulls || !vec.isNull[i + offset]) {
- byte tag = (byte) vec.tags[offset + i];
- tags.write(tag);
- if (currentLength[tag] == 0) {
- // start a new sequence
- currentStart[tag] = i + offset;
- currentLength[tag] = 1;
- } else if (currentStart[tag] + currentLength[tag] == i + offset) {
- // ok, we are extending the current run for that tag.
- currentLength[tag] += 1;
- } else {
- // otherwise, we need to close off the old run and start a new one
- childrenWriters[tag].writeBatch(vec.fields[tag],
- currentStart[tag], currentLength[tag]);
- currentStart[tag] = i + offset;
- currentLength[tag] = 1;
- }
- }
- }
- // write out any left over sequences
- for(int tag=0; tag < currentStart.length; ++tag) {
- if (currentLength[tag] != 0) {
- childrenWriters[tag].writeBatch(vec.fields[tag], currentStart[tag],
- currentLength[tag]);
- }
- }
- }
- }
-
- @Override
- void writeStripe(OrcProto.StripeFooter.Builder builder,
- int requiredIndexEntries) throws IOException {
- super.writeStripe(builder, requiredIndexEntries);
- tags.flush();
- for(TreeWriter child: childrenWriters) {
- child.writeStripe(builder, requiredIndexEntries);
- }
- recordPosition(rowIndexPosition);
- }
-
- @Override
- void recordPosition(PositionRecorder recorder) throws IOException {
- super.recordPosition(recorder);
- tags.getPosition(recorder);
- }
- }
-
- private static TreeWriter createTreeWriter(TypeDescription schema,
- StreamFactory streamFactory,
- boolean nullable) throws IOException {
- switch (schema.getCategory()) {
- case BOOLEAN:
- return new BooleanTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- case BYTE:
- return new ByteTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- case SHORT:
- case INT:
- case LONG:
- return new IntegerTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- case FLOAT:
- return new FloatTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- case DOUBLE:
- return new DoubleTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- case STRING:
- return new StringTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- case CHAR:
- return new CharTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- case VARCHAR:
- return new VarcharTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- case BINARY:
- return new BinaryTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- case TIMESTAMP:
- return new TimestampTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- case DATE:
- return new DateTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- case DECIMAL:
- return new DecimalTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- case STRUCT:
- return new StructTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- case MAP:
- return new MapTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- case LIST:
- return new ListTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- case UNION:
- return new UnionTreeWriter(streamFactory.getNextColumnId(),
- schema, streamFactory, nullable);
- default:
- throw new IllegalArgumentException("Bad category: " +
- schema.getCategory());
- }
- }
-
- private static void writeTypes(OrcProto.Footer.Builder builder,
- TypeDescription schema) {
- OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
- List<TypeDescription> children = OrcUtils.setTypeBuilderFromSchema(type, schema);
- builder.addTypes(type);
- if (children != null) {
- for(TypeDescription child: children) {
- writeTypes(builder, child);
- }
- }
- }
-
- @VisibleForTesting
- public void ensureStream() throws IOException {
- physWriter.initialize();
- }
-
- private void createRowIndexEntry() throws IOException {
- treeWriter.createRowIndexEntry();
- rowsInIndex = 0;
- }
-
- private void flushStripe() throws IOException {
- ensureStream();
- if (buildIndex && rowsInIndex != 0) {
- createRowIndexEntry();
- }
- if (rowsInStripe != 0) {
- if (callback != null) {
- callback.preStripeWrite(callbackContext);
- }
- // finalize the data for the stripe
- int requiredIndexEntries = rowIndexStride == 0 ? 0 :
- (int) ((rowsInStripe + rowIndexStride - 1) / rowIndexStride);
- OrcProto.StripeFooter.Builder builder = OrcProto.StripeFooter.newBuilder();
- OrcProto.StripeInformation.Builder dirEntry = OrcProto.StripeInformation
- .newBuilder().setNumberOfRows(rowsInStripe);
- treeWriter.writeStripe(builder, requiredIndexEntries);
- physWriter.finalizeStripe(builder, dirEntry);
- stripes.add(dirEntry.build());
- rowCount += rowsInStripe;
- rowsInStripe = 0;
- }
- }
-
- private long computeRawDataSize() {
- return getRawDataSize(treeWriter, schema);
- }
-
- private long getRawDataSize(TreeWriter child,
- TypeDescription schema) {
- long total = 0;
- long numVals = child.fileStatistics.getNumberOfValues();
- switch (schema.getCategory()) {
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case FLOAT:
- return numVals * JavaDataModel.get().primitive1();
- case LONG:
- case DOUBLE:
- return numVals * JavaDataModel.get().primitive2();
- case STRING:
- case VARCHAR:
- case CHAR:
- // ORC strings are converted to java Strings. so use JavaDataModel to
- // compute the overall size of strings
- StringColumnStatistics scs = (StringColumnStatistics) child.fileStatistics;
- numVals = numVals == 0 ? 1 : numVals;
- int avgStringLen = (int) (scs.getSum() / numVals);
- return numVals * JavaDataModel.get().lengthForStringOfLength(avgStringLen);
- case DECIMAL:
- return numVals * JavaDataModel.get().lengthOfDecimal();
- case DATE:
- return numVals * JavaDataModel.get().lengthOfDate();
- case BINARY:
- // get total length of binary blob
- BinaryColumnStatistics bcs = (BinaryColumnStatistics) child.fileStatistics;
- return bcs.getSum();
- case TIMESTAMP:
- return numVals * JavaDataModel.get().lengthOfTimestamp();
- case LIST:
- case MAP:
- case UNION:
- case STRUCT: {
- TreeWriter[] childWriters = child.getChildrenWriters();
- List<TypeDescription> childTypes = schema.getChildren();
- for (int i=0; i < childWriters.length; ++i) {
- total += getRawDataSize(childWriters[i], childTypes.get(i));
- }
- break;
- }
- default:
- LOG.debug("Unknown object inspector category.");
- break;
- }
- return total;
- }
-
- private void writeFileStatistics(OrcProto.Footer.Builder builder,
- TreeWriter writer) throws IOException {
- builder.addStatistics(writer.fileStatistics.serialize());
- for(TreeWriter child: writer.getChildrenWriters()) {
- writeFileStatistics(builder, child);
- }
- }
-
- private void writeMetadata() throws IOException {
- ensureStream();
- OrcProto.Metadata.Builder builder = OrcProto.Metadata.newBuilder();
- for(OrcProto.StripeStatistics.Builder ssb : treeWriter.stripeStatsBuilders) {
- builder.addStripeStats(ssb.build());
- }
-
- physWriter.writeFileMetadata(builder);
- }
-
- private void writeFooter() throws IOException {
- ensureStream();
- OrcProto.Footer.Builder builder = OrcProto.Footer.newBuilder();
- builder.setNumberOfRows(rowCount);
- builder.setRowIndexStride(rowIndexStride);
- // populate raw data size
- rawDataSize = computeRawDataSize();
- // serialize the types
- writeTypes(builder, schema);
- // add the stripe information
- for(OrcProto.StripeInformation stripe: stripes) {
- builder.addStripes(stripe);
- }
- // add the column statistics
- writeFileStatistics(builder, treeWriter);
- // add all of the user metadata
- for(Map.Entry<String, ByteString> entry: userMetadata.entrySet()) {
- builder.addMetadata(OrcProto.UserMetadataItem.newBuilder()
- .setName(entry.getKey()).setValue(entry.getValue()));
- }
- physWriter.writeFileFooter(builder);
- }
-
- private void writePostScript() throws IOException {
- OrcProto.PostScript.Builder builder =
- OrcProto.PostScript.newBuilder()
- .setMagic(OrcFile.MAGIC)
- .addVersion(version.getMajor())
- .addVersion(version.getMinor())
- .setWriterVersion(OrcFile.CURRENT_WRITER.getId());
- physWriter.writePostScript(builder);
- }
-
- private long estimateStripeSize() {
- return physWriter.estimateMemory() + treeWriter.estimateMemory();
- }
-
- @Override
- public TypeDescription getSchema() {
- return schema;
- }
-
- @Override
- public void addUserMetadata(String name, ByteBuffer value) {
- userMetadata.put(name, ByteString.copyFrom(value));
- }
-
- @Override
- public void addRowBatch(VectorizedRowBatch batch) throws IOException {
- if (buildIndex) {
- // Batch the writes up to the rowIndexStride so that we can get the
- // right size indexes.
- int posn = 0;
- while (posn < batch.size) {
- int chunkSize = Math.min(batch.size - posn,
- rowIndexStride - rowsInIndex);
- treeWriter.writeRootBatch(batch, posn, chunkSize);
- posn += chunkSize;
- rowsInIndex += chunkSize;
- rowsInStripe += chunkSize;
- if (rowsInIndex >= rowIndexStride) {
- createRowIndexEntry();
- }
- }
- } else {
- rowsInStripe += batch.size;
- treeWriter.writeRootBatch(batch, 0, batch.size);
- }
- if (path != null) {
- memoryManager.addedRow(batch.size);
- }
- }
-
- @Override
- public void close() throws IOException {
- if (callback != null) {
- callback.preFooterWrite(callbackContext);
- }
- // remove us from the memory manager so that we don't get any callbacks
- if (path != null) {
- memoryManager.removeWriter(path);
- }
- // actually close the file
- flushStripe();
- writeMetadata();
- writeFooter();
- writePostScript();
- physWriter.close();
- }
-
- /**
- * Raw data size will be compute when writing the file footer. Hence raw data
- * size value will be available only after closing the writer.
- */
- @Override
- public long getRawDataSize() {
- return rawDataSize;
- }
-
- /**
- * Row count gets updated when flushing the stripes. To get accurate row
- * count call this method after writer is closed.
- */
- @Override
- public long getNumberOfRows() {
- return rowCount;
- }
-
- @Override
- public long writeIntermediateFooter() throws IOException {
- // flush any buffered rows
- flushStripe();
- // write a footer
- if (stripesAtLastFlush != stripes.size()) {
- if (callback != null) {
- callback.preFooterWrite(callbackContext);
- }
- writeMetadata();
- writeFooter();
- writePostScript();
- stripesAtLastFlush = stripes.size();
- physWriter.flush();
- }
- return physWriter.getRawWriterPosition();
- }
-
- @Override
- public void appendStripe(byte[] stripe, int offset, int length,
- StripeInformation stripeInfo,
- OrcProto.StripeStatistics stripeStatistics) throws IOException {
- checkArgument(stripe != null, "Stripe must not be null");
- checkArgument(length <= stripe.length,
- "Specified length must not be greater specified array length");
- checkArgument(stripeInfo != null, "Stripe information must not be null");
- checkArgument(stripeStatistics != null,
- "Stripe statistics must not be null");
-
- ensureStream();
- OrcProto.StripeInformation.Builder dirEntry = OrcProto.StripeInformation.newBuilder();
- physWriter.appendRawStripe(stripe, offset, length, dirEntry);
-
- rowsInStripe = stripeStatistics.getColStats(0).getNumberOfValues();
- rowCount += rowsInStripe;
-
- // since we have already written the stripe, just update stripe statistics
- treeWriter.stripeStatsBuilders.add(stripeStatistics.toBuilder());
-
- // update file level statistics
- updateFileStatistics(stripeStatistics);
-
- // update stripe information
- stripes.add(dirEntry.setNumberOfRows(rowsInStripe)
- .setIndexLength(stripeInfo.getIndexLength())
- .setDataLength(stripeInfo.getDataLength())
- .setFooterLength(stripeInfo.getFooterLength())
- .build());
-
- // reset it after writing the stripe
- rowsInStripe = 0;
- }
-
- private void updateFileStatistics(OrcProto.StripeStatistics stripeStatistics) {
- List<OrcProto.ColumnStatistics> cs =
<TRUNCATED>
[37/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
HIVE-17118. Move the hive-orc source files to make the package names unique.
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/df8921d8
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/df8921d8
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/df8921d8
Branch: refs/heads/branch-2.2
Commit: df8921d851744c7c0aca956f5f3420fd08ee2134
Parents: 3e42672
Author: Owen O'Malley <om...@apache.org>
Authored: Tue Jul 18 12:57:07 2017 -0700
Committer: Owen O'Malley <om...@apache.org>
Committed: Tue Jul 18 14:15:51 2017 -0700
----------------------------------------------------------------------
bin/ext/orcfiledump.cmd | 2 +-
bin/ext/orcfiledump.sh | 4 +-
orc/pom.xml | 32 -
.../org/apache/hive/orc/OrcProto.java | 20179 +++++++++++++++++
.../protobuf-java/org/apache/orc/OrcProto.java | 20179 -----------------
.../apache/hive/orc/BinaryColumnStatistics.java | 25 +
.../java/org/apache/hive/orc/BloomFilterIO.java | 43 +
.../hive/orc/BooleanColumnStatistics.java | 27 +
.../org/apache/hive/orc/ColumnStatistics.java | 36 +
.../org/apache/hive/orc/CompressionCodec.java | 69 +
.../org/apache/hive/orc/CompressionKind.java | 27 +
.../java/org/apache/hive/orc/DataReader.java | 76 +
.../apache/hive/orc/DateColumnStatistics.java | 37 +
.../hive/orc/DecimalColumnStatistics.java | 45 +
.../apache/hive/orc/DoubleColumnStatistics.java | 44 +
.../apache/hive/orc/FileFormatException.java | 30 +
.../java/org/apache/hive/orc/FileMetadata.java | 60 +
.../hive/orc/IntegerColumnStatistics.java | 50 +
orc/src/java/org/apache/hive/orc/OrcConf.java | 193 +
orc/src/java/org/apache/hive/orc/OrcFile.java | 574 +
orc/src/java/org/apache/hive/orc/OrcUtils.java | 623 +
orc/src/java/org/apache/hive/orc/Reader.java | 375 +
.../java/org/apache/hive/orc/RecordReader.java | 64 +
.../apache/hive/orc/StringColumnStatistics.java | 41 +
.../org/apache/hive/orc/StripeInformation.java | 59 +
.../org/apache/hive/orc/StripeStatistics.java | 44 +
.../hive/orc/TimestampColumnStatistics.java | 38 +
.../org/apache/hive/orc/TypeDescription.java | 870 +
orc/src/java/org/apache/hive/orc/Writer.java | 110 +
.../org/apache/hive/orc/impl/AcidStats.java | 60 +
.../apache/hive/orc/impl/BitFieldReader.java | 214 +
.../apache/hive/orc/impl/BitFieldWriter.java | 69 +
.../org/apache/hive/orc/impl/BufferChunk.java | 85 +
.../hive/orc/impl/ColumnStatisticsImpl.java | 1101 +
.../hive/orc/impl/ConvertTreeReaderFactory.java | 2892 +++
.../hive/orc/impl/DataReaderProperties.java | 122 +
.../hive/orc/impl/DirectDecompressionCodec.java | 28 +
.../apache/hive/orc/impl/DynamicByteArray.java | 303 +
.../apache/hive/orc/impl/DynamicIntArray.java | 142 +
.../org/apache/hive/orc/impl/HadoopShims.java | 143 +
.../hive/orc/impl/HadoopShimsCurrent.java | 92 +
.../apache/hive/orc/impl/HadoopShims_2_2.java | 101 +
.../java/org/apache/hive/orc/impl/InStream.java | 498 +
.../org/apache/hive/orc/impl/IntegerReader.java | 82 +
.../org/apache/hive/orc/impl/IntegerWriter.java | 47 +
.../org/apache/hive/orc/impl/MemoryManager.java | 212 +
.../org/apache/hive/orc/impl/OrcAcidUtils.java | 88 +
.../java/org/apache/hive/orc/impl/OrcIndex.java | 43 +
.../java/org/apache/hive/orc/impl/OrcTail.java | 138 +
.../org/apache/hive/orc/impl/OutStream.java | 289 +
.../apache/hive/orc/impl/PhysicalFsWriter.java | 529 +
.../apache/hive/orc/impl/PhysicalWriter.java | 122 +
.../apache/hive/orc/impl/PositionProvider.java | 26 +
.../apache/hive/orc/impl/PositionRecorder.java | 25 +
.../hive/orc/impl/PositionedOutputStream.java | 39 +
.../org/apache/hive/orc/impl/ReaderImpl.java | 763 +
.../apache/hive/orc/impl/RecordReaderImpl.java | 1238 +
.../apache/hive/orc/impl/RecordReaderUtils.java | 578 +
.../org/apache/hive/orc/impl/RedBlackTree.java | 309 +
.../hive/orc/impl/RunLengthByteReader.java | 174 +
.../hive/orc/impl/RunLengthByteWriter.java | 106 +
.../hive/orc/impl/RunLengthIntegerReader.java | 173 +
.../hive/orc/impl/RunLengthIntegerReaderV2.java | 406 +
.../hive/orc/impl/RunLengthIntegerWriter.java | 143 +
.../hive/orc/impl/RunLengthIntegerWriterV2.java | 831 +
.../apache/hive/orc/impl/SchemaEvolution.java | 399 +
.../hive/orc/impl/SerializationUtils.java | 1311 ++
.../orc/impl/SettableUncompressedStream.java | 43 +
.../org/apache/hive/orc/impl/SnappyCodec.java | 108 +
.../org/apache/hive/orc/impl/StreamName.java | 97 +
.../hive/orc/impl/StringRedBlackTree.java | 207 +
.../apache/hive/orc/impl/TreeReaderFactory.java | 2162 ++
.../org/apache/hive/orc/impl/WriterImpl.java | 2443 ++
.../org/apache/hive/orc/impl/ZeroCopyShims.java | 89 +
.../org/apache/hive/orc/impl/ZlibCodec.java | 168 +
.../org/apache/hive/orc/tools/FileDump.java | 946 +
.../org/apache/hive/orc/tools/JsonFileDump.java | 411 +
.../org/apache/orc/BinaryColumnStatistics.java | 27 -
orc/src/java/org/apache/orc/BloomFilterIO.java | 43 -
.../org/apache/orc/BooleanColumnStatistics.java | 29 -
.../java/org/apache/orc/ColumnStatistics.java | 36 -
.../java/org/apache/orc/CompressionCodec.java | 69 -
.../java/org/apache/orc/CompressionKind.java | 27 -
orc/src/java/org/apache/orc/DataReader.java | 76 -
.../org/apache/orc/DateColumnStatistics.java | 39 -
.../org/apache/orc/DecimalColumnStatistics.java | 46 -
.../org/apache/orc/DoubleColumnStatistics.java | 46 -
.../org/apache/orc/FileFormatException.java | 30 -
orc/src/java/org/apache/orc/FileMetadata.java | 64 -
.../org/apache/orc/IntegerColumnStatistics.java | 52 -
orc/src/java/org/apache/orc/OrcConf.java | 193 -
orc/src/java/org/apache/orc/OrcFile.java | 574 -
orc/src/java/org/apache/orc/OrcUtils.java | 624 -
orc/src/java/org/apache/orc/Reader.java | 375 -
orc/src/java/org/apache/orc/RecordReader.java | 64 -
.../org/apache/orc/StringColumnStatistics.java | 43 -
.../java/org/apache/orc/StripeInformation.java | 59 -
.../java/org/apache/orc/StripeStatistics.java | 44 -
.../apache/orc/TimestampColumnStatistics.java | 38 -
.../java/org/apache/orc/TypeDescription.java | 870 -
orc/src/java/org/apache/orc/Writer.java | 114 -
orc/src/java/org/apache/orc/impl/AcidStats.java | 60 -
.../org/apache/orc/impl/BitFieldReader.java | 217 -
.../org/apache/orc/impl/BitFieldWriter.java | 73 -
.../java/org/apache/orc/impl/BufferChunk.java | 85 -
.../apache/orc/impl/ColumnStatisticsImpl.java | 1101 -
.../orc/impl/ConvertTreeReaderFactory.java | 2893 ---
.../apache/orc/impl/DataReaderProperties.java | 124 -
.../orc/impl/DirectDecompressionCodec.java | 28 -
.../org/apache/orc/impl/DynamicByteArray.java | 303 -
.../org/apache/orc/impl/DynamicIntArray.java | 142 -
.../java/org/apache/orc/impl/HadoopShims.java | 143 -
.../org/apache/orc/impl/HadoopShimsCurrent.java | 92 -
.../org/apache/orc/impl/HadoopShims_2_2.java | 101 -
orc/src/java/org/apache/orc/impl/InStream.java | 498 -
.../java/org/apache/orc/impl/IntegerReader.java | 82 -
.../java/org/apache/orc/impl/IntegerWriter.java | 47 -
.../java/org/apache/orc/impl/MemoryManager.java | 214 -
.../java/org/apache/orc/impl/OrcAcidUtils.java | 88 -
orc/src/java/org/apache/orc/impl/OrcIndex.java | 43 -
orc/src/java/org/apache/orc/impl/OrcTail.java | 140 -
orc/src/java/org/apache/orc/impl/OutStream.java | 289 -
.../org/apache/orc/impl/PhysicalFsWriter.java | 529 -
.../org/apache/orc/impl/PhysicalWriter.java | 122 -
.../org/apache/orc/impl/PositionProvider.java | 26 -
.../org/apache/orc/impl/PositionRecorder.java | 25 -
.../apache/orc/impl/PositionedOutputStream.java | 39 -
.../java/org/apache/orc/impl/ReaderImpl.java | 764 -
.../org/apache/orc/impl/RecordReaderImpl.java | 1238 -
.../org/apache/orc/impl/RecordReaderUtils.java | 578 -
.../java/org/apache/orc/impl/RedBlackTree.java | 311 -
.../apache/orc/impl/RunLengthByteReader.java | 174 -
.../apache/orc/impl/RunLengthByteWriter.java | 106 -
.../apache/orc/impl/RunLengthIntegerReader.java | 173 -
.../orc/impl/RunLengthIntegerReaderV2.java | 406 -
.../apache/orc/impl/RunLengthIntegerWriter.java | 143 -
.../orc/impl/RunLengthIntegerWriterV2.java | 831 -
.../org/apache/orc/impl/SchemaEvolution.java | 399 -
.../org/apache/orc/impl/SerializationUtils.java | 1311 --
.../orc/impl/SettableUncompressedStream.java | 44 -
.../java/org/apache/orc/impl/SnappyCodec.java | 108 -
.../java/org/apache/orc/impl/StreamName.java | 97 -
.../org/apache/orc/impl/StringRedBlackTree.java | 210 -
.../org/apache/orc/impl/TreeReaderFactory.java | 2163 --
.../java/org/apache/orc/impl/WriterImpl.java | 2446 --
.../java/org/apache/orc/impl/ZeroCopyShims.java | 89 -
orc/src/java/org/apache/orc/impl/ZlibCodec.java | 169 -
orc/src/java/org/apache/orc/tools/FileDump.java | 946 -
.../java/org/apache/orc/tools/JsonFileDump.java | 412 -
orc/src/protobuf/orc_proto.proto | 2 +-
.../apache/hive/orc/TestColumnStatistics.java | 364 +
.../apache/hive/orc/TestNewIntegerEncoding.java | 1373 ++
.../hive/orc/TestOrcNullOptimization.java | 415 +
.../org/apache/hive/orc/TestOrcTimezone1.java | 189 +
.../org/apache/hive/orc/TestOrcTimezone2.java | 143 +
.../org/apache/hive/orc/TestOrcTimezone3.java | 124 +
.../apache/hive/orc/TestStringDictionary.java | 291 +
.../apache/hive/orc/TestTypeDescription.java | 90 +
.../apache/hive/orc/TestUnrolledBitPack.java | 114 +
.../org/apache/hive/orc/TestVectorOrcFile.java | 2789 +++
.../hive/orc/impl/TestBitFieldReader.java | 145 +
.../org/apache/hive/orc/impl/TestBitPack.java | 279 +
.../hive/orc/impl/TestColumnStatisticsImpl.java | 64 +
.../hive/orc/impl/TestDataReaderProperties.java | 85 +
.../apache/hive/orc/impl/TestDynamicArray.java | 88 +
.../org/apache/hive/orc/impl/TestInStream.java | 314 +
.../orc/impl/TestIntegerCompressionReader.java | 130 +
.../apache/hive/orc/impl/TestMemoryManager.java | 132 +
.../apache/hive/orc/impl/TestOrcWideTable.java | 64 +
.../org/apache/hive/orc/impl/TestOutStream.java | 43 +
.../org/apache/hive/orc/impl/TestRLEv2.java | 307 +
.../apache/hive/orc/impl/TestReaderImpl.java | 152 +
.../hive/orc/impl/TestRecordReaderImpl.java | 1708 ++
.../hive/orc/impl/TestRunLengthByteReader.java | 143 +
.../orc/impl/TestRunLengthIntegerReader.java | 125 +
.../hive/orc/impl/TestSchemaEvolution.java | 480 +
.../hive/orc/impl/TestSerializationUtils.java | 199 +
.../apache/hive/orc/impl/TestStreamName.java | 49 +
.../hive/orc/impl/TestStringRedBlackTree.java | 232 +
.../test/org/apache/hive/orc/impl/TestZlib.java | 56 +
.../org/apache/hive/orc/tools/TestFileDump.java | 485 +
.../apache/hive/orc/tools/TestJsonFileDump.java | 150 +
.../org/apache/orc/TestColumnStatistics.java | 365 -
.../org/apache/orc/TestNewIntegerEncoding.java | 1373 --
.../org/apache/orc/TestOrcNullOptimization.java | 415 -
.../test/org/apache/orc/TestOrcTimezone1.java | 189 -
.../test/org/apache/orc/TestOrcTimezone2.java | 143 -
.../test/org/apache/orc/TestOrcTimezone3.java | 126 -
.../org/apache/orc/TestStringDictionary.java | 290 -
.../org/apache/orc/TestTypeDescription.java | 91 -
.../org/apache/orc/TestUnrolledBitPack.java | 114 -
.../test/org/apache/orc/TestVectorOrcFile.java | 2789 ---
.../org/apache/orc/impl/TestBitFieldReader.java | 145 -
.../test/org/apache/orc/impl/TestBitPack.java | 279 -
.../orc/impl/TestColumnStatisticsImpl.java | 64 -
.../orc/impl/TestDataReaderProperties.java | 86 -
.../org/apache/orc/impl/TestDynamicArray.java | 90 -
.../test/org/apache/orc/impl/TestInStream.java | 314 -
.../orc/impl/TestIntegerCompressionReader.java | 130 -
.../org/apache/orc/impl/TestMemoryManager.java | 133 -
.../org/apache/orc/impl/TestOrcWideTable.java | 64 -
.../test/org/apache/orc/impl/TestOutStream.java | 43 -
orc/src/test/org/apache/orc/impl/TestRLEv2.java | 307 -
.../org/apache/orc/impl/TestReaderImpl.java | 152 -
.../apache/orc/impl/TestRecordReaderImpl.java | 1709 --
.../orc/impl/TestRunLengthByteReader.java | 143 -
.../orc/impl/TestRunLengthIntegerReader.java | 125 -
.../apache/orc/impl/TestSchemaEvolution.java | 480 -
.../apache/orc/impl/TestSerializationUtils.java | 201 -
.../org/apache/orc/impl/TestStreamName.java | 49 -
.../apache/orc/impl/TestStringRedBlackTree.java | 234 -
orc/src/test/org/apache/orc/impl/TestZlib.java | 56 -
.../test/org/apache/orc/tools/TestFileDump.java | 486 -
.../org/apache/orc/tools/TestJsonFileDump.java | 150 -
214 files changed, 55660 insertions(+), 55754 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/bin/ext/orcfiledump.cmd
----------------------------------------------------------------------
diff --git a/bin/ext/orcfiledump.cmd b/bin/ext/orcfiledump.cmd
index ff4b410..2e7e2ca 100644
--- a/bin/ext/orcfiledump.cmd
+++ b/bin/ext/orcfiledump.cmd
@@ -14,7 +14,7 @@
@rem See the License for the specific language governing permissions and
@rem limitations under the License.
-set CLASS=org.apache.orc.tools.FileDump
+set CLASS=org.apache.hive.orc.tools.FileDump
set HIVE_OPTS=
set HADOOP_CLASSPATH=
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/bin/ext/orcfiledump.sh
----------------------------------------------------------------------
diff --git a/bin/ext/orcfiledump.sh b/bin/ext/orcfiledump.sh
index c84e61c..3e08280 100644
--- a/bin/ext/orcfiledump.sh
+++ b/bin/ext/orcfiledump.sh
@@ -17,7 +17,7 @@ THISSERVICE=orcfiledump
export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} "
orcfiledump () {
- CLASS=org.apache.orc.tools.FileDump
+ CLASS=org.apache.hive.orc.tools.FileDump
HIVE_OPTS=''
execHiveCmd $CLASS "$@"
}
@@ -34,4 +34,4 @@ orcfiledump_help () {
echo " --skip-dump Used along with --recover to directly recover files without dumping"
echo " --backup-path <new_path> Specify a backup path to store the corrupted files (default: /tmp)"
echo " --help (-h) Print help message"
-}
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/pom.xml
----------------------------------------------------------------------
diff --git a/orc/pom.xml b/orc/pom.xml
index de9c417..f75b91c 100644
--- a/orc/pom.xml
+++ b/orc/pom.xml
@@ -179,38 +179,6 @@
</execution>
</executions>
</plugin>
-
- <!-- Below we shade all of the org.apache.orc class names in hive-orc
- to a unique orc.apache.hive.orc prefix. This allows clients to
- use both Hive 2.2 and the standalone orc project. The uses in
- Hive 2.2 have been changed to use the org.apache.hive.orc prefix.
- In Hive 2.3 and beyond, hive uses the standalone ORC project. -->
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-shade-plugin</artifactId>
- <version>3.0.0</version>
- <executions>
- <execution>
- <phase>package</phase>
- <goals>
- <goal>shade</goal>
- </goals>
- <configuration>
- <artifactSet>
- <includes>
- <include>org.apache.hive:hive-orc</include>
- </includes>
- </artifactSet>
- <relocations>
- <relocation>
- <pattern>org.apache.orc</pattern>
- <shadedPattern>org.apache.hive.orc</shadedPattern>
- </relocation>
- </relocations>
- </configuration>
- </execution>
- </executions>
- </plugin>
</plugins>
</build>
</project>
[10/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/TestVectorOrcFile.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/TestVectorOrcFile.java b/orc/src/test/org/apache/hive/orc/TestVectorOrcFile.java
new file mode 100644
index 0000000..d78a2ef
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/TestVectorOrcFile.java
@@ -0,0 +1,2789 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc;
+
+import com.google.common.collect.Lists;
+
+import junit.framework.Assert;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hive.orc.impl.DataReaderProperties;
+import org.apache.hive.orc.impl.OrcIndex;
+import org.apache.hive.orc.impl.RecordReaderImpl;
+import org.apache.hive.orc.tools.TestJsonFileDump;
+import org.apache.hive.orc.impl.MemoryManager;
+import org.apache.hive.orc.impl.RecordReaderUtils;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+import java.io.File;
+import java.io.IOException;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.sql.Date;
+import java.sql.Timestamp;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import static junit.framework.TestCase.assertNotNull;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Tests for the vectorized reader and writer for ORC files.
+ */
+public class TestVectorOrcFile {
+
+ public static class InnerStruct {
+ int int1;
+ Text string1 = new Text();
+ InnerStruct(int int1, Text string1) {
+ this.int1 = int1;
+ this.string1.set(string1);
+ }
+ InnerStruct(int int1, String string1) {
+ this.int1 = int1;
+ this.string1.set(string1);
+ }
+
+ public String toString() {
+ return "{" + int1 + ", " + string1 + "}";
+ }
+ }
+
+ public static class MiddleStruct {
+ List<InnerStruct> list = new ArrayList<InnerStruct>();
+
+ MiddleStruct(InnerStruct... items) {
+ list.clear();
+ list.addAll(Arrays.asList(items));
+ }
+ }
+
+ private static InnerStruct inner(int i, String s) {
+ return new InnerStruct(i, s);
+ }
+
+ private static Map<String, InnerStruct> map(InnerStruct... items) {
+ Map<String, InnerStruct> result = new HashMap<String, InnerStruct>();
+ for(InnerStruct i: items) {
+ result.put(i.string1.toString(), i);
+ }
+ return result;
+ }
+
+ private static List<InnerStruct> list(InnerStruct... items) {
+ List<InnerStruct> result = new ArrayList<InnerStruct>();
+ result.addAll(Arrays.asList(items));
+ return result;
+ }
+
+ private static BytesWritable bytes(int... items) {
+ BytesWritable result = new BytesWritable();
+ result.setSize(items.length);
+ for(int i=0; i < items.length; ++i) {
+ result.getBytes()[i] = (byte) items[i];
+ }
+ return result;
+ }
+
+ private static byte[] bytesArray(int... items) {
+ byte[] result = new byte[items.length];
+ for(int i=0; i < items.length; ++i) {
+ result[i] = (byte) items[i];
+ }
+ return result;
+ }
+
+ private static ByteBuffer byteBuf(int... items) {
+ ByteBuffer result = ByteBuffer.allocate(items.length);
+ for(int item: items) {
+ result.put((byte) item);
+ }
+ result.flip();
+ return result;
+ }
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem () throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestVectorOrcFile." +
+ testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testReadFormat_0_11() throws Exception {
+ Path oldFilePath =
+ new Path(TestJsonFileDump.getFileFromClasspath("orc-file-11-format.orc"));
+ Reader reader = OrcFile.createReader(oldFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ int stripeCount = 0;
+ int rowCount = 0;
+ long currentOffset = -1;
+ for(StripeInformation stripe : reader.getStripes()) {
+ stripeCount += 1;
+ rowCount += stripe.getNumberOfRows();
+ if (currentOffset < 0) {
+ currentOffset = stripe.getOffset() + stripe.getIndexLength()
+ + stripe.getDataLength() + stripe.getFooterLength();
+ } else {
+ assertEquals(currentOffset, stripe.getOffset());
+ currentOffset += stripe.getIndexLength() + stripe.getDataLength()
+ + stripe.getFooterLength();
+ }
+ }
+ Assert.assertEquals(reader.getNumberOfRows(), rowCount);
+ assertEquals(2, stripeCount);
+
+ // check the stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(7500, stats[1].getNumberOfValues());
+ assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getFalseCount());
+ assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getTrueCount());
+ assertEquals("count: 7500 hasNull: true true: 3750", stats[1].toString());
+
+ assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
+ assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
+ assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
+ assertEquals(11520000, ((IntegerColumnStatistics) stats[3]).getSum());
+ assertEquals("count: 7500 hasNull: true min: 1024 max: 2048 sum: 11520000",
+ stats[3].toString());
+
+ assertEquals(Long.MAX_VALUE,
+ ((IntegerColumnStatistics) stats[5]).getMaximum());
+ assertEquals(Long.MAX_VALUE,
+ ((IntegerColumnStatistics) stats[5]).getMinimum());
+ assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
+ assertEquals(
+ "count: 7500 hasNull: true min: 9223372036854775807 max: 9223372036854775807",
+ stats[5].toString());
+
+ assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 0.0001);
+ assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 0.0001);
+ assertEquals(-75000.0, ((DoubleColumnStatistics) stats[7]).getSum(),
+ 0.00001);
+ assertEquals("count: 7500 hasNull: true min: -15.0 max: -5.0 sum: -75000.0",
+ stats[7].toString());
+
+ assertEquals("count: 7500 hasNull: true min: bye max: hi sum: 0", stats[9].toString());
+
+ // check the inspectors
+ TypeDescription schema = reader.getSchema();
+ assertEquals(TypeDescription.Category.STRUCT, schema.getCategory());
+ assertEquals("struct<boolean1:boolean,byte1:tinyint,short1:smallint,"
+ + "int1:int,long1:bigint,float1:float,double1:double,bytes1:"
+ + "binary,string1:string,middle:struct<list:array<struct<int1:int,"
+ + "string1:string>>>,list:array<struct<int1:int,string1:string>>,"
+ + "map:map<string,struct<int1:int,string1:string>>,ts:timestamp,"
+ + "decimal1:decimal(38,10)>", schema.toString());
+ VectorizedRowBatch batch = schema.createRowBatch();
+
+ RecordReader rows = reader.rows();
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1024, batch.size);
+
+ // check the contents of the first row
+ assertEquals(false, getBoolean(batch, 0));
+ assertEquals(1, getByte(batch, 0));
+ assertEquals(1024, getShort(batch, 0));
+ assertEquals(65536, getInt(batch, 0));
+ assertEquals(Long.MAX_VALUE, getLong(batch, 0));
+ assertEquals(1.0, getFloat(batch, 0), 0.00001);
+ assertEquals(-15.0, getDouble(batch, 0), 0.00001);
+ assertEquals(bytes(0, 1, 2, 3, 4), getBinary(batch, 0));
+ assertEquals("hi", getText(batch, 0).toString());
+
+ StructColumnVector middle = (StructColumnVector) batch.cols[9];
+ ListColumnVector midList = (ListColumnVector) middle.fields[0];
+ StructColumnVector midListStruct = (StructColumnVector) midList.child;
+ LongColumnVector midListInt = (LongColumnVector) midListStruct.fields[0];
+ BytesColumnVector midListStr = (BytesColumnVector) midListStruct.fields[1];
+ ListColumnVector list = (ListColumnVector) batch.cols[10];
+ StructColumnVector listStruct = (StructColumnVector) list.child;
+ LongColumnVector listInts = (LongColumnVector) listStruct.fields[0];
+ BytesColumnVector listStrs = (BytesColumnVector) listStruct.fields[1];
+ MapColumnVector map = (MapColumnVector) batch.cols[11];
+ BytesColumnVector mapKey = (BytesColumnVector) map.keys;
+ StructColumnVector mapValue = (StructColumnVector) map.values;
+ LongColumnVector mapValueInts = (LongColumnVector) mapValue.fields[0];
+ BytesColumnVector mapValueStrs = (BytesColumnVector) mapValue.fields[1];
+ TimestampColumnVector timestamp = (TimestampColumnVector) batch.cols[12];
+ DecimalColumnVector decs = (DecimalColumnVector) batch.cols[13];
+
+ assertEquals(false, middle.isNull[0]);
+ assertEquals(2, midList.lengths[0]);
+ int start = (int) midList.offsets[0];
+ assertEquals(1, midListInt.vector[start]);
+ assertEquals("bye", midListStr.toString(start));
+ assertEquals(2, midListInt.vector[start + 1]);
+ assertEquals("sigh", midListStr.toString(start + 1));
+
+ assertEquals(2, list.lengths[0]);
+ start = (int) list.offsets[0];
+ assertEquals(3, listInts.vector[start]);
+ assertEquals("good", listStrs.toString(start));
+ assertEquals(4, listInts.vector[start + 1]);
+ assertEquals("bad", listStrs.toString(start + 1));
+ assertEquals(0, map.lengths[0]);
+ assertEquals(Timestamp.valueOf("2000-03-12 15:00:00"),
+ timestamp.asScratchTimestamp(0));
+ assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547456")),
+ decs.vector[0]);
+
+ // check the contents of row 7499
+ rows.seekToRow(7499);
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(true, getBoolean(batch, 0));
+ assertEquals(100, getByte(batch, 0));
+ assertEquals(2048, getShort(batch, 0));
+ assertEquals(65536, getInt(batch, 0));
+ assertEquals(Long.MAX_VALUE, getLong(batch, 0));
+ assertEquals(2.0, getFloat(batch, 0), 0.00001);
+ assertEquals(-5.0, getDouble(batch, 0), 0.00001);
+ assertEquals(bytes(), getBinary(batch, 0));
+ assertEquals("bye", getText(batch, 0).toString());
+ assertEquals(false, middle.isNull[0]);
+ assertEquals(2, midList.lengths[0]);
+ start = (int) midList.offsets[0];
+ assertEquals(1, midListInt.vector[start]);
+ assertEquals("bye", midListStr.toString(start));
+ assertEquals(2, midListInt.vector[start + 1]);
+ assertEquals("sigh", midListStr.toString(start + 1));
+ assertEquals(3, list.lengths[0]);
+ start = (int) list.offsets[0];
+ assertEquals(100000000, listInts.vector[start]);
+ assertEquals("cat", listStrs.toString(start));
+ assertEquals(-100000, listInts.vector[start + 1]);
+ assertEquals("in", listStrs.toString(start + 1));
+ assertEquals(1234, listInts.vector[start + 2]);
+ assertEquals("hat", listStrs.toString(start + 2));
+ assertEquals(2, map.lengths[0]);
+ start = (int) map.offsets[0];
+ assertEquals("chani", mapKey.toString(start));
+ assertEquals(5, mapValueInts.vector[start]);
+ assertEquals("chani", mapValueStrs.toString(start));
+ assertEquals("mauddib", mapKey.toString(start + 1));
+ assertEquals(1, mapValueInts.vector[start + 1]);
+ assertEquals("mauddib", mapValueStrs.toString(start + 1));
+ assertEquals(Timestamp.valueOf("2000-03-12 15:00:01"),
+ timestamp.asScratchTimestamp(0));
+ assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547457")),
+ decs.vector[0]);
+
+ // handle the close up
+ Assert.assertEquals(false, rows.nextBatch(batch));
+ rows.close();
+ }
+
+ @Test
+ public void testTimestamp() throws Exception {
+ TypeDescription schema = TypeDescription.createTimestamp();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .bufferSize(10000).version(OrcFile.Version.V_0_11));
+ List<Timestamp> tslist = Lists.newArrayList();
+ tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.000999"));
+ tslist.add(Timestamp.valueOf("2003-01-01 00:00:00.000000222"));
+ tslist.add(Timestamp.valueOf("1999-01-01 00:00:00.999999999"));
+ tslist.add(Timestamp.valueOf("1995-01-01 00:00:00.688888888"));
+ tslist.add(Timestamp.valueOf("2002-01-01 00:00:00.1"));
+ tslist.add(Timestamp.valueOf("2010-03-02 00:00:00.000009001"));
+ tslist.add(Timestamp.valueOf("2005-01-01 00:00:00.000002229"));
+ tslist.add(Timestamp.valueOf("2006-01-01 00:00:00.900203003"));
+ tslist.add(Timestamp.valueOf("2003-01-01 00:00:00.800000007"));
+ tslist.add(Timestamp.valueOf("1996-08-02 00:00:00.723100809"));
+ tslist.add(Timestamp.valueOf("1998-11-02 00:00:00.857340643"));
+ tslist.add(Timestamp.valueOf("2008-10-02 00:00:00"));
+
+ VectorizedRowBatch batch = new VectorizedRowBatch(1, 1024);
+ TimestampColumnVector vec = new TimestampColumnVector(1024);
+ batch.cols[0] = vec;
+ batch.reset();
+ batch.size = tslist.size();
+ for (int i=0; i < tslist.size(); ++i) {
+ Timestamp ts = tslist.get(i);
+ vec.set(i, ts);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ TimestampColumnVector timestamps = (TimestampColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(tslist.get(idx++).getNanos(),
+ timestamps.asScratchTimestamp(r).getNanos());
+ }
+ }
+ Assert.assertEquals(tslist.size(), rows.getRowNumber());
+ assertEquals(0, writer.getSchema().getMaximumId());
+ boolean[] expected = new boolean[] {false};
+ boolean[] included = OrcUtils.includeColumns("", writer.getSchema());
+ assertEquals(true, Arrays.equals(expected, included));
+ }
+
+ @Test
+ public void testStringAndBinaryStatistics() throws Exception {
+
+ TypeDescription schema = TypeDescription.createStruct()
+ .addField("bytes1", TypeDescription.createBinary())
+ .addField("string1", TypeDescription.createString());
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 4;
+ BytesColumnVector field1 = (BytesColumnVector) batch.cols[0];
+ BytesColumnVector field2 = (BytesColumnVector) batch.cols[1];
+ field1.setVal(0, bytesArray(0, 1, 2, 3, 4));
+ field1.setVal(1, bytesArray(0, 1, 2, 3));
+ field1.setVal(2, bytesArray(0, 1, 2, 3, 4, 5));
+ field1.noNulls = false;
+ field1.isNull[3] = true;
+ field2.setVal(0, "foo".getBytes());
+ field2.setVal(1, "bar".getBytes());
+ field2.noNulls = false;
+ field2.isNull[2] = true;
+ field2.setVal(3, "hi".getBytes());
+ writer.addRowBatch(batch);
+ writer.close();
+ schema = writer.getSchema();
+ assertEquals(2, schema.getMaximumId());
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ boolean[] expected = new boolean[] {false, false, true};
+ boolean[] included = OrcUtils.includeColumns("string1", schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ expected = new boolean[] {false, false, false};
+ included = OrcUtils.includeColumns("", schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ expected = new boolean[] {false, false, false};
+ included = OrcUtils.includeColumns(null, schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ // check the stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(4, stats[0].getNumberOfValues());
+ assertEquals("count: 4 hasNull: false", stats[0].toString());
+
+ assertEquals(3, stats[1].getNumberOfValues());
+ assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum());
+ assertEquals("count: 3 hasNull: true sum: 15", stats[1].toString());
+
+ assertEquals(3, stats[2].getNumberOfValues());
+ assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum());
+ assertEquals("hi", ((StringColumnStatistics) stats[2]).getMaximum());
+ assertEquals(8, ((StringColumnStatistics) stats[2]).getSum());
+ assertEquals("count: 3 hasNull: true min: bar max: hi sum: 8",
+ stats[2].toString());
+
+ // check the inspectors
+ batch = reader.getSchema().createRowBatch();
+ BytesColumnVector bytes = (BytesColumnVector) batch.cols[0];
+ BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
+ RecordReader rows = reader.rows();
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(4, batch.size);
+
+ // check the contents of the first row
+ assertEquals(bytes(0,1,2,3,4), getBinary(bytes, 0));
+ assertEquals("foo", strs.toString(0));
+
+ // check the contents of second row
+ assertEquals(bytes(0,1,2,3), getBinary(bytes, 1));
+ assertEquals("bar", strs.toString(1));
+
+ // check the contents of third row
+ assertEquals(bytes(0,1,2,3,4,5), getBinary(bytes, 2));
+ assertNull(strs.toString(2));
+
+ // check the contents of fourth row
+ assertNull(getBinary(bytes, 3));
+ assertEquals("hi", strs.toString(3));
+
+ // handle the close up
+ Assert.assertEquals(false, rows.nextBatch(batch));
+ rows.close();
+ }
+
+
+ @Test
+ public void testStripeLevelStats() throws Exception {
+ TypeDescription schema = TypeDescription.createStruct()
+ .addField("int1", TypeDescription.createInt())
+ .addField("string1", TypeDescription.createString());
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 1000;
+ LongColumnVector field1 = (LongColumnVector) batch.cols[0];
+ BytesColumnVector field2 = (BytesColumnVector) batch.cols[1];
+ field1.isRepeating = true;
+ field2.isRepeating = true;
+ for (int b = 0; b < 11; b++) {
+ if (b >= 5) {
+ if (b >= 10) {
+ field1.vector[0] = 3;
+ field2.setVal(0, "three".getBytes());
+ } else {
+ field1.vector[0] = 2;
+ field2.setVal(0, "two".getBytes());
+ }
+ } else {
+ field1.vector[0] = 1;
+ field2.setVal(0, "one".getBytes());
+ }
+ writer.addRowBatch(batch);
+ }
+
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ schema = writer.getSchema();
+ assertEquals(2, schema.getMaximumId());
+ boolean[] expected = new boolean[] {false, true, false};
+ boolean[] included = OrcUtils.includeColumns("int1", schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ List<StripeStatistics> stats = reader.getStripeStatistics();
+ int numStripes = stats.size();
+ assertEquals(3, numStripes);
+ StripeStatistics ss1 = stats.get(0);
+ StripeStatistics ss2 = stats.get(1);
+ StripeStatistics ss3 = stats.get(2);
+
+ assertEquals(5000, ss1.getColumnStatistics()[0].getNumberOfValues());
+ assertEquals(5000, ss2.getColumnStatistics()[0].getNumberOfValues());
+ assertEquals(1000, ss3.getColumnStatistics()[0].getNumberOfValues());
+
+ assertEquals(5000, (ss1.getColumnStatistics()[1]).getNumberOfValues());
+ assertEquals(5000, (ss2.getColumnStatistics()[1]).getNumberOfValues());
+ assertEquals(1000, (ss3.getColumnStatistics()[1]).getNumberOfValues());
+ assertEquals(1, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getMinimum());
+ assertEquals(2, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getMinimum());
+ assertEquals(3, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getMinimum());
+ assertEquals(1, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getMaximum());
+ assertEquals(2, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getMaximum());
+ assertEquals(3, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getMaximum());
+ assertEquals(5000, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getSum());
+ assertEquals(10000, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getSum());
+ assertEquals(3000, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getSum());
+
+ assertEquals(5000, (ss1.getColumnStatistics()[2]).getNumberOfValues());
+ assertEquals(5000, (ss2.getColumnStatistics()[2]).getNumberOfValues());
+ assertEquals(1000, (ss3.getColumnStatistics()[2]).getNumberOfValues());
+ assertEquals("one", ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getMinimum());
+ assertEquals("two", ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getMinimum());
+ assertEquals("three", ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getMinimum());
+ assertEquals("one", ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getMaximum());
+ assertEquals("two", ((StringColumnStatistics) ss2.getColumnStatistics()[2]).getMaximum());
+ assertEquals("three", ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getMaximum());
+ assertEquals(15000, ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getSum());
+ assertEquals(15000, ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getSum());
+ assertEquals(5000, ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getSum());
+
+ RecordReaderImpl recordReader = (RecordReaderImpl) reader.rows();
+ OrcProto.RowIndex[] index = recordReader.readRowIndex(0, null, null).getRowGroupIndex();
+ assertEquals(3, index.length);
+ List<OrcProto.RowIndexEntry> items = index[1].getEntryList();
+ assertEquals(1, items.size());
+ assertEquals(3, items.get(0).getPositionsCount());
+ assertEquals(0, items.get(0).getPositions(0));
+ assertEquals(0, items.get(0).getPositions(1));
+ assertEquals(0, items.get(0).getPositions(2));
+ assertEquals(1,
+ items.get(0).getStatistics().getIntStatistics().getMinimum());
+ index = recordReader.readRowIndex(1, null, null).getRowGroupIndex();
+ assertEquals(3, index.length);
+ items = index[1].getEntryList();
+ assertEquals(2,
+ items.get(0).getStatistics().getIntStatistics().getMaximum());
+ }
+
+ private static void setInner(StructColumnVector inner, int rowId,
+ int i, String value) {
+ ((LongColumnVector) inner.fields[0]).vector[rowId] = i;
+ if (value != null) {
+ ((BytesColumnVector) inner.fields[1]).setVal(rowId, value.getBytes());
+ } else {
+ inner.fields[1].isNull[rowId] = true;
+ inner.fields[1].noNulls = false;
+ }
+ }
+
+ private static void checkInner(StructColumnVector inner, int rowId,
+ int rowInBatch, int i, String value) {
+ assertEquals("row " + rowId, i,
+ ((LongColumnVector) inner.fields[0]).vector[rowInBatch]);
+ if (value != null) {
+ assertEquals("row " + rowId, value,
+ ((BytesColumnVector) inner.fields[1]).toString(rowInBatch));
+ } else {
+ assertEquals("row " + rowId, true, inner.fields[1].isNull[rowInBatch]);
+ assertEquals("row " + rowId, false, inner.fields[1].noNulls);
+ }
+ }
+
+ private static void setInnerList(ListColumnVector list, int rowId,
+ List<InnerStruct> value) {
+ if (value != null) {
+ if (list.childCount + value.size() > list.child.isNull.length) {
+ list.child.ensureSize(list.childCount * 2, true);
+ }
+ list.lengths[rowId] = value.size();
+ list.offsets[rowId] = list.childCount;
+ for (int i = 0; i < list.lengths[rowId]; ++i) {
+ InnerStruct inner = value.get(i);
+ setInner((StructColumnVector) list.child, i + list.childCount,
+ inner.int1, inner.string1.toString());
+ }
+ list.childCount += value.size();
+ } else {
+ list.isNull[rowId] = true;
+ list.noNulls = false;
+ }
+ }
+
+ private static void checkInnerList(ListColumnVector list, int rowId,
+ int rowInBatch, List<InnerStruct> value) {
+ if (value != null) {
+ assertEquals("row " + rowId, value.size(), list.lengths[rowInBatch]);
+ int start = (int) list.offsets[rowInBatch];
+ for (int i = 0; i < list.lengths[rowInBatch]; ++i) {
+ InnerStruct inner = value.get(i);
+ checkInner((StructColumnVector) list.child, rowId, i + start,
+ inner.int1, inner.string1.toString());
+ }
+ list.childCount += value.size();
+ } else {
+ assertEquals("row " + rowId, true, list.isNull[rowInBatch]);
+ assertEquals("row " + rowId, false, list.noNulls);
+ }
+ }
+
+ private static void setInnerMap(MapColumnVector map, int rowId,
+ Map<String, InnerStruct> value) {
+ if (value != null) {
+ if (map.childCount >= map.keys.isNull.length) {
+ map.keys.ensureSize(map.childCount * 2, true);
+ map.values.ensureSize(map.childCount * 2, true);
+ }
+ map.lengths[rowId] = value.size();
+ int offset = map.childCount;
+ map.offsets[rowId] = offset;
+
+ for (Map.Entry<String, InnerStruct> entry : value.entrySet()) {
+ ((BytesColumnVector) map.keys).setVal(offset, entry.getKey().getBytes());
+ InnerStruct inner = entry.getValue();
+ setInner((StructColumnVector) map.values, offset, inner.int1,
+ inner.string1.toString());
+ offset += 1;
+ }
+ map.childCount = offset;
+ } else {
+ map.isNull[rowId] = true;
+ map.noNulls = false;
+ }
+ }
+
+ private static void checkInnerMap(MapColumnVector map, int rowId,
+ int rowInBatch,
+ Map<String, InnerStruct> value) {
+ if (value != null) {
+ assertEquals("row " + rowId, value.size(), map.lengths[rowInBatch]);
+ int offset = (int) map.offsets[rowInBatch];
+ for(int i=0; i < value.size(); ++i) {
+ String key = ((BytesColumnVector) map.keys).toString(offset + i);
+ InnerStruct expected = value.get(key);
+ checkInner((StructColumnVector) map.values, rowId, offset + i,
+ expected.int1, expected.string1.toString());
+ }
+ } else {
+ assertEquals("row " + rowId, true, map.isNull[rowId]);
+ assertEquals("row " + rowId, false, map.noNulls);
+ }
+ }
+
+ private static void setMiddleStruct(StructColumnVector middle, int rowId,
+ MiddleStruct value) {
+ if (value != null) {
+ setInnerList((ListColumnVector) middle.fields[0], rowId, value.list);
+ } else {
+ middle.isNull[rowId] = true;
+ middle.noNulls = false;
+ }
+ }
+
+ private static void checkMiddleStruct(StructColumnVector middle, int rowId,
+ int rowInBatch, MiddleStruct value) {
+ if (value != null) {
+ checkInnerList((ListColumnVector) middle.fields[0], rowId, rowInBatch,
+ value.list);
+ } else {
+ assertEquals("row " + rowId, true, middle.isNull[rowInBatch]);
+ assertEquals("row " + rowId, false, middle.noNulls);
+ }
+ }
+
+ private static void setBigRow(VectorizedRowBatch batch, int rowId,
+ Boolean b1, Byte b2, Short s1,
+ Integer i1, Long l1, Float f1,
+ Double d1, BytesWritable b3, String s2,
+ MiddleStruct m1, List<InnerStruct> l2,
+ Map<String, InnerStruct> m2) {
+ ((LongColumnVector) batch.cols[0]).vector[rowId] = b1 ? 1 : 0;
+ ((LongColumnVector) batch.cols[1]).vector[rowId] = b2;
+ ((LongColumnVector) batch.cols[2]).vector[rowId] = s1;
+ ((LongColumnVector) batch.cols[3]).vector[rowId] = i1;
+ ((LongColumnVector) batch.cols[4]).vector[rowId] = l1;
+ ((DoubleColumnVector) batch.cols[5]).vector[rowId] = f1;
+ ((DoubleColumnVector) batch.cols[6]).vector[rowId] = d1;
+ if (b3 != null) {
+ ((BytesColumnVector) batch.cols[7]).setVal(rowId, b3.getBytes(), 0,
+ b3.getLength());
+ } else {
+ batch.cols[7].isNull[rowId] = true;
+ batch.cols[7].noNulls = false;
+ }
+ if (s2 != null) {
+ ((BytesColumnVector) batch.cols[8]).setVal(rowId, s2.getBytes());
+ } else {
+ batch.cols[8].isNull[rowId] = true;
+ batch.cols[8].noNulls = false;
+ }
+ setMiddleStruct((StructColumnVector) batch.cols[9], rowId, m1);
+ setInnerList((ListColumnVector) batch.cols[10], rowId, l2);
+ setInnerMap((MapColumnVector) batch.cols[11], rowId, m2);
+ }
+
+ private static void checkBigRow(VectorizedRowBatch batch,
+ int rowInBatch,
+ int rowId,
+ boolean b1, byte b2, short s1,
+ int i1, long l1, float f1,
+ double d1, BytesWritable b3, String s2,
+ MiddleStruct m1, List<InnerStruct> l2,
+ Map<String, InnerStruct> m2) {
+ assertEquals("row " + rowId, b1, getBoolean(batch, rowInBatch));
+ assertEquals("row " + rowId, b2, getByte(batch, rowInBatch));
+ assertEquals("row " + rowId, s1, getShort(batch, rowInBatch));
+ assertEquals("row " + rowId, i1, getInt(batch, rowInBatch));
+ assertEquals("row " + rowId, l1, getLong(batch, rowInBatch));
+ assertEquals("row " + rowId, f1, getFloat(batch, rowInBatch), 0.0001);
+ assertEquals("row " + rowId, d1, getDouble(batch, rowInBatch), 0.0001);
+ if (b3 != null) {
+ BytesColumnVector bytes = (BytesColumnVector) batch.cols[7];
+ assertEquals("row " + rowId, b3.getLength(), bytes.length[rowInBatch]);
+ for(int i=0; i < b3.getLength(); ++i) {
+ assertEquals("row " + rowId + " byte " + i, b3.getBytes()[i],
+ bytes.vector[rowInBatch][bytes.start[rowInBatch] + i]);
+ }
+ } else {
+ assertEquals("row " + rowId, true, batch.cols[7].isNull[rowInBatch]);
+ assertEquals("row " + rowId, false, batch.cols[7].noNulls);
+ }
+ if (s2 != null) {
+ assertEquals("row " + rowId, s2, getText(batch, rowInBatch).toString());
+ } else {
+ assertEquals("row " + rowId, true, batch.cols[8].isNull[rowInBatch]);
+ assertEquals("row " + rowId, false, batch.cols[8].noNulls);
+ }
+ checkMiddleStruct((StructColumnVector) batch.cols[9], rowId, rowInBatch,
+ m1);
+ checkInnerList((ListColumnVector) batch.cols[10], rowId, rowInBatch, l2);
+ checkInnerMap((MapColumnVector) batch.cols[11], rowId, rowInBatch, m2);
+ }
+
+ private static boolean getBoolean(VectorizedRowBatch batch, int rowId) {
+ return ((LongColumnVector) batch.cols[0]).vector[rowId] != 0;
+ }
+
+ private static byte getByte(VectorizedRowBatch batch, int rowId) {
+ return (byte) ((LongColumnVector) batch.cols[1]).vector[rowId];
+ }
+
+ private static short getShort(VectorizedRowBatch batch, int rowId) {
+ return (short) ((LongColumnVector) batch.cols[2]).vector[rowId];
+ }
+
+ private static int getInt(VectorizedRowBatch batch, int rowId) {
+ return (int) ((LongColumnVector) batch.cols[3]).vector[rowId];
+ }
+
+ private static long getLong(VectorizedRowBatch batch, int rowId) {
+ return ((LongColumnVector) batch.cols[4]).vector[rowId];
+ }
+
+ private static float getFloat(VectorizedRowBatch batch, int rowId) {
+ return (float) ((DoubleColumnVector) batch.cols[5]).vector[rowId];
+ }
+
+ private static double getDouble(VectorizedRowBatch batch, int rowId) {
+ return ((DoubleColumnVector) batch.cols[6]).vector[rowId];
+ }
+
+ private static BytesWritable getBinary(BytesColumnVector column, int rowId) {
+ if (column.isRepeating) {
+ rowId = 0;
+ }
+ if (column.noNulls || !column.isNull[rowId]) {
+ return new BytesWritable(Arrays.copyOfRange(column.vector[rowId],
+ column.start[rowId], column.start[rowId] + column.length[rowId]));
+ } else {
+ return null;
+ }
+ }
+
+ private static BytesWritable getBinary(VectorizedRowBatch batch, int rowId) {
+ return getBinary((BytesColumnVector) batch.cols[7], rowId);
+ }
+
+ private static Text getText(BytesColumnVector vector, int rowId) {
+ if (vector.isRepeating) {
+ rowId = 0;
+ }
+ if (vector.noNulls || !vector.isNull[rowId]) {
+ return new Text(Arrays.copyOfRange(vector.vector[rowId],
+ vector.start[rowId], vector.start[rowId] + vector.length[rowId]));
+ } else {
+ return null;
+ }
+ }
+
+ private static Text getText(VectorizedRowBatch batch, int rowId) {
+ return getText((BytesColumnVector) batch.cols[8], rowId);
+ }
+
+ private static InnerStruct getInner(StructColumnVector vector,
+ int rowId) {
+ return new InnerStruct(
+ (int) ((LongColumnVector) vector.fields[0]).vector[rowId],
+ getText((BytesColumnVector) vector.fields[1], rowId));
+ }
+
+ private static List<InnerStruct> getList(ListColumnVector cv,
+ int rowId) {
+ if (cv.isRepeating) {
+ rowId = 0;
+ }
+ if (cv.noNulls || !cv.isNull[rowId]) {
+ List<InnerStruct> result =
+ new ArrayList<InnerStruct>((int) cv.lengths[rowId]);
+ for(long i=cv.offsets[rowId];
+ i < cv.offsets[rowId] + cv.lengths[rowId]; ++i) {
+ result.add(getInner((StructColumnVector) cv.child, (int) i));
+ }
+ return result;
+ } else {
+ return null;
+ }
+ }
+
+ private static List<InnerStruct> getMidList(VectorizedRowBatch batch,
+ int rowId) {
+ return getList((ListColumnVector) ((StructColumnVector) batch.cols[9])
+ .fields[0], rowId);
+ }
+
+ private static List<InnerStruct> getList(VectorizedRowBatch batch,
+ int rowId) {
+ return getList((ListColumnVector) batch.cols[10], rowId);
+ }
+
+ private static Map<Text, InnerStruct> getMap(VectorizedRowBatch batch,
+ int rowId) {
+ MapColumnVector cv = (MapColumnVector) batch.cols[11];
+ if (cv.isRepeating) {
+ rowId = 0;
+ }
+ if (cv.noNulls || !cv.isNull[rowId]) {
+ Map<Text, InnerStruct> result =
+ new HashMap<Text, InnerStruct>((int) cv.lengths[rowId]);
+ for(long i=cv.offsets[rowId];
+ i < cv.offsets[rowId] + cv.lengths[rowId]; ++i) {
+ result.put(getText((BytesColumnVector) cv.keys, (int) i),
+ getInner((StructColumnVector) cv.values, (int) i));
+ }
+ return result;
+ } else {
+ return null;
+ }
+ }
+
+ private static TypeDescription createInnerSchema() {
+ return TypeDescription.createStruct()
+ .addField("int1", TypeDescription.createInt())
+ .addField("string1", TypeDescription.createString());
+ }
+
+ private static TypeDescription createBigRowSchema() {
+ return TypeDescription.createStruct()
+ .addField("boolean1", TypeDescription.createBoolean())
+ .addField("byte1", TypeDescription.createByte())
+ .addField("short1", TypeDescription.createShort())
+ .addField("int1", TypeDescription.createInt())
+ .addField("long1", TypeDescription.createLong())
+ .addField("float1", TypeDescription.createFloat())
+ .addField("double1", TypeDescription.createDouble())
+ .addField("bytes1", TypeDescription.createBinary())
+ .addField("string1", TypeDescription.createString())
+ .addField("middle", TypeDescription.createStruct()
+ .addField("list", TypeDescription.createList(createInnerSchema())))
+ .addField("list", TypeDescription.createList(createInnerSchema()))
+ .addField("map", TypeDescription.createMap(
+ TypeDescription.createString(),
+ createInnerSchema()));
+ }
+
+ static void assertArrayEquals(boolean[] expected, boolean[] actual) {
+ assertEquals(expected.length, actual.length);
+ boolean diff = false;
+ for(int i=0; i < expected.length; ++i) {
+ if (expected[i] != actual[i]) {
+ System.out.println("Difference at " + i + " expected: " + expected[i] +
+ " actual: " + actual[i]);
+ diff = true;
+ }
+ }
+ assertEquals(false, diff);
+ }
+
+ @Test
+ public void test1() throws Exception {
+ TypeDescription schema = createBigRowSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 2;
+ setBigRow(batch, 0, false, (byte) 1, (short) 1024, 65536,
+ Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi",
+ new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
+ list(inner(3, "good"), inner(4, "bad")),
+ map());
+ setBigRow(batch, 1, true, (byte) 100, (short) 2048, 65536,
+ Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye",
+ new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
+ list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")),
+ map(inner(5, "chani"), inner(1, "mauddib")));
+ writer.addRowBatch(batch);
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ schema = writer.getSchema();
+ assertEquals(23, schema.getMaximumId());
+ boolean[] expected = new boolean[] {false, false, false, false, false,
+ false, false, false, false, false,
+ false, false, false, false, false,
+ false, false, false, false, false,
+ false, false, false, false};
+ boolean[] included = OrcUtils.includeColumns("", schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ expected = new boolean[] {false, true, false, false, false,
+ false, false, false, false, true,
+ true, true, true, true, true,
+ false, false, false, false, true,
+ true, true, true, true};
+ included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema);
+
+ assertArrayEquals(expected, included);
+
+ expected = new boolean[] {false, true, false, false, false,
+ false, false, false, false, true,
+ true, true, true, true, true,
+ false, false, false, false, true,
+ true, true, true, true};
+ included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema);
+ assertArrayEquals(expected, included);
+
+ expected = new boolean[] {false, true, true, true, true,
+ true, true, true, true, true,
+ true, true, true, true, true,
+ true, true, true, true, true,
+ true, true, true, true};
+ included = OrcUtils.includeColumns(
+ "boolean1,byte1,short1,int1,long1,float1,double1,bytes1,string1,middle,list,map",
+ schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ // check the stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(2, stats[1].getNumberOfValues());
+ assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount());
+ assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount());
+ assertEquals("count: 2 hasNull: false true: 1", stats[1].toString());
+
+ assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
+ assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
+ assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
+ assertEquals(3072, ((IntegerColumnStatistics) stats[3]).getSum());
+ assertEquals("count: 2 hasNull: false min: 1024 max: 2048 sum: 3072",
+ stats[3].toString());
+
+ StripeStatistics ss = reader.getStripeStatistics().get(0);
+ assertEquals(2, ss.getColumnStatistics()[0].getNumberOfValues());
+ assertEquals(1, ((BooleanColumnStatistics) ss.getColumnStatistics()[1]).getTrueCount());
+ assertEquals(1024, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMinimum());
+ assertEquals(2048, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMaximum());
+ assertEquals(3072, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getSum());
+ assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 0.0001);
+ assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 0.0001);
+ assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
+ assertEquals("count: 2 hasNull: false min: -15.0 max: -5.0 sum: -20.0",
+ stats[7].toString());
+
+ assertEquals("count: 2 hasNull: false min: bye max: hi sum: 5", stats[9].toString());
+
+ // check the schema
+ TypeDescription readerSchema = reader.getSchema();
+ assertEquals(TypeDescription.Category.STRUCT, readerSchema.getCategory());
+ assertEquals("struct<boolean1:boolean,byte1:tinyint,short1:smallint,"
+ + "int1:int,long1:bigint,float1:float,double1:double,bytes1:"
+ + "binary,string1:string,middle:struct<list:array<struct<int1:int,"
+ + "string1:string>>>,list:array<struct<int1:int,string1:string>>,"
+ + "map:map<string,struct<int1:int,string1:string>>>",
+ readerSchema.toString());
+ List<String> fieldNames = readerSchema.getFieldNames();
+ List<TypeDescription> fieldTypes = readerSchema.getChildren();
+ assertEquals("boolean1", fieldNames.get(0));
+ assertEquals(TypeDescription.Category.BOOLEAN, fieldTypes.get(0).getCategory());
+ assertEquals("byte1", fieldNames.get(1));
+ assertEquals(TypeDescription.Category.BYTE, fieldTypes.get(1).getCategory());
+ assertEquals("short1", fieldNames.get(2));
+ assertEquals(TypeDescription.Category.SHORT, fieldTypes.get(2).getCategory());
+ assertEquals("int1", fieldNames.get(3));
+ assertEquals(TypeDescription.Category.INT, fieldTypes.get(3).getCategory());
+ assertEquals("long1", fieldNames.get(4));
+ assertEquals(TypeDescription.Category.LONG, fieldTypes.get(4).getCategory());
+ assertEquals("float1", fieldNames.get(5));
+ assertEquals(TypeDescription.Category.FLOAT, fieldTypes.get(5).getCategory());
+ assertEquals("double1", fieldNames.get(6));
+ assertEquals(TypeDescription.Category.DOUBLE, fieldTypes.get(6).getCategory());
+ assertEquals("bytes1", fieldNames.get(7));
+ assertEquals(TypeDescription.Category.BINARY, fieldTypes.get(7).getCategory());
+ assertEquals("string1", fieldNames.get(8));
+ assertEquals(TypeDescription.Category.STRING, fieldTypes.get(8).getCategory());
+ assertEquals("middle", fieldNames.get(9));
+ TypeDescription middle = fieldTypes.get(9);
+ assertEquals(TypeDescription.Category.STRUCT, middle.getCategory());
+ TypeDescription midList = middle.getChildren().get(0);
+ assertEquals(TypeDescription.Category.LIST, midList.getCategory());
+ TypeDescription inner = midList.getChildren().get(0);
+ assertEquals(TypeDescription.Category.STRUCT, inner.getCategory());
+ assertEquals("int1", inner.getFieldNames().get(0));
+ assertEquals("string1", inner.getFieldNames().get(1));
+
+ RecordReader rows = reader.rows();
+ // create a new batch
+ batch = readerSchema.createRowBatch();
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(2, batch.size);
+ Assert.assertEquals(false, rows.nextBatch(batch));
+
+ // check the contents of the first row
+ assertEquals(false, getBoolean(batch, 0));
+ assertEquals(1, getByte(batch, 0));
+ assertEquals(1024, getShort(batch, 0));
+ assertEquals(65536, getInt(batch, 0));
+ assertEquals(Long.MAX_VALUE, getLong(batch, 0));
+ assertEquals(1.0, getFloat(batch, 0), 0.00001);
+ assertEquals(-15.0, getDouble(batch, 0), 0.00001);
+ assertEquals(bytes(0,1,2,3,4), getBinary(batch, 0));
+ assertEquals("hi", getText(batch, 0).toString());
+ List<InnerStruct> midRow = getMidList(batch, 0);
+ assertNotNull(midRow);
+ assertEquals(2, midRow.size());
+ assertEquals(1, midRow.get(0).int1);
+ assertEquals("bye", midRow.get(0).string1.toString());
+ assertEquals(2, midRow.get(1).int1);
+ assertEquals("sigh", midRow.get(1).string1.toString());
+ List<InnerStruct> list = getList(batch, 0);
+ assertEquals(2, list.size());
+ assertEquals(3, list.get(0).int1);
+ assertEquals("good", list.get(0).string1.toString());
+ assertEquals(4, list.get(1).int1);
+ assertEquals("bad", list.get(1).string1.toString());
+ Map<Text, InnerStruct> map = getMap(batch, 0);
+ assertEquals(0, map.size());
+
+ // check the contents of second row
+ assertEquals(true, getBoolean(batch, 1));
+ assertEquals(100, getByte(batch, 1));
+ assertEquals(2048, getShort(batch, 1));
+ assertEquals(65536, getInt(batch, 1));
+ assertEquals(Long.MAX_VALUE, getLong(batch, 1));
+ assertEquals(2.0, getFloat(batch, 1), 0.00001);
+ assertEquals(-5.0, getDouble(batch, 1), 0.00001);
+ assertEquals(bytes(), getBinary(batch, 1));
+ assertEquals("bye", getText(batch, 1).toString());
+ midRow = getMidList(batch, 1);
+ assertNotNull(midRow);
+ assertEquals(2, midRow.size());
+ assertEquals(1, midRow.get(0).int1);
+ assertEquals("bye", midRow.get(0).string1.toString());
+ assertEquals(2, midRow.get(1).int1);
+ assertEquals("sigh", midRow.get(1).string1.toString());
+ list = getList(batch, 1);
+ assertEquals(3, list.size());
+ assertEquals(100000000, list.get(0).int1);
+ assertEquals("cat", list.get(0).string1.toString());
+ assertEquals(-100000, list.get(1).int1);
+ assertEquals("in", list.get(1).string1.toString());
+ assertEquals(1234, list.get(2).int1);
+ assertEquals("hat", list.get(2).string1.toString());
+ map = getMap(batch, 1);
+ assertEquals(2, map.size());
+ InnerStruct value = map.get(new Text("chani"));
+ assertEquals(5, value.int1);
+ assertEquals("chani", value.string1.toString());
+ value = map.get(new Text("mauddib"));
+ assertEquals(1, value.int1);
+ assertEquals("mauddib", value.string1.toString());
+
+ // handle the close up
+ Assert.assertEquals(false, rows.nextBatch(batch));
+ rows.close();
+ }
+
+ @Test
+ public void testColumnProjection() throws Exception {
+ TypeDescription schema = createInnerSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(1000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(100)
+ .rowIndexStride(1000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ Random r1 = new Random(1);
+ Random r2 = new Random(2);
+ int x;
+ int minInt=0, maxInt=0;
+ String y;
+ String minStr = null, maxStr = null;
+ batch.size = 1000;
+ boolean first = true;
+ for(int b=0; b < 21; ++b) {
+ for(int r=0; r < 1000; ++r) {
+ x = r1.nextInt();
+ y = Long.toHexString(r2.nextLong());
+ if (first || x < minInt) {
+ minInt = x;
+ }
+ if (first || x > maxInt) {
+ maxInt = x;
+ }
+ if (first || y.compareTo(minStr) < 0) {
+ minStr = y;
+ }
+ if (first || y.compareTo(maxStr) > 0) {
+ maxStr = y;
+ }
+ first = false;
+ ((LongColumnVector) batch.cols[0]).vector[r] = x;
+ ((BytesColumnVector) batch.cols[1]).setVal(r, y.getBytes());
+ }
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ // check out the statistics
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(3, stats.length);
+ for(ColumnStatistics s: stats) {
+ assertEquals(21000, s.getNumberOfValues());
+ if (s instanceof IntegerColumnStatistics) {
+ assertEquals(minInt, ((IntegerColumnStatistics) s).getMinimum());
+ assertEquals(maxInt, ((IntegerColumnStatistics) s).getMaximum());
+ } else if (s instanceof StringColumnStatistics) {
+ assertEquals(maxStr, ((StringColumnStatistics) s).getMaximum());
+ assertEquals(minStr, ((StringColumnStatistics) s).getMinimum());
+ }
+ }
+
+ // check out the types
+ TypeDescription type = reader.getSchema();
+ assertEquals(TypeDescription.Category.STRUCT, type.getCategory());
+ assertEquals(2, type.getChildren().size());
+ TypeDescription type1 = type.getChildren().get(0);
+ TypeDescription type2 = type.getChildren().get(1);
+ assertEquals(TypeDescription.Category.INT, type1.getCategory());
+ assertEquals(TypeDescription.Category.STRING, type2.getCategory());
+ assertEquals("struct<int1:int,string1:string>", type.toString());
+
+ // read the contents and make sure they match
+ RecordReader rows1 = reader.rows(
+ new Reader.Options().include(new boolean[]{true, true, false}));
+ RecordReader rows2 = reader.rows(
+ new Reader.Options().include(new boolean[]{true, false, true}));
+ r1 = new Random(1);
+ r2 = new Random(2);
+ VectorizedRowBatch batch1 = reader.getSchema().createRowBatch(1000);
+ VectorizedRowBatch batch2 = reader.getSchema().createRowBatch(1000);
+ for(int i = 0; i < 21000; i += 1000) {
+ Assert.assertEquals(true, rows1.nextBatch(batch1));
+ Assert.assertEquals(true, rows2.nextBatch(batch2));
+ assertEquals(1000, batch1.size);
+ assertEquals(1000, batch2.size);
+ for(int j=0; j < 1000; ++j) {
+ assertEquals(r1.nextInt(),
+ ((LongColumnVector) batch1.cols[0]).vector[j]);
+ assertEquals(Long.toHexString(r2.nextLong()),
+ ((BytesColumnVector) batch2.cols[1]).toString(j));
+ }
+ }
+ Assert.assertEquals(false, rows1.nextBatch(batch1));
+ Assert.assertEquals(false, rows2.nextBatch(batch2));
+ rows1.close();
+ rows2.close();
+ }
+
+ @Test
+ public void testEmptyFile() throws Exception {
+ TypeDescription schema = createBigRowSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(1000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(100));
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ VectorizedRowBatch batch = reader.getSchema().createRowBatch();
+ Assert.assertEquals(false, reader.rows().nextBatch(batch));
+ Assert.assertEquals(CompressionKind.NONE, reader.getCompressionKind());
+ Assert.assertEquals(0, reader.getNumberOfRows());
+ Assert.assertEquals(0, reader.getCompressionSize());
+ Assert.assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
+ Assert.assertEquals(3, reader.getContentLength());
+ Assert.assertEquals(false, reader.getStripes().iterator().hasNext());
+ }
+
+ @Test
+ public void metaData() throws Exception {
+ TypeDescription schema = createBigRowSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(1000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(100));
+ writer.addUserMetadata("my.meta", byteBuf(1, 2, 3, 4, 5, 6, 7, -1, -2, 127,
+ -128));
+ writer.addUserMetadata("clobber", byteBuf(1, 2, 3));
+ writer.addUserMetadata("clobber", byteBuf(4, 3, 2, 1));
+ ByteBuffer bigBuf = ByteBuffer.allocate(40000);
+ Random random = new Random(0);
+ random.nextBytes(bigBuf.array());
+ writer.addUserMetadata("big", bigBuf);
+ bigBuf.position(0);
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 1;
+ setBigRow(batch, 0, true, (byte) 127, (short) 1024, 42,
+ 42L * 1024 * 1024 * 1024, (float) 3.1415, -2.713, null,
+ null, null, null, null);
+ writer.addRowBatch(batch);
+ writer.addUserMetadata("clobber", byteBuf(5,7,11,13,17,19));
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ Assert.assertEquals(byteBuf(5, 7, 11, 13, 17, 19), reader.getMetadataValue("clobber"));
+ Assert.assertEquals(byteBuf(1, 2, 3, 4, 5, 6, 7, -1, -2, 127, -128),
+ reader.getMetadataValue("my.meta"));
+ Assert.assertEquals(bigBuf, reader.getMetadataValue("big"));
+ try {
+ reader.getMetadataValue("unknown");
+ assertTrue(false);
+ } catch (IllegalArgumentException iae) {
+ // PASS
+ }
+ int i = 0;
+ for(String key: reader.getMetadataKeys()) {
+ if ("my.meta".equals(key) ||
+ "clobber".equals(key) ||
+ "big".equals(key)) {
+ i += 1;
+ } else {
+ throw new IllegalArgumentException("unknown key " + key);
+ }
+ }
+ assertEquals(3, i);
+ int numStripes = reader.getStripeStatistics().size();
+ assertEquals(1, numStripes);
+ }
+
+ /**
+ * Generate an ORC file with a range of dates and times.
+ */
+ public void createOrcDateFile(Path file, int minYear, int maxYear
+ ) throws IOException {
+ TypeDescription schema = TypeDescription.createStruct()
+ .addField("time", TypeDescription.createTimestamp())
+ .addField("date", TypeDescription.createDate());
+ Writer writer = OrcFile.createWriter(file,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .blockPadding(false));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 1000;
+ for (int year = minYear; year < maxYear; ++year) {
+ for (int ms = 1000; ms < 2000; ++ms) {
+ TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0];
+ timestampColVector.set(ms - 1000,
+ Timestamp.valueOf(year +
+ "-05-05 12:34:56." + ms));
+ ((LongColumnVector) batch.cols[1]).vector[ms - 1000] =
+ new DateWritable(new Date(year - 1900, 11, 25)).getDays();
+ }
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ Reader reader = OrcFile.createReader(file,
+ OrcFile.readerOptions(conf));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch(1000);
+ TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
+ LongColumnVector dates = (LongColumnVector) batch.cols[1];
+ for (int year = minYear; year < maxYear; ++year) {
+ rows.nextBatch(batch);
+ assertEquals(1000, batch.size);
+ for(int ms = 1000; ms < 2000; ++ms) {
+ StringBuilder buffer = new StringBuilder();
+ times.stringifyValue(buffer, ms - 1000);
+ String expected = Integer.toString(year) + "-05-05 12:34:56.";
+ // suppress the final zeros on the string by dividing by the largest
+ // power of 10 that divides evenly.
+ int roundedMs = ms;
+ for(int round = 1000; round > 0; round /= 10) {
+ if (ms % round == 0) {
+ roundedMs = ms / round;
+ break;
+ }
+ }
+ expected += roundedMs;
+ assertEquals(expected, buffer.toString());
+ assertEquals(Integer.toString(year) + "-12-25",
+ new DateWritable((int) dates.vector[ms - 1000]).toString());
+ }
+ }
+ rows.nextBatch(batch);
+ assertEquals(0, batch.size);
+ }
+
+ @Test
+ public void testDate1900() throws Exception {
+ createOrcDateFile(testFilePath, 1900, 1970);
+ }
+
+ @Test
+ public void testDate2038() throws Exception {
+ createOrcDateFile(testFilePath, 2038, 2250);
+ }
+
+ private static void setUnion(VectorizedRowBatch batch, int rowId,
+ Timestamp ts, Integer tag, Integer i, String s,
+ HiveDecimalWritable dec) {
+ UnionColumnVector union = (UnionColumnVector) batch.cols[1];
+ if (ts != null) {
+ TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0];
+ timestampColVector.set(rowId, ts);
+ } else {
+ batch.cols[0].isNull[rowId] = true;
+ batch.cols[0].noNulls = false;
+ }
+ if (tag != null) {
+ union.tags[rowId] = tag;
+ if (tag == 0) {
+ if (i != null) {
+ ((LongColumnVector) union.fields[tag]).vector[rowId] = i;
+ } else {
+ union.fields[tag].isNull[rowId] = true;
+ union.fields[tag].noNulls = false;
+ }
+ } else if (tag == 1) {
+ if (s != null) {
+ ((BytesColumnVector) union.fields[tag]).setVal(rowId, s.getBytes());
+ } else {
+ union.fields[tag].isNull[rowId] = true;
+ union.fields[tag].noNulls = false;
+ }
+ } else {
+ throw new IllegalArgumentException("Bad tag " + tag);
+ }
+ } else {
+ batch.cols[1].isNull[rowId] = true;
+ batch.cols[1].noNulls = false;
+ }
+ if (dec != null) {
+ ((DecimalColumnVector) batch.cols[2]).vector[rowId] = dec;
+ } else {
+ batch.cols[2].isNull[rowId] = true;
+ batch.cols[2].noNulls = false;
+ }
+ }
+
+ /**
+ * We test union, timestamp, and decimal separately since we need to make the
+ * object inspector manually. (The Hive reflection-based doesn't handle
+ * them properly.)
+ */
+ @Test
+ public void testUnionAndTimestamp() throws Exception {
+ TypeDescription schema = TypeDescription.createStruct()
+ .addField("time", TypeDescription.createTimestamp())
+ .addField("union", TypeDescription.createUnion()
+ .addUnionChild(TypeDescription.createInt())
+ .addUnionChild(TypeDescription.createString()))
+ .addField("decimal", TypeDescription.createDecimal()
+ .withPrecision(38)
+ .withScale(18));
+ HiveDecimal maxValue = HiveDecimal.create("10000000000000000000");
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(1000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(100)
+ .blockPadding(false));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 6;
+ setUnion(batch, 0, Timestamp.valueOf("2000-03-12 15:00:00"), 0, 42, null,
+ new HiveDecimalWritable("12345678.6547456"));
+ setUnion(batch, 1, Timestamp.valueOf("2000-03-20 12:00:00.123456789"),
+ 1, null, "hello", new HiveDecimalWritable("-5643.234"));
+
+ setUnion(batch, 2, null, null, null, null, null);
+ setUnion(batch, 3, null, 0, null, null, null);
+ setUnion(batch, 4, null, 1, null, null, null);
+
+ setUnion(batch, 5, Timestamp.valueOf("1970-01-01 00:00:00"), 0, 200000,
+ null, new HiveDecimalWritable("10000000000000000000"));
+ writer.addRowBatch(batch);
+
+ batch.reset();
+ Random rand = new Random(42);
+ for(int i=1970; i < 2038; ++i) {
+ Timestamp ts = Timestamp.valueOf(i + "-05-05 12:34:56." + i);
+ HiveDecimal dec =
+ HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18));
+ if ((i & 1) == 0) {
+ setUnion(batch, batch.size++, ts, 0, i*i, null,
+ new HiveDecimalWritable(dec));
+ } else {
+ setUnion(batch, batch.size++, ts, 1, null, Integer.toString(i*i),
+ new HiveDecimalWritable(dec));
+ }
+ if (maxValue.compareTo(dec) < 0) {
+ maxValue = dec;
+ }
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+
+ // let's add a lot of constant rows to test the rle
+ batch.size = 1000;
+ for(int c=0; c < batch.cols.length; ++c) {
+ batch.cols[c].setRepeating(true);
+ }
+ ((UnionColumnVector) batch.cols[1]).fields[0].isRepeating = true;
+ setUnion(batch, 0, null, 0, 1732050807, null, null);
+ for(int i=0; i < 5; ++i) {
+ writer.addRowBatch(batch);
+ }
+
+ batch.reset();
+ batch.size = 3;
+ setUnion(batch, 0, null, 0, 0, null, null);
+ setUnion(batch, 1, null, 0, 10, null, null);
+ setUnion(batch, 2, null, 0, 138, null, null);
+ writer.addRowBatch(batch);
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ schema = writer.getSchema();
+ assertEquals(5, schema.getMaximumId());
+ boolean[] expected = new boolean[] {false, false, false, false, false, false};
+ boolean[] included = OrcUtils.includeColumns("", schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ expected = new boolean[] {false, true, false, false, false, true};
+ included = OrcUtils.includeColumns("time,decimal", schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ expected = new boolean[] {false, false, true, true, true, false};
+ included = OrcUtils.includeColumns("union", schema);
+ assertEquals(true, Arrays.equals(expected, included));
+
+ Assert.assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
+ Assert.assertEquals(5077, reader.getNumberOfRows());
+ DecimalColumnStatistics stats =
+ (DecimalColumnStatistics) reader.getStatistics()[5];
+ assertEquals(71, stats.getNumberOfValues());
+ assertEquals(HiveDecimal.create("-5643.234"), stats.getMinimum());
+ assertEquals(maxValue, stats.getMaximum());
+ // TODO: fix this
+// assertEquals(null,stats.getSum());
+ int stripeCount = 0;
+ int rowCount = 0;
+ long currentOffset = -1;
+ for(StripeInformation stripe: reader.getStripes()) {
+ stripeCount += 1;
+ rowCount += stripe.getNumberOfRows();
+ if (currentOffset < 0) {
+ currentOffset = stripe.getOffset() + stripe.getLength();
+ } else {
+ assertEquals(currentOffset, stripe.getOffset());
+ currentOffset += stripe.getLength();
+ }
+ }
+ Assert.assertEquals(reader.getNumberOfRows(), rowCount);
+ assertEquals(2, stripeCount);
+ Assert.assertEquals(reader.getContentLength(), currentOffset);
+ RecordReader rows = reader.rows();
+ Assert.assertEquals(0, rows.getRowNumber());
+ Assert.assertEquals(0.0, rows.getProgress(), 0.000001);
+
+ schema = reader.getSchema();
+ batch = schema.createRowBatch(74);
+ Assert.assertEquals(0, rows.getRowNumber());
+ rows.nextBatch(batch);
+ assertEquals(74, batch.size);
+ Assert.assertEquals(74, rows.getRowNumber());
+ TimestampColumnVector ts = (TimestampColumnVector) batch.cols[0];
+ UnionColumnVector union = (UnionColumnVector) batch.cols[1];
+ LongColumnVector longs = (LongColumnVector) union.fields[0];
+ BytesColumnVector strs = (BytesColumnVector) union.fields[1];
+ DecimalColumnVector decs = (DecimalColumnVector) batch.cols[2];
+
+ assertEquals("struct<time:timestamp,union:uniontype<int,string>,decimal:decimal(38,18)>",
+ schema.toString());
+ assertEquals("2000-03-12 15:00:00.0", ts.asScratchTimestamp(0).toString());
+ assertEquals(0, union.tags[0]);
+ assertEquals(42, longs.vector[0]);
+ assertEquals("12345678.6547456", decs.vector[0].toString());
+
+ assertEquals("2000-03-20 12:00:00.123456789", ts.asScratchTimestamp(1).toString());
+ assertEquals(1, union.tags[1]);
+ assertEquals("hello", strs.toString(1));
+ assertEquals("-5643.234", decs.vector[1].toString());
+
+ assertEquals(false, ts.noNulls);
+ assertEquals(false, union.noNulls);
+ assertEquals(false, decs.noNulls);
+ assertEquals(true, ts.isNull[2]);
+ assertEquals(true, union.isNull[2]);
+ assertEquals(true, decs.isNull[2]);
+
+ assertEquals(true, ts.isNull[3]);
+ assertEquals(false, union.isNull[3]);
+ assertEquals(0, union.tags[3]);
+ assertEquals(true, longs.isNull[3]);
+ assertEquals(true, decs.isNull[3]);
+
+ assertEquals(true, ts.isNull[4]);
+ assertEquals(false, union.isNull[4]);
+ assertEquals(1, union.tags[4]);
+ assertEquals(true, strs.isNull[4]);
+ assertEquals(true, decs.isNull[4]);
+
+ assertEquals(false, ts.isNull[5]);
+ assertEquals("1970-01-01 00:00:00.0", ts.asScratchTimestamp(5).toString());
+ assertEquals(false, union.isNull[5]);
+ assertEquals(0, union.tags[5]);
+ assertEquals(false, longs.isNull[5]);
+ assertEquals(200000, longs.vector[5]);
+ assertEquals(false, decs.isNull[5]);
+ assertEquals("10000000000000000000", decs.vector[5].toString());
+
+ rand = new Random(42);
+ for(int i=1970; i < 2038; ++i) {
+ int row = 6 + i - 1970;
+ assertEquals(Timestamp.valueOf(i + "-05-05 12:34:56." + i),
+ ts.asScratchTimestamp(row));
+ if ((i & 1) == 0) {
+ assertEquals(0, union.tags[row]);
+ assertEquals(i*i, longs.vector[row]);
+ } else {
+ assertEquals(1, union.tags[row]);
+ assertEquals(Integer.toString(i * i), strs.toString(row));
+ }
+ assertEquals(new HiveDecimalWritable(HiveDecimal.create(new BigInteger(64, rand),
+ rand.nextInt(18))), decs.vector[row]);
+ }
+
+ // rebuild the row batch, so that we can read by 1000 rows
+ batch = schema.createRowBatch(1000);
+ ts = (TimestampColumnVector) batch.cols[0];
+ union = (UnionColumnVector) batch.cols[1];
+ longs = (LongColumnVector) union.fields[0];
+ strs = (BytesColumnVector) union.fields[1];
+ decs = (DecimalColumnVector) batch.cols[2];
+
+ for(int i=0; i < 5; ++i) {
+ rows.nextBatch(batch);
+ assertEquals("batch " + i, 1000, batch.size);
+ assertEquals("batch " + i, false, union.isRepeating);
+ assertEquals("batch " + i, true, union.noNulls);
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals("bad tag at " + i + "." +r, 0, union.tags[r]);
+ }
+ assertEquals("batch " + i, true, longs.isRepeating);
+ assertEquals("batch " + i, 1732050807, longs.vector[0]);
+ }
+
+ rows.nextBatch(batch);
+ assertEquals(3, batch.size);
+ assertEquals(0, union.tags[0]);
+ assertEquals(0, longs.vector[0]);
+ assertEquals(0, union.tags[1]);
+ assertEquals(10, longs.vector[1]);
+ assertEquals(0, union.tags[2]);
+ assertEquals(138, longs.vector[2]);
+
+ rows.nextBatch(batch);
+ assertEquals(0, batch.size);
+ Assert.assertEquals(1.0, rows.getProgress(), 0.00001);
+ Assert.assertEquals(reader.getNumberOfRows(), rows.getRowNumber());
+ rows.seekToRow(1);
+ rows.nextBatch(batch);
+ assertEquals(1000, batch.size);
+ assertEquals(Timestamp.valueOf("2000-03-20 12:00:00.123456789"), ts.asScratchTimestamp(0));
+ assertEquals(1, union.tags[0]);
+ assertEquals("hello", strs.toString(0));
+ assertEquals(new HiveDecimalWritable(HiveDecimal.create("-5643.234")), decs.vector[0]);
+ rows.close();
+ }
+
+ /**
+ * Read and write a randomly generated snappy file.
+ * @throws Exception
+ */
+ @Test
+ public void testSnappy() throws Exception {
+ TypeDescription schema = createInnerSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(1000)
+ .compress(CompressionKind.SNAPPY)
+ .bufferSize(100));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ Random rand = new Random(12);
+ batch.size = 1000;
+ for(int b=0; b < 10; ++b) {
+ for (int r=0; r < 1000; ++r) {
+ ((LongColumnVector) batch.cols[0]).vector[r] = rand.nextInt();
+ ((BytesColumnVector) batch.cols[1]).setVal(r,
+ Integer.toHexString(rand.nextInt()).getBytes());
+ }
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ Assert.assertEquals(CompressionKind.SNAPPY, reader.getCompressionKind());
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch(1000);
+ rand = new Random(12);
+ LongColumnVector longs = (LongColumnVector) batch.cols[0];
+ BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
+ for(int b=0; b < 10; ++b) {
+ rows.nextBatch(batch);
+ assertEquals(1000, batch.size);
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(rand.nextInt(), longs.vector[r]);
+ assertEquals(Integer.toHexString(rand.nextInt()), strs.toString(r));
+ }
+ }
+ rows.nextBatch(batch);
+ assertEquals(0, batch.size);
+ rows.close();
+ }
+
+ /**
+ * Read and write a randomly generated snappy file.
+ * @throws Exception
+ */
+ @Test
+ public void testWithoutIndex() throws Exception {
+ TypeDescription schema = createInnerSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(5000)
+ .compress(CompressionKind.SNAPPY)
+ .bufferSize(1000)
+ .rowIndexStride(0));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ Random rand = new Random(24);
+ batch.size = 5;
+ for(int c=0; c < batch.cols.length; ++c) {
+ batch.cols[c].setRepeating(true);
+ }
+ for(int i=0; i < 10000; ++i) {
+ ((LongColumnVector) batch.cols[0]).vector[0] = rand.nextInt();
+ ((BytesColumnVector) batch.cols[1])
+ .setVal(0, Integer.toBinaryString(rand.nextInt()).getBytes());
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ Assert.assertEquals(50000, reader.getNumberOfRows());
+ Assert.assertEquals(0, reader.getRowIndexStride());
+ StripeInformation stripe = reader.getStripes().iterator().next();
+ assertEquals(true, stripe.getDataLength() != 0);
+ assertEquals(0, stripe.getIndexLength());
+ RecordReader rows = reader.rows();
+ rand = new Random(24);
+ batch = reader.getSchema().createRowBatch(1000);
+ LongColumnVector longs = (LongColumnVector) batch.cols[0];
+ BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
+ for(int i=0; i < 50; ++i) {
+ rows.nextBatch(batch);
+ assertEquals("batch " + i, 1000, batch.size);
+ for(int j=0; j < 200; ++j) {
+ int intVal = rand.nextInt();
+ String strVal = Integer.toBinaryString(rand.nextInt());
+ for (int k = 0; k < 5; ++k) {
+ assertEquals(intVal, longs.vector[j * 5 + k]);
+ assertEquals(strVal, strs.toString(j * 5 + k));
+ }
+ }
+ }
+ rows.nextBatch(batch);
+ assertEquals(0, batch.size);
+ rows.close();
+ }
+
+ @Test
+ public void testSeek() throws Exception {
+ TypeDescription schema = createBigRowSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(200000)
+ .bufferSize(65536)
+ .rowIndexStride(1000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ Random rand = new Random(42);
+ final int COUNT=32768;
+ long[] intValues= new long[COUNT];
+ double[] doubleValues = new double[COUNT];
+ String[] stringValues = new String[COUNT];
+ BytesWritable[] byteValues = new BytesWritable[COUNT];
+ String[] words = new String[128];
+ for(int i=0; i < words.length; ++i) {
+ words[i] = Integer.toHexString(rand.nextInt());
+ }
+ for(int i=0; i < COUNT/2; ++i) {
+ intValues[2*i] = rand.nextLong();
+ intValues[2*i+1] = intValues[2*i];
+ stringValues[2*i] = words[rand.nextInt(words.length)];
+ stringValues[2*i+1] = stringValues[2*i];
+ }
+ for(int i=0; i < COUNT; ++i) {
+ doubleValues[i] = rand.nextDouble();
+ byte[] buf = new byte[20];
+ rand.nextBytes(buf);
+ byteValues[i] = new BytesWritable(buf);
+ }
+ for(int i=0; i < COUNT; ++i) {
+ appendRandomRow(batch, intValues, doubleValues, stringValues,
+ byteValues, words, i);
+ if (batch.size == 1024) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size != 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ Assert.assertEquals(COUNT, reader.getNumberOfRows());
+ RecordReader rows = reader.rows();
+ // get the row index
+ DataReader meta = RecordReaderUtils.createDefaultDataReader(
+ DataReaderProperties.builder()
+ .withBufferSize(reader.getCompressionSize())
+ .withFileSystem(fs)
+ .withPath(testFilePath)
+ .withCompression(reader.getCompressionKind())
+ .withTypeCount(reader.getSchema().getMaximumId() + 1)
+ .withZeroCopy(false)
+ .build());
+ OrcIndex index =
+ meta.readRowIndex(reader.getStripes().get(0), null, null, null, null,
+ null);
+ // check the primitive columns to make sure they have the right number of
+ // items in the first row group
+ for(int c=1; c < 9; ++c) {
+ OrcProto.RowIndex colIndex = index.getRowGroupIndex()[c];
+ assertEquals(1000,
+ colIndex.getEntry(0).getStatistics().getNumberOfValues());
+ }
+ batch = reader.getSchema().createRowBatch();
+ int nextRowInBatch = -1;
+ for(int i=COUNT-1; i >= 0; --i, --nextRowInBatch) {
+ // if we have consumed the previous batch read a new one
+ if (nextRowInBatch < 0) {
+ long base = Math.max(i - 1023, 0);
+ rows.seekToRow(base);
+ Assert.assertEquals("row " + i, true, rows.nextBatch(batch));
+ nextRowInBatch = batch.size - 1;
+ }
+ checkRandomRow(batch, intValues, doubleValues,
+ stringValues, byteValues, words, i, nextRowInBatch);
+ }
+ rows.close();
+ Iterator<StripeInformation> stripeIterator =
+ reader.getStripes().iterator();
+ long offsetOfStripe2 = 0;
+ long offsetOfStripe4 = 0;
+ long lastRowOfStripe2 = 0;
+ for(int i = 0; i < 5; ++i) {
+ StripeInformation stripe = stripeIterator.next();
+ if (i < 2) {
+ lastRowOfStripe2 += stripe.getNumberOfRows();
+ } else if (i == 2) {
+ offsetOfStripe2 = stripe.getOffset();
+ lastRowOfStripe2 += stripe.getNumberOfRows() - 1;
+ } else if (i == 4) {
+ offsetOfStripe4 = stripe.getOffset();
+ }
+ }
+ boolean[] columns = new boolean[reader.getStatistics().length];
+ columns[5] = true; // long colulmn
+ columns[9] = true; // text column
+ rows = reader.rows(new Reader.Options()
+ .range(offsetOfStripe2, offsetOfStripe4 - offsetOfStripe2)
+ .include(columns));
+ rows.seekToRow(lastRowOfStripe2);
+ // we only want two rows
+ batch = reader.getSchema().createRowBatch(2);
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1, batch.size);
+ assertEquals(intValues[(int) lastRowOfStripe2], getLong(batch, 0));
+ assertEquals(stringValues[(int) lastRowOfStripe2],
+ getText(batch, 0).toString());
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(intValues[(int) lastRowOfStripe2 + 1], getLong(batch, 0));
+ assertEquals(stringValues[(int) lastRowOfStripe2 + 1],
+ getText(batch, 0).toString());
+ rows.close();
+ }
+
+ private void appendRandomRow(VectorizedRowBatch batch,
+ long[] intValues, double[] doubleValues,
+ String[] stringValues,
+ BytesWritable[] byteValues,
+ String[] words, int i) {
+ InnerStruct inner = new InnerStruct((int) intValues[i], stringValues[i]);
+ InnerStruct inner2 = new InnerStruct((int) (intValues[i] >> 32),
+ words[i % words.length] + "-x");
+ setBigRow(batch, batch.size++, (intValues[i] & 1) == 0, (byte) intValues[i],
+ (short) intValues[i], (int) intValues[i], intValues[i],
+ (float) doubleValues[i], doubleValues[i], byteValues[i], stringValues[i],
+ new MiddleStruct(inner, inner2), list(), map(inner, inner2));
+ }
+
+ private void checkRandomRow(VectorizedRowBatch batch,
+ long[] intValues, double[] doubleValues,
+ String[] stringValues,
+ BytesWritable[] byteValues,
+ String[] words, int i, int rowInBatch) {
+ InnerStruct inner = new InnerStruct((int) intValues[i], stringValues[i]);
+ InnerStruct inner2 = new InnerStruct((int) (intValues[i] >> 32),
+ words[i % words.length] + "-x");
+ checkBigRow(batch, rowInBatch, i, (intValues[i] & 1) == 0, (byte) intValues[i],
+ (short) intValues[i], (int) intValues[i], intValues[i],
+ (float) doubleValues[i], doubleValues[i], byteValues[i], stringValues[i],
+ new MiddleStruct(inner, inner2), list(), map(inner, inner2));
+ }
+
+ private static class MyMemoryManager extends MemoryManager {
+ final long totalSpace;
+ double rate;
+ Path path = null;
+ long lastAllocation = 0;
+ int rows = 0;
+ Callback callback;
+
+ MyMemoryManager(Configuration conf, long totalSpace, double rate) {
+ super(conf);
+ this.totalSpace = totalSpace;
+ this.rate = rate;
+ }
+
+ @Override
+ public void addWriter(Path path, long requestedAllocation,
+ Callback callback) {
+ this.path = path;
+ this.lastAllocation = requestedAllocation;
+ this.callback = callback;
+ }
+
+ @Override
+ public synchronized void removeWriter(Path path) {
+ this.path = null;
+ this.lastAllocation = 0;
+ }
+
+ @Override
+ public long getTotalMemoryPool() {
+ return totalSpace;
+ }
+
+ @Override
+ public double getAllocationScale() {
+ return rate;
+ }
+
+ @Override
+ public void addedRow(int count) throws IOException {
+ rows += count;
+ if (rows % 100 == 0) {
+ callback.checkMemory(rate);
+ }
+ }
+ }
+
+ @Test
+ public void testMemoryManagementV11() throws Exception {
+ TypeDescription schema = createInnerSchema();
+ MyMemoryManager memory = new MyMemoryManager(conf, 10000, 0.1);
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .stripeSize(50000)
+ .bufferSize(100)
+ .rowIndexStride(0)
+ .memory(memory)
+ .version(OrcFile.Version.V_0_11));
+ assertEquals(testFilePath, memory.path);
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.size = 1;
+ for(int i=0; i < 2500; ++i) {
+ ((LongColumnVector) batch.cols[0]).vector[0] = i * 300;
+ ((BytesColumnVector) batch.cols[1]).setVal(0,
+ Integer.toHexString(10*i).getBytes());
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ assertEquals(null, memory.path);
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ int i = 0;
+ for(StripeInformation stripe: reader.getStripes()) {
+ i += 1;
+ assertTrue("stripe " + i + " is too long at " + stripe.getDataLength(),
+ stripe.getDataLength() < 5000);
+ }
+ assertEquals(25, i);
+ assertEquals(2500, reader.getNumberOfRows());
+ }
+
+ @Test
+ public void testMemoryManagementV12() throws Exception {
+ TypeDescription schema = createInnerSchema();
+ MyMemoryManager memory = new MyMemoryManager(conf, 10000, 0.1);
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .stripeSize(50000)
+ .bufferSize(100)
+ .rowIndexStride(0)
+ .memory(memory)
+ .version(OrcFile.Version.V_0_12));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ assertEquals(testFilePath, memory.path);
+ batch.size = 1;
+ for(int i=0; i < 2500; ++i) {
+ ((LongColumnVector) batch.cols[0]).vector[0] = i * 300;
+ ((BytesColumnVector) batch.cols[1]).setVal(0,
+ Integer.toHexString(10*i).getBytes());
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ assertEquals(null, memory.path);
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ int i = 0;
+ for(StripeInformation stripe: reader.getStripes()) {
+ i += 1;
+ assertTrue("stripe " + i + " is too long at " + stripe.getDataLength(),
+ stripe.getDataLength() < 5000);
+ }
+ // with HIVE-7832, the dictionaries will be disabled after writing the first
+ // stripe as there are too many distinct values. Hence only 3 stripes as
+ // compared to 25 stripes in version 0.11 (above test case)
+ assertEquals(3, i);
+ assertEquals(2500, reader.getNumberOfRows());
+ }
+
+ @Test
+ public void testPredicatePushdown() throws Exception {
+ TypeDescription schema = createInnerSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(400000L)
+ .compress(CompressionKind.NONE)
+ .bufferSize(500)
+ .rowIndexStride(1000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.ensureSize(3500);
+ batch.size = 3500;
+ for(int i=0; i < 3500; ++i) {
+ ((LongColumnVector) batch.cols[0]).vector[i] = i * 300;
+ ((BytesColumnVector) batch.cols[1]).setVal(i,
+ Integer.toHexString(10*i).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ assertEquals(3500, reader.getNumberOfRows());
+
+ SearchArgument sarg = SearchArgumentFactory.newBuilder()
+ .startAnd()
+ .startNot()
+ .lessThan("int1", PredicateLeaf.Type.LONG, 300000L)
+ .end()
+ .lessThan("int1", PredicateLeaf.Type.LONG, 600000L)
+ .end()
+ .build();
+ RecordReader rows = reader.rows(new Reader.Options()
+ .range(0L, Long.MAX_VALUE)
+ .include(new boolean[]{true, true, true})
+ .searchArgument(sarg, new String[]{null, "int1", "string1"}));
+ batch = reader.getSchema().createRowBatch(2000);
+ LongColumnVector ints = (LongColumnVector) batch.cols[0];
+ BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
+
+ Assert.assertEquals(1000L, rows.getRowNumber());
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1000, batch.size);
+
+ for(int i=1000; i < 2000; ++i) {
+ assertEquals(300 * i, ints.vector[i - 1000]);
+ assertEquals(Integer.toHexString(10*i), strs.toString(i - 1000));
+ }
+ Assert.assertEquals(false, rows.nextBatch(batch));
+ Assert.assertEquals(3500, rows.getRowNumber());
+
+ // look through the file with no rows selected
+ sarg = SearchArgumentFactory.newBuilder()
+ .startAnd()
+ .lessThan("int1", PredicateLeaf.Type.LONG, 0L)
+ .end()
+ .build();
+ rows = reader.rows(new Reader.Options()
+ .range(0L, Long.MAX_VALUE)
+ .include(new boolean[]{true, true, true})
+ .searchArgument(sarg, new String[]{null, "int1", "string1"}));
+ Assert.assertEquals(3500L, rows.getRowNumber());
+ assertTrue(!rows.nextBatch(batch));
+
+ // select first 100 and last 100 rows
+ sarg = SearchArgumentFactory.newBuilder()
+ .startOr()
+ .lessThan("int1", PredicateLeaf.Type.LONG, 300L * 100)
+ .startNot()
+ .lessThan("int1", PredicateLeaf.Type.LONG, 300L * 3400)
+ .end()
+ .end()
+ .build();
+ rows = reader.rows(new Reader.Options()
+ .range(0L, Long.MAX_VALUE)
+ .include(new boolean[]{true, true, true})
+ .searchArgument(sarg, new String[]{null, "int1", "string1"}));
+ Assert.assertEquals(0, rows.getRowNumber());
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1000, batch.size);
+ Assert.assertEquals(3000, rows.getRowNumber());
+ for(int i=0; i < 1000; ++i) {
+ assertEquals(300 * i, ints.vector[i]);
+ assertEquals(Integer.toHexString(10*i), strs.toString(i));
+ }
+
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(500, batch.size);
+ Assert.assertEquals(3500, rows.getRowNumber());
+ for(int i=3000; i < 3500; ++i) {
+ assertEquals(300 * i, ints.vector[i - 3000]);
+ assertEquals(Integer.toHexString(10*i), strs.toString(i - 3000));
+ }
+ Assert.assertEquals(false, rows.nextBatch(batch));
+ Assert.assertEquals(3500, rows.getRowNumber());
+ }
+
+ /**
+ * Test all of the types that have distinct ORC writers using the vectorized
+ * writer with different combinations of repeating and null values.
<TRUNCATED>
[27/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/SerializationUtils.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/SerializationUtils.java b/orc/src/java/org/apache/hive/orc/impl/SerializationUtils.java
new file mode 100644
index 0000000..dabc3ca
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/SerializationUtils.java
@@ -0,0 +1,1311 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.math.BigInteger;
+
+public final class SerializationUtils {
+
+ private final static int BUFFER_SIZE = 64;
+ private final byte[] readBuffer;
+ private final byte[] writeBuffer;
+
+ public SerializationUtils() {
+ this.readBuffer = new byte[BUFFER_SIZE];
+ this.writeBuffer = new byte[BUFFER_SIZE];
+ }
+
+ public void writeVulong(OutputStream output,
+ long value) throws IOException {
+ while (true) {
+ if ((value & ~0x7f) == 0) {
+ output.write((byte) value);
+ return;
+ } else {
+ output.write((byte) (0x80 | (value & 0x7f)));
+ value >>>= 7;
+ }
+ }
+ }
+
+ public void writeVslong(OutputStream output,
+ long value) throws IOException {
+ writeVulong(output, (value << 1) ^ (value >> 63));
+ }
+
+
+ public long readVulong(InputStream in) throws IOException {
+ long result = 0;
+ long b;
+ int offset = 0;
+ do {
+ b = in.read();
+ if (b == -1) {
+ throw new EOFException("Reading Vulong past EOF");
+ }
+ result |= (0x7f & b) << offset;
+ offset += 7;
+ } while (b >= 0x80);
+ return result;
+ }
+
+ public long readVslong(InputStream in) throws IOException {
+ long result = readVulong(in);
+ return (result >>> 1) ^ -(result & 1);
+ }
+
+ public float readFloat(InputStream in) throws IOException {
+ readFully(in, readBuffer, 0, 4);
+ int val = (((readBuffer[0] & 0xff) << 0)
+ + ((readBuffer[1] & 0xff) << 8)
+ + ((readBuffer[2] & 0xff) << 16)
+ + ((readBuffer[3] & 0xff) << 24));
+ return Float.intBitsToFloat(val);
+ }
+
+ public void writeFloat(OutputStream output,
+ float value) throws IOException {
+ int ser = Float.floatToIntBits(value);
+ writeBuffer[0] = (byte) ((ser >> 0) & 0xff);
+ writeBuffer[1] = (byte) ((ser >> 8) & 0xff);
+ writeBuffer[2] = (byte) ((ser >> 16) & 0xff);
+ writeBuffer[3] = (byte) ((ser >> 24) & 0xff);
+ output.write(writeBuffer, 0, 4);
+ }
+
+ public double readDouble(InputStream in) throws IOException {
+ return Double.longBitsToDouble(readLongLE(in));
+ }
+
+ public long readLongLE(InputStream in) throws IOException {
+ readFully(in, readBuffer, 0, 8);
+ return (((readBuffer[0] & 0xff) << 0)
+ + ((readBuffer[1] & 0xff) << 8)
+ + ((readBuffer[2] & 0xff) << 16)
+ + ((long) (readBuffer[3] & 0xff) << 24)
+ + ((long) (readBuffer[4] & 0xff) << 32)
+ + ((long) (readBuffer[5] & 0xff) << 40)
+ + ((long) (readBuffer[6] & 0xff) << 48)
+ + ((long) (readBuffer[7] & 0xff) << 56));
+ }
+
+ private void readFully(final InputStream in, final byte[] buffer, final int off, final int len)
+ throws IOException {
+ int n = 0;
+ while (n < len) {
+ int count = in.read(buffer, off + n, len - n);
+ if (count < 0) {
+ throw new EOFException("Read past EOF for " + in);
+ }
+ n += count;
+ }
+ }
+
+ public void writeDouble(OutputStream output,
+ double value) throws IOException {
+ writeLongLE(output, Double.doubleToLongBits(value));
+ }
+
+ private void writeLongLE(OutputStream output, long value) throws IOException {
+ writeBuffer[0] = (byte) ((value >> 0) & 0xff);
+ writeBuffer[1] = (byte) ((value >> 8) & 0xff);
+ writeBuffer[2] = (byte) ((value >> 16) & 0xff);
+ writeBuffer[3] = (byte) ((value >> 24) & 0xff);
+ writeBuffer[4] = (byte) ((value >> 32) & 0xff);
+ writeBuffer[5] = (byte) ((value >> 40) & 0xff);
+ writeBuffer[6] = (byte) ((value >> 48) & 0xff);
+ writeBuffer[7] = (byte) ((value >> 56) & 0xff);
+ output.write(writeBuffer, 0, 8);
+ }
+
+ /**
+ * Write the arbitrarily sized signed BigInteger in vint format.
+ *
+ * Signed integers are encoded using the low bit as the sign bit using zigzag
+ * encoding.
+ *
+ * Each byte uses the low 7 bits for data and the high bit for stop/continue.
+ *
+ * Bytes are stored LSB first.
+ * @param output the stream to write to
+ * @param value the value to output
+ * @throws IOException
+ */
+ public static void writeBigInteger(OutputStream output,
+ BigInteger value) throws IOException {
+ // encode the signed number as a positive integer
+ value = value.shiftLeft(1);
+ int sign = value.signum();
+ if (sign < 0) {
+ value = value.negate();
+ value = value.subtract(BigInteger.ONE);
+ }
+ int length = value.bitLength();
+ while (true) {
+ long lowBits = value.longValue() & 0x7fffffffffffffffL;
+ length -= 63;
+ // write out the next 63 bits worth of data
+ for(int i=0; i < 9; ++i) {
+ // if this is the last byte, leave the high bit off
+ if (length <= 0 && (lowBits & ~0x7f) == 0) {
+ output.write((byte) lowBits);
+ return;
+ } else {
+ output.write((byte) (0x80 | (lowBits & 0x7f)));
+ lowBits >>>= 7;
+ }
+ }
+ value = value.shiftRight(63);
+ }
+ }
+
+ /**
+ * Read the signed arbitrary sized BigInteger BigInteger in vint format
+ * @param input the stream to read from
+ * @return the read BigInteger
+ * @throws IOException
+ */
+ public static BigInteger readBigInteger(InputStream input) throws IOException {
+ BigInteger result = BigInteger.ZERO;
+ long work = 0;
+ int offset = 0;
+ long b;
+ do {
+ b = input.read();
+ if (b == -1) {
+ throw new EOFException("Reading BigInteger past EOF from " + input);
+ }
+ work |= (0x7f & b) << (offset % 63);
+ offset += 7;
+ // if we've read 63 bits, roll them into the result
+ if (offset == 63) {
+ result = BigInteger.valueOf(work);
+ work = 0;
+ } else if (offset % 63 == 0) {
+ result = result.or(BigInteger.valueOf(work).shiftLeft(offset-63));
+ work = 0;
+ }
+ } while (b >= 0x80);
+ if (work != 0) {
+ result = result.or(BigInteger.valueOf(work).shiftLeft((offset/63)*63));
+ }
+ // convert back to a signed number
+ boolean isNegative = result.testBit(0);
+ if (isNegative) {
+ result = result.add(BigInteger.ONE);
+ result = result.negate();
+ }
+ result = result.shiftRight(1);
+ return result;
+ }
+
+ public enum FixedBitSizes {
+ ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE,
+ THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN,
+ TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX,
+ TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR;
+ }
+
+ /**
+ * Count the number of bits required to encode the given value
+ * @param value
+ * @return bits required to store value
+ */
+ public int findClosestNumBits(long value) {
+ int count = 0;
+ while (value != 0) {
+ count++;
+ value = value >>> 1;
+ }
+ return getClosestFixedBits(count);
+ }
+
+ /**
+ * zigzag encode the given value
+ * @param val
+ * @return zigzag encoded value
+ */
+ public long zigzagEncode(long val) {
+ return (val << 1) ^ (val >> 63);
+ }
+
+ /**
+ * zigzag decode the given value
+ * @param val
+ * @return zizag decoded value
+ */
+ public long zigzagDecode(long val) {
+ return (val >>> 1) ^ -(val & 1);
+ }
+
+ /**
+ * Compute the bits required to represent pth percentile value
+ * @param data - array
+ * @param p - percentile value (>=0.0 to <=1.0)
+ * @return pth percentile bits
+ */
+ public int percentileBits(long[] data, int offset, int length,
+ double p) {
+ if ((p > 1.0) || (p <= 0.0)) {
+ return -1;
+ }
+
+ // histogram that store the encoded bit requirement for each values.
+ // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
+ int[] hist = new int[32];
+
+ // compute the histogram
+ for(int i = offset; i < (offset + length); i++) {
+ int idx = encodeBitWidth(findClosestNumBits(data[i]));
+ hist[idx] += 1;
+ }
+
+ int perLen = (int) (length * (1.0 - p));
+
+ // return the bits required by pth percentile length
+ for(int i = hist.length - 1; i >= 0; i--) {
+ perLen -= hist[i];
+ if (perLen < 0) {
+ return decodeBitWidth(i);
+ }
+ }
+
+ return 0;
+ }
+
+ /**
+ * Read n bytes in big endian order and convert to long
+ * @return long value
+ */
+ public long bytesToLongBE(InStream input, int n) throws IOException {
+ long out = 0;
+ long val = 0;
+ while (n > 0) {
+ n--;
+ // store it in a long and then shift else integer overflow will occur
+ val = input.read();
+ out |= (val << (n * 8));
+ }
+ return out;
+ }
+
+ /**
+ * Calculate the number of bytes required
+ * @param n - number of values
+ * @param numBits - bit width
+ * @return number of bytes required
+ */
+ int getTotalBytesRequired(int n, int numBits) {
+ return (n * numBits + 7) / 8;
+ }
+
+ /**
+ * For a given fixed bit this function will return the closest available fixed
+ * bit
+ * @param n
+ * @return closest valid fixed bit
+ */
+ public int getClosestFixedBits(int n) {
+ if (n == 0) {
+ return 1;
+ }
+
+ if (n >= 1 && n <= 24) {
+ return n;
+ } else if (n > 24 && n <= 26) {
+ return 26;
+ } else if (n > 26 && n <= 28) {
+ return 28;
+ } else if (n > 28 && n <= 30) {
+ return 30;
+ } else if (n > 30 && n <= 32) {
+ return 32;
+ } else if (n > 32 && n <= 40) {
+ return 40;
+ } else if (n > 40 && n <= 48) {
+ return 48;
+ } else if (n > 48 && n <= 56) {
+ return 56;
+ } else {
+ return 64;
+ }
+ }
+
+ public int getClosestAlignedFixedBits(int n) {
+ if (n == 0 || n == 1) {
+ return 1;
+ } else if (n > 1 && n <= 2) {
+ return 2;
+ } else if (n > 2 && n <= 4) {
+ return 4;
+ } else if (n > 4 && n <= 8) {
+ return 8;
+ } else if (n > 8 && n <= 16) {
+ return 16;
+ } else if (n > 16 && n <= 24) {
+ return 24;
+ } else if (n > 24 && n <= 32) {
+ return 32;
+ } else if (n > 32 && n <= 40) {
+ return 40;
+ } else if (n > 40 && n <= 48) {
+ return 48;
+ } else if (n > 48 && n <= 56) {
+ return 56;
+ } else {
+ return 64;
+ }
+ }
+
+ /**
+ * Finds the closest available fixed bit width match and returns its encoded
+ * value (ordinal)
+ * @param n - fixed bit width to encode
+ * @return encoded fixed bit width
+ */
+ public int encodeBitWidth(int n) {
+ n = getClosestFixedBits(n);
+
+ if (n >= 1 && n <= 24) {
+ return n - 1;
+ } else if (n > 24 && n <= 26) {
+ return FixedBitSizes.TWENTYSIX.ordinal();
+ } else if (n > 26 && n <= 28) {
+ return FixedBitSizes.TWENTYEIGHT.ordinal();
+ } else if (n > 28 && n <= 30) {
+ return FixedBitSizes.THIRTY.ordinal();
+ } else if (n > 30 && n <= 32) {
+ return FixedBitSizes.THIRTYTWO.ordinal();
+ } else if (n > 32 && n <= 40) {
+ return FixedBitSizes.FORTY.ordinal();
+ } else if (n > 40 && n <= 48) {
+ return FixedBitSizes.FORTYEIGHT.ordinal();
+ } else if (n > 48 && n <= 56) {
+ return FixedBitSizes.FIFTYSIX.ordinal();
+ } else {
+ return FixedBitSizes.SIXTYFOUR.ordinal();
+ }
+ }
+
+ /**
+ * Decodes the ordinal fixed bit value to actual fixed bit width value
+ * @param n - encoded fixed bit width
+ * @return decoded fixed bit width
+ */
+ public int decodeBitWidth(int n) {
+ if (n >= FixedBitSizes.ONE.ordinal()
+ && n <= FixedBitSizes.TWENTYFOUR.ordinal()) {
+ return n + 1;
+ } else if (n == FixedBitSizes.TWENTYSIX.ordinal()) {
+ return 26;
+ } else if (n == FixedBitSizes.TWENTYEIGHT.ordinal()) {
+ return 28;
+ } else if (n == FixedBitSizes.THIRTY.ordinal()) {
+ return 30;
+ } else if (n == FixedBitSizes.THIRTYTWO.ordinal()) {
+ return 32;
+ } else if (n == FixedBitSizes.FORTY.ordinal()) {
+ return 40;
+ } else if (n == FixedBitSizes.FORTYEIGHT.ordinal()) {
+ return 48;
+ } else if (n == FixedBitSizes.FIFTYSIX.ordinal()) {
+ return 56;
+ } else {
+ return 64;
+ }
+ }
+
+ /**
+ * Bitpack and write the input values to underlying output stream
+ * @param input - values to write
+ * @param offset - offset
+ * @param len - length
+ * @param bitSize - bit width
+ * @param output - output stream
+ * @throws IOException
+ */
+ public void writeInts(long[] input, int offset, int len, int bitSize,
+ OutputStream output) throws IOException {
+ if (input == null || input.length < 1 || offset < 0 || len < 1
+ || bitSize < 1) {
+ return;
+ }
+
+ switch (bitSize) {
+ case 1:
+ unrolledBitPack1(input, offset, len, output);
+ return;
+ case 2:
+ unrolledBitPack2(input, offset, len, output);
+ return;
+ case 4:
+ unrolledBitPack4(input, offset, len, output);
+ return;
+ case 8:
+ unrolledBitPack8(input, offset, len, output);
+ return;
+ case 16:
+ unrolledBitPack16(input, offset, len, output);
+ return;
+ case 24:
+ unrolledBitPack24(input, offset, len, output);
+ return;
+ case 32:
+ unrolledBitPack32(input, offset, len, output);
+ return;
+ case 40:
+ unrolledBitPack40(input, offset, len, output);
+ return;
+ case 48:
+ unrolledBitPack48(input, offset, len, output);
+ return;
+ case 56:
+ unrolledBitPack56(input, offset, len, output);
+ return;
+ case 64:
+ unrolledBitPack64(input, offset, len, output);
+ return;
+ default:
+ break;
+ }
+
+ int bitsLeft = 8;
+ byte current = 0;
+ for(int i = offset; i < (offset + len); i++) {
+ long value = input[i];
+ int bitsToWrite = bitSize;
+ while (bitsToWrite > bitsLeft) {
+ // add the bits to the bottom of the current word
+ current |= value >>> (bitsToWrite - bitsLeft);
+ // subtract out the bits we just added
+ bitsToWrite -= bitsLeft;
+ // zero out the bits above bitsToWrite
+ value &= (1L << bitsToWrite) - 1;
+ output.write(current);
+ current = 0;
+ bitsLeft = 8;
+ }
+ bitsLeft -= bitsToWrite;
+ current |= value << bitsLeft;
+ if (bitsLeft == 0) {
+ output.write(current);
+ current = 0;
+ bitsLeft = 8;
+ }
+ }
+
+ // flush
+ if (bitsLeft != 8) {
+ output.write(current);
+ current = 0;
+ bitsLeft = 8;
+ }
+ }
+
+ private void unrolledBitPack1(long[] input, int offset, int len,
+ OutputStream output) throws IOException {
+ final int numHops = 8;
+ final int remainder = len % numHops;
+ final int endOffset = offset + len;
+ final int endUnroll = endOffset - remainder;
+ int val = 0;
+ for (int i = offset; i < endUnroll; i = i + numHops) {
+ val = (int) (val | ((input[i] & 1) << 7)
+ | ((input[i + 1] & 1) << 6)
+ | ((input[i + 2] & 1) << 5)
+ | ((input[i + 3] & 1) << 4)
+ | ((input[i + 4] & 1) << 3)
+ | ((input[i + 5] & 1) << 2)
+ | ((input[i + 6] & 1) << 1)
+ | (input[i + 7]) & 1);
+ output.write(val);
+ val = 0;
+ }
+
+ if (remainder > 0) {
+ int startShift = 7;
+ for (int i = endUnroll; i < endOffset; i++) {
+ val = (int) (val | (input[i] & 1) << startShift);
+ startShift -= 1;
+ }
+ output.write(val);
+ }
+ }
+
+ private void unrolledBitPack2(long[] input, int offset, int len,
+ OutputStream output) throws IOException {
+ final int numHops = 4;
+ final int remainder = len % numHops;
+ final int endOffset = offset + len;
+ final int endUnroll = endOffset - remainder;
+ int val = 0;
+ for (int i = offset; i < endUnroll; i = i + numHops) {
+ val = (int) (val | ((input[i] & 3) << 6)
+ | ((input[i + 1] & 3) << 4)
+ | ((input[i + 2] & 3) << 2)
+ | (input[i + 3]) & 3);
+ output.write(val);
+ val = 0;
+ }
+
+ if (remainder > 0) {
+ int startShift = 6;
+ for (int i = endUnroll; i < endOffset; i++) {
+ val = (int) (val | (input[i] & 3) << startShift);
+ startShift -= 2;
+ }
+ output.write(val);
+ }
+ }
+
+ private void unrolledBitPack4(long[] input, int offset, int len,
+ OutputStream output) throws IOException {
+ final int numHops = 2;
+ final int remainder = len % numHops;
+ final int endOffset = offset + len;
+ final int endUnroll = endOffset - remainder;
+ int val = 0;
+ for (int i = offset; i < endUnroll; i = i + numHops) {
+ val = (int) (val | ((input[i] & 15) << 4) | (input[i + 1]) & 15);
+ output.write(val);
+ val = 0;
+ }
+
+ if (remainder > 0) {
+ int startShift = 4;
+ for (int i = endUnroll; i < endOffset; i++) {
+ val = (int) (val | (input[i] & 15) << startShift);
+ startShift -= 4;
+ }
+ output.write(val);
+ }
+ }
+
+ private void unrolledBitPack8(long[] input, int offset, int len,
+ OutputStream output) throws IOException {
+ unrolledBitPackBytes(input, offset, len, output, 1);
+ }
+
+ private void unrolledBitPack16(long[] input, int offset, int len,
+ OutputStream output) throws IOException {
+ unrolledBitPackBytes(input, offset, len, output, 2);
+ }
+
+ private void unrolledBitPack24(long[] input, int offset, int len,
+ OutputStream output) throws IOException {
+ unrolledBitPackBytes(input, offset, len, output, 3);
+ }
+
+ private void unrolledBitPack32(long[] input, int offset, int len,
+ OutputStream output) throws IOException {
+ unrolledBitPackBytes(input, offset, len, output, 4);
+ }
+
+ private void unrolledBitPack40(long[] input, int offset, int len,
+ OutputStream output) throws IOException {
+ unrolledBitPackBytes(input, offset, len, output, 5);
+ }
+
+ private void unrolledBitPack48(long[] input, int offset, int len,
+ OutputStream output) throws IOException {
+ unrolledBitPackBytes(input, offset, len, output, 6);
+ }
+
+ private void unrolledBitPack56(long[] input, int offset, int len,
+ OutputStream output) throws IOException {
+ unrolledBitPackBytes(input, offset, len, output, 7);
+ }
+
+ private void unrolledBitPack64(long[] input, int offset, int len,
+ OutputStream output) throws IOException {
+ unrolledBitPackBytes(input, offset, len, output, 8);
+ }
+
+ private void unrolledBitPackBytes(long[] input, int offset, int len, OutputStream output, int numBytes) throws IOException {
+ final int numHops = 8;
+ final int remainder = len % numHops;
+ final int endOffset = offset + len;
+ final int endUnroll = endOffset - remainder;
+ int i = offset;
+ for (; i < endUnroll; i = i + numHops) {
+ writeLongBE(output, input, i, numHops, numBytes);
+ }
+
+ if (remainder > 0) {
+ writeRemainingLongs(output, i, input, remainder, numBytes);
+ }
+ }
+
+ private void writeRemainingLongs(OutputStream output, int offset, long[] input, int remainder,
+ int numBytes) throws IOException {
+ final int numHops = remainder;
+
+ int idx = 0;
+ switch (numBytes) {
+ case 1:
+ while (remainder > 0) {
+ writeBuffer[idx] = (byte) (input[offset + idx] & 255);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 2:
+ while (remainder > 0) {
+ writeLongBE2(output, input[offset + idx], idx * 2);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 3:
+ while (remainder > 0) {
+ writeLongBE3(output, input[offset + idx], idx * 3);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 4:
+ while (remainder > 0) {
+ writeLongBE4(output, input[offset + idx], idx * 4);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 5:
+ while (remainder > 0) {
+ writeLongBE5(output, input[offset + idx], idx * 5);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 6:
+ while (remainder > 0) {
+ writeLongBE6(output, input[offset + idx], idx * 6);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 7:
+ while (remainder > 0) {
+ writeLongBE7(output, input[offset + idx], idx * 7);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 8:
+ while (remainder > 0) {
+ writeLongBE8(output, input[offset + idx], idx * 8);
+ remainder--;
+ idx++;
+ }
+ break;
+ default:
+ break;
+ }
+
+ final int toWrite = numHops * numBytes;
+ output.write(writeBuffer, 0, toWrite);
+ }
+
+ private void writeLongBE(OutputStream output, long[] input, int offset, int numHops, int numBytes) throws IOException {
+
+ switch (numBytes) {
+ case 1:
+ writeBuffer[0] = (byte) (input[offset + 0] & 255);
+ writeBuffer[1] = (byte) (input[offset + 1] & 255);
+ writeBuffer[2] = (byte) (input[offset + 2] & 255);
+ writeBuffer[3] = (byte) (input[offset + 3] & 255);
+ writeBuffer[4] = (byte) (input[offset + 4] & 255);
+ writeBuffer[5] = (byte) (input[offset + 5] & 255);
+ writeBuffer[6] = (byte) (input[offset + 6] & 255);
+ writeBuffer[7] = (byte) (input[offset + 7] & 255);
+ break;
+ case 2:
+ writeLongBE2(output, input[offset + 0], 0);
+ writeLongBE2(output, input[offset + 1], 2);
+ writeLongBE2(output, input[offset + 2], 4);
+ writeLongBE2(output, input[offset + 3], 6);
+ writeLongBE2(output, input[offset + 4], 8);
+ writeLongBE2(output, input[offset + 5], 10);
+ writeLongBE2(output, input[offset + 6], 12);
+ writeLongBE2(output, input[offset + 7], 14);
+ break;
+ case 3:
+ writeLongBE3(output, input[offset + 0], 0);
+ writeLongBE3(output, input[offset + 1], 3);
+ writeLongBE3(output, input[offset + 2], 6);
+ writeLongBE3(output, input[offset + 3], 9);
+ writeLongBE3(output, input[offset + 4], 12);
+ writeLongBE3(output, input[offset + 5], 15);
+ writeLongBE3(output, input[offset + 6], 18);
+ writeLongBE3(output, input[offset + 7], 21);
+ break;
+ case 4:
+ writeLongBE4(output, input[offset + 0], 0);
+ writeLongBE4(output, input[offset + 1], 4);
+ writeLongBE4(output, input[offset + 2], 8);
+ writeLongBE4(output, input[offset + 3], 12);
+ writeLongBE4(output, input[offset + 4], 16);
+ writeLongBE4(output, input[offset + 5], 20);
+ writeLongBE4(output, input[offset + 6], 24);
+ writeLongBE4(output, input[offset + 7], 28);
+ break;
+ case 5:
+ writeLongBE5(output, input[offset + 0], 0);
+ writeLongBE5(output, input[offset + 1], 5);
+ writeLongBE5(output, input[offset + 2], 10);
+ writeLongBE5(output, input[offset + 3], 15);
+ writeLongBE5(output, input[offset + 4], 20);
+ writeLongBE5(output, input[offset + 5], 25);
+ writeLongBE5(output, input[offset + 6], 30);
+ writeLongBE5(output, input[offset + 7], 35);
+ break;
+ case 6:
+ writeLongBE6(output, input[offset + 0], 0);
+ writeLongBE6(output, input[offset + 1], 6);
+ writeLongBE6(output, input[offset + 2], 12);
+ writeLongBE6(output, input[offset + 3], 18);
+ writeLongBE6(output, input[offset + 4], 24);
+ writeLongBE6(output, input[offset + 5], 30);
+ writeLongBE6(output, input[offset + 6], 36);
+ writeLongBE6(output, input[offset + 7], 42);
+ break;
+ case 7:
+ writeLongBE7(output, input[offset + 0], 0);
+ writeLongBE7(output, input[offset + 1], 7);
+ writeLongBE7(output, input[offset + 2], 14);
+ writeLongBE7(output, input[offset + 3], 21);
+ writeLongBE7(output, input[offset + 4], 28);
+ writeLongBE7(output, input[offset + 5], 35);
+ writeLongBE7(output, input[offset + 6], 42);
+ writeLongBE7(output, input[offset + 7], 49);
+ break;
+ case 8:
+ writeLongBE8(output, input[offset + 0], 0);
+ writeLongBE8(output, input[offset + 1], 8);
+ writeLongBE8(output, input[offset + 2], 16);
+ writeLongBE8(output, input[offset + 3], 24);
+ writeLongBE8(output, input[offset + 4], 32);
+ writeLongBE8(output, input[offset + 5], 40);
+ writeLongBE8(output, input[offset + 6], 48);
+ writeLongBE8(output, input[offset + 7], 56);
+ break;
+ default:
+ break;
+ }
+
+ final int toWrite = numHops * numBytes;
+ output.write(writeBuffer, 0, toWrite);
+ }
+
+ private void writeLongBE2(OutputStream output, long val, int wbOffset) {
+ writeBuffer[wbOffset + 0] = (byte) (val >>> 8);
+ writeBuffer[wbOffset + 1] = (byte) (val >>> 0);
+ }
+
+ private void writeLongBE3(OutputStream output, long val, int wbOffset) {
+ writeBuffer[wbOffset + 0] = (byte) (val >>> 16);
+ writeBuffer[wbOffset + 1] = (byte) (val >>> 8);
+ writeBuffer[wbOffset + 2] = (byte) (val >>> 0);
+ }
+
+ private void writeLongBE4(OutputStream output, long val, int wbOffset) {
+ writeBuffer[wbOffset + 0] = (byte) (val >>> 24);
+ writeBuffer[wbOffset + 1] = (byte) (val >>> 16);
+ writeBuffer[wbOffset + 2] = (byte) (val >>> 8);
+ writeBuffer[wbOffset + 3] = (byte) (val >>> 0);
+ }
+
+ private void writeLongBE5(OutputStream output, long val, int wbOffset) {
+ writeBuffer[wbOffset + 0] = (byte) (val >>> 32);
+ writeBuffer[wbOffset + 1] = (byte) (val >>> 24);
+ writeBuffer[wbOffset + 2] = (byte) (val >>> 16);
+ writeBuffer[wbOffset + 3] = (byte) (val >>> 8);
+ writeBuffer[wbOffset + 4] = (byte) (val >>> 0);
+ }
+
+ private void writeLongBE6(OutputStream output, long val, int wbOffset) {
+ writeBuffer[wbOffset + 0] = (byte) (val >>> 40);
+ writeBuffer[wbOffset + 1] = (byte) (val >>> 32);
+ writeBuffer[wbOffset + 2] = (byte) (val >>> 24);
+ writeBuffer[wbOffset + 3] = (byte) (val >>> 16);
+ writeBuffer[wbOffset + 4] = (byte) (val >>> 8);
+ writeBuffer[wbOffset + 5] = (byte) (val >>> 0);
+ }
+
+ private void writeLongBE7(OutputStream output, long val, int wbOffset) {
+ writeBuffer[wbOffset + 0] = (byte) (val >>> 48);
+ writeBuffer[wbOffset + 1] = (byte) (val >>> 40);
+ writeBuffer[wbOffset + 2] = (byte) (val >>> 32);
+ writeBuffer[wbOffset + 3] = (byte) (val >>> 24);
+ writeBuffer[wbOffset + 4] = (byte) (val >>> 16);
+ writeBuffer[wbOffset + 5] = (byte) (val >>> 8);
+ writeBuffer[wbOffset + 6] = (byte) (val >>> 0);
+ }
+
+ private void writeLongBE8(OutputStream output, long val, int wbOffset) {
+ writeBuffer[wbOffset + 0] = (byte) (val >>> 56);
+ writeBuffer[wbOffset + 1] = (byte) (val >>> 48);
+ writeBuffer[wbOffset + 2] = (byte) (val >>> 40);
+ writeBuffer[wbOffset + 3] = (byte) (val >>> 32);
+ writeBuffer[wbOffset + 4] = (byte) (val >>> 24);
+ writeBuffer[wbOffset + 5] = (byte) (val >>> 16);
+ writeBuffer[wbOffset + 6] = (byte) (val >>> 8);
+ writeBuffer[wbOffset + 7] = (byte) (val >>> 0);
+ }
+
+ /**
+ * Read bitpacked integers from input stream
+ * @param buffer - input buffer
+ * @param offset - offset
+ * @param len - length
+ * @param bitSize - bit width
+ * @param input - input stream
+ * @throws IOException
+ */
+ public void readInts(long[] buffer, int offset, int len, int bitSize,
+ InStream input) throws IOException {
+ int bitsLeft = 0;
+ int current = 0;
+
+ switch (bitSize) {
+ case 1:
+ unrolledUnPack1(buffer, offset, len, input);
+ return;
+ case 2:
+ unrolledUnPack2(buffer, offset, len, input);
+ return;
+ case 4:
+ unrolledUnPack4(buffer, offset, len, input);
+ return;
+ case 8:
+ unrolledUnPack8(buffer, offset, len, input);
+ return;
+ case 16:
+ unrolledUnPack16(buffer, offset, len, input);
+ return;
+ case 24:
+ unrolledUnPack24(buffer, offset, len, input);
+ return;
+ case 32:
+ unrolledUnPack32(buffer, offset, len, input);
+ return;
+ case 40:
+ unrolledUnPack40(buffer, offset, len, input);
+ return;
+ case 48:
+ unrolledUnPack48(buffer, offset, len, input);
+ return;
+ case 56:
+ unrolledUnPack56(buffer, offset, len, input);
+ return;
+ case 64:
+ unrolledUnPack64(buffer, offset, len, input);
+ return;
+ default:
+ break;
+ }
+
+ for(int i = offset; i < (offset + len); i++) {
+ long result = 0;
+ int bitsLeftToRead = bitSize;
+ while (bitsLeftToRead > bitsLeft) {
+ result <<= bitsLeft;
+ result |= current & ((1 << bitsLeft) - 1);
+ bitsLeftToRead -= bitsLeft;
+ current = input.read();
+ bitsLeft = 8;
+ }
+
+ // handle the left over bits
+ if (bitsLeftToRead > 0) {
+ result <<= bitsLeftToRead;
+ bitsLeft -= bitsLeftToRead;
+ result |= (current >> bitsLeft) & ((1 << bitsLeftToRead) - 1);
+ }
+ buffer[i] = result;
+ }
+ }
+
+
+ private void unrolledUnPack1(long[] buffer, int offset, int len,
+ InStream input) throws IOException {
+ final int numHops = 8;
+ final int remainder = len % numHops;
+ final int endOffset = offset + len;
+ final int endUnroll = endOffset - remainder;
+ int val = 0;
+ for (int i = offset; i < endUnroll; i = i + numHops) {
+ val = input.read();
+ buffer[i] = (val >>> 7) & 1;
+ buffer[i + 1] = (val >>> 6) & 1;
+ buffer[i + 2] = (val >>> 5) & 1;
+ buffer[i + 3] = (val >>> 4) & 1;
+ buffer[i + 4] = (val >>> 3) & 1;
+ buffer[i + 5] = (val >>> 2) & 1;
+ buffer[i + 6] = (val >>> 1) & 1;
+ buffer[i + 7] = val & 1;
+ }
+
+ if (remainder > 0) {
+ int startShift = 7;
+ val = input.read();
+ for (int i = endUnroll; i < endOffset; i++) {
+ buffer[i] = (val >>> startShift) & 1;
+ startShift -= 1;
+ }
+ }
+ }
+
+ private void unrolledUnPack2(long[] buffer, int offset, int len,
+ InStream input) throws IOException {
+ final int numHops = 4;
+ final int remainder = len % numHops;
+ final int endOffset = offset + len;
+ final int endUnroll = endOffset - remainder;
+ int val = 0;
+ for (int i = offset; i < endUnroll; i = i + numHops) {
+ val = input.read();
+ buffer[i] = (val >>> 6) & 3;
+ buffer[i + 1] = (val >>> 4) & 3;
+ buffer[i + 2] = (val >>> 2) & 3;
+ buffer[i + 3] = val & 3;
+ }
+
+ if (remainder > 0) {
+ int startShift = 6;
+ val = input.read();
+ for (int i = endUnroll; i < endOffset; i++) {
+ buffer[i] = (val >>> startShift) & 3;
+ startShift -= 2;
+ }
+ }
+ }
+
+ private void unrolledUnPack4(long[] buffer, int offset, int len,
+ InStream input) throws IOException {
+ final int numHops = 2;
+ final int remainder = len % numHops;
+ final int endOffset = offset + len;
+ final int endUnroll = endOffset - remainder;
+ int val = 0;
+ for (int i = offset; i < endUnroll; i = i + numHops) {
+ val = input.read();
+ buffer[i] = (val >>> 4) & 15;
+ buffer[i + 1] = val & 15;
+ }
+
+ if (remainder > 0) {
+ int startShift = 4;
+ val = input.read();
+ for (int i = endUnroll; i < endOffset; i++) {
+ buffer[i] = (val >>> startShift) & 15;
+ startShift -= 4;
+ }
+ }
+ }
+
+ private void unrolledUnPack8(long[] buffer, int offset, int len,
+ InStream input) throws IOException {
+ unrolledUnPackBytes(buffer, offset, len, input, 1);
+ }
+
+ private void unrolledUnPack16(long[] buffer, int offset, int len,
+ InStream input) throws IOException {
+ unrolledUnPackBytes(buffer, offset, len, input, 2);
+ }
+
+ private void unrolledUnPack24(long[] buffer, int offset, int len,
+ InStream input) throws IOException {
+ unrolledUnPackBytes(buffer, offset, len, input, 3);
+ }
+
+ private void unrolledUnPack32(long[] buffer, int offset, int len,
+ InStream input) throws IOException {
+ unrolledUnPackBytes(buffer, offset, len, input, 4);
+ }
+
+ private void unrolledUnPack40(long[] buffer, int offset, int len,
+ InStream input) throws IOException {
+ unrolledUnPackBytes(buffer, offset, len, input, 5);
+ }
+
+ private void unrolledUnPack48(long[] buffer, int offset, int len,
+ InStream input) throws IOException {
+ unrolledUnPackBytes(buffer, offset, len, input, 6);
+ }
+
+ private void unrolledUnPack56(long[] buffer, int offset, int len,
+ InStream input) throws IOException {
+ unrolledUnPackBytes(buffer, offset, len, input, 7);
+ }
+
+ private void unrolledUnPack64(long[] buffer, int offset, int len,
+ InStream input) throws IOException {
+ unrolledUnPackBytes(buffer, offset, len, input, 8);
+ }
+
+ private void unrolledUnPackBytes(long[] buffer, int offset, int len, InStream input, int numBytes)
+ throws IOException {
+ final int numHops = 8;
+ final int remainder = len % numHops;
+ final int endOffset = offset + len;
+ final int endUnroll = endOffset - remainder;
+ int i = offset;
+ for (; i < endUnroll; i = i + numHops) {
+ readLongBE(input, buffer, i, numHops, numBytes);
+ }
+
+ if (remainder > 0) {
+ readRemainingLongs(buffer, i, input, remainder, numBytes);
+ }
+ }
+
+ private void readRemainingLongs(long[] buffer, int offset, InStream input, int remainder,
+ int numBytes) throws IOException {
+ final int toRead = remainder * numBytes;
+ // bulk read to buffer
+ int bytesRead = input.read(readBuffer, 0, toRead);
+ while (bytesRead != toRead) {
+ bytesRead += input.read(readBuffer, bytesRead, toRead - bytesRead);
+ }
+
+ int idx = 0;
+ switch (numBytes) {
+ case 1:
+ while (remainder > 0) {
+ buffer[offset++] = readBuffer[idx] & 255;
+ remainder--;
+ idx++;
+ }
+ break;
+ case 2:
+ while (remainder > 0) {
+ buffer[offset++] = readLongBE2(input, idx * 2);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 3:
+ while (remainder > 0) {
+ buffer[offset++] = readLongBE3(input, idx * 3);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 4:
+ while (remainder > 0) {
+ buffer[offset++] = readLongBE4(input, idx * 4);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 5:
+ while (remainder > 0) {
+ buffer[offset++] = readLongBE5(input, idx * 5);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 6:
+ while (remainder > 0) {
+ buffer[offset++] = readLongBE6(input, idx * 6);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 7:
+ while (remainder > 0) {
+ buffer[offset++] = readLongBE7(input, idx * 7);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 8:
+ while (remainder > 0) {
+ buffer[offset++] = readLongBE8(input, idx * 8);
+ remainder--;
+ idx++;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ private void readLongBE(InStream in, long[] buffer, int start, int numHops, int numBytes)
+ throws IOException {
+ final int toRead = numHops * numBytes;
+ // bulk read to buffer
+ int bytesRead = in.read(readBuffer, 0, toRead);
+ while (bytesRead != toRead) {
+ bytesRead += in.read(readBuffer, bytesRead, toRead - bytesRead);
+ }
+
+ switch (numBytes) {
+ case 1:
+ buffer[start + 0] = readBuffer[0] & 255;
+ buffer[start + 1] = readBuffer[1] & 255;
+ buffer[start + 2] = readBuffer[2] & 255;
+ buffer[start + 3] = readBuffer[3] & 255;
+ buffer[start + 4] = readBuffer[4] & 255;
+ buffer[start + 5] = readBuffer[5] & 255;
+ buffer[start + 6] = readBuffer[6] & 255;
+ buffer[start + 7] = readBuffer[7] & 255;
+ break;
+ case 2:
+ buffer[start + 0] = readLongBE2(in, 0);
+ buffer[start + 1] = readLongBE2(in, 2);
+ buffer[start + 2] = readLongBE2(in, 4);
+ buffer[start + 3] = readLongBE2(in, 6);
+ buffer[start + 4] = readLongBE2(in, 8);
+ buffer[start + 5] = readLongBE2(in, 10);
+ buffer[start + 6] = readLongBE2(in, 12);
+ buffer[start + 7] = readLongBE2(in, 14);
+ break;
+ case 3:
+ buffer[start + 0] = readLongBE3(in, 0);
+ buffer[start + 1] = readLongBE3(in, 3);
+ buffer[start + 2] = readLongBE3(in, 6);
+ buffer[start + 3] = readLongBE3(in, 9);
+ buffer[start + 4] = readLongBE3(in, 12);
+ buffer[start + 5] = readLongBE3(in, 15);
+ buffer[start + 6] = readLongBE3(in, 18);
+ buffer[start + 7] = readLongBE3(in, 21);
+ break;
+ case 4:
+ buffer[start + 0] = readLongBE4(in, 0);
+ buffer[start + 1] = readLongBE4(in, 4);
+ buffer[start + 2] = readLongBE4(in, 8);
+ buffer[start + 3] = readLongBE4(in, 12);
+ buffer[start + 4] = readLongBE4(in, 16);
+ buffer[start + 5] = readLongBE4(in, 20);
+ buffer[start + 6] = readLongBE4(in, 24);
+ buffer[start + 7] = readLongBE4(in, 28);
+ break;
+ case 5:
+ buffer[start + 0] = readLongBE5(in, 0);
+ buffer[start + 1] = readLongBE5(in, 5);
+ buffer[start + 2] = readLongBE5(in, 10);
+ buffer[start + 3] = readLongBE5(in, 15);
+ buffer[start + 4] = readLongBE5(in, 20);
+ buffer[start + 5] = readLongBE5(in, 25);
+ buffer[start + 6] = readLongBE5(in, 30);
+ buffer[start + 7] = readLongBE5(in, 35);
+ break;
+ case 6:
+ buffer[start + 0] = readLongBE6(in, 0);
+ buffer[start + 1] = readLongBE6(in, 6);
+ buffer[start + 2] = readLongBE6(in, 12);
+ buffer[start + 3] = readLongBE6(in, 18);
+ buffer[start + 4] = readLongBE6(in, 24);
+ buffer[start + 5] = readLongBE6(in, 30);
+ buffer[start + 6] = readLongBE6(in, 36);
+ buffer[start + 7] = readLongBE6(in, 42);
+ break;
+ case 7:
+ buffer[start + 0] = readLongBE7(in, 0);
+ buffer[start + 1] = readLongBE7(in, 7);
+ buffer[start + 2] = readLongBE7(in, 14);
+ buffer[start + 3] = readLongBE7(in, 21);
+ buffer[start + 4] = readLongBE7(in, 28);
+ buffer[start + 5] = readLongBE7(in, 35);
+ buffer[start + 6] = readLongBE7(in, 42);
+ buffer[start + 7] = readLongBE7(in, 49);
+ break;
+ case 8:
+ buffer[start + 0] = readLongBE8(in, 0);
+ buffer[start + 1] = readLongBE8(in, 8);
+ buffer[start + 2] = readLongBE8(in, 16);
+ buffer[start + 3] = readLongBE8(in, 24);
+ buffer[start + 4] = readLongBE8(in, 32);
+ buffer[start + 5] = readLongBE8(in, 40);
+ buffer[start + 6] = readLongBE8(in, 48);
+ buffer[start + 7] = readLongBE8(in, 56);
+ break;
+ default:
+ break;
+ }
+ }
+
+ private long readLongBE2(InStream in, int rbOffset) {
+ return (((readBuffer[rbOffset] & 255) << 8)
+ + ((readBuffer[rbOffset + 1] & 255) << 0));
+ }
+
+ private long readLongBE3(InStream in, int rbOffset) {
+ return (((readBuffer[rbOffset] & 255) << 16)
+ + ((readBuffer[rbOffset + 1] & 255) << 8)
+ + ((readBuffer[rbOffset + 2] & 255) << 0));
+ }
+
+ private long readLongBE4(InStream in, int rbOffset) {
+ return (((long) (readBuffer[rbOffset] & 255) << 24)
+ + ((readBuffer[rbOffset + 1] & 255) << 16)
+ + ((readBuffer[rbOffset + 2] & 255) << 8)
+ + ((readBuffer[rbOffset + 3] & 255) << 0));
+ }
+
+ private long readLongBE5(InStream in, int rbOffset) {
+ return (((long) (readBuffer[rbOffset] & 255) << 32)
+ + ((long) (readBuffer[rbOffset + 1] & 255) << 24)
+ + ((readBuffer[rbOffset + 2] & 255) << 16)
+ + ((readBuffer[rbOffset + 3] & 255) << 8)
+ + ((readBuffer[rbOffset + 4] & 255) << 0));
+ }
+
+ private long readLongBE6(InStream in, int rbOffset) {
+ return (((long) (readBuffer[rbOffset] & 255) << 40)
+ + ((long) (readBuffer[rbOffset + 1] & 255) << 32)
+ + ((long) (readBuffer[rbOffset + 2] & 255) << 24)
+ + ((readBuffer[rbOffset + 3] & 255) << 16)
+ + ((readBuffer[rbOffset + 4] & 255) << 8)
+ + ((readBuffer[rbOffset + 5] & 255) << 0));
+ }
+
+ private long readLongBE7(InStream in, int rbOffset) {
+ return (((long) (readBuffer[rbOffset] & 255) << 48)
+ + ((long) (readBuffer[rbOffset + 1] & 255) << 40)
+ + ((long) (readBuffer[rbOffset + 2] & 255) << 32)
+ + ((long) (readBuffer[rbOffset + 3] & 255) << 24)
+ + ((readBuffer[rbOffset + 4] & 255) << 16)
+ + ((readBuffer[rbOffset + 5] & 255) << 8)
+ + ((readBuffer[rbOffset + 6] & 255) << 0));
+ }
+
+ private long readLongBE8(InStream in, int rbOffset) {
+ return (((long) (readBuffer[rbOffset] & 255) << 56)
+ + ((long) (readBuffer[rbOffset + 1] & 255) << 48)
+ + ((long) (readBuffer[rbOffset + 2] & 255) << 40)
+ + ((long) (readBuffer[rbOffset + 3] & 255) << 32)
+ + ((long) (readBuffer[rbOffset + 4] & 255) << 24)
+ + ((readBuffer[rbOffset + 5] & 255) << 16)
+ + ((readBuffer[rbOffset + 6] & 255) << 8)
+ + ((readBuffer[rbOffset + 7] & 255) << 0));
+ }
+
+ // Do not want to use Guava LongMath.checkedSubtract() here as it will throw
+ // ArithmeticException in case of overflow
+ public boolean isSafeSubtract(long left, long right) {
+ return (left ^ right) >= 0 | (left ^ (left - right)) >= 0;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/SettableUncompressedStream.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/SettableUncompressedStream.java b/orc/src/java/org/apache/hive/orc/impl/SettableUncompressedStream.java
new file mode 100644
index 0000000..5ce0b24
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/SettableUncompressedStream.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.util.List;
+
+import org.apache.hadoop.hive.common.DiskRangeInfo;
+import org.apache.hadoop.hive.common.io.DiskRange;
+
+/**
+ * An uncompressed stream whose underlying byte buffer can be set.
+ */
+public class SettableUncompressedStream extends InStream.UncompressedStream {
+
+ public SettableUncompressedStream(String name, List<DiskRange> input, long length) {
+ super(name, input, length);
+ setOffset(input);
+ }
+
+ public void setBuffers(DiskRangeInfo diskRangeInfo) {
+ reset(diskRangeInfo.getDiskRanges(), diskRangeInfo.getTotalLength());
+ setOffset(diskRangeInfo.getDiskRanges());
+ }
+
+ private void setOffset(List<DiskRange> list) {
+ currentOffset = list.isEmpty() ? 0 : list.get(0).getOffset();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/SnappyCodec.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/SnappyCodec.java b/orc/src/java/org/apache/hive/orc/impl/SnappyCodec.java
new file mode 100644
index 0000000..c6c0358
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/SnappyCodec.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import org.apache.hive.orc.CompressionCodec;
+import org.iq80.snappy.Snappy;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.EnumSet;
+
+public class SnappyCodec implements CompressionCodec, DirectDecompressionCodec {
+ private static final HadoopShims SHIMS = HadoopShims.Factory.get();
+
+ Boolean direct = null;
+
+ @Override
+ public boolean compress(ByteBuffer in, ByteBuffer out,
+ ByteBuffer overflow) throws IOException {
+ int inBytes = in.remaining();
+ // I should work on a patch for Snappy to support an overflow buffer
+ // to prevent the extra buffer copy.
+ byte[] compressed = new byte[Snappy.maxCompressedLength(inBytes)];
+ int outBytes =
+ Snappy.compress(in.array(), in.arrayOffset() + in.position(), inBytes,
+ compressed, 0);
+ if (outBytes < inBytes) {
+ int remaining = out.remaining();
+ if (remaining >= outBytes) {
+ System.arraycopy(compressed, 0, out.array(), out.arrayOffset() +
+ out.position(), outBytes);
+ out.position(out.position() + outBytes);
+ } else {
+ System.arraycopy(compressed, 0, out.array(), out.arrayOffset() +
+ out.position(), remaining);
+ out.position(out.limit());
+ System.arraycopy(compressed, remaining, overflow.array(),
+ overflow.arrayOffset(), outBytes - remaining);
+ overflow.position(outBytes - remaining);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public void decompress(ByteBuffer in, ByteBuffer out) throws IOException {
+ if(in.isDirect() && out.isDirect()) {
+ directDecompress(in, out);
+ return;
+ }
+ int inOffset = in.position();
+ int uncompressLen =
+ Snappy.uncompress(in.array(), in.arrayOffset() + inOffset,
+ in.limit() - inOffset, out.array(), out.arrayOffset() + out.position());
+ out.position(uncompressLen + out.position());
+ out.flip();
+ }
+
+ @Override
+ public boolean isAvailable() {
+ if (direct == null) {
+ try {
+ if (SHIMS.getDirectDecompressor(
+ HadoopShims.DirectCompressionType.SNAPPY) != null) {
+ direct = Boolean.valueOf(true);
+ } else {
+ direct = Boolean.valueOf(false);
+ }
+ } catch (UnsatisfiedLinkError ule) {
+ direct = Boolean.valueOf(false);
+ }
+ }
+ return direct.booleanValue();
+ }
+
+ @Override
+ public void directDecompress(ByteBuffer in, ByteBuffer out)
+ throws IOException {
+ HadoopShims.DirectDecompressor decompressShim =
+ SHIMS.getDirectDecompressor(HadoopShims.DirectCompressionType.SNAPPY);
+ decompressShim.decompress(in, out);
+ out.flip(); // flip for read
+ }
+
+ @Override
+ public CompressionCodec modify(EnumSet<Modifier> modifiers) {
+ // snappy allows no modifications
+ return this;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/StreamName.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/StreamName.java b/orc/src/java/org/apache/hive/orc/impl/StreamName.java
new file mode 100644
index 0000000..dd99665
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/StreamName.java
@@ -0,0 +1,97 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import org.apache.hive.orc.OrcProto;
+
+/**
+ * The name of a stream within a stripe.
+ */
+public class StreamName implements Comparable<StreamName> {
+ private final int column;
+ private final OrcProto.Stream.Kind kind;
+
+ public static enum Area {
+ DATA, INDEX
+ }
+
+ public StreamName(int column, OrcProto.Stream.Kind kind) {
+ this.column = column;
+ this.kind = kind;
+ }
+
+ public boolean equals(Object obj) {
+ if (obj != null && obj instanceof StreamName) {
+ StreamName other = (StreamName) obj;
+ return other.column == column && other.kind == kind;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public int compareTo(StreamName streamName) {
+ if (streamName == null) {
+ return -1;
+ }
+ Area area = getArea(kind);
+ Area otherArea = streamName.getArea(streamName.kind);
+ if (area != otherArea) {
+ return -area.compareTo(otherArea);
+ }
+ if (column != streamName.column) {
+ return column < streamName.column ? -1 : 1;
+ }
+ return kind.compareTo(streamName.kind);
+ }
+
+ public int getColumn() {
+ return column;
+ }
+
+ public OrcProto.Stream.Kind getKind() {
+ return kind;
+ }
+
+ public Area getArea() {
+ return getArea(kind);
+ }
+
+ public static Area getArea(OrcProto.Stream.Kind kind) {
+ switch (kind) {
+ case ROW_INDEX:
+ case DICTIONARY_COUNT:
+ case BLOOM_FILTER:
+ return Area.INDEX;
+ default:
+ return Area.DATA;
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "Stream for column " + column + " kind " + kind;
+ }
+
+ @Override
+ public int hashCode() {
+ return column * 101 + kind.getNumber();
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/StringRedBlackTree.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/StringRedBlackTree.java b/orc/src/java/org/apache/hive/orc/impl/StringRedBlackTree.java
new file mode 100644
index 0000000..f9113d0
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/StringRedBlackTree.java
@@ -0,0 +1,207 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+import java.io.OutputStream;
+
+import org.apache.hadoop.io.Text;
+
+/**
+ * A red-black tree that stores strings. The strings are stored as UTF-8 bytes
+ * and an offset for each entry.
+ */
+public class StringRedBlackTree extends RedBlackTree {
+ private final DynamicByteArray byteArray = new DynamicByteArray();
+ private final DynamicIntArray keyOffsets;
+ private final Text newKey = new Text();
+
+ public StringRedBlackTree(int initialCapacity) {
+ super(initialCapacity);
+ keyOffsets = new DynamicIntArray(initialCapacity);
+ }
+
+ public int add(String value) {
+ newKey.set(value);
+ return addNewKey();
+ }
+
+ private int addNewKey() {
+ // if the newKey is actually new, add it to our byteArray and store the offset & length
+ if (add()) {
+ int len = newKey.getLength();
+ keyOffsets.add(byteArray.add(newKey.getBytes(), 0, len));
+ }
+ return lastAdd;
+ }
+
+ public int add(Text value) {
+ newKey.set(value);
+ return addNewKey();
+ }
+
+ public int add(byte[] bytes, int offset, int length) {
+ newKey.set(bytes, offset, length);
+ return addNewKey();
+ }
+
+ @Override
+ protected int compareValue(int position) {
+ int start = keyOffsets.get(position);
+ int end;
+ if (position + 1 == keyOffsets.size()) {
+ end = byteArray.size();
+ } else {
+ end = keyOffsets.get(position+1);
+ }
+ return byteArray.compare(newKey.getBytes(), 0, newKey.getLength(),
+ start, end - start);
+ }
+
+ /**
+ * The information about each node.
+ */
+ public interface VisitorContext {
+ /**
+ * Get the position where the key was originally added.
+ * @return the number returned by add.
+ */
+ int getOriginalPosition();
+
+ /**
+ * Write the bytes for the string to the given output stream.
+ * @param out the stream to write to.
+ * @throws IOException
+ */
+ void writeBytes(OutputStream out) throws IOException;
+
+ /**
+ * Get the original string.
+ * @return the string
+ */
+ Text getText();
+
+ /**
+ * Get the number of bytes.
+ * @return the string's length in bytes
+ */
+ int getLength();
+ }
+
+ /**
+ * The interface for visitors.
+ */
+ public interface Visitor {
+ /**
+ * Called once for each node of the tree in sort order.
+ * @param context the information about each node
+ * @throws IOException
+ */
+ void visit(VisitorContext context) throws IOException;
+ }
+
+ private class VisitorContextImpl implements VisitorContext {
+ private int originalPosition;
+ private int start;
+ private int end;
+ private final Text text = new Text();
+
+ public int getOriginalPosition() {
+ return originalPosition;
+ }
+
+ public Text getText() {
+ byteArray.setText(text, start, end - start);
+ return text;
+ }
+
+ public void writeBytes(OutputStream out) throws IOException {
+ byteArray.write(out, start, end - start);
+ }
+
+ public int getLength() {
+ return end - start;
+ }
+
+ void setPosition(int position) {
+ originalPosition = position;
+ start = keyOffsets.get(originalPosition);
+ if (position + 1 == keyOffsets.size()) {
+ end = byteArray.size();
+ } else {
+ end = keyOffsets.get(originalPosition + 1);
+ }
+ }
+ }
+
+ private void recurse(int node, Visitor visitor, VisitorContextImpl context
+ ) throws IOException {
+ if (node != NULL) {
+ recurse(getLeft(node), visitor, context);
+ context.setPosition(node);
+ visitor.visit(context);
+ recurse(getRight(node), visitor, context);
+ }
+ }
+
+ /**
+ * Visit all of the nodes in the tree in sorted order.
+ * @param visitor the action to be applied to each node
+ * @throws IOException
+ */
+ public void visit(Visitor visitor) throws IOException {
+ recurse(root, visitor, new VisitorContextImpl());
+ }
+
+ /**
+ * Reset the table to empty.
+ */
+ public void clear() {
+ super.clear();
+ byteArray.clear();
+ keyOffsets.clear();
+ }
+
+ public void getText(Text result, int originalPosition) {
+ int offset = keyOffsets.get(originalPosition);
+ int length;
+ if (originalPosition + 1 == keyOffsets.size()) {
+ length = byteArray.size() - offset;
+ } else {
+ length = keyOffsets.get(originalPosition + 1) - offset;
+ }
+ byteArray.setText(result, offset, length);
+ }
+
+ /**
+ * Get the size of the character data in the table.
+ * @return the bytes used by the table
+ */
+ public int getCharacterSize() {
+ return byteArray.size();
+ }
+
+ /**
+ * Calculate the approximate size in memory.
+ * @return the number of bytes used in storing the tree.
+ */
+ public long getSizeInBytes() {
+ return byteArray.getSizeInBytes() + keyOffsets.getSizeInBytes() +
+ super.getSizeInBytes();
+ }
+}
[22/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/TypeDescription.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/TypeDescription.java b/orc/src/java/org/apache/orc/TypeDescription.java
deleted file mode 100644
index 2e9328b..0000000
--- a/orc/src/java/org/apache/orc/TypeDescription.java
+++ /dev/null
@@ -1,870 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-/**
- * This is the description of the types in an ORC file.
- */
-public class TypeDescription
- implements Comparable<TypeDescription>, Serializable {
- private static final int MAX_PRECISION = 38;
- private static final int MAX_SCALE = 38;
- private static final int DEFAULT_PRECISION = 38;
- private static final int DEFAULT_SCALE = 10;
- private static final int DEFAULT_LENGTH = 256;
-
- @Override
- public int compareTo(TypeDescription other) {
- if (this == other) {
- return 0;
- } else if (other == null) {
- return -1;
- } else {
- int result = category.compareTo(other.category);
- if (result == 0) {
- switch (category) {
- case CHAR:
- case VARCHAR:
- return maxLength - other.maxLength;
- case DECIMAL:
- if (precision != other.precision) {
- return precision - other.precision;
- }
- return scale - other.scale;
- case UNION:
- case LIST:
- case MAP:
- if (children.size() != other.children.size()) {
- return children.size() - other.children.size();
- }
- for(int c=0; result == 0 && c < children.size(); ++c) {
- result = children.get(c).compareTo(other.children.get(c));
- }
- break;
- case STRUCT:
- if (children.size() != other.children.size()) {
- return children.size() - other.children.size();
- }
- for(int c=0; result == 0 && c < children.size(); ++c) {
- result = fieldNames.get(c).compareTo(other.fieldNames.get(c));
- if (result == 0) {
- result = children.get(c).compareTo(other.children.get(c));
- }
- }
- break;
- default:
- // PASS
- }
- }
- return result;
- }
- }
-
- public enum Category {
- BOOLEAN("boolean", true),
- BYTE("tinyint", true),
- SHORT("smallint", true),
- INT("int", true),
- LONG("bigint", true),
- FLOAT("float", true),
- DOUBLE("double", true),
- STRING("string", true),
- DATE("date", true),
- TIMESTAMP("timestamp", true),
- BINARY("binary", true),
- DECIMAL("decimal", true),
- VARCHAR("varchar", true),
- CHAR("char", true),
- LIST("array", false),
- MAP("map", false),
- STRUCT("struct", false),
- UNION("uniontype", false);
-
- Category(String name, boolean isPrimitive) {
- this.name = name;
- this.isPrimitive = isPrimitive;
- }
-
- final boolean isPrimitive;
- final String name;
-
- public boolean isPrimitive() {
- return isPrimitive;
- }
-
- public String getName() {
- return name;
- }
- }
-
- public static TypeDescription createBoolean() {
- return new TypeDescription(Category.BOOLEAN);
- }
-
- public static TypeDescription createByte() {
- return new TypeDescription(Category.BYTE);
- }
-
- public static TypeDescription createShort() {
- return new TypeDescription(Category.SHORT);
- }
-
- public static TypeDescription createInt() {
- return new TypeDescription(Category.INT);
- }
-
- public static TypeDescription createLong() {
- return new TypeDescription(Category.LONG);
- }
-
- public static TypeDescription createFloat() {
- return new TypeDescription(Category.FLOAT);
- }
-
- public static TypeDescription createDouble() {
- return new TypeDescription(Category.DOUBLE);
- }
-
- public static TypeDescription createString() {
- return new TypeDescription(Category.STRING);
- }
-
- public static TypeDescription createDate() {
- return new TypeDescription(Category.DATE);
- }
-
- public static TypeDescription createTimestamp() {
- return new TypeDescription(Category.TIMESTAMP);
- }
-
- public static TypeDescription createBinary() {
- return new TypeDescription(Category.BINARY);
- }
-
- public static TypeDescription createDecimal() {
- return new TypeDescription(Category.DECIMAL);
- }
-
- static class StringPosition {
- final String value;
- int position;
- final int length;
-
- StringPosition(String value) {
- this.value = value;
- position = 0;
- length = value.length();
- }
-
- @Override
- public String toString() {
- StringBuilder buffer = new StringBuilder();
- buffer.append('\'');
- buffer.append(value.substring(0, position));
- buffer.append('^');
- buffer.append(value.substring(position));
- buffer.append('\'');
- return buffer.toString();
- }
- }
-
- static Category parseCategory(StringPosition source) {
- int start = source.position;
- while (source.position < source.length) {
- char ch = source.value.charAt(source.position);
- if (!Character.isLetter(ch)) {
- break;
- }
- source.position += 1;
- }
- if (source.position != start) {
- String word = source.value.substring(start, source.position).toLowerCase();
- for (Category cat : Category.values()) {
- if (cat.getName().equals(word)) {
- return cat;
- }
- }
- }
- throw new IllegalArgumentException("Can't parse category at " + source);
- }
-
- static int parseInt(StringPosition source) {
- int start = source.position;
- int result = 0;
- while (source.position < source.length) {
- char ch = source.value.charAt(source.position);
- if (!Character.isDigit(ch)) {
- break;
- }
- result = result * 10 + (ch - '0');
- source.position += 1;
- }
- if (source.position == start) {
- throw new IllegalArgumentException("Missing integer at " + source);
- }
- return result;
- }
-
- static String parseName(StringPosition source) {
- int start = source.position;
- while (source.position < source.length) {
- char ch = source.value.charAt(source.position);
- if (!Character.isLetterOrDigit(ch) && ch != '.' && ch != '_') {
- break;
- }
- source.position += 1;
- }
- if (source.position == start) {
- throw new IllegalArgumentException("Missing name at " + source);
- }
- return source.value.substring(start, source.position);
- }
-
- static void requireChar(StringPosition source, char required) {
- if (source.position >= source.length ||
- source.value.charAt(source.position) != required) {
- throw new IllegalArgumentException("Missing required char '" +
- required + "' at " + source);
- }
- source.position += 1;
- }
-
- static boolean consumeChar(StringPosition source, char ch) {
- boolean result = source.position < source.length &&
- source.value.charAt(source.position) == ch;
- if (result) {
- source.position += 1;
- }
- return result;
- }
-
- static void parseUnion(TypeDescription type, StringPosition source) {
- requireChar(source, '<');
- do {
- type.addUnionChild(parseType(source));
- } while (consumeChar(source, ','));
- requireChar(source, '>');
- }
-
- static void parseStruct(TypeDescription type, StringPosition source) {
- requireChar(source, '<');
- do {
- String fieldName = parseName(source);
- requireChar(source, ':');
- type.addField(fieldName, parseType(source));
- } while (consumeChar(source, ','));
- requireChar(source, '>');
- }
-
- static TypeDescription parseType(StringPosition source) {
- TypeDescription result = new TypeDescription(parseCategory(source));
- switch (result.getCategory()) {
- case BINARY:
- case BOOLEAN:
- case BYTE:
- case DATE:
- case DOUBLE:
- case FLOAT:
- case INT:
- case LONG:
- case SHORT:
- case STRING:
- case TIMESTAMP:
- break;
- case CHAR:
- case VARCHAR:
- requireChar(source, '(');
- result.withMaxLength(parseInt(source));
- requireChar(source, ')');
- break;
- case DECIMAL: {
- requireChar(source, '(');
- int precision = parseInt(source);
- requireChar(source, ',');
- result.withScale(parseInt(source));
- result.withPrecision(precision);
- requireChar(source, ')');
- break;
- }
- case LIST:
- requireChar(source, '<');
- result.children.add(parseType(source));
- requireChar(source, '>');
- break;
- case MAP:
- requireChar(source, '<');
- result.children.add(parseType(source));
- requireChar(source, ',');
- result.children.add(parseType(source));
- requireChar(source, '>');
- break;
- case UNION:
- parseUnion(result, source);
- break;
- case STRUCT:
- parseStruct(result, source);
- break;
- default:
- throw new IllegalArgumentException("Unknown type " +
- result.getCategory() + " at " + source);
- }
- return result;
- }
-
- /**
- * Parse TypeDescription from the Hive type names. This is the inverse
- * of TypeDescription.toString()
- * @param typeName the name of the type
- * @return a new TypeDescription or null if typeName was null
- * @throws IllegalArgumentException if the string is badly formed
- */
- public static TypeDescription fromString(String typeName) {
- if (typeName == null) {
- return null;
- }
- StringPosition source = new StringPosition(typeName);
- TypeDescription result = parseType(source);
- if (source.position != source.length) {
- throw new IllegalArgumentException("Extra characters at " + source);
- }
- return result;
- }
-
- /**
- * For decimal types, set the precision.
- * @param precision the new precision
- * @return this
- */
- public TypeDescription withPrecision(int precision) {
- if (category != Category.DECIMAL) {
- throw new IllegalArgumentException("precision is only allowed on decimal"+
- " and not " + category.name);
- } else if (precision < 1 || precision > MAX_PRECISION || scale > precision){
- throw new IllegalArgumentException("precision " + precision +
- " is out of range 1 .. " + scale);
- }
- this.precision = precision;
- return this;
- }
-
- /**
- * For decimal types, set the scale.
- * @param scale the new scale
- * @return this
- */
- public TypeDescription withScale(int scale) {
- if (category != Category.DECIMAL) {
- throw new IllegalArgumentException("scale is only allowed on decimal"+
- " and not " + category.name);
- } else if (scale < 0 || scale > MAX_SCALE || scale > precision) {
- throw new IllegalArgumentException("scale is out of range at " + scale);
- }
- this.scale = scale;
- return this;
- }
-
- public static TypeDescription createVarchar() {
- return new TypeDescription(Category.VARCHAR);
- }
-
- public static TypeDescription createChar() {
- return new TypeDescription(Category.CHAR);
- }
-
- /**
- * Set the maximum length for char and varchar types.
- * @param maxLength the maximum value
- * @return this
- */
- public TypeDescription withMaxLength(int maxLength) {
- if (category != Category.VARCHAR && category != Category.CHAR) {
- throw new IllegalArgumentException("maxLength is only allowed on char" +
- " and varchar and not " + category.name);
- }
- this.maxLength = maxLength;
- return this;
- }
-
- public static TypeDescription createList(TypeDescription childType) {
- TypeDescription result = new TypeDescription(Category.LIST);
- result.children.add(childType);
- childType.parent = result;
- return result;
- }
-
- public static TypeDescription createMap(TypeDescription keyType,
- TypeDescription valueType) {
- TypeDescription result = new TypeDescription(Category.MAP);
- result.children.add(keyType);
- result.children.add(valueType);
- keyType.parent = result;
- valueType.parent = result;
- return result;
- }
-
- public static TypeDescription createUnion() {
- return new TypeDescription(Category.UNION);
- }
-
- public static TypeDescription createStruct() {
- return new TypeDescription(Category.STRUCT);
- }
-
- /**
- * Add a child to a union type.
- * @param child a new child type to add
- * @return the union type.
- */
- public TypeDescription addUnionChild(TypeDescription child) {
- if (category != Category.UNION) {
- throw new IllegalArgumentException("Can only add types to union type" +
- " and not " + category);
- }
- children.add(child);
- child.parent = this;
- return this;
- }
-
- /**
- * Add a field to a struct type as it is built.
- * @param field the field name
- * @param fieldType the type of the field
- * @return the struct type
- */
- public TypeDescription addField(String field, TypeDescription fieldType) {
- if (category != Category.STRUCT) {
- throw new IllegalArgumentException("Can only add fields to struct type" +
- " and not " + category);
- }
- fieldNames.add(field);
- children.add(fieldType);
- fieldType.parent = this;
- return this;
- }
-
- /**
- * Get the id for this type.
- * The first call will cause all of the the ids in tree to be assigned, so
- * it should not be called before the type is completely built.
- * @return the sequential id
- */
- public int getId() {
- // if the id hasn't been assigned, assign all of the ids from the root
- if (id == -1) {
- TypeDescription root = this;
- while (root.parent != null) {
- root = root.parent;
- }
- root.assignIds(0);
- }
- return id;
- }
-
- public TypeDescription clone() {
- TypeDescription result = new TypeDescription(category);
- result.maxLength = maxLength;
- result.precision = precision;
- result.scale = scale;
- if (fieldNames != null) {
- result.fieldNames.addAll(fieldNames);
- }
- if (children != null) {
- for(TypeDescription child: children) {
- TypeDescription clone = child.clone();
- clone.parent = result;
- result.children.add(clone);
- }
- }
- return result;
- }
-
- @Override
- public int hashCode() {
- long result = category.ordinal() * 4241 + maxLength + precision * 13 + scale;
- if (children != null) {
- for(TypeDescription child: children) {
- result = result * 6959 + child.hashCode();
- }
- }
- return (int) result;
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == null || !(other instanceof TypeDescription)) {
- return false;
- }
- if (other == this) {
- return true;
- }
- TypeDescription castOther = (TypeDescription) other;
- if (category != castOther.category ||
- maxLength != castOther.maxLength ||
- scale != castOther.scale ||
- precision != castOther.precision) {
- return false;
- }
- if (children != null) {
- if (children.size() != castOther.children.size()) {
- return false;
- }
- for (int i = 0; i < children.size(); ++i) {
- if (!children.get(i).equals(castOther.children.get(i))) {
- return false;
- }
- }
- }
- if (category == Category.STRUCT) {
- for(int i=0; i < fieldNames.size(); ++i) {
- if (!fieldNames.get(i).equals(castOther.fieldNames.get(i))) {
- return false;
- }
- }
- }
- return true;
- }
-
- /**
- * Get the maximum id assigned to this type or its children.
- * The first call will cause all of the the ids in tree to be assigned, so
- * it should not be called before the type is completely built.
- * @return the maximum id assigned under this type
- */
- public int getMaximumId() {
- // if the id hasn't been assigned, assign all of the ids from the root
- if (maxId == -1) {
- TypeDescription root = this;
- while (root.parent != null) {
- root = root.parent;
- }
- root.assignIds(0);
- }
- return maxId;
- }
-
- private ColumnVector createColumn(int maxSize) {
- switch (category) {
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- case DATE:
- return new LongColumnVector(maxSize);
- case TIMESTAMP:
- return new TimestampColumnVector(maxSize);
- case FLOAT:
- case DOUBLE:
- return new DoubleColumnVector(maxSize);
- case DECIMAL:
- return new DecimalColumnVector(maxSize, precision, scale);
- case STRING:
- case BINARY:
- case CHAR:
- case VARCHAR:
- return new BytesColumnVector(maxSize);
- case STRUCT: {
- ColumnVector[] fieldVector = new ColumnVector[children.size()];
- for(int i=0; i < fieldVector.length; ++i) {
- fieldVector[i] = children.get(i).createColumn(maxSize);
- }
- return new StructColumnVector(maxSize,
- fieldVector);
- }
- case UNION: {
- ColumnVector[] fieldVector = new ColumnVector[children.size()];
- for(int i=0; i < fieldVector.length; ++i) {
- fieldVector[i] = children.get(i).createColumn(maxSize);
- }
- return new UnionColumnVector(maxSize,
- fieldVector);
- }
- case LIST:
- return new ListColumnVector(maxSize,
- children.get(0).createColumn(maxSize));
- case MAP:
- return new MapColumnVector(maxSize,
- children.get(0).createColumn(maxSize),
- children.get(1).createColumn(maxSize));
- default:
- throw new IllegalArgumentException("Unknown type " + category);
- }
- }
-
- public VectorizedRowBatch createRowBatch(int maxSize) {
- VectorizedRowBatch result;
- if (category == Category.STRUCT) {
- result = new VectorizedRowBatch(children.size(), maxSize);
- for(int i=0; i < result.cols.length; ++i) {
- result.cols[i] = children.get(i).createColumn(maxSize);
- }
- } else {
- result = new VectorizedRowBatch(1, maxSize);
- result.cols[0] = createColumn(maxSize);
- }
- result.reset();
- return result;
- }
-
- public VectorizedRowBatch createRowBatch() {
- return createRowBatch(VectorizedRowBatch.DEFAULT_SIZE);
- }
-
- /**
- * Get the kind of this type.
- * @return get the category for this type.
- */
- public Category getCategory() {
- return category;
- }
-
- /**
- * Get the maximum length of the type. Only used for char and varchar types.
- * @return the maximum length of the string type
- */
- public int getMaxLength() {
- return maxLength;
- }
-
- /**
- * Get the precision of the decimal type.
- * @return the number of digits for the precision.
- */
- public int getPrecision() {
- return precision;
- }
-
- /**
- * Get the scale of the decimal type.
- * @return the number of digits for the scale.
- */
- public int getScale() {
- return scale;
- }
-
- /**
- * For struct types, get the list of field names.
- * @return the list of field names.
- */
- public List<String> getFieldNames() {
- return Collections.unmodifiableList(fieldNames);
- }
-
- /**
- * Get the subtypes of this type.
- * @return the list of children types
- */
- public List<TypeDescription> getChildren() {
- return children == null ? null : Collections.unmodifiableList(children);
- }
-
- /**
- * Assign ids to all of the nodes under this one.
- * @param startId the lowest id to assign
- * @return the next available id
- */
- private int assignIds(int startId) {
- id = startId++;
- if (children != null) {
- for (TypeDescription child : children) {
- startId = child.assignIds(startId);
- }
- }
- maxId = startId - 1;
- return startId;
- }
-
- private TypeDescription(Category category) {
- this.category = category;
- if (category.isPrimitive) {
- children = null;
- } else {
- children = new ArrayList<>();
- }
- if (category == Category.STRUCT) {
- fieldNames = new ArrayList<>();
- } else {
- fieldNames = null;
- }
- }
-
- private int id = -1;
- private int maxId = -1;
- private TypeDescription parent;
- private final Category category;
- private final List<TypeDescription> children;
- private final List<String> fieldNames;
- private int maxLength = DEFAULT_LENGTH;
- private int precision = DEFAULT_PRECISION;
- private int scale = DEFAULT_SCALE;
-
- public void printToBuffer(StringBuilder buffer) {
- buffer.append(category.name);
- switch (category) {
- case DECIMAL:
- buffer.append('(');
- buffer.append(precision);
- buffer.append(',');
- buffer.append(scale);
- buffer.append(')');
- break;
- case CHAR:
- case VARCHAR:
- buffer.append('(');
- buffer.append(maxLength);
- buffer.append(')');
- break;
- case LIST:
- case MAP:
- case UNION:
- buffer.append('<');
- for(int i=0; i < children.size(); ++i) {
- if (i != 0) {
- buffer.append(',');
- }
- children.get(i).printToBuffer(buffer);
- }
- buffer.append('>');
- break;
- case STRUCT:
- buffer.append('<');
- for(int i=0; i < children.size(); ++i) {
- if (i != 0) {
- buffer.append(',');
- }
- buffer.append(fieldNames.get(i));
- buffer.append(':');
- children.get(i).printToBuffer(buffer);
- }
- buffer.append('>');
- break;
- default:
- break;
- }
- }
-
- public String toString() {
- StringBuilder buffer = new StringBuilder();
- printToBuffer(buffer);
- return buffer.toString();
- }
-
- private void printJsonToBuffer(String prefix, StringBuilder buffer,
- int indent) {
- for(int i=0; i < indent; ++i) {
- buffer.append(' ');
- }
- buffer.append(prefix);
- buffer.append("{\"category\": \"");
- buffer.append(category.name);
- buffer.append("\", \"id\": ");
- buffer.append(getId());
- buffer.append(", \"max\": ");
- buffer.append(maxId);
- switch (category) {
- case DECIMAL:
- buffer.append(", \"precision\": ");
- buffer.append(precision);
- buffer.append(", \"scale\": ");
- buffer.append(scale);
- break;
- case CHAR:
- case VARCHAR:
- buffer.append(", \"length\": ");
- buffer.append(maxLength);
- break;
- case LIST:
- case MAP:
- case UNION:
- buffer.append(", \"children\": [");
- for(int i=0; i < children.size(); ++i) {
- buffer.append('\n');
- children.get(i).printJsonToBuffer("", buffer, indent + 2);
- if (i != children.size() - 1) {
- buffer.append(',');
- }
- }
- buffer.append("]");
- break;
- case STRUCT:
- buffer.append(", \"fields\": [");
- for(int i=0; i < children.size(); ++i) {
- buffer.append('\n');
- children.get(i).printJsonToBuffer("\"" + fieldNames.get(i) + "\": ",
- buffer, indent + 2);
- if (i != children.size() - 1) {
- buffer.append(',');
- }
- }
- buffer.append(']');
- break;
- default:
- break;
- }
- buffer.append('}');
- }
-
- public String toJson() {
- StringBuilder buffer = new StringBuilder();
- printJsonToBuffer("", buffer, 0);
- return buffer.toString();
- }
-
- /**
- * Locate a subtype by its id.
- * @param goal the column id to look for
- * @return the subtype
- */
- public TypeDescription findSubtype(int goal) {
- // call getId method to make sure the ids are assigned
- int id = getId();
- if (goal < id || goal > maxId) {
- throw new IllegalArgumentException("Unknown type id " + id + " in " +
- toJson());
- }
- if (goal == id) {
- return this;
- } else {
- TypeDescription prev = null;
- for(TypeDescription next: children) {
- if (next.id > goal) {
- return prev.findSubtype(goal);
- }
- prev = next;
- }
- return prev.findSubtype(goal);
- }
- }}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/Writer.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/Writer.java b/orc/src/java/org/apache/orc/Writer.java
deleted file mode 100644
index 4492062..0000000
--- a/orc/src/java/org/apache/orc/Writer.java
+++ /dev/null
@@ -1,114 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.List;
-
-import org.apache.orc.OrcProto;
-import org.apache.orc.StripeInformation;
-import org.apache.orc.TypeDescription;
-
-/**
- * The interface for writing ORC files.
- */
-public interface Writer {
-
- /**
- * Get the schema for this writer
- * @return the file schema
- */
- TypeDescription getSchema();
-
- /**
- * Add arbitrary meta-data to the ORC file. This may be called at any point
- * until the Writer is closed. If the same key is passed a second time, the
- * second value will replace the first.
- * @param key a key to label the data with.
- * @param value the contents of the metadata.
- */
- void addUserMetadata(String key, ByteBuffer value);
-
- /**
- * Add a row batch to the ORC file.
- * @param batch the rows to add
- */
- void addRowBatch(VectorizedRowBatch batch) throws IOException;
-
- /**
- * Flush all of the buffers and close the file. No methods on this writer
- * should be called afterwards.
- * @throws IOException
- */
- void close() throws IOException;
-
- /**
- * Return the deserialized data size. Raw data size will be compute when
- * writing the file footer. Hence raw data size value will be available only
- * after closing the writer.
- *
- * @return raw data size
- */
- long getRawDataSize();
-
- /**
- * Return the number of rows in file. Row count gets updated when flushing
- * the stripes. To get accurate row count this method should be called after
- * closing the writer.
- *
- * @return row count
- */
- long getNumberOfRows();
-
- /**
- * Write an intermediate footer on the file such that if the file is
- * truncated to the returned offset, it would be a valid ORC file.
- * @return the offset that would be a valid end location for an ORC file
- */
- long writeIntermediateFooter() throws IOException;
-
- /**
- * Fast stripe append to ORC file. This interface is used for fast ORC file
- * merge with other ORC files. When merging, the file to be merged should pass
- * stripe in binary form along with stripe information and stripe statistics.
- * After appending last stripe of a file, use appendUserMetadata() to append
- * any user metadata.
- * @param stripe - stripe as byte array
- * @param offset - offset within byte array
- * @param length - length of stripe within byte array
- * @param stripeInfo - stripe information
- * @param stripeStatistics - stripe statistics (Protobuf objects can be
- * merged directly)
- * @throws IOException
- */
- public void appendStripe(byte[] stripe, int offset, int length,
- StripeInformation stripeInfo,
- OrcProto.StripeStatistics stripeStatistics) throws IOException;
-
- /**
- * When fast stripe append is used for merging ORC stripes, after appending
- * the last stripe from a file, this interface must be used to merge any
- * user metadata.
- * @param userMetadata - user metadata
- */
- public void appendUserMetadata(List<OrcProto.UserMetadataItem> userMetadata);
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/AcidStats.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/AcidStats.java b/orc/src/java/org/apache/orc/impl/AcidStats.java
deleted file mode 100644
index 6657fe9..0000000
--- a/orc/src/java/org/apache/orc/impl/AcidStats.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-/**
- * Statistics about the ACID operations in an ORC file
- */
-public class AcidStats {
- public long inserts;
- public long updates;
- public long deletes;
-
- public AcidStats() {
- inserts = 0;
- updates = 0;
- deletes = 0;
- }
-
- public AcidStats(String serialized) {
- String[] parts = serialized.split(",");
- inserts = Long.parseLong(parts[0]);
- updates = Long.parseLong(parts[1]);
- deletes = Long.parseLong(parts[2]);
- }
-
- public String serialize() {
- StringBuilder builder = new StringBuilder();
- builder.append(inserts);
- builder.append(",");
- builder.append(updates);
- builder.append(",");
- builder.append(deletes);
- return builder.toString();
- }
-
- @Override
- public String toString() {
- StringBuilder builder = new StringBuilder();
- builder.append(" inserts: ").append(inserts);
- builder.append(" updates: ").append(updates);
- builder.append(" deletes: ").append(deletes);
- return builder.toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/BitFieldReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/BitFieldReader.java b/orc/src/java/org/apache/orc/impl/BitFieldReader.java
deleted file mode 100644
index dda7355..0000000
--- a/orc/src/java/org/apache/orc/impl/BitFieldReader.java
+++ /dev/null
@@ -1,217 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.EOFException;
-import java.io.IOException;
-
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.orc.impl.InStream;
-import org.apache.orc.impl.PositionProvider;
-import org.apache.orc.impl.RunLengthByteReader;
-
-public class BitFieldReader {
- private final RunLengthByteReader input;
- /** The number of bits in one item. Non-test code always uses 1. */
- private final int bitSize;
- private int current;
- private int bitsLeft;
- private final int mask;
-
- public BitFieldReader(InStream input,
- int bitSize) throws IOException {
- this.input = new RunLengthByteReader(input);
- this.bitSize = bitSize;
- mask = (1 << bitSize) - 1;
- }
-
- public void setInStream(InStream inStream) {
- this.input.setInStream(inStream);
- }
-
- private void readByte() throws IOException {
- if (input.hasNext()) {
- current = 0xff & input.next();
- bitsLeft = 8;
- } else {
- throw new EOFException("Read past end of bit field from " + this);
- }
- }
-
- public int next() throws IOException {
- int result = 0;
- int bitsLeftToRead = bitSize;
- while (bitsLeftToRead > bitsLeft) {
- result <<= bitsLeft;
- result |= current & ((1 << bitsLeft) - 1);
- bitsLeftToRead -= bitsLeft;
- readByte();
- }
- if (bitsLeftToRead > 0) {
- result <<= bitsLeftToRead;
- bitsLeft -= bitsLeftToRead;
- result |= (current >>> bitsLeft) & ((1 << bitsLeftToRead) - 1);
- }
- return result & mask;
- }
-
- /**
- * Unlike integer readers, where runs are encoded explicitly, in this one we have to read ahead
- * to figure out whether we have a run. Given that runs in booleans are likely it's worth it.
- * However it means we'd need to keep track of how many bytes we read, and next/nextVector won't
- * work anymore once this is called. These is trivial to fix, but these are never interspersed.
- */
- private boolean lastRunValue;
- private int lastRunLength = -1;
- private void readNextRun(int maxRunLength) throws IOException {
- assert bitSize == 1;
- if (lastRunLength > 0) return; // last run is not exhausted yet
- if (bitsLeft == 0) {
- readByte();
- }
- // First take care of the partial bits.
- boolean hasVal = false;
- int runLength = 0;
- if (bitsLeft != 8) {
- int partialBitsMask = (1 << bitsLeft) - 1;
- int partialBits = current & partialBitsMask;
- if (partialBits == partialBitsMask || partialBits == 0) {
- lastRunValue = (partialBits == partialBitsMask);
- if (maxRunLength <= bitsLeft) {
- lastRunLength = maxRunLength;
- return;
- }
- maxRunLength -= bitsLeft;
- hasVal = true;
- runLength = bitsLeft;
- bitsLeft = 0;
- } else {
- // There's no run in partial bits. Return whatever we have.
- int prefixBitsCount = 32 - bitsLeft;
- runLength = Integer.numberOfLeadingZeros(partialBits) - prefixBitsCount;
- lastRunValue = (runLength > 0);
- lastRunLength = Math.min(maxRunLength, lastRunValue ? runLength :
- (Integer.numberOfLeadingZeros(~(partialBits | ~partialBitsMask)) - prefixBitsCount));
- return;
- }
- assert bitsLeft == 0;
- readByte();
- }
- if (!hasVal) {
- lastRunValue = ((current >> 7) == 1);
- hasVal = true;
- }
- // Read full bytes until the run ends.
- assert bitsLeft == 8;
- while (maxRunLength >= 8
- && ((lastRunValue && (current == 0xff)) || (!lastRunValue && (current == 0)))) {
- runLength += 8;
- maxRunLength -= 8;
- readByte();
- }
- if (maxRunLength > 0) {
- int extraBits = Integer.numberOfLeadingZeros(
- lastRunValue ? (~(current | ~255)) : current) - 24;
- bitsLeft -= extraBits;
- runLength += extraBits;
- }
- lastRunLength = runLength;
- }
-
- public void nextVector(LongColumnVector previous,
- long previousLen) throws IOException {
- previous.isRepeating = true;
- for (int i = 0; i < previousLen; i++) {
- if (previous.noNulls || !previous.isNull[i]) {
- previous.vector[i] = next();
- } else {
- // The default value of null for int types in vectorized
- // processing is 1, so set that if the value is null
- previous.vector[i] = 1;
- }
-
- // The default value for nulls in Vectorization for int types is 1
- // and given that non null value can also be 1, we need to check for isNull also
- // when determining the isRepeating flag.
- if (previous.isRepeating
- && i > 0
- && ((previous.vector[0] != previous.vector[i]) ||
- (previous.isNull[0] != previous.isNull[i]))) {
- previous.isRepeating = false;
- }
- }
- }
-
- public void seek(PositionProvider index) throws IOException {
- input.seek(index);
- int consumed = (int) index.getNext();
- if (consumed > 8) {
- throw new IllegalArgumentException("Seek past end of byte at " +
- consumed + " in " + input);
- } else if (consumed != 0) {
- readByte();
- bitsLeft = 8 - consumed;
- } else {
- bitsLeft = 0;
- }
- }
-
- public void skip(long items) throws IOException {
- long totalBits = bitSize * items;
- if (bitsLeft >= totalBits) {
- bitsLeft -= totalBits;
- } else {
- totalBits -= bitsLeft;
- input.skip(totalBits / 8);
- current = input.next();
- bitsLeft = (int) (8 - (totalBits % 8));
- }
- }
-
- @Override
- public String toString() {
- return "bit reader current: " + current + " bits left: " + bitsLeft +
- " bit size: " + bitSize + " from " + input;
- }
-
- boolean hasFullByte() {
- return bitsLeft == 8 || bitsLeft == 0;
- }
-
- int peekOneBit() throws IOException {
- assert bitSize == 1;
- if (bitsLeft == 0) {
- readByte();
- }
- return (current >>> (bitsLeft - 1)) & 1;
- }
-
- int peekFullByte() throws IOException {
- assert bitSize == 1;
- assert bitsLeft == 8 || bitsLeft == 0;
- if (bitsLeft == 0) {
- readByte();
- }
- return current;
- }
-
- void skipInCurrentByte(int bits) throws IOException {
- assert bitsLeft >= bits;
- bitsLeft -= bits;
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/BitFieldWriter.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/BitFieldWriter.java b/orc/src/java/org/apache/orc/impl/BitFieldWriter.java
deleted file mode 100644
index aa5f886..0000000
--- a/orc/src/java/org/apache/orc/impl/BitFieldWriter.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import org.apache.orc.impl.PositionRecorder;
-import org.apache.orc.impl.PositionedOutputStream;
-import org.apache.orc.impl.RunLengthByteWriter;
-
-import java.io.IOException;
-
-public class BitFieldWriter {
- private RunLengthByteWriter output;
- private final int bitSize;
- private byte current = 0;
- private int bitsLeft = 8;
-
- public BitFieldWriter(PositionedOutputStream output,
- int bitSize) throws IOException {
- this.output = new RunLengthByteWriter(output);
- this.bitSize = bitSize;
- }
-
- private void writeByte() throws IOException {
- output.write(current);
- current = 0;
- bitsLeft = 8;
- }
-
- public void flush() throws IOException {
- if (bitsLeft != 8) {
- writeByte();
- }
- output.flush();
- }
-
- public void write(int value) throws IOException {
- int bitsToWrite = bitSize;
- while (bitsToWrite > bitsLeft) {
- // add the bits to the bottom of the current word
- current |= value >>> (bitsToWrite - bitsLeft);
- // subtract out the bits we just added
- bitsToWrite -= bitsLeft;
- // zero out the bits above bitsToWrite
- value &= (1 << bitsToWrite) - 1;
- writeByte();
- }
- bitsLeft -= bitsToWrite;
- current |= value << bitsLeft;
- if (bitsLeft == 0) {
- writeByte();
- }
- }
-
- public void getPosition(PositionRecorder recorder) throws IOException {
- output.getPosition(recorder);
- recorder.addPosition(8 - bitsLeft);
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/BufferChunk.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/BufferChunk.java b/orc/src/java/org/apache/orc/impl/BufferChunk.java
deleted file mode 100644
index da43b96..0000000
--- a/orc/src/java/org/apache/orc/impl/BufferChunk.java
+++ /dev/null
@@ -1,85 +0,0 @@
-package org.apache.orc.impl;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.hadoop.hive.common.io.DiskRange;
-import org.apache.hadoop.hive.common.io.DiskRangeList;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.nio.ByteBuffer;
-
-/**
- * The sections of stripe that we have read.
- * This might not match diskRange - 1 disk range can be multiple buffer chunks,
- * depending on DFS block boundaries.
- */
-public class BufferChunk extends DiskRangeList {
-
- private static final Logger LOG =
- LoggerFactory.getLogger(BufferChunk.class);
- final ByteBuffer chunk;
-
- public BufferChunk(ByteBuffer chunk, long offset) {
- super(offset, offset + chunk.remaining());
- this.chunk = chunk;
- }
-
- public ByteBuffer getChunk() {
- return chunk;
- }
-
- @Override
- public boolean hasData() {
- return chunk != null;
- }
-
- @Override
- public final String toString() {
- boolean makesSense = chunk.remaining() == (end - offset);
- return "data range [" + offset + ", " + end + "), size: " + chunk.remaining()
- + (makesSense ? "" : "(!)") + " type: " +
- (chunk.isDirect() ? "direct" : "array-backed");
- }
-
- @Override
- public DiskRange sliceAndShift(long offset, long end, long shiftBy) {
- assert offset <= end && offset >= this.offset && end <= this.end;
- assert offset + shiftBy >= 0;
- ByteBuffer sliceBuf = chunk.slice();
- int newPos = (int) (offset - this.offset);
- int newLimit = newPos + (int) (end - offset);
- try {
- sliceBuf.position(newPos);
- sliceBuf.limit(newLimit);
- } catch (Throwable t) {
- LOG.error("Failed to slice buffer chunk with range" + " [" + this.offset + ", " + this.end
- + "), position: " + chunk.position() + " limit: " + chunk.limit() + ", "
- + (chunk.isDirect() ? "direct" : "array") + "; to [" + offset + ", " + end + ") "
- + t.getClass());
- throw new RuntimeException(t);
- }
- return new BufferChunk(sliceBuf, offset + shiftBy);
- }
-
- @Override
- public ByteBuffer getData() {
- return chunk;
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/orc/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
deleted file mode 100644
index 76a466d..0000000
--- a/orc/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
+++ /dev/null
@@ -1,1101 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.sql.Date;
-import java.sql.Timestamp;
-
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.WritableComparator;
-import org.apache.orc.BinaryColumnStatistics;
-import org.apache.orc.BooleanColumnStatistics;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.DateColumnStatistics;
-import org.apache.orc.DecimalColumnStatistics;
-import org.apache.orc.DoubleColumnStatistics;
-import org.apache.orc.IntegerColumnStatistics;
-import org.apache.orc.OrcProto;
-import org.apache.orc.StringColumnStatistics;
-import org.apache.orc.TimestampColumnStatistics;
-import org.apache.orc.TypeDescription;
-
-public class ColumnStatisticsImpl implements ColumnStatistics {
-
- private static final class BooleanStatisticsImpl extends ColumnStatisticsImpl
- implements BooleanColumnStatistics {
- private long trueCount = 0;
-
- BooleanStatisticsImpl(OrcProto.ColumnStatistics stats) {
- super(stats);
- OrcProto.BucketStatistics bkt = stats.getBucketStatistics();
- trueCount = bkt.getCount(0);
- }
-
- BooleanStatisticsImpl() {
- }
-
- @Override
- public void reset() {
- super.reset();
- trueCount = 0;
- }
-
- @Override
- public void updateBoolean(boolean value, int repetitions) {
- if (value) {
- trueCount += repetitions;
- }
- }
-
- @Override
- public void merge(ColumnStatisticsImpl other) {
- if (other instanceof BooleanStatisticsImpl) {
- BooleanStatisticsImpl bkt = (BooleanStatisticsImpl) other;
- trueCount += bkt.trueCount;
- } else {
- if (isStatsExists() && trueCount != 0) {
- throw new IllegalArgumentException("Incompatible merging of boolean column statistics");
- }
- }
- super.merge(other);
- }
-
- @Override
- public OrcProto.ColumnStatistics.Builder serialize() {
- OrcProto.ColumnStatistics.Builder builder = super.serialize();
- OrcProto.BucketStatistics.Builder bucket =
- OrcProto.BucketStatistics.newBuilder();
- bucket.addCount(trueCount);
- builder.setBucketStatistics(bucket);
- return builder;
- }
-
- @Override
- public long getFalseCount() {
- return getNumberOfValues() - trueCount;
- }
-
- @Override
- public long getTrueCount() {
- return trueCount;
- }
-
- @Override
- public String toString() {
- return super.toString() + " true: " + trueCount;
- }
- }
-
- private static final class IntegerStatisticsImpl extends ColumnStatisticsImpl
- implements IntegerColumnStatistics {
-
- private long minimum = Long.MAX_VALUE;
- private long maximum = Long.MIN_VALUE;
- private long sum = 0;
- private boolean hasMinimum = false;
- private boolean overflow = false;
-
- IntegerStatisticsImpl() {
- }
-
- IntegerStatisticsImpl(OrcProto.ColumnStatistics stats) {
- super(stats);
- OrcProto.IntegerStatistics intStat = stats.getIntStatistics();
- if (intStat.hasMinimum()) {
- hasMinimum = true;
- minimum = intStat.getMinimum();
- }
- if (intStat.hasMaximum()) {
- maximum = intStat.getMaximum();
- }
- if (intStat.hasSum()) {
- sum = intStat.getSum();
- } else {
- overflow = true;
- }
- }
-
- @Override
- public void reset() {
- super.reset();
- hasMinimum = false;
- minimum = Long.MAX_VALUE;
- maximum = Long.MIN_VALUE;
- sum = 0;
- overflow = false;
- }
-
- @Override
- public void updateInteger(long value, int repetitions) {
- if (!hasMinimum) {
- hasMinimum = true;
- minimum = value;
- maximum = value;
- } else if (value < minimum) {
- minimum = value;
- } else if (value > maximum) {
- maximum = value;
- }
- if (!overflow) {
- boolean wasPositive = sum >= 0;
- sum += value * repetitions;
- if ((value >= 0) == wasPositive) {
- overflow = (sum >= 0) != wasPositive;
- }
- }
- }
-
- @Override
- public void merge(ColumnStatisticsImpl other) {
- if (other instanceof IntegerStatisticsImpl) {
- IntegerStatisticsImpl otherInt = (IntegerStatisticsImpl) other;
- if (!hasMinimum) {
- hasMinimum = otherInt.hasMinimum;
- minimum = otherInt.minimum;
- maximum = otherInt.maximum;
- } else if (otherInt.hasMinimum) {
- if (otherInt.minimum < minimum) {
- minimum = otherInt.minimum;
- }
- if (otherInt.maximum > maximum) {
- maximum = otherInt.maximum;
- }
- }
-
- overflow |= otherInt.overflow;
- if (!overflow) {
- boolean wasPositive = sum >= 0;
- sum += otherInt.sum;
- if ((otherInt.sum >= 0) == wasPositive) {
- overflow = (sum >= 0) != wasPositive;
- }
- }
- } else {
- if (isStatsExists() && hasMinimum) {
- throw new IllegalArgumentException("Incompatible merging of integer column statistics");
- }
- }
- super.merge(other);
- }
-
- @Override
- public OrcProto.ColumnStatistics.Builder serialize() {
- OrcProto.ColumnStatistics.Builder builder = super.serialize();
- OrcProto.IntegerStatistics.Builder intb =
- OrcProto.IntegerStatistics.newBuilder();
- if (hasMinimum) {
- intb.setMinimum(minimum);
- intb.setMaximum(maximum);
- }
- if (!overflow) {
- intb.setSum(sum);
- }
- builder.setIntStatistics(intb);
- return builder;
- }
-
- @Override
- public long getMinimum() {
- return minimum;
- }
-
- @Override
- public long getMaximum() {
- return maximum;
- }
-
- @Override
- public boolean isSumDefined() {
- return !overflow;
- }
-
- @Override
- public long getSum() {
- return sum;
- }
-
- @Override
- public String toString() {
- StringBuilder buf = new StringBuilder(super.toString());
- if (hasMinimum) {
- buf.append(" min: ");
- buf.append(minimum);
- buf.append(" max: ");
- buf.append(maximum);
- }
- if (!overflow) {
- buf.append(" sum: ");
- buf.append(sum);
- }
- return buf.toString();
- }
- }
-
- private static final class DoubleStatisticsImpl extends ColumnStatisticsImpl
- implements DoubleColumnStatistics {
- private boolean hasMinimum = false;
- private double minimum = Double.MAX_VALUE;
- private double maximum = Double.MIN_VALUE;
- private double sum = 0;
-
- DoubleStatisticsImpl() {
- }
-
- DoubleStatisticsImpl(OrcProto.ColumnStatistics stats) {
- super(stats);
- OrcProto.DoubleStatistics dbl = stats.getDoubleStatistics();
- if (dbl.hasMinimum()) {
- hasMinimum = true;
- minimum = dbl.getMinimum();
- }
- if (dbl.hasMaximum()) {
- maximum = dbl.getMaximum();
- }
- if (dbl.hasSum()) {
- sum = dbl.getSum();
- }
- }
-
- @Override
- public void reset() {
- super.reset();
- hasMinimum = false;
- minimum = Double.MAX_VALUE;
- maximum = Double.MIN_VALUE;
- sum = 0;
- }
-
- @Override
- public void updateDouble(double value) {
- if (!hasMinimum) {
- hasMinimum = true;
- minimum = value;
- maximum = value;
- } else if (value < minimum) {
- minimum = value;
- } else if (value > maximum) {
- maximum = value;
- }
- sum += value;
- }
-
- @Override
- public void merge(ColumnStatisticsImpl other) {
- if (other instanceof DoubleStatisticsImpl) {
- DoubleStatisticsImpl dbl = (DoubleStatisticsImpl) other;
- if (!hasMinimum) {
- hasMinimum = dbl.hasMinimum;
- minimum = dbl.minimum;
- maximum = dbl.maximum;
- } else if (dbl.hasMinimum) {
- if (dbl.minimum < minimum) {
- minimum = dbl.minimum;
- }
- if (dbl.maximum > maximum) {
- maximum = dbl.maximum;
- }
- }
- sum += dbl.sum;
- } else {
- if (isStatsExists() && hasMinimum) {
- throw new IllegalArgumentException("Incompatible merging of double column statistics");
- }
- }
- super.merge(other);
- }
-
- @Override
- public OrcProto.ColumnStatistics.Builder serialize() {
- OrcProto.ColumnStatistics.Builder builder = super.serialize();
- OrcProto.DoubleStatistics.Builder dbl =
- OrcProto.DoubleStatistics.newBuilder();
- if (hasMinimum) {
- dbl.setMinimum(minimum);
- dbl.setMaximum(maximum);
- }
- dbl.setSum(sum);
- builder.setDoubleStatistics(dbl);
- return builder;
- }
-
- @Override
- public double getMinimum() {
- return minimum;
- }
-
- @Override
- public double getMaximum() {
- return maximum;
- }
-
- @Override
- public double getSum() {
- return sum;
- }
-
- @Override
- public String toString() {
- StringBuilder buf = new StringBuilder(super.toString());
- if (hasMinimum) {
- buf.append(" min: ");
- buf.append(minimum);
- buf.append(" max: ");
- buf.append(maximum);
- }
- buf.append(" sum: ");
- buf.append(sum);
- return buf.toString();
- }
- }
-
- protected static final class StringStatisticsImpl extends ColumnStatisticsImpl
- implements StringColumnStatistics {
- private Text minimum = null;
- private Text maximum = null;
- private long sum = 0;
-
- StringStatisticsImpl() {
- }
-
- StringStatisticsImpl(OrcProto.ColumnStatistics stats) {
- super(stats);
- OrcProto.StringStatistics str = stats.getStringStatistics();
- if (str.hasMaximum()) {
- maximum = new Text(str.getMaximum());
- }
- if (str.hasMinimum()) {
- minimum = new Text(str.getMinimum());
- }
- if(str.hasSum()) {
- sum = str.getSum();
- }
- }
-
- @Override
- public void reset() {
- super.reset();
- minimum = null;
- maximum = null;
- sum = 0;
- }
-
- @Override
- public void updateString(Text value) {
- if (minimum == null) {
- maximum = minimum = new Text(value);
- } else if (minimum.compareTo(value) > 0) {
- minimum = new Text(value);
- } else if (maximum.compareTo(value) < 0) {
- maximum = new Text(value);
- }
- sum += value.getLength();
- }
-
- @Override
- public void updateString(byte[] bytes, int offset, int length,
- int repetitions) {
- if (minimum == null) {
- maximum = minimum = new Text();
- maximum.set(bytes, offset, length);
- } else if (WritableComparator.compareBytes(minimum.getBytes(), 0,
- minimum.getLength(), bytes, offset, length) > 0) {
- minimum = new Text();
- minimum.set(bytes, offset, length);
- } else if (WritableComparator.compareBytes(maximum.getBytes(), 0,
- maximum.getLength(), bytes, offset, length) < 0) {
- maximum = new Text();
- maximum.set(bytes, offset, length);
- }
- sum += length * repetitions;
- }
-
- @Override
- public void merge(ColumnStatisticsImpl other) {
- if (other instanceof StringStatisticsImpl) {
- StringStatisticsImpl str = (StringStatisticsImpl) other;
- if (minimum == null) {
- if (str.minimum != null) {
- maximum = new Text(str.getMaximum());
- minimum = new Text(str.getMinimum());
- } else {
- /* both are empty */
- maximum = minimum = null;
- }
- } else if (str.minimum != null) {
- if (minimum.compareTo(str.minimum) > 0) {
- minimum = new Text(str.getMinimum());
- }
- if (maximum.compareTo(str.maximum) < 0) {
- maximum = new Text(str.getMaximum());
- }
- }
- sum += str.sum;
- } else {
- if (isStatsExists() && minimum != null) {
- throw new IllegalArgumentException("Incompatible merging of string column statistics");
- }
- }
- super.merge(other);
- }
-
- @Override
- public OrcProto.ColumnStatistics.Builder serialize() {
- OrcProto.ColumnStatistics.Builder result = super.serialize();
- OrcProto.StringStatistics.Builder str =
- OrcProto.StringStatistics.newBuilder();
- if (getNumberOfValues() != 0) {
- str.setMinimum(getMinimum());
- str.setMaximum(getMaximum());
- str.setSum(sum);
- }
- result.setStringStatistics(str);
- return result;
- }
-
- @Override
- public String getMinimum() {
- return minimum == null ? null : minimum.toString();
- }
-
- @Override
- public String getMaximum() {
- return maximum == null ? null : maximum.toString();
- }
-
- @Override
- public long getSum() {
- return sum;
- }
-
- @Override
- public String toString() {
- StringBuilder buf = new StringBuilder(super.toString());
- if (getNumberOfValues() != 0) {
- buf.append(" min: ");
- buf.append(getMinimum());
- buf.append(" max: ");
- buf.append(getMaximum());
- buf.append(" sum: ");
- buf.append(sum);
- }
- return buf.toString();
- }
- }
-
- protected static final class BinaryStatisticsImpl extends ColumnStatisticsImpl implements
- BinaryColumnStatistics {
-
- private long sum = 0;
-
- BinaryStatisticsImpl() {
- }
-
- BinaryStatisticsImpl(OrcProto.ColumnStatistics stats) {
- super(stats);
- OrcProto.BinaryStatistics binStats = stats.getBinaryStatistics();
- if (binStats.hasSum()) {
- sum = binStats.getSum();
- }
- }
-
- @Override
- public void reset() {
- super.reset();
- sum = 0;
- }
-
- @Override
- public void updateBinary(BytesWritable value) {
- sum += value.getLength();
- }
-
- @Override
- public void updateBinary(byte[] bytes, int offset, int length,
- int repetitions) {
- sum += length * repetitions;
- }
-
- @Override
- public void merge(ColumnStatisticsImpl other) {
- if (other instanceof BinaryColumnStatistics) {
- BinaryStatisticsImpl bin = (BinaryStatisticsImpl) other;
- sum += bin.sum;
- } else {
- if (isStatsExists() && sum != 0) {
- throw new IllegalArgumentException("Incompatible merging of binary column statistics");
- }
- }
- super.merge(other);
- }
-
- @Override
- public long getSum() {
- return sum;
- }
-
- @Override
- public OrcProto.ColumnStatistics.Builder serialize() {
- OrcProto.ColumnStatistics.Builder result = super.serialize();
- OrcProto.BinaryStatistics.Builder bin = OrcProto.BinaryStatistics.newBuilder();
- bin.setSum(sum);
- result.setBinaryStatistics(bin);
- return result;
- }
-
- @Override
- public String toString() {
- StringBuilder buf = new StringBuilder(super.toString());
- if (getNumberOfValues() != 0) {
- buf.append(" sum: ");
- buf.append(sum);
- }
- return buf.toString();
- }
- }
-
- private static final class DecimalStatisticsImpl extends ColumnStatisticsImpl
- implements DecimalColumnStatistics {
-
- // These objects are mutable for better performance.
- private HiveDecimalWritable minimum = null;
- private HiveDecimalWritable maximum = null;
- private HiveDecimalWritable sum = new HiveDecimalWritable(0);
-
- DecimalStatisticsImpl() {
- }
-
- DecimalStatisticsImpl(OrcProto.ColumnStatistics stats) {
- super(stats);
- OrcProto.DecimalStatistics dec = stats.getDecimalStatistics();
- if (dec.hasMaximum()) {
- maximum = new HiveDecimalWritable(dec.getMaximum());
- }
- if (dec.hasMinimum()) {
- minimum = new HiveDecimalWritable(dec.getMinimum());
- }
- if (dec.hasSum()) {
- sum = new HiveDecimalWritable(dec.getSum());
- } else {
- sum = null;
- }
- }
-
- @Override
- public void reset() {
- super.reset();
- minimum = null;
- maximum = null;
- sum = new HiveDecimalWritable(0);
- }
-
- @Override
- public void updateDecimal(HiveDecimalWritable value) {
- if (minimum == null) {
- minimum = new HiveDecimalWritable(value);
- maximum = new HiveDecimalWritable(value);
- } else if (minimum.compareTo(value) > 0) {
- minimum.set(value);
- } else if (maximum.compareTo(value) < 0) {
- maximum.set(value);
- }
- if (sum != null) {
- sum.mutateAdd(value);
- }
- }
-
- @Override
- public void merge(ColumnStatisticsImpl other) {
- if (other instanceof DecimalStatisticsImpl) {
- DecimalStatisticsImpl dec = (DecimalStatisticsImpl) other;
- if (minimum == null) {
- minimum = (dec.minimum != null ? new HiveDecimalWritable(dec.minimum) : null);
- maximum = (dec.maximum != null ? new HiveDecimalWritable(dec.maximum) : null);
- sum = dec.sum;
- } else if (dec.minimum != null) {
- if (minimum.compareTo(dec.minimum) > 0) {
- minimum.set(dec.minimum);
- }
- if (maximum.compareTo(dec.maximum) < 0) {
- maximum.set(dec.maximum);
- }
- if (sum == null || dec.sum == null) {
- sum = null;
- } else {
- sum.mutateAdd(dec.sum);
- }
- }
- } else {
- if (isStatsExists() && minimum != null) {
- throw new IllegalArgumentException("Incompatible merging of decimal column statistics");
- }
- }
- super.merge(other);
- }
-
- @Override
- public OrcProto.ColumnStatistics.Builder serialize() {
- OrcProto.ColumnStatistics.Builder result = super.serialize();
- OrcProto.DecimalStatistics.Builder dec =
- OrcProto.DecimalStatistics.newBuilder();
- if (getNumberOfValues() != 0 && minimum != null) {
- dec.setMinimum(minimum.toString());
- dec.setMaximum(maximum.toString());
- }
- // Check isSet for overflow.
- if (sum != null && sum.isSet()) {
- dec.setSum(sum.toString());
- }
- result.setDecimalStatistics(dec);
- return result;
- }
-
- @Override
- public HiveDecimal getMinimum() {
- return minimum == null ? null : minimum.getHiveDecimal();
- }
-
- @Override
- public HiveDecimal getMaximum() {
- return maximum == null ? null : maximum.getHiveDecimal();
- }
-
- @Override
- public HiveDecimal getSum() {
- return sum == null ? null : sum.getHiveDecimal();
- }
-
- @Override
- public String toString() {
- StringBuilder buf = new StringBuilder(super.toString());
- if (getNumberOfValues() != 0) {
- buf.append(" min: ");
- buf.append(minimum);
- buf.append(" max: ");
- buf.append(maximum);
- if (sum != null) {
- buf.append(" sum: ");
- buf.append(sum);
- }
- }
- return buf.toString();
- }
- }
-
- private static final class DateStatisticsImpl extends ColumnStatisticsImpl
- implements DateColumnStatistics {
- private Integer minimum = null;
- private Integer maximum = null;
-
- DateStatisticsImpl() {
- }
-
- DateStatisticsImpl(OrcProto.ColumnStatistics stats) {
- super(stats);
- OrcProto.DateStatistics dateStats = stats.getDateStatistics();
- // min,max values serialized/deserialized as int (days since epoch)
- if (dateStats.hasMaximum()) {
- maximum = dateStats.getMaximum();
- }
- if (dateStats.hasMinimum()) {
- minimum = dateStats.getMinimum();
- }
- }
-
- @Override
- public void reset() {
- super.reset();
- minimum = null;
- maximum = null;
- }
-
- @Override
- public void updateDate(DateWritable value) {
- if (minimum == null) {
- minimum = value.getDays();
- maximum = value.getDays();
- } else if (minimum > value.getDays()) {
- minimum = value.getDays();
- } else if (maximum < value.getDays()) {
- maximum = value.getDays();
- }
- }
-
- @Override
- public void updateDate(int value) {
- if (minimum == null) {
- minimum = value;
- maximum = value;
- } else if (minimum > value) {
- minimum = value;
- } else if (maximum < value) {
- maximum = value;
- }
- }
-
- @Override
- public void merge(ColumnStatisticsImpl other) {
- if (other instanceof DateStatisticsImpl) {
- DateStatisticsImpl dateStats = (DateStatisticsImpl) other;
- if (minimum == null) {
- minimum = dateStats.minimum;
- maximum = dateStats.maximum;
- } else if (dateStats.minimum != null) {
- if (minimum > dateStats.minimum) {
- minimum = dateStats.minimum;
- }
- if (maximum < dateStats.maximum) {
- maximum = dateStats.maximum;
- }
- }
- } else {
- if (isStatsExists() && minimum != null) {
- throw new IllegalArgumentException("Incompatible merging of date column statistics");
- }
- }
- super.merge(other);
- }
-
- @Override
- public OrcProto.ColumnStatistics.Builder serialize() {
- OrcProto.ColumnStatistics.Builder result = super.serialize();
- OrcProto.DateStatistics.Builder dateStats =
- OrcProto.DateStatistics.newBuilder();
- if (getNumberOfValues() != 0 && minimum != null) {
- dateStats.setMinimum(minimum);
- dateStats.setMaximum(maximum);
- }
- result.setDateStatistics(dateStats);
- return result;
- }
-
- private transient final DateWritable minDate = new DateWritable();
- private transient final DateWritable maxDate = new DateWritable();
-
- @Override
- public Date getMinimum() {
- if (minimum == null) {
- return null;
- }
- minDate.set(minimum);
- return minDate.get();
- }
-
- @Override
- public Date getMaximum() {
- if (maximum == null) {
- return null;
- }
- maxDate.set(maximum);
- return maxDate.get();
- }
-
- @Override
- public String toString() {
- StringBuilder buf = new StringBuilder(super.toString());
- if (getNumberOfValues() != 0) {
- buf.append(" min: ");
- buf.append(getMinimum());
- buf.append(" max: ");
- buf.append(getMaximum());
- }
- return buf.toString();
- }
- }
-
- private static final class TimestampStatisticsImpl extends ColumnStatisticsImpl
- implements TimestampColumnStatistics {
- private Long minimum = null;
- private Long maximum = null;
-
- TimestampStatisticsImpl() {
- }
-
- TimestampStatisticsImpl(OrcProto.ColumnStatistics stats) {
- super(stats);
- OrcProto.TimestampStatistics timestampStats = stats.getTimestampStatistics();
- // min,max values serialized/deserialized as int (milliseconds since epoch)
- if (timestampStats.hasMaximum()) {
- maximum = timestampStats.getMaximum();
- }
- if (timestampStats.hasMinimum()) {
- minimum = timestampStats.getMinimum();
- }
- }
-
- @Override
- public void reset() {
- super.reset();
- minimum = null;
- maximum = null;
- }
-
- @Override
- public void updateTimestamp(Timestamp value) {
- if (minimum == null) {
- minimum = value.getTime();
- maximum = value.getTime();
- } else if (minimum > value.getTime()) {
- minimum = value.getTime();
- } else if (maximum < value.getTime()) {
- maximum = value.getTime();
- }
- }
-
- @Override
- public void updateTimestamp(long value) {
- if (minimum == null) {
- minimum = value;
- maximum = value;
- } else if (minimum > value) {
- minimum = value;
- } else if (maximum < value) {
- maximum = value;
- }
- }
-
- @Override
- public void merge(ColumnStatisticsImpl other) {
- if (other instanceof TimestampStatisticsImpl) {
- TimestampStatisticsImpl timestampStats = (TimestampStatisticsImpl) other;
- if (minimum == null) {
- minimum = timestampStats.minimum;
- maximum = timestampStats.maximum;
- } else if (timestampStats.minimum != null) {
- if (minimum > timestampStats.minimum) {
- minimum = timestampStats.minimum;
- }
- if (maximum < timestampStats.maximum) {
- maximum = timestampStats.maximum;
- }
- }
- } else {
- if (isStatsExists() && minimum != null) {
- throw new IllegalArgumentException("Incompatible merging of timestamp column statistics");
- }
- }
- super.merge(other);
- }
-
- @Override
- public OrcProto.ColumnStatistics.Builder serialize() {
- OrcProto.ColumnStatistics.Builder result = super.serialize();
- OrcProto.TimestampStatistics.Builder timestampStats = OrcProto.TimestampStatistics
- .newBuilder();
- if (getNumberOfValues() != 0 && minimum != null) {
- timestampStats.setMinimum(minimum);
- timestampStats.setMaximum(maximum);
- }
- result.setTimestampStatistics(timestampStats);
- return result;
- }
-
- @Override
- public Timestamp getMinimum() {
- return minimum == null ? null : new Timestamp(minimum);
- }
-
- @Override
- public Timestamp getMaximum() {
- return maximum == null ? null : new Timestamp(maximum);
- }
-
- @Override
- public String toString() {
- StringBuilder buf = new StringBuilder(super.toString());
- if (getNumberOfValues() != 0) {
- buf.append(" min: ");
- buf.append(getMinimum());
- buf.append(" max: ");
- buf.append(getMaximum());
- }
- return buf.toString();
- }
- }
-
- private long count = 0;
- private boolean hasNull = false;
-
- ColumnStatisticsImpl(OrcProto.ColumnStatistics stats) {
- if (stats.hasNumberOfValues()) {
- count = stats.getNumberOfValues();
- }
-
- if (stats.hasHasNull()) {
- hasNull = stats.getHasNull();
- } else {
- hasNull = true;
- }
- }
-
- ColumnStatisticsImpl() {
- }
-
- public void increment() {
- count += 1;
- }
-
- public void increment(int count) {
- this.count += count;
- }
-
- public void setNull() {
- hasNull = true;
- }
-
- public void updateBoolean(boolean value, int repetitions) {
- throw new UnsupportedOperationException("Can't update boolean");
- }
-
- public void updateInteger(long value, int repetitions) {
- throw new UnsupportedOperationException("Can't update integer");
- }
-
- public void updateDouble(double value) {
- throw new UnsupportedOperationException("Can't update double");
- }
-
- public void updateString(Text value) {
- throw new UnsupportedOperationException("Can't update string");
- }
-
- public void updateString(byte[] bytes, int offset, int length,
- int repetitions) {
- throw new UnsupportedOperationException("Can't update string");
- }
-
- public void updateBinary(BytesWritable value) {
- throw new UnsupportedOperationException("Can't update binary");
- }
-
- public void updateBinary(byte[] bytes, int offset, int length,
- int repetitions) {
- throw new UnsupportedOperationException("Can't update string");
- }
-
- public void updateDecimal(HiveDecimalWritable value) {
- throw new UnsupportedOperationException("Can't update decimal");
- }
-
- public void updateDate(DateWritable value) {
- throw new UnsupportedOperationException("Can't update date");
- }
-
- public void updateDate(int value) {
- throw new UnsupportedOperationException("Can't update date");
- }
-
- public void updateTimestamp(Timestamp value) {
- throw new UnsupportedOperationException("Can't update timestamp");
- }
-
- public void updateTimestamp(long value) {
- throw new UnsupportedOperationException("Can't update timestamp");
- }
-
- public boolean isStatsExists() {
- return (count > 0 || hasNull == true);
- }
-
- public void merge(ColumnStatisticsImpl stats) {
- count += stats.count;
- hasNull |= stats.hasNull;
- }
-
- public void reset() {
- count = 0;
- hasNull = false;
- }
-
- @Override
- public long getNumberOfValues() {
- return count;
- }
-
- @Override
- public boolean hasNull() {
- return hasNull;
- }
-
- @Override
- public String toString() {
- return "count: " + count + " hasNull: " + hasNull;
- }
-
- public OrcProto.ColumnStatistics.Builder serialize() {
- OrcProto.ColumnStatistics.Builder builder =
- OrcProto.ColumnStatistics.newBuilder();
- builder.setNumberOfValues(count);
- builder.setHasNull(hasNull);
- return builder;
- }
-
- public static ColumnStatisticsImpl create(TypeDescription schema) {
- switch (schema.getCategory()) {
- case BOOLEAN:
- return new BooleanStatisticsImpl();
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- return new IntegerStatisticsImpl();
- case FLOAT:
- case DOUBLE:
- return new DoubleStatisticsImpl();
- case STRING:
- case CHAR:
- case VARCHAR:
- return new StringStatisticsImpl();
- case DECIMAL:
- return new DecimalStatisticsImpl();
- case DATE:
- return new DateStatisticsImpl();
- case TIMESTAMP:
- return new TimestampStatisticsImpl();
- case BINARY:
- return new BinaryStatisticsImpl();
- default:
- return new ColumnStatisticsImpl();
- }
- }
-
- public static ColumnStatisticsImpl deserialize(OrcProto.ColumnStatistics stats) {
- if (stats.hasBucketStatistics()) {
- return new BooleanStatisticsImpl(stats);
- } else if (stats.hasIntStatistics()) {
- return new IntegerStatisticsImpl(stats);
- } else if (stats.hasDoubleStatistics()) {
- return new DoubleStatisticsImpl(stats);
- } else if (stats.hasStringStatistics()) {
- return new StringStatisticsImpl(stats);
- } else if (stats.hasDecimalStatistics()) {
- return new DecimalStatisticsImpl(stats);
- } else if (stats.hasDateStatistics()) {
- return new DateStatisticsImpl(stats);
- } else if (stats.hasTimestampStatistics()) {
- return new TimestampStatisticsImpl(stats);
- } else if(stats.hasBinaryStatistics()) {
- return new BinaryStatisticsImpl(stats);
- } else {
- return new ColumnStatisticsImpl(stats);
- }
- }
-}
[09/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestBitFieldReader.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestBitFieldReader.java b/orc/src/test/org/apache/hive/orc/impl/TestBitFieldReader.java
new file mode 100644
index 0000000..dbdded5
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestBitFieldReader.java
@@ -0,0 +1,145 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.nio.ByteBuffer;
+
+import org.apache.hive.orc.CompressionCodec;
+import org.junit.Test;
+
+public class TestBitFieldReader {
+
+ public void runSeekTest(CompressionCodec codec) throws Exception {
+ TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
+ final int COUNT = 16384;
+ BitFieldWriter out = new BitFieldWriter(
+ new OutStream("test", 500, codec, collect), 1);
+ TestInStream.PositionCollector[] positions =
+ new TestInStream.PositionCollector[COUNT];
+ for(int i=0; i < COUNT; ++i) {
+ positions[i] = new TestInStream.PositionCollector();
+ out.getPosition(positions[i]);
+ // test runs, non-runs
+ if (i < COUNT / 2) {
+ out.write(i & 1);
+ } else {
+ out.write((i/3) & 1);
+ }
+ }
+ out.flush();
+ ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
+ collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
+ inBuf.flip();
+ BitFieldReader in = new BitFieldReader(InStream.create("test",
+ new ByteBuffer[]{inBuf}, new long[]{0}, inBuf.remaining(),
+ codec, 500), 1);
+ for(int i=0; i < COUNT; ++i) {
+ int x = in.next();
+ if (i < COUNT / 2) {
+ assertEquals(i & 1, x);
+ } else {
+ assertEquals((i/3) & 1, x);
+ }
+ }
+ for(int i=COUNT-1; i >= 0; --i) {
+ in.seek(positions[i]);
+ int x = in.next();
+ if (i < COUNT / 2) {
+ assertEquals(i & 1, x);
+ } else {
+ assertEquals((i/3) & 1, x);
+ }
+ }
+ }
+
+ @Test
+ public void testUncompressedSeek() throws Exception {
+ runSeekTest(null);
+ }
+
+ @Test
+ public void testCompressedSeek() throws Exception {
+ runSeekTest(new ZlibCodec());
+ }
+
+ @Test
+ public void testBiggerItems() throws Exception {
+ TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
+ final int COUNT = 16384;
+ BitFieldWriter out = new BitFieldWriter(
+ new OutStream("test", 500, null, collect), 3);
+ for(int i=0; i < COUNT; ++i) {
+ // test runs, non-runs
+ if (i < COUNT / 2) {
+ out.write(i & 7);
+ } else {
+ out.write((i/3) & 7);
+ }
+ }
+ out.flush();
+ ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
+ collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
+ inBuf.flip();
+ BitFieldReader in = new BitFieldReader(InStream.create("test",
+ new ByteBuffer[]{inBuf}, new long[]{0}, inBuf.remaining(),
+ null, 500), 3);
+ for(int i=0; i < COUNT; ++i) {
+ int x = in.next();
+ if (i < COUNT / 2) {
+ assertEquals(i & 7, x);
+ } else {
+ assertEquals((i/3) & 7, x);
+ }
+ }
+ }
+
+ @Test
+ public void testSkips() throws Exception {
+ TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
+ BitFieldWriter out = new BitFieldWriter(
+ new OutStream("test", 100, null, collect), 1);
+ final int COUNT = 16384;
+ for(int i=0; i < COUNT; ++i) {
+ if (i < COUNT/2) {
+ out.write(i & 1);
+ } else {
+ out.write((i/3) & 1);
+ }
+ }
+ out.flush();
+ ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
+ collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
+ inBuf.flip();
+ BitFieldReader in = new BitFieldReader(InStream.create("test", new ByteBuffer[]{inBuf},
+ new long[]{0}, inBuf.remaining(), null, 100), 1);
+ for(int i=0; i < COUNT; i += 5) {
+ int x = (int) in.next();
+ if (i < COUNT/2) {
+ assertEquals(i & 1, x);
+ } else {
+ assertEquals((i/3) & 1, x);
+ }
+ if (i < COUNT - 5) {
+ in.skip(4);
+ }
+ in.skip(0);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestBitPack.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestBitPack.java b/orc/src/test/org/apache/hive/orc/impl/TestBitPack.java
new file mode 100644
index 0000000..1790c35
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestBitPack.java
@@ -0,0 +1,279 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+import com.google.common.primitives.Longs;
+
+public class TestBitPack {
+
+ private static final int SIZE = 100;
+ private static Random rand = new Random(100);
+ Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
+ + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ private long[] deltaEncode(long[] inp) {
+ long[] output = new long[inp.length];
+ SerializationUtils utils = new SerializationUtils();
+ for (int i = 0; i < inp.length; i++) {
+ output[i] = utils.zigzagEncode(inp[i]);
+ }
+ return output;
+ }
+
+ private long nextLong(Random rng, long n) {
+ long bits, val;
+ do {
+ bits = (rng.nextLong() << 1) >>> 1;
+ val = bits % n;
+ } while (bits - val + (n - 1) < 0L);
+ return val;
+ }
+
+ private void runTest(int numBits) throws IOException {
+ long[] inp = new long[SIZE];
+ for (int i = 0; i < SIZE; i++) {
+ long val = 0;
+ if (numBits <= 32) {
+ if (numBits == 1) {
+ val = -1 * rand.nextInt(2);
+ } else {
+ val = rand.nextInt((int) Math.pow(2, numBits - 1));
+ }
+ } else {
+ val = nextLong(rand, (long) Math.pow(2, numBits - 2));
+ }
+ if (val % 2 == 0) {
+ val = -val;
+ }
+ inp[i] = val;
+ }
+ long[] deltaEncoded = deltaEncode(inp);
+ long minInput = Collections.min(Longs.asList(deltaEncoded));
+ long maxInput = Collections.max(Longs.asList(deltaEncoded));
+ long rangeInput = maxInput - minInput;
+ SerializationUtils utils = new SerializationUtils();
+ int fixedWidth = utils.findClosestNumBits(rangeInput);
+ TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
+ OutStream output = new OutStream("test", SIZE, null, collect);
+ utils.writeInts(deltaEncoded, 0, deltaEncoded.length, fixedWidth, output);
+ output.flush();
+ ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
+ collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
+ inBuf.flip();
+ long[] buff = new long[SIZE];
+ utils.readInts(buff, 0, SIZE, fixedWidth, InStream.create("test", new ByteBuffer[] { inBuf },
+ new long[] { 0 }, inBuf.remaining(), null, SIZE));
+ for (int i = 0; i < SIZE; i++) {
+ buff[i] = utils.zigzagDecode(buff[i]);
+ }
+ assertEquals(numBits, fixedWidth);
+ assertArrayEquals(inp, buff);
+ }
+
+ @Test
+ public void test01BitPacking1Bit() throws IOException {
+ runTest(1);
+ }
+
+ @Test
+ public void test02BitPacking2Bit() throws IOException {
+ runTest(2);
+ }
+
+ @Test
+ public void test03BitPacking3Bit() throws IOException {
+ runTest(3);
+ }
+
+ @Test
+ public void test04BitPacking4Bit() throws IOException {
+ runTest(4);
+ }
+
+ @Test
+ public void test05BitPacking5Bit() throws IOException {
+ runTest(5);
+ }
+
+ @Test
+ public void test06BitPacking6Bit() throws IOException {
+ runTest(6);
+ }
+
+ @Test
+ public void test07BitPacking7Bit() throws IOException {
+ runTest(7);
+ }
+
+ @Test
+ public void test08BitPacking8Bit() throws IOException {
+ runTest(8);
+ }
+
+ @Test
+ public void test09BitPacking9Bit() throws IOException {
+ runTest(9);
+ }
+
+ @Test
+ public void test10BitPacking10Bit() throws IOException {
+ runTest(10);
+ }
+
+ @Test
+ public void test11BitPacking11Bit() throws IOException {
+ runTest(11);
+ }
+
+ @Test
+ public void test12BitPacking12Bit() throws IOException {
+ runTest(12);
+ }
+
+ @Test
+ public void test13BitPacking13Bit() throws IOException {
+ runTest(13);
+ }
+
+ @Test
+ public void test14BitPacking14Bit() throws IOException {
+ runTest(14);
+ }
+
+ @Test
+ public void test15BitPacking15Bit() throws IOException {
+ runTest(15);
+ }
+
+ @Test
+ public void test16BitPacking16Bit() throws IOException {
+ runTest(16);
+ }
+
+ @Test
+ public void test17BitPacking17Bit() throws IOException {
+ runTest(17);
+ }
+
+ @Test
+ public void test18BitPacking18Bit() throws IOException {
+ runTest(18);
+ }
+
+ @Test
+ public void test19BitPacking19Bit() throws IOException {
+ runTest(19);
+ }
+
+ @Test
+ public void test20BitPacking20Bit() throws IOException {
+ runTest(20);
+ }
+
+ @Test
+ public void test21BitPacking21Bit() throws IOException {
+ runTest(21);
+ }
+
+ @Test
+ public void test22BitPacking22Bit() throws IOException {
+ runTest(22);
+ }
+
+ @Test
+ public void test23BitPacking23Bit() throws IOException {
+ runTest(23);
+ }
+
+ @Test
+ public void test24BitPacking24Bit() throws IOException {
+ runTest(24);
+ }
+
+ @Test
+ public void test26BitPacking26Bit() throws IOException {
+ runTest(26);
+ }
+
+ @Test
+ public void test28BitPacking28Bit() throws IOException {
+ runTest(28);
+ }
+
+ @Test
+ public void test30BitPacking30Bit() throws IOException {
+ runTest(30);
+ }
+
+ @Test
+ public void test32BitPacking32Bit() throws IOException {
+ runTest(32);
+ }
+
+ @Test
+ public void test40BitPacking40Bit() throws IOException {
+ runTest(40);
+ }
+
+ @Test
+ public void test48BitPacking48Bit() throws IOException {
+ runTest(48);
+ }
+
+ @Test
+ public void test56BitPacking56Bit() throws IOException {
+ runTest(56);
+ }
+
+ @Test
+ public void test64BitPacking64Bit() throws IOException {
+ runTest(64);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestColumnStatisticsImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestColumnStatisticsImpl.java b/orc/src/test/org/apache/hive/orc/impl/TestColumnStatisticsImpl.java
new file mode 100644
index 0000000..4708043
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestColumnStatisticsImpl.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hive.orc.OrcProto;
+import org.apache.hive.orc.TypeDescription;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class TestColumnStatisticsImpl {
+
+ @Test
+ public void testUpdateDate() throws Exception {
+ ColumnStatisticsImpl stat = ColumnStatisticsImpl.create(TypeDescription.createDate());
+ DateWritable date = new DateWritable(16400);
+ stat.increment();
+ stat.updateDate(date);
+ assertDateStatistics(stat, 1, 16400, 16400);
+
+ date.set(16410);
+ stat.increment();
+ stat.updateDate(date);
+ assertDateStatistics(stat, 2, 16400, 16410);
+
+ date.set(16420);
+ stat.increment();
+ stat.updateDate(date);
+ assertDateStatistics(stat, 3, 16400, 16420);
+ }
+
+ private void assertDateStatistics(ColumnStatisticsImpl stat, int count, int minimum, int maximum) {
+ OrcProto.ColumnStatistics.Builder builder = stat.serialize();
+
+ assertEquals(count, builder.getNumberOfValues());
+ assertTrue(builder.hasDateStatistics());
+ assertFalse(builder.hasStringStatistics());
+
+ OrcProto.DateStatistics protoStat = builder.getDateStatistics();
+ assertTrue(protoStat.hasMinimum());
+ assertEquals(minimum, protoStat.getMinimum());
+ assertTrue(protoStat.hasMaximum());
+ assertEquals(maximum, protoStat.getMaximum());
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestDataReaderProperties.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestDataReaderProperties.java b/orc/src/test/org/apache/hive/orc/impl/TestDataReaderProperties.java
new file mode 100644
index 0000000..1e53f55
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestDataReaderProperties.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hive.orc.CompressionKind;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.mockito.Mockito.mock;
+
+public class TestDataReaderProperties {
+
+ private FileSystem mockedFileSystem = mock(FileSystem.class);
+ private Path mockedPath = mock(Path.class);
+ private boolean mockedZeroCopy = false;
+
+ @Test
+ public void testCompleteBuild() {
+ DataReaderProperties properties = DataReaderProperties.builder()
+ .withFileSystem(mockedFileSystem)
+ .withPath(mockedPath)
+ .withCompression(CompressionKind.ZLIB)
+ .withZeroCopy(mockedZeroCopy)
+ .build();
+ assertEquals(mockedFileSystem, properties.getFileSystem());
+ assertEquals(mockedPath, properties.getPath());
+ assertEquals(CompressionKind.ZLIB, properties.getCompression());
+ assertEquals(mockedZeroCopy, properties.getZeroCopy());
+ }
+
+ @Test
+ public void testMissingNonRequiredArgs() {
+ DataReaderProperties properties = DataReaderProperties.builder()
+ .withFileSystem(mockedFileSystem)
+ .withPath(mockedPath)
+ .build();
+ assertEquals(mockedFileSystem, properties.getFileSystem());
+ assertEquals(mockedPath, properties.getPath());
+ assertNull(properties.getCompression());
+ assertFalse(properties.getZeroCopy());
+ }
+
+ @Test(expected = java.lang.NullPointerException.class)
+ public void testEmptyBuild() {
+ DataReaderProperties.builder().build();
+ }
+
+ @Test(expected = java.lang.NullPointerException.class)
+ public void testMissingPath() {
+ DataReaderProperties.builder()
+ .withFileSystem(mockedFileSystem)
+ .withCompression(CompressionKind.NONE)
+ .withZeroCopy(mockedZeroCopy)
+ .build();
+ }
+
+ @Test(expected = java.lang.NullPointerException.class)
+ public void testMissingFileSystem() {
+ DataReaderProperties.builder()
+ .withPath(mockedPath)
+ .withCompression(CompressionKind.NONE)
+ .withZeroCopy(mockedZeroCopy)
+ .build();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestDynamicArray.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestDynamicArray.java b/orc/src/test/org/apache/hive/orc/impl/TestDynamicArray.java
new file mode 100644
index 0000000..408be12
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestDynamicArray.java
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.util.Random;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class TestDynamicArray {
+
+ @Test
+ public void testByteArray() throws Exception {
+ DynamicByteArray dba = new DynamicByteArray(3, 10);
+ dba.add((byte) 0);
+ dba.add((byte) 1);
+ dba.set(3, (byte) 3);
+ dba.set(2, (byte) 2);
+ dba.add((byte) 4);
+ assertEquals("{0,1,2,3,4}", dba.toString());
+ assertEquals(5, dba.size());
+ byte[] val;
+ val = new byte[0];
+ assertEquals(0, dba.compare(val, 0, 0, 2, 0));
+ assertEquals(-1, dba.compare(val, 0, 0, 2, 1));
+ val = new byte[]{3,42};
+ assertEquals(1, dba.compare(val, 0, 1, 2, 0));
+ assertEquals(1, dba.compare(val, 0, 1, 2, 1));
+ assertEquals(0, dba.compare(val, 0, 1, 3, 1));
+ assertEquals(-1, dba.compare(val, 0, 1, 3, 2));
+ assertEquals(1, dba.compare(val, 0, 2, 3, 1));
+ val = new byte[256];
+ for(int b=-128; b < 128; ++b) {
+ dba.add((byte) b);
+ val[b+128] = (byte) b;
+ }
+ assertEquals(0, dba.compare(val, 0, 256, 5, 256));
+ assertEquals(1, dba.compare(val, 0, 1, 0, 1));
+ assertEquals(1, dba.compare(val, 254, 1, 0, 1));
+ assertEquals(1, dba.compare(val, 120, 1, 64, 1));
+ val = new byte[1024];
+ Random rand = new Random(1701);
+ for(int i = 0; i < val.length; ++i) {
+ rand.nextBytes(val);
+ }
+ dba.add(val, 0, 1024);
+ assertEquals(1285, dba.size());
+ assertEquals(0, dba.compare(val, 0, 1024, 261, 1024));
+ }
+
+ @Test
+ public void testIntArray() throws Exception {
+ DynamicIntArray dia = new DynamicIntArray(10);
+ for(int i=0; i < 10000; ++i) {
+ dia.add(2*i);
+ }
+ assertEquals(10000, dia.size());
+ for(int i=0; i < 10000; ++i) {
+ assertEquals(2*i, dia.get(i));
+ }
+ dia.clear();
+ assertEquals(0, dia.size());
+ dia.add(3);
+ dia.add(12);
+ dia.add(65);
+ assertEquals("{3,12,65}", dia.toString());
+ for(int i=0; i < 5; ++i) {
+ dia.increment(i, 3);
+ }
+ assertEquals("{6,15,68,3,3}", dia.toString());
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestInStream.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestInStream.java b/orc/src/test/org/apache/hive/orc/impl/TestInStream.java
new file mode 100644
index 0000000..ffaef54
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestInStream.java
@@ -0,0 +1,314 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.fail;
+
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hive.orc.CompressionCodec;
+import org.junit.Test;
+
+public class TestInStream {
+
+ static class OutputCollector implements OutStream.OutputReceiver {
+ DynamicByteArray buffer = new DynamicByteArray();
+
+ @Override
+ public void output(ByteBuffer buffer) throws IOException {
+ this.buffer.add(buffer.array(), buffer.arrayOffset() + buffer.position(),
+ buffer.remaining());
+ }
+ }
+
+ static class PositionCollector
+ implements PositionProvider, PositionRecorder {
+ private List<Long> positions = new ArrayList<Long>();
+ private int index = 0;
+
+ @Override
+ public long getNext() {
+ return positions.get(index++);
+ }
+
+ @Override
+ public void addPosition(long offset) {
+ positions.add(offset);
+ }
+
+ public void reset() {
+ index = 0;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder("position: ");
+ for(int i=0; i < positions.size(); ++i) {
+ if (i != 0) {
+ builder.append(", ");
+ }
+ builder.append(positions.get(i));
+ }
+ return builder.toString();
+ }
+ }
+
+ @Test
+ public void testUncompressed() throws Exception {
+ OutputCollector collect = new OutputCollector();
+ OutStream out = new OutStream("test", 100, null, collect);
+ PositionCollector[] positions = new PositionCollector[1024];
+ for(int i=0; i < 1024; ++i) {
+ positions[i] = new PositionCollector();
+ out.getPosition(positions[i]);
+ out.write(i);
+ }
+ out.flush();
+ assertEquals(1024, collect.buffer.size());
+ for(int i=0; i < 1024; ++i) {
+ assertEquals((byte) i, collect.buffer.get(i));
+ }
+ ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
+ collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
+ inBuf.flip();
+ InStream in = InStream.create("test", new ByteBuffer[]{inBuf},
+ new long[]{0}, inBuf.remaining(), null, 100);
+ assertEquals("uncompressed stream test position: 0 length: 1024" +
+ " range: 0 offset: 0 limit: 0",
+ in.toString());
+ for(int i=0; i < 1024; ++i) {
+ int x = in.read();
+ assertEquals(i & 0xff, x);
+ }
+ for(int i=1023; i >= 0; --i) {
+ in.seek(positions[i]);
+ assertEquals(i & 0xff, in.read());
+ }
+ }
+
+ @Test
+ public void testCompressed() throws Exception {
+ OutputCollector collect = new OutputCollector();
+ CompressionCodec codec = new ZlibCodec();
+ OutStream out = new OutStream("test", 300, codec, collect);
+ PositionCollector[] positions = new PositionCollector[1024];
+ for(int i=0; i < 1024; ++i) {
+ positions[i] = new PositionCollector();
+ out.getPosition(positions[i]);
+ out.write(i);
+ }
+ out.flush();
+ assertEquals("test", out.toString());
+ assertEquals(961, collect.buffer.size());
+ ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
+ collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
+ inBuf.flip();
+ InStream in = InStream.create("test", new ByteBuffer[]{inBuf},
+ new long[]{0}, inBuf.remaining(), codec, 300);
+ assertEquals("compressed stream test position: 0 length: 961 range: 0" +
+ " offset: 0 limit: 0 range 0 = 0 to 961",
+ in.toString());
+ for(int i=0; i < 1024; ++i) {
+ int x = in.read();
+ assertEquals(i & 0xff, x);
+ }
+ assertEquals(0, in.available());
+ for(int i=1023; i >= 0; --i) {
+ in.seek(positions[i]);
+ assertEquals(i & 0xff, in.read());
+ }
+ }
+
+ @Test
+ public void testCorruptStream() throws Exception {
+ OutputCollector collect = new OutputCollector();
+ CompressionCodec codec = new ZlibCodec();
+ OutStream out = new OutStream("test", 500, codec, collect);
+ PositionCollector[] positions = new PositionCollector[1024];
+ for(int i=0; i < 1024; ++i) {
+ positions[i] = new PositionCollector();
+ out.getPosition(positions[i]);
+ out.write(i);
+ }
+ out.flush();
+
+ // now try to read the stream with a buffer that is too small
+ ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
+ collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
+ inBuf.flip();
+ InStream in = InStream.create("test", new ByteBuffer[]{inBuf},
+ new long[]{0}, inBuf.remaining(), codec, 100);
+ byte[] contents = new byte[1024];
+ try {
+ in.read(contents);
+ fail();
+ } catch(IllegalArgumentException iae) {
+ // EXPECTED
+ }
+
+ // make a corrupted header
+ inBuf.clear();
+ inBuf.put((byte) 32);
+ inBuf.put((byte) 0);
+ inBuf.flip();
+ in = InStream.create("test2", new ByteBuffer[]{inBuf}, new long[]{0},
+ inBuf.remaining(), codec, 300);
+ try {
+ in.read();
+ fail();
+ } catch (IllegalStateException ise) {
+ // EXPECTED
+ }
+ }
+
+ @Test
+ public void testDisjointBuffers() throws Exception {
+ OutputCollector collect = new OutputCollector();
+ CompressionCodec codec = new ZlibCodec();
+ OutStream out = new OutStream("test", 400, codec, collect);
+ PositionCollector[] positions = new PositionCollector[1024];
+ DataOutput stream = new DataOutputStream(out);
+ for(int i=0; i < 1024; ++i) {
+ positions[i] = new PositionCollector();
+ out.getPosition(positions[i]);
+ stream.writeInt(i);
+ }
+ out.flush();
+ assertEquals("test", out.toString());
+ assertEquals(1674, collect.buffer.size());
+ ByteBuffer[] inBuf = new ByteBuffer[3];
+ inBuf[0] = ByteBuffer.allocate(500);
+ inBuf[1] = ByteBuffer.allocate(1200);
+ inBuf[2] = ByteBuffer.allocate(500);
+ collect.buffer.setByteBuffer(inBuf[0], 0, 483);
+ collect.buffer.setByteBuffer(inBuf[1], 483, 1625 - 483);
+ collect.buffer.setByteBuffer(inBuf[2], 1625, 1674 - 1625);
+
+ for(int i=0; i < inBuf.length; ++i) {
+ inBuf[i].flip();
+ }
+ InStream in = InStream.create("test", inBuf,
+ new long[]{0,483, 1625}, 1674, codec, 400);
+ assertEquals("compressed stream test position: 0 length: 1674 range: 0" +
+ " offset: 0 limit: 0 range 0 = 0 to 483;" +
+ " range 1 = 483 to 1142; range 2 = 1625 to 49",
+ in.toString());
+ DataInputStream inStream = new DataInputStream(in);
+ for(int i=0; i < 1024; ++i) {
+ int x = inStream.readInt();
+ assertEquals(i, x);
+ }
+ assertEquals(0, in.available());
+ for(int i=1023; i >= 0; --i) {
+ in.seek(positions[i]);
+ assertEquals(i, inStream.readInt());
+ }
+
+ in = InStream.create("test", new ByteBuffer[]{inBuf[1], inBuf[2]},
+ new long[]{483, 1625}, 1674, codec, 400);
+ inStream = new DataInputStream(in);
+ positions[303].reset();
+ in.seek(positions[303]);
+ for(int i=303; i < 1024; ++i) {
+ assertEquals(i, inStream.readInt());
+ }
+
+ in = InStream.create("test", new ByteBuffer[]{inBuf[0], inBuf[2]},
+ new long[]{0, 1625}, 1674, codec, 400);
+ inStream = new DataInputStream(in);
+ positions[1001].reset();
+ for(int i=0; i < 300; ++i) {
+ assertEquals(i, inStream.readInt());
+ }
+ in.seek(positions[1001]);
+ for(int i=1001; i < 1024; ++i) {
+ assertEquals(i, inStream.readInt());
+ }
+ }
+
+ @Test
+ public void testUncompressedDisjointBuffers() throws Exception {
+ OutputCollector collect = new OutputCollector();
+ OutStream out = new OutStream("test", 400, null, collect);
+ PositionCollector[] positions = new PositionCollector[1024];
+ DataOutput stream = new DataOutputStream(out);
+ for(int i=0; i < 1024; ++i) {
+ positions[i] = new PositionCollector();
+ out.getPosition(positions[i]);
+ stream.writeInt(i);
+ }
+ out.flush();
+ assertEquals("test", out.toString());
+ assertEquals(4096, collect.buffer.size());
+ ByteBuffer[] inBuf = new ByteBuffer[3];
+ inBuf[0] = ByteBuffer.allocate(1100);
+ inBuf[1] = ByteBuffer.allocate(2200);
+ inBuf[2] = ByteBuffer.allocate(1100);
+ collect.buffer.setByteBuffer(inBuf[0], 0, 1024);
+ collect.buffer.setByteBuffer(inBuf[1], 1024, 2048);
+ collect.buffer.setByteBuffer(inBuf[2], 3072, 1024);
+
+ for(int i=0; i < inBuf.length; ++i) {
+ inBuf[i].flip();
+ }
+ InStream in = InStream.create("test", inBuf,
+ new long[]{0, 1024, 3072}, 4096, null, 400);
+ assertEquals("uncompressed stream test position: 0 length: 4096" +
+ " range: 0 offset: 0 limit: 0",
+ in.toString());
+ DataInputStream inStream = new DataInputStream(in);
+ for(int i=0; i < 1024; ++i) {
+ int x = inStream.readInt();
+ assertEquals(i, x);
+ }
+ assertEquals(0, in.available());
+ for(int i=1023; i >= 0; --i) {
+ in.seek(positions[i]);
+ assertEquals(i, inStream.readInt());
+ }
+
+ in = InStream.create("test", new ByteBuffer[]{inBuf[1], inBuf[2]},
+ new long[]{1024, 3072}, 4096, null, 400);
+ inStream = new DataInputStream(in);
+ positions[256].reset();
+ in.seek(positions[256]);
+ for(int i=256; i < 1024; ++i) {
+ assertEquals(i, inStream.readInt());
+ }
+
+ in = InStream.create("test", new ByteBuffer[]{inBuf[0], inBuf[2]},
+ new long[]{0, 3072}, 4096, null, 400);
+ inStream = new DataInputStream(in);
+ positions[768].reset();
+ for(int i=0; i < 256; ++i) {
+ assertEquals(i, inStream.readInt());
+ }
+ in.seek(positions[768]);
+ for(int i=768; i < 1024; ++i) {
+ assertEquals(i, inStream.readInt());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestIntegerCompressionReader.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestIntegerCompressionReader.java b/orc/src/test/org/apache/hive/orc/impl/TestIntegerCompressionReader.java
new file mode 100644
index 0000000..55d6893
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestIntegerCompressionReader.java
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import org.apache.hive.orc.CompressionCodec;
+import org.junit.Test;
+
+public class TestIntegerCompressionReader {
+
+ public void runSeekTest(CompressionCodec codec) throws Exception {
+ TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
+ RunLengthIntegerWriterV2 out = new RunLengthIntegerWriterV2(
+ new OutStream("test", 1000, codec, collect), true);
+ TestInStream.PositionCollector[] positions =
+ new TestInStream.PositionCollector[4096];
+ Random random = new Random(99);
+ int[] junk = new int[2048];
+ for(int i=0; i < junk.length; ++i) {
+ junk[i] = random.nextInt();
+ }
+ for(int i=0; i < 4096; ++i) {
+ positions[i] = new TestInStream.PositionCollector();
+ out.getPosition(positions[i]);
+ // test runs, incrementing runs, non-runs
+ if (i < 1024) {
+ out.write(i/4);
+ } else if (i < 2048) {
+ out.write(2*i);
+ } else {
+ out.write(junk[i-2048]);
+ }
+ }
+ out.flush();
+ ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
+ collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
+ inBuf.flip();
+ RunLengthIntegerReaderV2 in =
+ new RunLengthIntegerReaderV2(InStream.create
+ ("test", new ByteBuffer[]{inBuf},
+ new long[]{0}, inBuf.remaining(),
+ codec, 1000), true, false);
+ for(int i=0; i < 2048; ++i) {
+ int x = (int) in.next();
+ if (i < 1024) {
+ assertEquals(i/4, x);
+ } else if (i < 2048) {
+ assertEquals(2*i, x);
+ } else {
+ assertEquals(junk[i-2048], x);
+ }
+ }
+ for(int i=2047; i >= 0; --i) {
+ in.seek(positions[i]);
+ int x = (int) in.next();
+ if (i < 1024) {
+ assertEquals(i/4, x);
+ } else if (i < 2048) {
+ assertEquals(2*i, x);
+ } else {
+ assertEquals(junk[i-2048], x);
+ }
+ }
+ }
+
+ @Test
+ public void testUncompressedSeek() throws Exception {
+ runSeekTest(null);
+ }
+
+ @Test
+ public void testCompressedSeek() throws Exception {
+ runSeekTest(new ZlibCodec());
+ }
+
+ @Test
+ public void testSkips() throws Exception {
+ TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
+ RunLengthIntegerWriterV2 out = new RunLengthIntegerWriterV2(
+ new OutStream("test", 100, null, collect), true);
+ for(int i=0; i < 2048; ++i) {
+ if (i < 1024) {
+ out.write(i);
+ } else {
+ out.write(256 * i);
+ }
+ }
+ out.flush();
+ ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
+ collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
+ inBuf.flip();
+ RunLengthIntegerReaderV2 in =
+ new RunLengthIntegerReaderV2(InStream.create("test",
+ new ByteBuffer[]{inBuf},
+ new long[]{0},
+ inBuf.remaining(),
+ null, 100), true, false);
+ for(int i=0; i < 2048; i += 10) {
+ int x = (int) in.next();
+ if (i < 1024) {
+ assertEquals(i, x);
+ } else {
+ assertEquals(256 * i, x);
+ }
+ if (i < 2038) {
+ in.skip(9);
+ }
+ in.skip(0);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestMemoryManager.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestMemoryManager.java b/orc/src/test/org/apache/hive/orc/impl/TestMemoryManager.java
new file mode 100644
index 0000000..290208c
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestMemoryManager.java
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.hamcrest.BaseMatcher;
+import org.hamcrest.Description;
+import org.junit.Test;
+import org.mockito.Matchers;
+import org.mockito.Mockito;
+
+import java.lang.management.ManagementFactory;
+
+import static junit.framework.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+
+/**
+ * Test the ORC memory manager.
+ */
+public class TestMemoryManager {
+ private static final double ERROR = 0.000001;
+
+ private static class NullCallback implements MemoryManager.Callback {
+ public boolean checkMemory(double newScale) {
+ return false;
+ }
+ }
+
+ @Test
+ public void testBasics() throws Exception {
+ Configuration conf = new Configuration();
+ MemoryManager mgr = new MemoryManager(conf);
+ NullCallback callback = new NullCallback();
+ long poolSize = mgr.getTotalMemoryPool();
+ assertEquals(Math.round(ManagementFactory.getMemoryMXBean().
+ getHeapMemoryUsage().getMax() * 0.5d), poolSize);
+ assertEquals(1.0, mgr.getAllocationScale(), 0.00001);
+ mgr.addWriter(new Path("p1"), 1000, callback);
+ assertEquals(1.0, mgr.getAllocationScale(), 0.00001);
+ mgr.addWriter(new Path("p1"), poolSize / 2, callback);
+ assertEquals(1.0, mgr.getAllocationScale(), 0.00001);
+ mgr.addWriter(new Path("p2"), poolSize / 2, callback);
+ assertEquals(1.0, mgr.getAllocationScale(), 0.00001);
+ mgr.addWriter(new Path("p3"), poolSize / 2, callback);
+ assertEquals(0.6666667, mgr.getAllocationScale(), 0.00001);
+ mgr.addWriter(new Path("p4"), poolSize / 2, callback);
+ assertEquals(0.5, mgr.getAllocationScale(), 0.000001);
+ mgr.addWriter(new Path("p4"), 3 * poolSize / 2, callback);
+ assertEquals(0.3333333, mgr.getAllocationScale(), 0.000001);
+ mgr.removeWriter(new Path("p1"));
+ mgr.removeWriter(new Path("p2"));
+ assertEquals(0.5, mgr.getAllocationScale(), 0.00001);
+ mgr.removeWriter(new Path("p4"));
+ assertEquals(1.0, mgr.getAllocationScale(), 0.00001);
+ }
+
+ @Test
+ public void testConfig() throws Exception {
+ Configuration conf = new Configuration();
+ conf.set("hive.exec.orc.memory.pool", "0.9");
+ MemoryManager mgr = new MemoryManager(conf);
+ long mem =
+ ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getMax();
+ System.err.print("Memory = " + mem);
+ long pool = mgr.getTotalMemoryPool();
+ assertTrue("Pool too small: " + pool, mem * 0.899 < pool);
+ assertTrue("Pool too big: " + pool, pool < mem * 0.901);
+ }
+
+ private static class DoubleMatcher extends BaseMatcher<Double> {
+ final double expected;
+ final double error;
+ DoubleMatcher(double expected, double error) {
+ this.expected = expected;
+ this.error = error;
+ }
+
+ @Override
+ public boolean matches(Object val) {
+ double dbl = (Double) val;
+ return Math.abs(dbl - expected) <= error;
+ }
+
+ @Override
+ public void describeTo(Description description) {
+ description.appendText("not sufficiently close to ");
+ description.appendText(Double.toString(expected));
+ }
+ }
+
+ private static DoubleMatcher closeTo(double value, double error) {
+ return new DoubleMatcher(value, error);
+ }
+
+ @Test
+ public void testCallback() throws Exception {
+ Configuration conf = new Configuration();
+ MemoryManager mgr = new MemoryManager(conf);
+ long pool = mgr.getTotalMemoryPool();
+ MemoryManager.Callback[] calls = new MemoryManager.Callback[20];
+ for(int i=0; i < calls.length; ++i) {
+ calls[i] = Mockito.mock(MemoryManager.Callback.class);
+ mgr.addWriter(new Path(Integer.toString(i)), pool/4, calls[i]);
+ }
+ // add enough rows to get the memory manager to check the limits
+ for(int i=0; i < 10000; ++i) {
+ mgr.addedRow(1);
+ }
+ for(int call=0; call < calls.length; ++call) {
+ Mockito.verify(calls[call], Mockito.times(2))
+ .checkMemory(Matchers.doubleThat(closeTo(0.2, ERROR)));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestOrcWideTable.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestOrcWideTable.java b/orc/src/test/org/apache/hive/orc/impl/TestOrcWideTable.java
new file mode 100644
index 0000000..e377a24
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestOrcWideTable.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+
+import org.junit.Test;
+
+public class TestOrcWideTable {
+
+ @Test
+ public void testBufferSizeFor1Col() throws IOException {
+ assertEquals(128 * 1024, PhysicalFsWriter.getEstimatedBufferSize(512 * 1024 * 1024,
+ 1, 128*1024));
+ }
+
+ @Test
+ public void testBufferSizeFor50Col() throws IOException {
+ assertEquals(256 * 1024, PhysicalFsWriter.getEstimatedBufferSize(256 * 1024 * 1024,
+ 50, 256*1024));
+ }
+
+ @Test
+ public void testBufferSizeFor1000Col() throws IOException {
+ assertEquals(32 * 1024, PhysicalFsWriter.getEstimatedBufferSize(512 * 1024 * 1024,
+ 1000, 128*1024));
+ }
+
+ @Test
+ public void testBufferSizeFor2000Col() throws IOException {
+ assertEquals(16 * 1024, PhysicalFsWriter.getEstimatedBufferSize(512 * 1024 * 1024,
+ 2000, 256*1024));
+ }
+
+ @Test
+ public void testBufferSizeFor4000Col() throws IOException {
+ assertEquals(8 * 1024, PhysicalFsWriter.getEstimatedBufferSize(512 * 1024 * 1024,
+ 4000, 256*1024));
+ }
+
+ @Test
+ public void testBufferSizeFor25000Col() throws IOException {
+ assertEquals(4 * 1024, PhysicalFsWriter.getEstimatedBufferSize(512 * 1024 * 1024,
+ 25000, 256*1024));
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestOutStream.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestOutStream.java b/orc/src/test/org/apache/hive/orc/impl/TestOutStream.java
new file mode 100644
index 0000000..23c13f4
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestOutStream.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import org.apache.hive.orc.CompressionCodec;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import java.nio.ByteBuffer;
+
+import static org.junit.Assert.assertEquals;
+
+public class TestOutStream {
+
+ @Test
+ public void testFlush() throws Exception {
+ OutStream.OutputReceiver receiver =
+ Mockito.mock(OutStream.OutputReceiver.class);
+ CompressionCodec codec = new ZlibCodec();
+ OutStream stream = new OutStream("test", 128*1024, codec, receiver);
+ assertEquals(0L, stream.getBufferSize());
+ stream.write(new byte[]{0, 1, 2});
+ stream.flush();
+ Mockito.verify(receiver).output(Mockito.any(ByteBuffer.class));
+ assertEquals(0L, stream.getBufferSize());
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestRLEv2.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestRLEv2.java b/orc/src/test/org/apache/hive/orc/impl/TestRLEv2.java
new file mode 100644
index 0000000..441a3a2
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestRLEv2.java
@@ -0,0 +1,307 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.PrintStream;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hive.orc.OrcFile;
+import org.apache.hive.orc.TypeDescription;
+import org.apache.hive.orc.Writer;
+import org.apache.hive.orc.tools.FileDump;
+import org.apache.hive.orc.CompressionKind;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+public class TestRLEv2 {
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+ Path testFilePath;
+ Configuration conf;
+ FileSystem fs;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem () throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestRLEv2." +
+ testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ void appendInt(VectorizedRowBatch batch, int i) {
+ ((LongColumnVector) batch.cols[0]).vector[batch.size++] = i;
+ }
+
+ @Test
+ public void testFixedDeltaZero() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (int i = 0; i < 5120; ++i) {
+ appendInt(batch, 123);
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // 10 runs of 512 elements. Each run has 2 bytes header, 2 bytes base (base = 123,
+ // zigzag encoded varint) and 1 byte delta (delta = 0). In total, 5 bytes per run.
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 50"));
+ System.setOut(origOut);
+ }
+
+ @Test
+ public void testFixedDeltaOne() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (int i = 0; i < 5120; ++i) {
+ appendInt(batch, i % 512);
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // 10 runs of 512 elements. Each run has 2 bytes header, 1 byte base (base = 0)
+ // and 1 byte delta (delta = 1). In total, 4 bytes per run.
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 40"));
+ System.setOut(origOut);
+ }
+
+ @Test
+ public void testFixedDeltaOneDescending() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (int i = 0; i < 5120; ++i) {
+ appendInt(batch, 512 - (i % 512));
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // 10 runs of 512 elements. Each run has 2 bytes header, 2 byte base (base = 512, zigzag + varint)
+ // and 1 byte delta (delta = 1). In total, 5 bytes per run.
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 50"));
+ System.setOut(origOut);
+ }
+
+ @Test
+ public void testFixedDeltaLarge() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (int i = 0; i < 5120; ++i) {
+ appendInt(batch, i % 512 + ((i % 512) * 100));
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // 10 runs of 512 elements. Each run has 2 bytes header, 1 byte base (base = 0)
+ // and 2 bytes delta (delta = 100, zigzag encoded varint). In total, 5 bytes per run.
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 50"));
+ System.setOut(origOut);
+ }
+
+ @Test
+ public void testFixedDeltaLargeDescending() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (int i = 0; i < 5120; ++i) {
+ appendInt(batch, (512 - i % 512) + ((i % 512) * 100));
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // 10 runs of 512 elements. Each run has 2 bytes header, 2 byte base (base = 512, zigzag + varint)
+ // and 2 bytes delta (delta = 100, zigzag encoded varint). In total, 6 bytes per run.
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 60"));
+ System.setOut(origOut);
+ }
+
+ @Test
+ public void testShortRepeat() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (int i = 0; i < 5; ++i) {
+ appendInt(batch, 10);
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // 1 byte header + 1 byte value
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 2"));
+ System.setOut(origOut);
+ }
+
+ @Test
+ public void testDeltaUnknownSign() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ appendInt(batch, 0);
+ for (int i = 0; i < 511; ++i) {
+ appendInt(batch, i);
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // monotonicity will be undetermined for this sequence 0,0,1,2,3,...510. Hence DIRECT encoding
+ // will be used. 2 bytes for header and 640 bytes for data (512 values with fixed bit of 10 bits
+ // each, 5120/8 = 640). Total bytes 642
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 642"));
+ System.setOut(origOut);
+ }
+
+ @Test
+ public void testPatchedBase() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+ Writer w = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .compress(CompressionKind.NONE)
+ .setSchema(schema)
+ .rowIndexStride(0)
+ .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+ .version(OrcFile.Version.V_0_12)
+ );
+
+ Random rand = new Random(123);
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ appendInt(batch, 10000000);
+ for (int i = 0; i < 511; ++i) {
+ appendInt(batch, rand.nextInt(i+1));
+ }
+ w.addRowBatch(batch);
+ w.close();
+
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toUri().toString()});
+ System.out.flush();
+ String outDump = new String(myOut.toByteArray());
+ // use PATCHED_BASE encoding
+ assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 583"));
+ System.setOut(origOut);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestReaderImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestReaderImpl.java b/orc/src/test/org/apache/hive/orc/impl/TestReaderImpl.java
new file mode 100644
index 0000000..c414e17
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestReaderImpl.java
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2016 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.ByteArrayInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PositionedReadable;
+import org.apache.hadoop.fs.Seekable;
+import org.apache.hive.orc.FileFormatException;
+import org.apache.hive.orc.OrcFile;
+import org.apache.hadoop.io.Text;
+import org.junit.Test;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.rules.ExpectedException;
+
+public class TestReaderImpl {
+
+ @Rule
+ public ExpectedException thrown = ExpectedException.none();
+
+ private final Path path = new Path("test-file.orc");
+ private FSDataInputStream in;
+ private int psLen;
+ private ByteBuffer buffer;
+
+ @Before
+ public void setup() {
+ in = null;
+ }
+
+ @Test
+ public void testEnsureOrcFooterSmallTextFile() throws IOException {
+ prepareTestCase("1".getBytes());
+ thrown.expect(FileFormatException.class);
+ ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
+ }
+
+ @Test
+ public void testEnsureOrcFooterLargeTextFile() throws IOException {
+ prepareTestCase("This is Some Text File".getBytes());
+ thrown.expect(FileFormatException.class);
+ ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
+ }
+
+ @Test
+ public void testEnsureOrcFooter011ORCFile() throws IOException {
+ prepareTestCase(composeContent(OrcFile.MAGIC, "FOOTER"));
+ ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
+ }
+
+ @Test
+ public void testEnsureOrcFooterCorrectORCFooter() throws IOException {
+ prepareTestCase(composeContent("", OrcFile.MAGIC));
+ ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
+ }
+
+ private void prepareTestCase(byte[] bytes) {
+ buffer = ByteBuffer.wrap(bytes);
+ psLen = buffer.get(bytes.length - 1) & 0xff;
+ in = new FSDataInputStream(new SeekableByteArrayInputStream(bytes));
+ }
+
+ private byte[] composeContent(String headerStr, String footerStr) throws CharacterCodingException {
+ ByteBuffer header = Text.encode(headerStr);
+ ByteBuffer footer = Text.encode(footerStr);
+ int headerLen = header.remaining();
+ int footerLen = footer.remaining() + 1;
+
+ ByteBuffer buf = ByteBuffer.allocate(headerLen + footerLen);
+
+ buf.put(header);
+ buf.put(footer);
+ buf.put((byte) footerLen);
+ return buf.array();
+ }
+
+ private static final class SeekableByteArrayInputStream extends ByteArrayInputStream
+ implements Seekable, PositionedReadable {
+
+ public SeekableByteArrayInputStream(byte[] buf) {
+ super(buf);
+ }
+
+ @Override
+ public void seek(long pos) throws IOException {
+ this.reset();
+ this.skip(pos);
+ }
+
+ @Override
+ public long getPos() throws IOException {
+ return pos;
+ }
+
+ @Override
+ public boolean seekToNewSource(long targetPos) throws IOException {
+ return false;
+ }
+
+ @Override
+ public int read(long position, byte[] buffer, int offset, int length)
+ throws IOException {
+ long oldPos = getPos();
+ int nread = -1;
+ try {
+ seek(position);
+ nread = read(buffer, offset, length);
+ } finally {
+ seek(oldPos);
+ }
+ return nread;
+ }
+
+ @Override
+ public void readFully(long position, byte[] buffer, int offset, int length)
+ throws IOException {
+ int nread = 0;
+ while (nread < length) {
+ int nbytes = read(position + nread, buffer, offset + nread, length - nread);
+ if (nbytes < 0) {
+ throw new EOFException("End of file reached before reading fully.");
+ }
+ nread += nbytes;
+ }
+ }
+
+ @Override
+ public void readFully(long position, byte[] buffer)
+ throws IOException {
+ readFully(position, buffer, 0, buffer.length);
+ }
+ }
+}
[36/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/gen/protobuf-java/org/apache/hive/orc/OrcProto.java
----------------------------------------------------------------------
diff --git a/orc/src/gen/protobuf-java/org/apache/hive/orc/OrcProto.java b/orc/src/gen/protobuf-java/org/apache/hive/orc/OrcProto.java
new file mode 100644
index 0000000..963471b
--- /dev/null
+++ b/orc/src/gen/protobuf-java/org/apache/hive/orc/OrcProto.java
@@ -0,0 +1,20179 @@
+// Generated by the protocol buffer compiler. DO NOT EDIT!
+// source: orc_proto.proto
+
+package org.apache.hive.orc;
+
+public final class OrcProto {
+ private OrcProto() {}
+ public static void registerAllExtensions(
+ com.google.protobuf.ExtensionRegistry registry) {
+ }
+ /**
+ * Protobuf enum {@code orc.proto.CompressionKind}
+ */
+ public enum CompressionKind
+ implements com.google.protobuf.ProtocolMessageEnum {
+ /**
+ * <code>NONE = 0;</code>
+ */
+ NONE(0, 0),
+ /**
+ * <code>ZLIB = 1;</code>
+ */
+ ZLIB(1, 1),
+ /**
+ * <code>SNAPPY = 2;</code>
+ */
+ SNAPPY(2, 2),
+ /**
+ * <code>LZO = 3;</code>
+ */
+ LZO(3, 3),
+ ;
+
+ /**
+ * <code>NONE = 0;</code>
+ */
+ public static final int NONE_VALUE = 0;
+ /**
+ * <code>ZLIB = 1;</code>
+ */
+ public static final int ZLIB_VALUE = 1;
+ /**
+ * <code>SNAPPY = 2;</code>
+ */
+ public static final int SNAPPY_VALUE = 2;
+ /**
+ * <code>LZO = 3;</code>
+ */
+ public static final int LZO_VALUE = 3;
+
+
+ public final int getNumber() { return value; }
+
+ public static CompressionKind valueOf(int value) {
+ switch (value) {
+ case 0: return NONE;
+ case 1: return ZLIB;
+ case 2: return SNAPPY;
+ case 3: return LZO;
+ default: return null;
+ }
+ }
+
+ public static com.google.protobuf.Internal.EnumLiteMap<CompressionKind>
+ internalGetValueMap() {
+ return internalValueMap;
+ }
+ private static com.google.protobuf.Internal.EnumLiteMap<CompressionKind>
+ internalValueMap =
+ new com.google.protobuf.Internal.EnumLiteMap<CompressionKind>() {
+ public CompressionKind findValueByNumber(int number) {
+ return CompressionKind.valueOf(number);
+ }
+ };
+
+ public final com.google.protobuf.Descriptors.EnumValueDescriptor
+ getValueDescriptor() {
+ return getDescriptor().getValues().get(index);
+ }
+ public final com.google.protobuf.Descriptors.EnumDescriptor
+ getDescriptorForType() {
+ return getDescriptor();
+ }
+ public static final com.google.protobuf.Descriptors.EnumDescriptor
+ getDescriptor() {
+ return OrcProto.getDescriptor().getEnumTypes().get(0);
+ }
+
+ private static final CompressionKind[] VALUES = values();
+
+ public static CompressionKind valueOf(
+ com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
+ if (desc.getType() != getDescriptor()) {
+ throw new java.lang.IllegalArgumentException(
+ "EnumValueDescriptor is not for this type.");
+ }
+ return VALUES[desc.getIndex()];
+ }
+
+ private final int index;
+ private final int value;
+
+ private CompressionKind(int index, int value) {
+ this.index = index;
+ this.value = value;
+ }
+
+ // @@protoc_insertion_point(enum_scope:orc.proto.CompressionKind)
+ }
+
+ public interface IntegerStatisticsOrBuilder
+ extends com.google.protobuf.MessageOrBuilder {
+
+ // optional sint64 minimum = 1;
+ /**
+ * <code>optional sint64 minimum = 1;</code>
+ */
+ boolean hasMinimum();
+ /**
+ * <code>optional sint64 minimum = 1;</code>
+ */
+ long getMinimum();
+
+ // optional sint64 maximum = 2;
+ /**
+ * <code>optional sint64 maximum = 2;</code>
+ */
+ boolean hasMaximum();
+ /**
+ * <code>optional sint64 maximum = 2;</code>
+ */
+ long getMaximum();
+
+ // optional sint64 sum = 3;
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ */
+ boolean hasSum();
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ */
+ long getSum();
+ }
+ /**
+ * Protobuf type {@code orc.proto.IntegerStatistics}
+ */
+ public static final class IntegerStatistics extends
+ com.google.protobuf.GeneratedMessage
+ implements IntegerStatisticsOrBuilder {
+ // Use IntegerStatistics.newBuilder() to construct.
+ private IntegerStatistics(com.google.protobuf.GeneratedMessage.Builder<?> builder) {
+ super(builder);
+ this.unknownFields = builder.getUnknownFields();
+ }
+ private IntegerStatistics(boolean noInit) { this.unknownFields = com.google.protobuf.UnknownFieldSet.getDefaultInstance(); }
+
+ private static final IntegerStatistics defaultInstance;
+ public static IntegerStatistics getDefaultInstance() {
+ return defaultInstance;
+ }
+
+ public IntegerStatistics getDefaultInstanceForType() {
+ return defaultInstance;
+ }
+
+ private final com.google.protobuf.UnknownFieldSet unknownFields;
+ @java.lang.Override
+ public final com.google.protobuf.UnknownFieldSet
+ getUnknownFields() {
+ return this.unknownFields;
+ }
+ private IntegerStatistics(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ initFields();
+ int mutable_bitField0_ = 0;
+ com.google.protobuf.UnknownFieldSet.Builder unknownFields =
+ com.google.protobuf.UnknownFieldSet.newBuilder();
+ try {
+ boolean done = false;
+ while (!done) {
+ int tag = input.readTag();
+ switch (tag) {
+ case 0:
+ done = true;
+ break;
+ default: {
+ if (!parseUnknownField(input, unknownFields,
+ extensionRegistry, tag)) {
+ done = true;
+ }
+ break;
+ }
+ case 8: {
+ bitField0_ |= 0x00000001;
+ minimum_ = input.readSInt64();
+ break;
+ }
+ case 16: {
+ bitField0_ |= 0x00000002;
+ maximum_ = input.readSInt64();
+ break;
+ }
+ case 24: {
+ bitField0_ |= 0x00000004;
+ sum_ = input.readSInt64();
+ break;
+ }
+ }
+ }
+ } catch (com.google.protobuf.InvalidProtocolBufferException e) {
+ throw e.setUnfinishedMessage(this);
+ } catch (java.io.IOException e) {
+ throw new com.google.protobuf.InvalidProtocolBufferException(
+ e.getMessage()).setUnfinishedMessage(this);
+ } finally {
+ this.unknownFields = unknownFields.build();
+ makeExtensionsImmutable();
+ }
+ }
+ public static final com.google.protobuf.Descriptors.Descriptor
+ getDescriptor() {
+ return OrcProto.internal_static_orc_proto_IntegerStatistics_descriptor;
+ }
+
+ protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
+ internalGetFieldAccessorTable() {
+ return OrcProto.internal_static_orc_proto_IntegerStatistics_fieldAccessorTable
+ .ensureFieldAccessorsInitialized(
+ OrcProto.IntegerStatistics.class, OrcProto.IntegerStatistics.Builder.class);
+ }
+
+ public static com.google.protobuf.Parser<IntegerStatistics> PARSER =
+ new com.google.protobuf.AbstractParser<IntegerStatistics>() {
+ public IntegerStatistics parsePartialFrom(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return new IntegerStatistics(input, extensionRegistry);
+ }
+ };
+
+ @java.lang.Override
+ public com.google.protobuf.Parser<IntegerStatistics> getParserForType() {
+ return PARSER;
+ }
+
+ private int bitField0_;
+ // optional sint64 minimum = 1;
+ public static final int MINIMUM_FIELD_NUMBER = 1;
+ private long minimum_;
+ /**
+ * <code>optional sint64 minimum = 1;</code>
+ */
+ public boolean hasMinimum() {
+ return ((bitField0_ & 0x00000001) == 0x00000001);
+ }
+ /**
+ * <code>optional sint64 minimum = 1;</code>
+ */
+ public long getMinimum() {
+ return minimum_;
+ }
+
+ // optional sint64 maximum = 2;
+ public static final int MAXIMUM_FIELD_NUMBER = 2;
+ private long maximum_;
+ /**
+ * <code>optional sint64 maximum = 2;</code>
+ */
+ public boolean hasMaximum() {
+ return ((bitField0_ & 0x00000002) == 0x00000002);
+ }
+ /**
+ * <code>optional sint64 maximum = 2;</code>
+ */
+ public long getMaximum() {
+ return maximum_;
+ }
+
+ // optional sint64 sum = 3;
+ public static final int SUM_FIELD_NUMBER = 3;
+ private long sum_;
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ */
+ public boolean hasSum() {
+ return ((bitField0_ & 0x00000004) == 0x00000004);
+ }
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ */
+ public long getSum() {
+ return sum_;
+ }
+
+ private void initFields() {
+ minimum_ = 0L;
+ maximum_ = 0L;
+ sum_ = 0L;
+ }
+ private byte memoizedIsInitialized = -1;
+ public final boolean isInitialized() {
+ byte isInitialized = memoizedIsInitialized;
+ if (isInitialized != -1) return isInitialized == 1;
+
+ memoizedIsInitialized = 1;
+ return true;
+ }
+
+ public void writeTo(com.google.protobuf.CodedOutputStream output)
+ throws java.io.IOException {
+ getSerializedSize();
+ if (((bitField0_ & 0x00000001) == 0x00000001)) {
+ output.writeSInt64(1, minimum_);
+ }
+ if (((bitField0_ & 0x00000002) == 0x00000002)) {
+ output.writeSInt64(2, maximum_);
+ }
+ if (((bitField0_ & 0x00000004) == 0x00000004)) {
+ output.writeSInt64(3, sum_);
+ }
+ getUnknownFields().writeTo(output);
+ }
+
+ private int memoizedSerializedSize = -1;
+ public int getSerializedSize() {
+ int size = memoizedSerializedSize;
+ if (size != -1) return size;
+
+ size = 0;
+ if (((bitField0_ & 0x00000001) == 0x00000001)) {
+ size += com.google.protobuf.CodedOutputStream
+ .computeSInt64Size(1, minimum_);
+ }
+ if (((bitField0_ & 0x00000002) == 0x00000002)) {
+ size += com.google.protobuf.CodedOutputStream
+ .computeSInt64Size(2, maximum_);
+ }
+ if (((bitField0_ & 0x00000004) == 0x00000004)) {
+ size += com.google.protobuf.CodedOutputStream
+ .computeSInt64Size(3, sum_);
+ }
+ size += getUnknownFields().getSerializedSize();
+ memoizedSerializedSize = size;
+ return size;
+ }
+
+ private static final long serialVersionUID = 0L;
+ @java.lang.Override
+ protected java.lang.Object writeReplace()
+ throws java.io.ObjectStreamException {
+ return super.writeReplace();
+ }
+
+ public static OrcProto.IntegerStatistics parseFrom(
+ com.google.protobuf.ByteString data)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data);
+ }
+ public static OrcProto.IntegerStatistics parseFrom(
+ com.google.protobuf.ByteString data,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data, extensionRegistry);
+ }
+ public static OrcProto.IntegerStatistics parseFrom(byte[] data)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data);
+ }
+ public static OrcProto.IntegerStatistics parseFrom(
+ byte[] data,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data, extensionRegistry);
+ }
+ public static OrcProto.IntegerStatistics parseFrom(java.io.InputStream input)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input);
+ }
+ public static OrcProto.IntegerStatistics parseFrom(
+ java.io.InputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input, extensionRegistry);
+ }
+ public static OrcProto.IntegerStatistics parseDelimitedFrom(java.io.InputStream input)
+ throws java.io.IOException {
+ return PARSER.parseDelimitedFrom(input);
+ }
+ public static OrcProto.IntegerStatistics parseDelimitedFrom(
+ java.io.InputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return PARSER.parseDelimitedFrom(input, extensionRegistry);
+ }
+ public static OrcProto.IntegerStatistics parseFrom(
+ com.google.protobuf.CodedInputStream input)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input);
+ }
+ public static OrcProto.IntegerStatistics parseFrom(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input, extensionRegistry);
+ }
+
+ public static Builder newBuilder() { return Builder.create(); }
+ public Builder newBuilderForType() { return newBuilder(); }
+ public static Builder newBuilder(OrcProto.IntegerStatistics prototype) {
+ return newBuilder().mergeFrom(prototype);
+ }
+ public Builder toBuilder() { return newBuilder(this); }
+
+ @java.lang.Override
+ protected Builder newBuilderForType(
+ com.google.protobuf.GeneratedMessage.BuilderParent parent) {
+ Builder builder = new Builder(parent);
+ return builder;
+ }
+ /**
+ * Protobuf type {@code orc.proto.IntegerStatistics}
+ */
+ public static final class Builder extends
+ com.google.protobuf.GeneratedMessage.Builder<Builder>
+ implements OrcProto.IntegerStatisticsOrBuilder {
+ public static final com.google.protobuf.Descriptors.Descriptor
+ getDescriptor() {
+ return OrcProto.internal_static_orc_proto_IntegerStatistics_descriptor;
+ }
+
+ protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
+ internalGetFieldAccessorTable() {
+ return OrcProto.internal_static_orc_proto_IntegerStatistics_fieldAccessorTable
+ .ensureFieldAccessorsInitialized(
+ OrcProto.IntegerStatistics.class, OrcProto.IntegerStatistics.Builder.class);
+ }
+
+ // Construct using OrcProto.IntegerStatistics.newBuilder()
+ private Builder() {
+ maybeForceBuilderInitialization();
+ }
+
+ private Builder(
+ com.google.protobuf.GeneratedMessage.BuilderParent parent) {
+ super(parent);
+ maybeForceBuilderInitialization();
+ }
+ private void maybeForceBuilderInitialization() {
+ if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) {
+ }
+ }
+ private static Builder create() {
+ return new Builder();
+ }
+
+ public Builder clear() {
+ super.clear();
+ minimum_ = 0L;
+ bitField0_ = (bitField0_ & ~0x00000001);
+ maximum_ = 0L;
+ bitField0_ = (bitField0_ & ~0x00000002);
+ sum_ = 0L;
+ bitField0_ = (bitField0_ & ~0x00000004);
+ return this;
+ }
+
+ public Builder clone() {
+ return create().mergeFrom(buildPartial());
+ }
+
+ public com.google.protobuf.Descriptors.Descriptor
+ getDescriptorForType() {
+ return OrcProto.internal_static_orc_proto_IntegerStatistics_descriptor;
+ }
+
+ public OrcProto.IntegerStatistics getDefaultInstanceForType() {
+ return OrcProto.IntegerStatistics.getDefaultInstance();
+ }
+
+ public OrcProto.IntegerStatistics build() {
+ OrcProto.IntegerStatistics result = buildPartial();
+ if (!result.isInitialized()) {
+ throw newUninitializedMessageException(result);
+ }
+ return result;
+ }
+
+ public OrcProto.IntegerStatistics buildPartial() {
+ OrcProto.IntegerStatistics result = new OrcProto.IntegerStatistics(this);
+ int from_bitField0_ = bitField0_;
+ int to_bitField0_ = 0;
+ if (((from_bitField0_ & 0x00000001) == 0x00000001)) {
+ to_bitField0_ |= 0x00000001;
+ }
+ result.minimum_ = minimum_;
+ if (((from_bitField0_ & 0x00000002) == 0x00000002)) {
+ to_bitField0_ |= 0x00000002;
+ }
+ result.maximum_ = maximum_;
+ if (((from_bitField0_ & 0x00000004) == 0x00000004)) {
+ to_bitField0_ |= 0x00000004;
+ }
+ result.sum_ = sum_;
+ result.bitField0_ = to_bitField0_;
+ onBuilt();
+ return result;
+ }
+
+ public Builder mergeFrom(com.google.protobuf.Message other) {
+ if (other instanceof OrcProto.IntegerStatistics) {
+ return mergeFrom((OrcProto.IntegerStatistics)other);
+ } else {
+ super.mergeFrom(other);
+ return this;
+ }
+ }
+
+ public Builder mergeFrom(OrcProto.IntegerStatistics other) {
+ if (other == OrcProto.IntegerStatistics.getDefaultInstance()) return this;
+ if (other.hasMinimum()) {
+ setMinimum(other.getMinimum());
+ }
+ if (other.hasMaximum()) {
+ setMaximum(other.getMaximum());
+ }
+ if (other.hasSum()) {
+ setSum(other.getSum());
+ }
+ this.mergeUnknownFields(other.getUnknownFields());
+ return this;
+ }
+
+ public final boolean isInitialized() {
+ return true;
+ }
+
+ public Builder mergeFrom(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ OrcProto.IntegerStatistics parsedMessage = null;
+ try {
+ parsedMessage = PARSER.parsePartialFrom(input, extensionRegistry);
+ } catch (com.google.protobuf.InvalidProtocolBufferException e) {
+ parsedMessage = (OrcProto.IntegerStatistics) e.getUnfinishedMessage();
+ throw e;
+ } finally {
+ if (parsedMessage != null) {
+ mergeFrom(parsedMessage);
+ }
+ }
+ return this;
+ }
+ private int bitField0_;
+
+ // optional sint64 minimum = 1;
+ private long minimum_ ;
+ /**
+ * <code>optional sint64 minimum = 1;</code>
+ */
+ public boolean hasMinimum() {
+ return ((bitField0_ & 0x00000001) == 0x00000001);
+ }
+ /**
+ * <code>optional sint64 minimum = 1;</code>
+ */
+ public long getMinimum() {
+ return minimum_;
+ }
+ /**
+ * <code>optional sint64 minimum = 1;</code>
+ */
+ public Builder setMinimum(long value) {
+ bitField0_ |= 0x00000001;
+ minimum_ = value;
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>optional sint64 minimum = 1;</code>
+ */
+ public Builder clearMinimum() {
+ bitField0_ = (bitField0_ & ~0x00000001);
+ minimum_ = 0L;
+ onChanged();
+ return this;
+ }
+
+ // optional sint64 maximum = 2;
+ private long maximum_ ;
+ /**
+ * <code>optional sint64 maximum = 2;</code>
+ */
+ public boolean hasMaximum() {
+ return ((bitField0_ & 0x00000002) == 0x00000002);
+ }
+ /**
+ * <code>optional sint64 maximum = 2;</code>
+ */
+ public long getMaximum() {
+ return maximum_;
+ }
+ /**
+ * <code>optional sint64 maximum = 2;</code>
+ */
+ public Builder setMaximum(long value) {
+ bitField0_ |= 0x00000002;
+ maximum_ = value;
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>optional sint64 maximum = 2;</code>
+ */
+ public Builder clearMaximum() {
+ bitField0_ = (bitField0_ & ~0x00000002);
+ maximum_ = 0L;
+ onChanged();
+ return this;
+ }
+
+ // optional sint64 sum = 3;
+ private long sum_ ;
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ */
+ public boolean hasSum() {
+ return ((bitField0_ & 0x00000004) == 0x00000004);
+ }
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ */
+ public long getSum() {
+ return sum_;
+ }
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ */
+ public Builder setSum(long value) {
+ bitField0_ |= 0x00000004;
+ sum_ = value;
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ */
+ public Builder clearSum() {
+ bitField0_ = (bitField0_ & ~0x00000004);
+ sum_ = 0L;
+ onChanged();
+ return this;
+ }
+
+ // @@protoc_insertion_point(builder_scope:orc.proto.IntegerStatistics)
+ }
+
+ static {
+ defaultInstance = new IntegerStatistics(true);
+ defaultInstance.initFields();
+ }
+
+ // @@protoc_insertion_point(class_scope:orc.proto.IntegerStatistics)
+ }
+
+ public interface DoubleStatisticsOrBuilder
+ extends com.google.protobuf.MessageOrBuilder {
+
+ // optional double minimum = 1;
+ /**
+ * <code>optional double minimum = 1;</code>
+ */
+ boolean hasMinimum();
+ /**
+ * <code>optional double minimum = 1;</code>
+ */
+ double getMinimum();
+
+ // optional double maximum = 2;
+ /**
+ * <code>optional double maximum = 2;</code>
+ */
+ boolean hasMaximum();
+ /**
+ * <code>optional double maximum = 2;</code>
+ */
+ double getMaximum();
+
+ // optional double sum = 3;
+ /**
+ * <code>optional double sum = 3;</code>
+ */
+ boolean hasSum();
+ /**
+ * <code>optional double sum = 3;</code>
+ */
+ double getSum();
+ }
+ /**
+ * Protobuf type {@code orc.proto.DoubleStatistics}
+ */
+ public static final class DoubleStatistics extends
+ com.google.protobuf.GeneratedMessage
+ implements DoubleStatisticsOrBuilder {
+ // Use DoubleStatistics.newBuilder() to construct.
+ private DoubleStatistics(com.google.protobuf.GeneratedMessage.Builder<?> builder) {
+ super(builder);
+ this.unknownFields = builder.getUnknownFields();
+ }
+ private DoubleStatistics(boolean noInit) { this.unknownFields = com.google.protobuf.UnknownFieldSet.getDefaultInstance(); }
+
+ private static final DoubleStatistics defaultInstance;
+ public static DoubleStatistics getDefaultInstance() {
+ return defaultInstance;
+ }
+
+ public DoubleStatistics getDefaultInstanceForType() {
+ return defaultInstance;
+ }
+
+ private final com.google.protobuf.UnknownFieldSet unknownFields;
+ @java.lang.Override
+ public final com.google.protobuf.UnknownFieldSet
+ getUnknownFields() {
+ return this.unknownFields;
+ }
+ private DoubleStatistics(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ initFields();
+ int mutable_bitField0_ = 0;
+ com.google.protobuf.UnknownFieldSet.Builder unknownFields =
+ com.google.protobuf.UnknownFieldSet.newBuilder();
+ try {
+ boolean done = false;
+ while (!done) {
+ int tag = input.readTag();
+ switch (tag) {
+ case 0:
+ done = true;
+ break;
+ default: {
+ if (!parseUnknownField(input, unknownFields,
+ extensionRegistry, tag)) {
+ done = true;
+ }
+ break;
+ }
+ case 9: {
+ bitField0_ |= 0x00000001;
+ minimum_ = input.readDouble();
+ break;
+ }
+ case 17: {
+ bitField0_ |= 0x00000002;
+ maximum_ = input.readDouble();
+ break;
+ }
+ case 25: {
+ bitField0_ |= 0x00000004;
+ sum_ = input.readDouble();
+ break;
+ }
+ }
+ }
+ } catch (com.google.protobuf.InvalidProtocolBufferException e) {
+ throw e.setUnfinishedMessage(this);
+ } catch (java.io.IOException e) {
+ throw new com.google.protobuf.InvalidProtocolBufferException(
+ e.getMessage()).setUnfinishedMessage(this);
+ } finally {
+ this.unknownFields = unknownFields.build();
+ makeExtensionsImmutable();
+ }
+ }
+ public static final com.google.protobuf.Descriptors.Descriptor
+ getDescriptor() {
+ return OrcProto.internal_static_orc_proto_DoubleStatistics_descriptor;
+ }
+
+ protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
+ internalGetFieldAccessorTable() {
+ return OrcProto.internal_static_orc_proto_DoubleStatistics_fieldAccessorTable
+ .ensureFieldAccessorsInitialized(
+ OrcProto.DoubleStatistics.class, OrcProto.DoubleStatistics.Builder.class);
+ }
+
+ public static com.google.protobuf.Parser<DoubleStatistics> PARSER =
+ new com.google.protobuf.AbstractParser<DoubleStatistics>() {
+ public DoubleStatistics parsePartialFrom(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return new DoubleStatistics(input, extensionRegistry);
+ }
+ };
+
+ @java.lang.Override
+ public com.google.protobuf.Parser<DoubleStatistics> getParserForType() {
+ return PARSER;
+ }
+
+ private int bitField0_;
+ // optional double minimum = 1;
+ public static final int MINIMUM_FIELD_NUMBER = 1;
+ private double minimum_;
+ /**
+ * <code>optional double minimum = 1;</code>
+ */
+ public boolean hasMinimum() {
+ return ((bitField0_ & 0x00000001) == 0x00000001);
+ }
+ /**
+ * <code>optional double minimum = 1;</code>
+ */
+ public double getMinimum() {
+ return minimum_;
+ }
+
+ // optional double maximum = 2;
+ public static final int MAXIMUM_FIELD_NUMBER = 2;
+ private double maximum_;
+ /**
+ * <code>optional double maximum = 2;</code>
+ */
+ public boolean hasMaximum() {
+ return ((bitField0_ & 0x00000002) == 0x00000002);
+ }
+ /**
+ * <code>optional double maximum = 2;</code>
+ */
+ public double getMaximum() {
+ return maximum_;
+ }
+
+ // optional double sum = 3;
+ public static final int SUM_FIELD_NUMBER = 3;
+ private double sum_;
+ /**
+ * <code>optional double sum = 3;</code>
+ */
+ public boolean hasSum() {
+ return ((bitField0_ & 0x00000004) == 0x00000004);
+ }
+ /**
+ * <code>optional double sum = 3;</code>
+ */
+ public double getSum() {
+ return sum_;
+ }
+
+ private void initFields() {
+ minimum_ = 0D;
+ maximum_ = 0D;
+ sum_ = 0D;
+ }
+ private byte memoizedIsInitialized = -1;
+ public final boolean isInitialized() {
+ byte isInitialized = memoizedIsInitialized;
+ if (isInitialized != -1) return isInitialized == 1;
+
+ memoizedIsInitialized = 1;
+ return true;
+ }
+
+ public void writeTo(com.google.protobuf.CodedOutputStream output)
+ throws java.io.IOException {
+ getSerializedSize();
+ if (((bitField0_ & 0x00000001) == 0x00000001)) {
+ output.writeDouble(1, minimum_);
+ }
+ if (((bitField0_ & 0x00000002) == 0x00000002)) {
+ output.writeDouble(2, maximum_);
+ }
+ if (((bitField0_ & 0x00000004) == 0x00000004)) {
+ output.writeDouble(3, sum_);
+ }
+ getUnknownFields().writeTo(output);
+ }
+
+ private int memoizedSerializedSize = -1;
+ public int getSerializedSize() {
+ int size = memoizedSerializedSize;
+ if (size != -1) return size;
+
+ size = 0;
+ if (((bitField0_ & 0x00000001) == 0x00000001)) {
+ size += com.google.protobuf.CodedOutputStream
+ .computeDoubleSize(1, minimum_);
+ }
+ if (((bitField0_ & 0x00000002) == 0x00000002)) {
+ size += com.google.protobuf.CodedOutputStream
+ .computeDoubleSize(2, maximum_);
+ }
+ if (((bitField0_ & 0x00000004) == 0x00000004)) {
+ size += com.google.protobuf.CodedOutputStream
+ .computeDoubleSize(3, sum_);
+ }
+ size += getUnknownFields().getSerializedSize();
+ memoizedSerializedSize = size;
+ return size;
+ }
+
+ private static final long serialVersionUID = 0L;
+ @java.lang.Override
+ protected java.lang.Object writeReplace()
+ throws java.io.ObjectStreamException {
+ return super.writeReplace();
+ }
+
+ public static OrcProto.DoubleStatistics parseFrom(
+ com.google.protobuf.ByteString data)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data);
+ }
+ public static OrcProto.DoubleStatistics parseFrom(
+ com.google.protobuf.ByteString data,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data, extensionRegistry);
+ }
+ public static OrcProto.DoubleStatistics parseFrom(byte[] data)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data);
+ }
+ public static OrcProto.DoubleStatistics parseFrom(
+ byte[] data,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data, extensionRegistry);
+ }
+ public static OrcProto.DoubleStatistics parseFrom(java.io.InputStream input)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input);
+ }
+ public static OrcProto.DoubleStatistics parseFrom(
+ java.io.InputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input, extensionRegistry);
+ }
+ public static OrcProto.DoubleStatistics parseDelimitedFrom(java.io.InputStream input)
+ throws java.io.IOException {
+ return PARSER.parseDelimitedFrom(input);
+ }
+ public static OrcProto.DoubleStatistics parseDelimitedFrom(
+ java.io.InputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return PARSER.parseDelimitedFrom(input, extensionRegistry);
+ }
+ public static OrcProto.DoubleStatistics parseFrom(
+ com.google.protobuf.CodedInputStream input)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input);
+ }
+ public static OrcProto.DoubleStatistics parseFrom(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input, extensionRegistry);
+ }
+
+ public static Builder newBuilder() { return Builder.create(); }
+ public Builder newBuilderForType() { return newBuilder(); }
+ public static Builder newBuilder(OrcProto.DoubleStatistics prototype) {
+ return newBuilder().mergeFrom(prototype);
+ }
+ public Builder toBuilder() { return newBuilder(this); }
+
+ @java.lang.Override
+ protected Builder newBuilderForType(
+ com.google.protobuf.GeneratedMessage.BuilderParent parent) {
+ Builder builder = new Builder(parent);
+ return builder;
+ }
+ /**
+ * Protobuf type {@code orc.proto.DoubleStatistics}
+ */
+ public static final class Builder extends
+ com.google.protobuf.GeneratedMessage.Builder<Builder>
+ implements OrcProto.DoubleStatisticsOrBuilder {
+ public static final com.google.protobuf.Descriptors.Descriptor
+ getDescriptor() {
+ return OrcProto.internal_static_orc_proto_DoubleStatistics_descriptor;
+ }
+
+ protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
+ internalGetFieldAccessorTable() {
+ return OrcProto.internal_static_orc_proto_DoubleStatistics_fieldAccessorTable
+ .ensureFieldAccessorsInitialized(
+ OrcProto.DoubleStatistics.class, OrcProto.DoubleStatistics.Builder.class);
+ }
+
+ // Construct using OrcProto.DoubleStatistics.newBuilder()
+ private Builder() {
+ maybeForceBuilderInitialization();
+ }
+
+ private Builder(
+ com.google.protobuf.GeneratedMessage.BuilderParent parent) {
+ super(parent);
+ maybeForceBuilderInitialization();
+ }
+ private void maybeForceBuilderInitialization() {
+ if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) {
+ }
+ }
+ private static Builder create() {
+ return new Builder();
+ }
+
+ public Builder clear() {
+ super.clear();
+ minimum_ = 0D;
+ bitField0_ = (bitField0_ & ~0x00000001);
+ maximum_ = 0D;
+ bitField0_ = (bitField0_ & ~0x00000002);
+ sum_ = 0D;
+ bitField0_ = (bitField0_ & ~0x00000004);
+ return this;
+ }
+
+ public Builder clone() {
+ return create().mergeFrom(buildPartial());
+ }
+
+ public com.google.protobuf.Descriptors.Descriptor
+ getDescriptorForType() {
+ return OrcProto.internal_static_orc_proto_DoubleStatistics_descriptor;
+ }
+
+ public OrcProto.DoubleStatistics getDefaultInstanceForType() {
+ return OrcProto.DoubleStatistics.getDefaultInstance();
+ }
+
+ public OrcProto.DoubleStatistics build() {
+ OrcProto.DoubleStatistics result = buildPartial();
+ if (!result.isInitialized()) {
+ throw newUninitializedMessageException(result);
+ }
+ return result;
+ }
+
+ public OrcProto.DoubleStatistics buildPartial() {
+ OrcProto.DoubleStatistics result = new OrcProto.DoubleStatistics(this);
+ int from_bitField0_ = bitField0_;
+ int to_bitField0_ = 0;
+ if (((from_bitField0_ & 0x00000001) == 0x00000001)) {
+ to_bitField0_ |= 0x00000001;
+ }
+ result.minimum_ = minimum_;
+ if (((from_bitField0_ & 0x00000002) == 0x00000002)) {
+ to_bitField0_ |= 0x00000002;
+ }
+ result.maximum_ = maximum_;
+ if (((from_bitField0_ & 0x00000004) == 0x00000004)) {
+ to_bitField0_ |= 0x00000004;
+ }
+ result.sum_ = sum_;
+ result.bitField0_ = to_bitField0_;
+ onBuilt();
+ return result;
+ }
+
+ public Builder mergeFrom(com.google.protobuf.Message other) {
+ if (other instanceof OrcProto.DoubleStatistics) {
+ return mergeFrom((OrcProto.DoubleStatistics)other);
+ } else {
+ super.mergeFrom(other);
+ return this;
+ }
+ }
+
+ public Builder mergeFrom(OrcProto.DoubleStatistics other) {
+ if (other == OrcProto.DoubleStatistics.getDefaultInstance()) return this;
+ if (other.hasMinimum()) {
+ setMinimum(other.getMinimum());
+ }
+ if (other.hasMaximum()) {
+ setMaximum(other.getMaximum());
+ }
+ if (other.hasSum()) {
+ setSum(other.getSum());
+ }
+ this.mergeUnknownFields(other.getUnknownFields());
+ return this;
+ }
+
+ public final boolean isInitialized() {
+ return true;
+ }
+
+ public Builder mergeFrom(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ OrcProto.DoubleStatistics parsedMessage = null;
+ try {
+ parsedMessage = PARSER.parsePartialFrom(input, extensionRegistry);
+ } catch (com.google.protobuf.InvalidProtocolBufferException e) {
+ parsedMessage = (OrcProto.DoubleStatistics) e.getUnfinishedMessage();
+ throw e;
+ } finally {
+ if (parsedMessage != null) {
+ mergeFrom(parsedMessage);
+ }
+ }
+ return this;
+ }
+ private int bitField0_;
+
+ // optional double minimum = 1;
+ private double minimum_ ;
+ /**
+ * <code>optional double minimum = 1;</code>
+ */
+ public boolean hasMinimum() {
+ return ((bitField0_ & 0x00000001) == 0x00000001);
+ }
+ /**
+ * <code>optional double minimum = 1;</code>
+ */
+ public double getMinimum() {
+ return minimum_;
+ }
+ /**
+ * <code>optional double minimum = 1;</code>
+ */
+ public Builder setMinimum(double value) {
+ bitField0_ |= 0x00000001;
+ minimum_ = value;
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>optional double minimum = 1;</code>
+ */
+ public Builder clearMinimum() {
+ bitField0_ = (bitField0_ & ~0x00000001);
+ minimum_ = 0D;
+ onChanged();
+ return this;
+ }
+
+ // optional double maximum = 2;
+ private double maximum_ ;
+ /**
+ * <code>optional double maximum = 2;</code>
+ */
+ public boolean hasMaximum() {
+ return ((bitField0_ & 0x00000002) == 0x00000002);
+ }
+ /**
+ * <code>optional double maximum = 2;</code>
+ */
+ public double getMaximum() {
+ return maximum_;
+ }
+ /**
+ * <code>optional double maximum = 2;</code>
+ */
+ public Builder setMaximum(double value) {
+ bitField0_ |= 0x00000002;
+ maximum_ = value;
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>optional double maximum = 2;</code>
+ */
+ public Builder clearMaximum() {
+ bitField0_ = (bitField0_ & ~0x00000002);
+ maximum_ = 0D;
+ onChanged();
+ return this;
+ }
+
+ // optional double sum = 3;
+ private double sum_ ;
+ /**
+ * <code>optional double sum = 3;</code>
+ */
+ public boolean hasSum() {
+ return ((bitField0_ & 0x00000004) == 0x00000004);
+ }
+ /**
+ * <code>optional double sum = 3;</code>
+ */
+ public double getSum() {
+ return sum_;
+ }
+ /**
+ * <code>optional double sum = 3;</code>
+ */
+ public Builder setSum(double value) {
+ bitField0_ |= 0x00000004;
+ sum_ = value;
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>optional double sum = 3;</code>
+ */
+ public Builder clearSum() {
+ bitField0_ = (bitField0_ & ~0x00000004);
+ sum_ = 0D;
+ onChanged();
+ return this;
+ }
+
+ // @@protoc_insertion_point(builder_scope:orc.proto.DoubleStatistics)
+ }
+
+ static {
+ defaultInstance = new DoubleStatistics(true);
+ defaultInstance.initFields();
+ }
+
+ // @@protoc_insertion_point(class_scope:orc.proto.DoubleStatistics)
+ }
+
+ public interface StringStatisticsOrBuilder
+ extends com.google.protobuf.MessageOrBuilder {
+
+ // optional string minimum = 1;
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ boolean hasMinimum();
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ java.lang.String getMinimum();
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ com.google.protobuf.ByteString
+ getMinimumBytes();
+
+ // optional string maximum = 2;
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ boolean hasMaximum();
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ java.lang.String getMaximum();
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ com.google.protobuf.ByteString
+ getMaximumBytes();
+
+ // optional sint64 sum = 3;
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ *
+ * <pre>
+ * sum will store the total length of all strings in a stripe
+ * </pre>
+ */
+ boolean hasSum();
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ *
+ * <pre>
+ * sum will store the total length of all strings in a stripe
+ * </pre>
+ */
+ long getSum();
+ }
+ /**
+ * Protobuf type {@code orc.proto.StringStatistics}
+ */
+ public static final class StringStatistics extends
+ com.google.protobuf.GeneratedMessage
+ implements StringStatisticsOrBuilder {
+ // Use StringStatistics.newBuilder() to construct.
+ private StringStatistics(com.google.protobuf.GeneratedMessage.Builder<?> builder) {
+ super(builder);
+ this.unknownFields = builder.getUnknownFields();
+ }
+ private StringStatistics(boolean noInit) { this.unknownFields = com.google.protobuf.UnknownFieldSet.getDefaultInstance(); }
+
+ private static final StringStatistics defaultInstance;
+ public static StringStatistics getDefaultInstance() {
+ return defaultInstance;
+ }
+
+ public StringStatistics getDefaultInstanceForType() {
+ return defaultInstance;
+ }
+
+ private final com.google.protobuf.UnknownFieldSet unknownFields;
+ @java.lang.Override
+ public final com.google.protobuf.UnknownFieldSet
+ getUnknownFields() {
+ return this.unknownFields;
+ }
+ private StringStatistics(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ initFields();
+ int mutable_bitField0_ = 0;
+ com.google.protobuf.UnknownFieldSet.Builder unknownFields =
+ com.google.protobuf.UnknownFieldSet.newBuilder();
+ try {
+ boolean done = false;
+ while (!done) {
+ int tag = input.readTag();
+ switch (tag) {
+ case 0:
+ done = true;
+ break;
+ default: {
+ if (!parseUnknownField(input, unknownFields,
+ extensionRegistry, tag)) {
+ done = true;
+ }
+ break;
+ }
+ case 10: {
+ bitField0_ |= 0x00000001;
+ minimum_ = input.readBytes();
+ break;
+ }
+ case 18: {
+ bitField0_ |= 0x00000002;
+ maximum_ = input.readBytes();
+ break;
+ }
+ case 24: {
+ bitField0_ |= 0x00000004;
+ sum_ = input.readSInt64();
+ break;
+ }
+ }
+ }
+ } catch (com.google.protobuf.InvalidProtocolBufferException e) {
+ throw e.setUnfinishedMessage(this);
+ } catch (java.io.IOException e) {
+ throw new com.google.protobuf.InvalidProtocolBufferException(
+ e.getMessage()).setUnfinishedMessage(this);
+ } finally {
+ this.unknownFields = unknownFields.build();
+ makeExtensionsImmutable();
+ }
+ }
+ public static final com.google.protobuf.Descriptors.Descriptor
+ getDescriptor() {
+ return OrcProto.internal_static_orc_proto_StringStatistics_descriptor;
+ }
+
+ protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
+ internalGetFieldAccessorTable() {
+ return OrcProto.internal_static_orc_proto_StringStatistics_fieldAccessorTable
+ .ensureFieldAccessorsInitialized(
+ OrcProto.StringStatistics.class, OrcProto.StringStatistics.Builder.class);
+ }
+
+ public static com.google.protobuf.Parser<StringStatistics> PARSER =
+ new com.google.protobuf.AbstractParser<StringStatistics>() {
+ public StringStatistics parsePartialFrom(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return new StringStatistics(input, extensionRegistry);
+ }
+ };
+
+ @java.lang.Override
+ public com.google.protobuf.Parser<StringStatistics> getParserForType() {
+ return PARSER;
+ }
+
+ private int bitField0_;
+ // optional string minimum = 1;
+ public static final int MINIMUM_FIELD_NUMBER = 1;
+ private java.lang.Object minimum_;
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ public boolean hasMinimum() {
+ return ((bitField0_ & 0x00000001) == 0x00000001);
+ }
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ public java.lang.String getMinimum() {
+ java.lang.Object ref = minimum_;
+ if (ref instanceof java.lang.String) {
+ return (java.lang.String) ref;
+ } else {
+ com.google.protobuf.ByteString bs =
+ (com.google.protobuf.ByteString) ref;
+ java.lang.String s = bs.toStringUtf8();
+ if (bs.isValidUtf8()) {
+ minimum_ = s;
+ }
+ return s;
+ }
+ }
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ public com.google.protobuf.ByteString
+ getMinimumBytes() {
+ java.lang.Object ref = minimum_;
+ if (ref instanceof java.lang.String) {
+ com.google.protobuf.ByteString b =
+ com.google.protobuf.ByteString.copyFromUtf8(
+ (java.lang.String) ref);
+ minimum_ = b;
+ return b;
+ } else {
+ return (com.google.protobuf.ByteString) ref;
+ }
+ }
+
+ // optional string maximum = 2;
+ public static final int MAXIMUM_FIELD_NUMBER = 2;
+ private java.lang.Object maximum_;
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ public boolean hasMaximum() {
+ return ((bitField0_ & 0x00000002) == 0x00000002);
+ }
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ public java.lang.String getMaximum() {
+ java.lang.Object ref = maximum_;
+ if (ref instanceof java.lang.String) {
+ return (java.lang.String) ref;
+ } else {
+ com.google.protobuf.ByteString bs =
+ (com.google.protobuf.ByteString) ref;
+ java.lang.String s = bs.toStringUtf8();
+ if (bs.isValidUtf8()) {
+ maximum_ = s;
+ }
+ return s;
+ }
+ }
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ public com.google.protobuf.ByteString
+ getMaximumBytes() {
+ java.lang.Object ref = maximum_;
+ if (ref instanceof java.lang.String) {
+ com.google.protobuf.ByteString b =
+ com.google.protobuf.ByteString.copyFromUtf8(
+ (java.lang.String) ref);
+ maximum_ = b;
+ return b;
+ } else {
+ return (com.google.protobuf.ByteString) ref;
+ }
+ }
+
+ // optional sint64 sum = 3;
+ public static final int SUM_FIELD_NUMBER = 3;
+ private long sum_;
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ *
+ * <pre>
+ * sum will store the total length of all strings in a stripe
+ * </pre>
+ */
+ public boolean hasSum() {
+ return ((bitField0_ & 0x00000004) == 0x00000004);
+ }
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ *
+ * <pre>
+ * sum will store the total length of all strings in a stripe
+ * </pre>
+ */
+ public long getSum() {
+ return sum_;
+ }
+
+ private void initFields() {
+ minimum_ = "";
+ maximum_ = "";
+ sum_ = 0L;
+ }
+ private byte memoizedIsInitialized = -1;
+ public final boolean isInitialized() {
+ byte isInitialized = memoizedIsInitialized;
+ if (isInitialized != -1) return isInitialized == 1;
+
+ memoizedIsInitialized = 1;
+ return true;
+ }
+
+ public void writeTo(com.google.protobuf.CodedOutputStream output)
+ throws java.io.IOException {
+ getSerializedSize();
+ if (((bitField0_ & 0x00000001) == 0x00000001)) {
+ output.writeBytes(1, getMinimumBytes());
+ }
+ if (((bitField0_ & 0x00000002) == 0x00000002)) {
+ output.writeBytes(2, getMaximumBytes());
+ }
+ if (((bitField0_ & 0x00000004) == 0x00000004)) {
+ output.writeSInt64(3, sum_);
+ }
+ getUnknownFields().writeTo(output);
+ }
+
+ private int memoizedSerializedSize = -1;
+ public int getSerializedSize() {
+ int size = memoizedSerializedSize;
+ if (size != -1) return size;
+
+ size = 0;
+ if (((bitField0_ & 0x00000001) == 0x00000001)) {
+ size += com.google.protobuf.CodedOutputStream
+ .computeBytesSize(1, getMinimumBytes());
+ }
+ if (((bitField0_ & 0x00000002) == 0x00000002)) {
+ size += com.google.protobuf.CodedOutputStream
+ .computeBytesSize(2, getMaximumBytes());
+ }
+ if (((bitField0_ & 0x00000004) == 0x00000004)) {
+ size += com.google.protobuf.CodedOutputStream
+ .computeSInt64Size(3, sum_);
+ }
+ size += getUnknownFields().getSerializedSize();
+ memoizedSerializedSize = size;
+ return size;
+ }
+
+ private static final long serialVersionUID = 0L;
+ @java.lang.Override
+ protected java.lang.Object writeReplace()
+ throws java.io.ObjectStreamException {
+ return super.writeReplace();
+ }
+
+ public static OrcProto.StringStatistics parseFrom(
+ com.google.protobuf.ByteString data)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data);
+ }
+ public static OrcProto.StringStatistics parseFrom(
+ com.google.protobuf.ByteString data,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data, extensionRegistry);
+ }
+ public static OrcProto.StringStatistics parseFrom(byte[] data)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data);
+ }
+ public static OrcProto.StringStatistics parseFrom(
+ byte[] data,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data, extensionRegistry);
+ }
+ public static OrcProto.StringStatistics parseFrom(java.io.InputStream input)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input);
+ }
+ public static OrcProto.StringStatistics parseFrom(
+ java.io.InputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input, extensionRegistry);
+ }
+ public static OrcProto.StringStatistics parseDelimitedFrom(java.io.InputStream input)
+ throws java.io.IOException {
+ return PARSER.parseDelimitedFrom(input);
+ }
+ public static OrcProto.StringStatistics parseDelimitedFrom(
+ java.io.InputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return PARSER.parseDelimitedFrom(input, extensionRegistry);
+ }
+ public static OrcProto.StringStatistics parseFrom(
+ com.google.protobuf.CodedInputStream input)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input);
+ }
+ public static OrcProto.StringStatistics parseFrom(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input, extensionRegistry);
+ }
+
+ public static Builder newBuilder() { return Builder.create(); }
+ public Builder newBuilderForType() { return newBuilder(); }
+ public static Builder newBuilder(OrcProto.StringStatistics prototype) {
+ return newBuilder().mergeFrom(prototype);
+ }
+ public Builder toBuilder() { return newBuilder(this); }
+
+ @java.lang.Override
+ protected Builder newBuilderForType(
+ com.google.protobuf.GeneratedMessage.BuilderParent parent) {
+ Builder builder = new Builder(parent);
+ return builder;
+ }
+ /**
+ * Protobuf type {@code orc.proto.StringStatistics}
+ */
+ public static final class Builder extends
+ com.google.protobuf.GeneratedMessage.Builder<Builder>
+ implements OrcProto.StringStatisticsOrBuilder {
+ public static final com.google.protobuf.Descriptors.Descriptor
+ getDescriptor() {
+ return OrcProto.internal_static_orc_proto_StringStatistics_descriptor;
+ }
+
+ protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
+ internalGetFieldAccessorTable() {
+ return OrcProto.internal_static_orc_proto_StringStatistics_fieldAccessorTable
+ .ensureFieldAccessorsInitialized(
+ OrcProto.StringStatistics.class, OrcProto.StringStatistics.Builder.class);
+ }
+
+ // Construct using OrcProto.StringStatistics.newBuilder()
+ private Builder() {
+ maybeForceBuilderInitialization();
+ }
+
+ private Builder(
+ com.google.protobuf.GeneratedMessage.BuilderParent parent) {
+ super(parent);
+ maybeForceBuilderInitialization();
+ }
+ private void maybeForceBuilderInitialization() {
+ if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) {
+ }
+ }
+ private static Builder create() {
+ return new Builder();
+ }
+
+ public Builder clear() {
+ super.clear();
+ minimum_ = "";
+ bitField0_ = (bitField0_ & ~0x00000001);
+ maximum_ = "";
+ bitField0_ = (bitField0_ & ~0x00000002);
+ sum_ = 0L;
+ bitField0_ = (bitField0_ & ~0x00000004);
+ return this;
+ }
+
+ public Builder clone() {
+ return create().mergeFrom(buildPartial());
+ }
+
+ public com.google.protobuf.Descriptors.Descriptor
+ getDescriptorForType() {
+ return OrcProto.internal_static_orc_proto_StringStatistics_descriptor;
+ }
+
+ public OrcProto.StringStatistics getDefaultInstanceForType() {
+ return OrcProto.StringStatistics.getDefaultInstance();
+ }
+
+ public OrcProto.StringStatistics build() {
+ OrcProto.StringStatistics result = buildPartial();
+ if (!result.isInitialized()) {
+ throw newUninitializedMessageException(result);
+ }
+ return result;
+ }
+
+ public OrcProto.StringStatistics buildPartial() {
+ OrcProto.StringStatistics result = new OrcProto.StringStatistics(this);
+ int from_bitField0_ = bitField0_;
+ int to_bitField0_ = 0;
+ if (((from_bitField0_ & 0x00000001) == 0x00000001)) {
+ to_bitField0_ |= 0x00000001;
+ }
+ result.minimum_ = minimum_;
+ if (((from_bitField0_ & 0x00000002) == 0x00000002)) {
+ to_bitField0_ |= 0x00000002;
+ }
+ result.maximum_ = maximum_;
+ if (((from_bitField0_ & 0x00000004) == 0x00000004)) {
+ to_bitField0_ |= 0x00000004;
+ }
+ result.sum_ = sum_;
+ result.bitField0_ = to_bitField0_;
+ onBuilt();
+ return result;
+ }
+
+ public Builder mergeFrom(com.google.protobuf.Message other) {
+ if (other instanceof OrcProto.StringStatistics) {
+ return mergeFrom((OrcProto.StringStatistics)other);
+ } else {
+ super.mergeFrom(other);
+ return this;
+ }
+ }
+
+ public Builder mergeFrom(OrcProto.StringStatistics other) {
+ if (other == OrcProto.StringStatistics.getDefaultInstance()) return this;
+ if (other.hasMinimum()) {
+ bitField0_ |= 0x00000001;
+ minimum_ = other.minimum_;
+ onChanged();
+ }
+ if (other.hasMaximum()) {
+ bitField0_ |= 0x00000002;
+ maximum_ = other.maximum_;
+ onChanged();
+ }
+ if (other.hasSum()) {
+ setSum(other.getSum());
+ }
+ this.mergeUnknownFields(other.getUnknownFields());
+ return this;
+ }
+
+ public final boolean isInitialized() {
+ return true;
+ }
+
+ public Builder mergeFrom(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ OrcProto.StringStatistics parsedMessage = null;
+ try {
+ parsedMessage = PARSER.parsePartialFrom(input, extensionRegistry);
+ } catch (com.google.protobuf.InvalidProtocolBufferException e) {
+ parsedMessage = (OrcProto.StringStatistics) e.getUnfinishedMessage();
+ throw e;
+ } finally {
+ if (parsedMessage != null) {
+ mergeFrom(parsedMessage);
+ }
+ }
+ return this;
+ }
+ private int bitField0_;
+
+ // optional string minimum = 1;
+ private java.lang.Object minimum_ = "";
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ public boolean hasMinimum() {
+ return ((bitField0_ & 0x00000001) == 0x00000001);
+ }
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ public java.lang.String getMinimum() {
+ java.lang.Object ref = minimum_;
+ if (!(ref instanceof java.lang.String)) {
+ java.lang.String s = ((com.google.protobuf.ByteString) ref)
+ .toStringUtf8();
+ minimum_ = s;
+ return s;
+ } else {
+ return (java.lang.String) ref;
+ }
+ }
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ public com.google.protobuf.ByteString
+ getMinimumBytes() {
+ java.lang.Object ref = minimum_;
+ if (ref instanceof String) {
+ com.google.protobuf.ByteString b =
+ com.google.protobuf.ByteString.copyFromUtf8(
+ (java.lang.String) ref);
+ minimum_ = b;
+ return b;
+ } else {
+ return (com.google.protobuf.ByteString) ref;
+ }
+ }
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ public Builder setMinimum(
+ java.lang.String value) {
+ if (value == null) {
+ throw new NullPointerException();
+ }
+ bitField0_ |= 0x00000001;
+ minimum_ = value;
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ public Builder clearMinimum() {
+ bitField0_ = (bitField0_ & ~0x00000001);
+ minimum_ = getDefaultInstance().getMinimum();
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ public Builder setMinimumBytes(
+ com.google.protobuf.ByteString value) {
+ if (value == null) {
+ throw new NullPointerException();
+ }
+ bitField0_ |= 0x00000001;
+ minimum_ = value;
+ onChanged();
+ return this;
+ }
+
+ // optional string maximum = 2;
+ private java.lang.Object maximum_ = "";
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ public boolean hasMaximum() {
+ return ((bitField0_ & 0x00000002) == 0x00000002);
+ }
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ public java.lang.String getMaximum() {
+ java.lang.Object ref = maximum_;
+ if (!(ref instanceof java.lang.String)) {
+ java.lang.String s = ((com.google.protobuf.ByteString) ref)
+ .toStringUtf8();
+ maximum_ = s;
+ return s;
+ } else {
+ return (java.lang.String) ref;
+ }
+ }
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ public com.google.protobuf.ByteString
+ getMaximumBytes() {
+ java.lang.Object ref = maximum_;
+ if (ref instanceof String) {
+ com.google.protobuf.ByteString b =
+ com.google.protobuf.ByteString.copyFromUtf8(
+ (java.lang.String) ref);
+ maximum_ = b;
+ return b;
+ } else {
+ return (com.google.protobuf.ByteString) ref;
+ }
+ }
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ public Builder setMaximum(
+ java.lang.String value) {
+ if (value == null) {
+ throw new NullPointerException();
+ }
+ bitField0_ |= 0x00000002;
+ maximum_ = value;
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ public Builder clearMaximum() {
+ bitField0_ = (bitField0_ & ~0x00000002);
+ maximum_ = getDefaultInstance().getMaximum();
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ public Builder setMaximumBytes(
+ com.google.protobuf.ByteString value) {
+ if (value == null) {
+ throw new NullPointerException();
+ }
+ bitField0_ |= 0x00000002;
+ maximum_ = value;
+ onChanged();
+ return this;
+ }
+
+ // optional sint64 sum = 3;
+ private long sum_ ;
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ *
+ * <pre>
+ * sum will store the total length of all strings in a stripe
+ * </pre>
+ */
+ public boolean hasSum() {
+ return ((bitField0_ & 0x00000004) == 0x00000004);
+ }
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ *
+ * <pre>
+ * sum will store the total length of all strings in a stripe
+ * </pre>
+ */
+ public long getSum() {
+ return sum_;
+ }
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ *
+ * <pre>
+ * sum will store the total length of all strings in a stripe
+ * </pre>
+ */
+ public Builder setSum(long value) {
+ bitField0_ |= 0x00000004;
+ sum_ = value;
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>optional sint64 sum = 3;</code>
+ *
+ * <pre>
+ * sum will store the total length of all strings in a stripe
+ * </pre>
+ */
+ public Builder clearSum() {
+ bitField0_ = (bitField0_ & ~0x00000004);
+ sum_ = 0L;
+ onChanged();
+ return this;
+ }
+
+ // @@protoc_insertion_point(builder_scope:orc.proto.StringStatistics)
+ }
+
+ static {
+ defaultInstance = new StringStatistics(true);
+ defaultInstance.initFields();
+ }
+
+ // @@protoc_insertion_point(class_scope:orc.proto.StringStatistics)
+ }
+
+ public interface BucketStatisticsOrBuilder
+ extends com.google.protobuf.MessageOrBuilder {
+
+ // repeated uint64 count = 1 [packed = true];
+ /**
+ * <code>repeated uint64 count = 1 [packed = true];</code>
+ */
+ java.util.List<java.lang.Long> getCountList();
+ /**
+ * <code>repeated uint64 count = 1 [packed = true];</code>
+ */
+ int getCountCount();
+ /**
+ * <code>repeated uint64 count = 1 [packed = true];</code>
+ */
+ long getCount(int index);
+ }
+ /**
+ * Protobuf type {@code orc.proto.BucketStatistics}
+ */
+ public static final class BucketStatistics extends
+ com.google.protobuf.GeneratedMessage
+ implements BucketStatisticsOrBuilder {
+ // Use BucketStatistics.newBuilder() to construct.
+ private BucketStatistics(com.google.protobuf.GeneratedMessage.Builder<?> builder) {
+ super(builder);
+ this.unknownFields = builder.getUnknownFields();
+ }
+ private BucketStatistics(boolean noInit) { this.unknownFields = com.google.protobuf.UnknownFieldSet.getDefaultInstance(); }
+
+ private static final BucketStatistics defaultInstance;
+ public static BucketStatistics getDefaultInstance() {
+ return defaultInstance;
+ }
+
+ public BucketStatistics getDefaultInstanceForType() {
+ return defaultInstance;
+ }
+
+ private final com.google.protobuf.UnknownFieldSet unknownFields;
+ @java.lang.Override
+ public final com.google.protobuf.UnknownFieldSet
+ getUnknownFields() {
+ return this.unknownFields;
+ }
+ private BucketStatistics(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ initFields();
+ int mutable_bitField0_ = 0;
+ com.google.protobuf.UnknownFieldSet.Builder unknownFields =
+ com.google.protobuf.UnknownFieldSet.newBuilder();
+ try {
+ boolean done = false;
+ while (!done) {
+ int tag = input.readTag();
+ switch (tag) {
+ case 0:
+ done = true;
+ break;
+ default: {
+ if (!parseUnknownField(input, unknownFields,
+ extensionRegistry, tag)) {
+ done = true;
+ }
+ break;
+ }
+ case 8: {
+ if (!((mutable_bitField0_ & 0x00000001) == 0x00000001)) {
+ count_ = new java.util.ArrayList<java.lang.Long>();
+ mutable_bitField0_ |= 0x00000001;
+ }
+ count_.add(input.readUInt64());
+ break;
+ }
+ case 10: {
+ int length = input.readRawVarint32();
+ int limit = input.pushLimit(length);
+ if (!((mutable_bitField0_ & 0x00000001) == 0x00000001) && input.getBytesUntilLimit() > 0) {
+ count_ = new java.util.ArrayList<java.lang.Long>();
+ mutable_bitField0_ |= 0x00000001;
+ }
+ while (input.getBytesUntilLimit() > 0) {
+ count_.add(input.readUInt64());
+ }
+ input.popLimit(limit);
+ break;
+ }
+ }
+ }
+ } catch (com.google.protobuf.InvalidProtocolBufferException e) {
+ throw e.setUnfinishedMessage(this);
+ } catch (java.io.IOException e) {
+ throw new com.google.protobuf.InvalidProtocolBufferException(
+ e.getMessage()).setUnfinishedMessage(this);
+ } finally {
+ if (((mutable_bitField0_ & 0x00000001) == 0x00000001)) {
+ count_ = java.util.Collections.unmodifiableList(count_);
+ }
+ this.unknownFields = unknownFields.build();
+ makeExtensionsImmutable();
+ }
+ }
+ public static final com.google.protobuf.Descriptors.Descriptor
+ getDescriptor() {
+ return OrcProto.internal_static_orc_proto_BucketStatistics_descriptor;
+ }
+
+ protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
+ internalGetFieldAccessorTable() {
+ return OrcProto.internal_static_orc_proto_BucketStatistics_fieldAccessorTable
+ .ensureFieldAccessorsInitialized(
+ OrcProto.BucketStatistics.class, OrcProto.BucketStatistics.Builder.class);
+ }
+
+ public static com.google.protobuf.Parser<BucketStatistics> PARSER =
+ new com.google.protobuf.AbstractParser<BucketStatistics>() {
+ public BucketStatistics parsePartialFrom(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return new BucketStatistics(input, extensionRegistry);
+ }
+ };
+
+ @java.lang.Override
+ public com.google.protobuf.Parser<BucketStatistics> getParserForType() {
+ return PARSER;
+ }
+
+ // repeated uint64 count = 1 [packed = true];
+ public static final int COUNT_FIELD_NUMBER = 1;
+ private java.util.List<java.lang.Long> count_;
+ /**
+ * <code>repeated uint64 count = 1 [packed = true];</code>
+ */
+ public java.util.List<java.lang.Long>
+ getCountList() {
+ return count_;
+ }
+ /**
+ * <code>repeated uint64 count = 1 [packed = true];</code>
+ */
+ public int getCountCount() {
+ return count_.size();
+ }
+ /**
+ * <code>repeated uint64 count = 1 [packed = true];</code>
+ */
+ public long getCount(int index) {
+ return count_.get(index);
+ }
+ private int countMemoizedSerializedSize = -1;
+
+ private void initFields() {
+ count_ = java.util.Collections.emptyList();
+ }
+ private byte memoizedIsInitialized = -1;
+ public final boolean isInitialized() {
+ byte isInitialized = memoizedIsInitialized;
+ if (isInitialized != -1) return isInitialized == 1;
+
+ memoizedIsInitialized = 1;
+ return true;
+ }
+
+ public void writeTo(com.google.protobuf.CodedOutputStream output)
+ throws java.io.IOException {
+ getSerializedSize();
+ if (getCountList().size() > 0) {
+ output.writeRawVarint32(10);
+ output.writeRawVarint32(countMemoizedSerializedSize);
+ }
+ for (int i = 0; i < count_.size(); i++) {
+ output.writeUInt64NoTag(count_.get(i));
+ }
+ getUnknownFields().writeTo(output);
+ }
+
+ private int memoizedSerializedSize = -1;
+ public int getSerializedSize() {
+ int size = memoizedSerializedSize;
+ if (size != -1) return size;
+
+ size = 0;
+ {
+ int dataSize = 0;
+ for (int i = 0; i < count_.size(); i++) {
+ dataSize += com.google.protobuf.CodedOutputStream
+ .computeUInt64SizeNoTag(count_.get(i));
+ }
+ size += dataSize;
+ if (!getCountList().isEmpty()) {
+ size += 1;
+ size += com.google.protobuf.CodedOutputStream
+ .computeInt32SizeNoTag(dataSize);
+ }
+ countMemoizedSerializedSize = dataSize;
+ }
+ size += getUnknownFields().getSerializedSize();
+ memoizedSerializedSize = size;
+ return size;
+ }
+
+ private static final long serialVersionUID = 0L;
+ @java.lang.Override
+ protected java.lang.Object writeReplace()
+ throws java.io.ObjectStreamException {
+ return super.writeReplace();
+ }
+
+ public static OrcProto.BucketStatistics parseFrom(
+ com.google.protobuf.ByteString data)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data);
+ }
+ public static OrcProto.BucketStatistics parseFrom(
+ com.google.protobuf.ByteString data,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data, extensionRegistry);
+ }
+ public static OrcProto.BucketStatistics parseFrom(byte[] data)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data);
+ }
+ public static OrcProto.BucketStatistics parseFrom(
+ byte[] data,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return PARSER.parseFrom(data, extensionRegistry);
+ }
+ public static OrcProto.BucketStatistics parseFrom(java.io.InputStream input)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input);
+ }
+ public static OrcProto.BucketStatistics parseFrom(
+ java.io.InputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input, extensionRegistry);
+ }
+ public static OrcProto.BucketStatistics parseDelimitedFrom(java.io.InputStream input)
+ throws java.io.IOException {
+ return PARSER.parseDelimitedFrom(input);
+ }
+ public static OrcProto.BucketStatistics parseDelimitedFrom(
+ java.io.InputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return PARSER.parseDelimitedFrom(input, extensionRegistry);
+ }
+ public static OrcProto.BucketStatistics parseFrom(
+ com.google.protobuf.CodedInputStream input)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input);
+ }
+ public static OrcProto.BucketStatistics parseFrom(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return PARSER.parseFrom(input, extensionRegistry);
+ }
+
+ public static Builder newBuilder() { return Builder.create(); }
+ public Builder newBuilderForType() { return newBuilder(); }
+ public static Builder newBuilder(OrcProto.BucketStatistics prototype) {
+ return newBuilder().mergeFrom(prototype);
+ }
+ public Builder toBuilder() { return newBuilder(this); }
+
+ @java.lang.Override
+ protected Builder newBuilderForType(
+ com.google.protobuf.GeneratedMessage.BuilderParent parent) {
+ Builder builder = new Builder(parent);
+ return builder;
+ }
+ /**
+ * Protobuf type {@code orc.proto.BucketStatistics}
+ */
+ public static final class Builder extends
+ com.google.protobuf.GeneratedMessage.Builder<Builder>
+ implements OrcProto.BucketStatisticsOrBuilder {
+ public static final com.google.protobuf.Descriptors.Descriptor
+ getDescriptor() {
+ return OrcProto.internal_static_orc_proto_BucketStatistics_descriptor;
+ }
+
+ protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
+ internalGetFieldAccessorTable() {
+ return OrcProto.internal_static_orc_proto_BucketStatistics_fieldAccessorTable
+ .ensureFieldAccessorsInitialized(
+ OrcProto.BucketStatistics.class, OrcProto.BucketStatistics.Builder.class);
+ }
+
+ // Construct using OrcProto.BucketStatistics.newBuilder()
+ private Builder() {
+ maybeForceBuilderInitialization();
+ }
+
+ private Builder(
+ com.google.protobuf.GeneratedMessage.BuilderParent parent) {
+ super(parent);
+ maybeForceBuilderInitialization();
+ }
+ private void maybeForceBuilderInitialization() {
+ if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) {
+ }
+ }
+ private static Builder create() {
+ return new Builder();
+ }
+
+ public Builder clear() {
+ super.clear();
+ count_ = java.util.Collections.emptyList();
+ bitField0_ = (bitField0_ & ~0x00000001);
+ return this;
+ }
+
+ public Builder clone() {
+ return create().mergeFrom(buildPartial());
+ }
+
+ public com.google.protobuf.Descriptors.Descriptor
+ getDescriptorForType() {
+ return OrcProto.internal_static_orc_proto_BucketStatistics_descriptor;
+ }
+
+ public OrcProto.BucketStatistics getDefaultInstanceForType() {
+ return OrcProto.BucketStatistics.getDefaultInstance();
+ }
+
+ public OrcProto.BucketStatistics build() {
+ OrcProto.BucketStatistics result = buildPartial();
+ if (!result.isInitialized()) {
+ throw newUninitializedMessageException(result);
+ }
+ return result;
+ }
+
+ public OrcProto.BucketStatistics buildPartial() {
+ OrcProto.BucketStatistics result = new OrcProto.BucketStatistics(this);
+ int from_bitField0_ = bitField0_;
+ if (((bitField0_ & 0x00000001) == 0x00000001)) {
+ count_ = java.util.Collections.unmodifiableList(count_);
+ bitField0_ = (bitField0_ & ~0x00000001);
+ }
+ result.count_ = count_;
+ onBuilt();
+ return result;
+ }
+
+ public Builder mergeFrom(com.google.protobuf.Message other) {
+ if (other instanceof OrcProto.BucketStatistics) {
+ return mergeFrom((OrcProto.BucketStatistics)other);
+ } else {
+ super.mergeFrom(other);
+ return this;
+ }
+ }
+
+ public Builder mergeFrom(OrcProto.BucketStatistics other) {
+ if (other == OrcProto.BucketStatistics.getDefaultInstance()) return this;
+ if (!other.count_.isEmpty()) {
+ if (count_.isEmpty()) {
+ count_ = other.count_;
+ bitField0_ = (bitField0_ & ~0x00000001);
+ } else {
+ ensureCountIsMutable();
+ count_.addAll(other.count_);
+ }
+ onChanged();
+ }
+ this.mergeUnknownFields(other.getUnknownFields());
+ return this;
+ }
+
+ public final boolean isInitialized() {
+ return true;
+ }
+
+ public Builder mergeFrom(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ OrcProto.BucketStatistics parsedMessage = null;
+ try {
+ parsedMessage = PARSER.parsePartialFrom(input, extensionRegistry);
+ } catch (com.google.protobuf.InvalidProtocolBufferException e) {
+ parsedMessage = (OrcProto.BucketStatistics) e.getUnfinishedMessage();
+ throw e;
+ } finally {
+ if (parsedMessage != null) {
+ mergeFrom(parsedMessage);
+ }
+ }
+ return this;
+ }
+ private int bitField0_;
+
+ // repeated uint64 count = 1 [packed = true];
+ private java.util.List<java.lang.Long> count_ = java.util.Collections.emptyList();
+ private void ensureCountIsMutable() {
+ if (!((bitField0_ & 0x00000001) == 0x00000001)) {
+ count_ = new java.util.ArrayList<java.lang.Long>(count_);
+ bitField0_ |= 0x00000001;
+ }
+ }
+ /**
+ * <code>repeated uint64 count = 1 [packed = true];</code>
+ */
+ public java.util.List<java.lang.Long>
+ getCountList() {
+ return java.util.Collections.unmodifiableList(count_);
+ }
+ /**
+ * <code>repeated uint64 count = 1 [packed = true];</code>
+ */
+ public int getCountCount() {
+ return count_.size();
+ }
+ /**
+ * <code>repeated uint64 count = 1 [packed = true];</code>
+ */
+ public long getCount(int index) {
+ return count_.get(index);
+ }
+ /**
+ * <code>repeated uint64 count = 1 [packed = true];</code>
+ */
+ public Builder setCount(
+ int index, long value) {
+ ensureCountIsMutable();
+ count_.set(index, value);
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>repeated uint64 count = 1 [packed = true];</code>
+ */
+ public Builder addCount(long value) {
+ ensureCountIsMutable();
+ count_.add(value);
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>repeated uint64 count = 1 [packed = true];</code>
+ */
+ public Builder addAllCount(
+ java.lang.Iterable<? extends java.lang.Long> values) {
+ ensureCountIsMutable();
+ super.addAll(values, count_);
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>repeated uint64 count = 1 [packed = true];</code>
+ */
+ public Builder clearCount() {
+ count_ = java.util.Collections.emptyList();
+ bitField0_ = (bitField0_ & ~0x00000001);
+ onChanged();
+ return this;
+ }
+
+ // @@protoc_insertion_point(builder_scope:orc.proto.BucketStatistics)
+ }
+
+ static {
+ defaultInstance = new BucketStatistics(true);
+ defaultInstance.initFields();
+ }
+
+ // @@protoc_insertion_point(class_scope:orc.proto.BucketStatistics)
+ }
+
+ public interface DecimalStatisticsOrBuilder
+ extends com.google.protobuf.MessageOrBuilder {
+
+ // optional string minimum = 1;
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ boolean hasMinimum();
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ java.lang.String getMinimum();
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ com.google.protobuf.ByteString
+ getMinimumBytes();
+
+ // optional string maximum = 2;
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ boolean hasMaximum();
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ java.lang.String getMaximum();
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ com.google.protobuf.ByteString
+ getMaximumBytes();
+
+ // optional string sum = 3;
+ /**
+ * <code>optional string sum = 3;</code>
+ */
+ boolean hasSum();
+ /**
+ * <code>optional string sum = 3;</code>
+ */
+ java.lang.String getSum();
+ /**
+ * <code>optional string sum = 3;</code>
+ */
+ com.google.protobuf.ByteString
+ getSumBytes();
+ }
+ /**
+ * Protobuf type {@code orc.proto.DecimalStatistics}
+ */
+ public static final class DecimalStatistics extends
+ com.google.protobuf.GeneratedMessage
+ implements DecimalStatisticsOrBuilder {
+ // Use DecimalStatistics.newBuilder() to construct.
+ private DecimalStatistics(com.google.protobuf.GeneratedMessage.Builder<?> builder) {
+ super(builder);
+ this.unknownFields = builder.getUnknownFields();
+ }
+ private DecimalStatistics(boolean noInit) { this.unknownFields = com.google.protobuf.UnknownFieldSet.getDefaultInstance(); }
+
+ private static final DecimalStatistics defaultInstance;
+ public static DecimalStatistics getDefaultInstance() {
+ return defaultInstance;
+ }
+
+ public DecimalStatistics getDefaultInstanceForType() {
+ return defaultInstance;
+ }
+
+ private final com.google.protobuf.UnknownFieldSet unknownFields;
+ @java.lang.Override
+ public final com.google.protobuf.UnknownFieldSet
+ getUnknownFields() {
+ return this.unknownFields;
+ }
+ private DecimalStatistics(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ initFields();
+ int mutable_bitField0_ = 0;
+ com.google.protobuf.UnknownFieldSet.Builder unknownFields =
+ com.google.protobuf.UnknownFieldSet.newBuilder();
+ try {
+ boolean done = false;
+ while (!done) {
+ int tag = input.readTag();
+ switch (tag) {
+ case 0:
+ done = true;
+ break;
+ default: {
+ if (!parseUnknownField(input, unknownFields,
+ extensionRegistry, tag)) {
+ done = true;
+ }
+ break;
+ }
+ case 10: {
+ bitField0_ |= 0x00000001;
+ minimum_ = input.readBytes();
+ break;
+ }
+ case 18: {
+ bitField0_ |= 0x00000002;
+ maximum_ = input.readBytes();
+ break;
+ }
+ case 26: {
+ bitField0_ |= 0x00000004;
+ sum_ = input.readBytes();
+ break;
+ }
+ }
+ }
+ } catch (com.google.protobuf.InvalidProtocolBufferException e) {
+ throw e.setUnfinishedMessage(this);
+ } catch (java.io.IOException e) {
+ throw new com.google.protobuf.InvalidProtocolBufferException(
+ e.getMessage()).setUnfinishedMessage(this);
+ } finally {
+ this.unknownFields = unknownFields.build();
+ makeExtensionsImmutable();
+ }
+ }
+ public static final com.google.protobuf.Descriptors.Descriptor
+ getDescriptor() {
+ return OrcProto.internal_static_orc_proto_DecimalStatistics_descriptor;
+ }
+
+ protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
+ internalGetFieldAccessorTable() {
+ return OrcProto.internal_static_orc_proto_DecimalStatistics_fieldAccessorTable
+ .ensureFieldAccessorsInitialized(
+ OrcProto.DecimalStatistics.class, OrcProto.DecimalStatistics.Builder.class);
+ }
+
+ public static com.google.protobuf.Parser<DecimalStatistics> PARSER =
+ new com.google.protobuf.AbstractParser<DecimalStatistics>() {
+ public DecimalStatistics parsePartialFrom(
+ com.google.protobuf.CodedInputStream input,
+ com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return new DecimalStatistics(input, extensionRegistry);
+ }
+ };
+
+ @java.lang.Override
+ public com.google.protobuf.Parser<DecimalStatistics> getParserForType() {
+ return PARSER;
+ }
+
+ private int bitField0_;
+ // optional string minimum = 1;
+ public static final int MINIMUM_FIELD_NUMBER = 1;
+ private java.lang.Object minimum_;
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ public boolean hasMinimum() {
+ return ((bitField0_ & 0x00000001) == 0x00000001);
+ }
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ public java.lang.String getMinimum() {
+ java.lang.Object ref = minimum_;
+ if (ref instanceof java.lang.String) {
+ return (java.lang.String) ref;
+ } else {
+ com.google.protobuf.ByteString bs =
+ (com.google.protobuf.ByteString) ref;
+ java.lang.String s = bs.toStringUtf8();
+ if (bs.isValidUtf8()) {
+ minimum_ = s;
+ }
+ return s;
+ }
+ }
+ /**
+ * <code>optional string minimum = 1;</code>
+ */
+ public com.google.protobuf.ByteString
+ getMinimumBytes() {
+ java.lang.Object ref = minimum_;
+ if (ref instanceof java.lang.String) {
+ com.google.protobuf.ByteString b =
+ com.google.protobuf.ByteString.copyFromUtf8(
+ (java.lang.String) ref);
+ minimum_ = b;
+ return b;
+ } else {
+ return (com.google.protobuf.ByteString) ref;
+ }
+ }
+
+ // optional string maximum = 2;
+ public static final int MAXIMUM_FIELD_NUMBER = 2;
+ private java.lang.Object maximum_;
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ public boolean hasMaximum() {
+ return ((bitField0_ & 0x00000002) == 0x00000002);
+ }
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ public java.lang.String getMaximum() {
+ java.lang.Object ref = maximum_;
+ if (ref instanceof java.lang.String) {
+ return (java.lang.String) ref;
+ } else {
+ com.google.protobuf.ByteString bs =
+ (com.google.protobuf.ByteString) ref;
+ java.lang.String s = bs.toStringUtf8();
+ if (bs.isValidUtf8()) {
+ maximum_ = s;
+ }
+ return s;
+ }
+ }
+ /**
+ * <code>optional string maximum = 2;</code>
+ */
+ public com.google.protobuf.ByteString
+ getMaximumBytes() {
+ java.lang.Object ref = maximum_;
+ if (ref instanceof java.lang.String) {
+ com.google.protobuf.ByteString b =
+ com.google.protobuf.ByteString.copyFromUtf8(
+ (java.lang.String) ref);
+ maximum_ = b;
+ return b;
+ } else {
+ return (com.google.protobuf.ByteString) ref;
+ }
+ }
+
+ // optional string sum = 3;
+ public static final int SUM_FIELD_NUMBER = 3;
+ private java.lang.Object sum_;
+ /**
+ * <code>optional string sum = 3;</code>
+ */
+ public boolean hasSum() {
+ return ((bitField0_ & 0x00000004) == 0x00000004);
+ }
+ /**
+ * <code>optional string sum = 3;</code>
+ */
+ public java.lang.String getSum() {
+ java.lang.Object ref = sum_;
+ if (ref instanceof java.lang.String) {
+ return (java.lang.String) ref;
+ } else {
+ com.google.protobuf.ByteString bs =
+ (com.google.protobuf.ByteString) ref;
+ java.lang.String s = bs.toStringUtf8();
+ if (bs.isValidUtf8()) {
+ sum_ = s;
+ }
+ return s;
+
<TRUNCATED>
[32/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/ConvertTreeReaderFactory.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/ConvertTreeReaderFactory.java b/orc/src/java/org/apache/hive/orc/impl/ConvertTreeReaderFactory.java
new file mode 100644
index 0000000..21020b8
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/ConvertTreeReaderFactory.java
@@ -0,0 +1,2892 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.sql.Date;
+import java.sql.Timestamp;
+import java.util.EnumMap;
+import java.util.Map;
+
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr;
+import org.apache.hadoop.hive.ql.util.TimestampUtils;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hive.orc.TypeDescription;
+import org.apache.hive.orc.OrcProto;
+
+/**
+ * Convert ORC tree readers.
+ */
+public class ConvertTreeReaderFactory extends TreeReaderFactory {
+
+ /**
+ * Override methods like checkEncoding to pass-thru to the convert TreeReader.
+ */
+ public static class ConvertTreeReader extends TreeReader {
+
+ private TreeReader convertTreeReader;
+
+ ConvertTreeReader(int columnId) throws IOException {
+ super(columnId);
+ }
+
+ // The ordering of types here is used to determine which numeric types
+ // are common/convertible to one another. Probably better to rely on the
+ // ordering explicitly defined here than to assume that the enum values
+ // that were arbitrarily assigned in PrimitiveCategory work for our purposes.
+ private static EnumMap<TypeDescription.Category, Integer> numericTypes =
+ new EnumMap<>(TypeDescription.Category.class);
+
+ static {
+ registerNumericType(TypeDescription.Category.BOOLEAN, 1);
+ registerNumericType(TypeDescription.Category.BYTE, 2);
+ registerNumericType(TypeDescription.Category.SHORT, 3);
+ registerNumericType(TypeDescription.Category.INT, 4);
+ registerNumericType(TypeDescription.Category.LONG, 5);
+ registerNumericType(TypeDescription.Category.FLOAT, 6);
+ registerNumericType(TypeDescription.Category.DOUBLE, 7);
+ registerNumericType(TypeDescription.Category.DECIMAL, 8);
+ }
+
+ private static void registerNumericType(TypeDescription.Category kind, int level) {
+ numericTypes.put(kind, level);
+ }
+
+ protected void setConvertTreeReader(TreeReader convertTreeReader) {
+ this.convertTreeReader = convertTreeReader;
+ }
+
+ protected TreeReader getStringGroupTreeReader(int columnId,
+ TypeDescription fileType) throws IOException {
+ switch (fileType.getCategory()) {
+ case STRING:
+ return new StringTreeReader(columnId);
+ case CHAR:
+ return new CharTreeReader(columnId, fileType.getMaxLength());
+ case VARCHAR:
+ return new VarcharTreeReader(columnId, fileType.getMaxLength());
+ default:
+ throw new RuntimeException("Unexpected type kind " + fileType.getCategory().name());
+ }
+ }
+
+ protected void assignStringGroupVectorEntry(BytesColumnVector bytesColVector,
+ int elementNum, TypeDescription readerType, byte[] bytes) {
+ assignStringGroupVectorEntry(bytesColVector,
+ elementNum, readerType, bytes, 0, bytes.length);
+ }
+
+ /*
+ * Assign a BytesColumnVector entry when we have a byte array, start, and
+ * length for the string group which can be (STRING, CHAR, VARCHAR).
+ */
+ protected void assignStringGroupVectorEntry(BytesColumnVector bytesColVector,
+ int elementNum, TypeDescription readerType, byte[] bytes, int start, int length) {
+ switch (readerType.getCategory()) {
+ case STRING:
+ bytesColVector.setVal(elementNum, bytes, start, length);
+ break;
+ case CHAR:
+ {
+ int adjustedDownLen =
+ StringExpr.rightTrimAndTruncate(bytes, start, length, readerType.getMaxLength());
+ bytesColVector.setVal(elementNum, bytes, start, adjustedDownLen);
+ }
+ break;
+ case VARCHAR:
+ {
+ int adjustedDownLen =
+ StringExpr.truncate(bytes, start, length, readerType.getMaxLength());
+ bytesColVector.setVal(elementNum, bytes, start, adjustedDownLen);
+ }
+ break;
+ default:
+ throw new RuntimeException("Unexpected type kind " + readerType.getCategory().name());
+ }
+ }
+
+ protected void convertStringGroupVectorElement(BytesColumnVector bytesColVector,
+ int elementNum, TypeDescription readerType) {
+ switch (readerType.getCategory()) {
+ case STRING:
+ // No conversion needed.
+ break;
+ case CHAR:
+ {
+ int length = bytesColVector.length[elementNum];
+ int adjustedDownLen = StringExpr
+ .rightTrimAndTruncate(bytesColVector.vector[elementNum],
+ bytesColVector.start[elementNum], length,
+ readerType.getMaxLength());
+ if (adjustedDownLen < length) {
+ bytesColVector.length[elementNum] = adjustedDownLen;
+ }
+ }
+ break;
+ case VARCHAR:
+ {
+ int length = bytesColVector.length[elementNum];
+ int adjustedDownLen = StringExpr
+ .truncate(bytesColVector.vector[elementNum],
+ bytesColVector.start[elementNum], length,
+ readerType.getMaxLength());
+ if (adjustedDownLen < length) {
+ bytesColVector.length[elementNum] = adjustedDownLen;
+ }
+ }
+ break;
+ default:
+ throw new RuntimeException("Unexpected type kind " + readerType.getCategory().name());
+ }
+ }
+
+ private boolean isParseError;
+
+ /*
+ * We do this because we want the various parse methods return a primitive.
+ *
+ * @return true if there was a parse error in the last call to
+ * parseLongFromString, etc.
+ */
+ protected boolean getIsParseError() {
+ return isParseError;
+ }
+
+ protected long parseLongFromString(String string) {
+ try {
+ long longValue = Long.parseLong(string);
+ isParseError = false;
+ return longValue;
+ } catch (NumberFormatException e) {
+ isParseError = true;
+ return 0;
+ }
+ }
+
+ protected float parseFloatFromString(String string) {
+ try {
+ float floatValue = Float.parseFloat(string);
+ isParseError = false;
+ return floatValue;
+ } catch (NumberFormatException e) {
+ isParseError = true;
+ return Float.NaN;
+ }
+ }
+
+ protected double parseDoubleFromString(String string) {
+ try {
+ double value = Double.parseDouble(string);
+ isParseError = false;
+ return value;
+ } catch (NumberFormatException e) {
+ isParseError = true;
+ return Double.NaN;
+ }
+ }
+
+ /**
+ * @param string
+ * @return the HiveDecimal parsed, or null if there was a parse error.
+ */
+ protected HiveDecimal parseDecimalFromString(String string) {
+ try {
+ HiveDecimal value = HiveDecimal.create(string);
+ return value;
+ } catch (NumberFormatException e) {
+ return null;
+ }
+ }
+
+ /**
+ * @param string
+ * @return the Timestamp parsed, or null if there was a parse error.
+ */
+ protected Timestamp parseTimestampFromString(String string) {
+ try {
+ Timestamp value = Timestamp.valueOf(string);
+ return value;
+ } catch (IllegalArgumentException e) {
+ return null;
+ }
+ }
+
+ /**
+ * @param string
+ * @return the Date parsed, or null if there was a parse error.
+ */
+ protected Date parseDateFromString(String string) {
+ try {
+ Date value = Date.valueOf(string);
+ return value;
+ } catch (IllegalArgumentException e) {
+ return null;
+ }
+ }
+
+ protected String stringFromBytesColumnVectorEntry(
+ BytesColumnVector bytesColVector, int elementNum) {
+ String string;
+
+ string = new String(
+ bytesColVector.vector[elementNum],
+ bytesColVector.start[elementNum], bytesColVector.length[elementNum],
+ StandardCharsets.UTF_8);
+
+ return string;
+ }
+
+ private static final double MIN_LONG_AS_DOUBLE = -0x1p63;
+ /*
+ * We cannot store Long.MAX_VALUE as a double without losing precision. Instead, we store
+ * Long.MAX_VALUE + 1 == -Long.MIN_VALUE, and then offset all comparisons by 1.
+ */
+ private static final double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
+
+ public boolean doubleCanFitInLong(double doubleValue) {
+
+ // Borrowed from Guava DoubleMath.roundToLong except do not want dependency on Guava and we
+ // don't want to catch an exception.
+
+ return ((MIN_LONG_AS_DOUBLE - doubleValue < 1.0) &&
+ (doubleValue < MAX_LONG_AS_DOUBLE_PLUS_ONE));
+ }
+
+ @Override
+ void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
+ // Pass-thru.
+ convertTreeReader.checkEncoding(encoding);
+ }
+
+ @Override
+ void startStripe(Map<StreamName, InStream> streams,
+ OrcProto.StripeFooter stripeFooter
+ ) throws IOException {
+ // Pass-thru.
+ convertTreeReader.startStripe(streams, stripeFooter);
+ }
+
+ @Override
+ public void seek(PositionProvider[] index) throws IOException {
+ // Pass-thru.
+ convertTreeReader.seek(index);
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ // Pass-thru.
+ convertTreeReader.seek(index);
+ }
+
+ @Override
+ void skipRows(long items) throws IOException {
+ // Pass-thru.
+ convertTreeReader.skipRows(items);
+ }
+
+ /**
+ * Override this to use convertVector.
+ * Source and result are member variables in the subclass with the right
+ * type.
+ * @param elementNum
+ * @throws IOException
+ */
+ // Override this to use convertVector.
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ throw new RuntimeException("Expected this method to be overriden");
+ }
+
+ // Common code used by the conversion.
+ public void convertVector(ColumnVector fromColVector,
+ ColumnVector resultColVector, final int batchSize) throws IOException {
+
+ resultColVector.reset();
+ if (fromColVector.isRepeating) {
+ resultColVector.isRepeating = true;
+ if (fromColVector.noNulls || !fromColVector.isNull[0]) {
+ setConvertVectorElement(0);
+ } else {
+ resultColVector.noNulls = false;
+ resultColVector.isNull[0] = true;
+ }
+ } else if (fromColVector.noNulls){
+ for (int i = 0; i < batchSize; i++) {
+ setConvertVectorElement(i);
+ }
+ } else {
+ for (int i = 0; i < batchSize; i++) {
+ if (!fromColVector.isNull[i]) {
+ setConvertVectorElement(i);
+ } else {
+ resultColVector.noNulls = false;
+ resultColVector.isNull[i] = true;
+ }
+ }
+ }
+ }
+
+ public void downCastAnyInteger(LongColumnVector longColVector, int elementNum,
+ TypeDescription readerType) {
+ downCastAnyInteger(longColVector, elementNum, longColVector.vector[elementNum], readerType);
+ }
+
+ public void downCastAnyInteger(LongColumnVector longColVector, int elementNum, long inputLong,
+ TypeDescription readerType) {
+ long[] vector = longColVector.vector;
+ long outputLong;
+ TypeDescription.Category readerCategory = readerType.getCategory();
+ switch (readerCategory) {
+ case BOOLEAN:
+ // No data loss for boolean.
+ vector[elementNum] = inputLong == 0 ? 0 : 1;
+ return;
+ case BYTE:
+ outputLong = (byte) inputLong;
+ break;
+ case SHORT:
+ outputLong = (short) inputLong;
+ break;
+ case INT:
+ outputLong = (int) inputLong;
+ break;
+ case LONG:
+ // No data loss for long.
+ vector[elementNum] = inputLong;
+ return;
+ default:
+ throw new RuntimeException("Unexpected type kind " + readerCategory.name());
+ }
+
+ if (outputLong != inputLong) {
+ // Data loss.
+ longColVector.isNull[elementNum] = true;
+ longColVector.noNulls = false;
+ } else {
+ vector[elementNum] = outputLong;
+ }
+ }
+
+ protected boolean integerDownCastNeeded(TypeDescription fileType, TypeDescription readerType) {
+ Integer fileLevel = numericTypes.get(fileType.getCategory());
+ Integer schemaLevel = numericTypes.get(readerType.getCategory());
+ return (schemaLevel.intValue() < fileLevel.intValue());
+ }
+ }
+
+ public static class AnyIntegerTreeReader extends ConvertTreeReader {
+
+ private TypeDescription.Category fileTypeCategory;
+ private TreeReader anyIntegerTreeReader;
+
+ private long longValue;
+
+ AnyIntegerTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.fileTypeCategory = fileType.getCategory();
+ switch (fileTypeCategory) {
+ case BOOLEAN:
+ anyIntegerTreeReader = new BooleanTreeReader(columnId);
+ break;
+ case BYTE:
+ anyIntegerTreeReader = new ByteTreeReader(columnId);
+ break;
+ case SHORT:
+ anyIntegerTreeReader = new ShortTreeReader(columnId);
+ break;
+ case INT:
+ anyIntegerTreeReader = new IntTreeReader(columnId);
+ break;
+ case LONG:
+ anyIntegerTreeReader = new LongTreeReader(columnId, skipCorrupt);
+ break;
+ default:
+ throw new RuntimeException("Unexpected type kind " + fileType.getCategory().name());
+ }
+ setConvertTreeReader(anyIntegerTreeReader);
+ }
+
+ protected long getLong() throws IOException {
+ return longValue;
+ }
+
+ protected String getString(long longValue) {
+ if (fileTypeCategory == TypeDescription.Category.BOOLEAN) {
+ return longValue == 0 ? "FALSE" : "TRUE";
+ } else {
+ return Long.toString(longValue);
+ }
+ }
+
+ protected String getString() {
+ return getString(longValue);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ anyIntegerTreeReader.nextVector(previousVector, isNull, batchSize);
+ }
+ }
+
+ public static class AnyIntegerFromAnyIntegerTreeReader extends ConvertTreeReader {
+
+ private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
+
+ private final TypeDescription readerType;
+ private final boolean downCastNeeded;
+
+ AnyIntegerFromAnyIntegerTreeReader(int columnId, TypeDescription fileType, TypeDescription readerType, boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ anyIntegerAsLongTreeReader = new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
+ setConvertTreeReader(anyIntegerAsLongTreeReader);
+ downCastNeeded = integerDownCastNeeded(fileType, readerType);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ anyIntegerAsLongTreeReader.nextVector(previousVector, isNull, batchSize);
+ LongColumnVector resultColVector = (LongColumnVector) previousVector;
+ if (downCastNeeded) {
+ if (resultColVector.isRepeating) {
+ if (resultColVector.noNulls || !resultColVector.isNull[0]) {
+ downCastAnyInteger(resultColVector, 0, readerType);
+ } else {
+ // Result remains null.
+ }
+ } else if (resultColVector.noNulls){
+ for (int i = 0; i < batchSize; i++) {
+ downCastAnyInteger(resultColVector, i, readerType);
+ }
+ } else {
+ for (int i = 0; i < batchSize; i++) {
+ if (!resultColVector.isNull[i]) {
+ downCastAnyInteger(resultColVector, i, readerType);
+ } else {
+ // Result remains null.
+ }
+ }
+ }
+ }
+ }
+ }
+
+ public static class AnyIntegerFromFloatTreeReader extends ConvertTreeReader {
+
+ private FloatTreeReader floatTreeReader;
+
+ private final TypeDescription readerType;
+ private DoubleColumnVector doubleColVector;
+ private LongColumnVector longColVector;
+
+ AnyIntegerFromFloatTreeReader(int columnId, TypeDescription readerType)
+ throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ floatTreeReader = new FloatTreeReader(columnId);
+ setConvertTreeReader(floatTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ double doubleValue = doubleColVector.vector[elementNum];
+ if (!doubleCanFitInLong(doubleValue)) {
+ longColVector.isNull[elementNum] = true;
+ longColVector.noNulls = false;
+ } else {
+ // UNDONE: Does the overflow check above using double really work here for float?
+ float floatValue = (float) doubleValue;
+ downCastAnyInteger(longColVector, elementNum, (long) floatValue, readerType);
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ longColVector = (LongColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ floatTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, longColVector, batchSize);
+ }
+ }
+
+ public static class AnyIntegerFromDoubleTreeReader extends ConvertTreeReader {
+
+ private DoubleTreeReader doubleTreeReader;
+
+ private final TypeDescription readerType;
+ private DoubleColumnVector doubleColVector;
+ private LongColumnVector longColVector;
+
+ AnyIntegerFromDoubleTreeReader(int columnId, TypeDescription readerType)
+ throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ doubleTreeReader = new DoubleTreeReader(columnId);
+ setConvertTreeReader(doubleTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ double doubleValue = doubleColVector.vector[elementNum];
+ if (!doubleCanFitInLong(doubleValue)) {
+ longColVector.isNull[elementNum] = true;
+ longColVector.noNulls = false;
+ } else {
+ downCastAnyInteger(longColVector, elementNum, (long) doubleValue, readerType);
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ longColVector = (LongColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ doubleTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, longColVector, batchSize);
+ }
+ }
+
+ public static class AnyIntegerFromDecimalTreeReader extends ConvertTreeReader {
+
+ private DecimalTreeReader decimalTreeReader;
+
+ private final int precision;
+ private final int scale;
+ private final TypeDescription readerType;
+ private DecimalColumnVector decimalColVector;
+ private LongColumnVector longColVector;
+
+ AnyIntegerFromDecimalTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType) throws IOException {
+ super(columnId);
+ this.precision = fileType.getPrecision();
+ this.scale = fileType.getScale();
+ this.readerType = readerType;
+ decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
+ setConvertTreeReader(decimalTreeReader);
+ }
+
+ private static HiveDecimal DECIMAL_MAX_LONG = HiveDecimal.create(Long.MAX_VALUE);
+ private static HiveDecimal DECIMAL_MIN_LONG = HiveDecimal.create(Long.MIN_VALUE);
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ HiveDecimal decimalValue = decimalColVector.vector[elementNum].getHiveDecimal();
+ if (decimalValue.compareTo(DECIMAL_MAX_LONG) > 0 ||
+ decimalValue.compareTo(DECIMAL_MIN_LONG) < 0) {
+ longColVector.isNull[elementNum] = true;
+ longColVector.noNulls = false;
+ } else {
+ downCastAnyInteger(longColVector, elementNum, decimalValue.longValue(), readerType);
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (decimalColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ decimalColVector = new DecimalColumnVector(precision, scale);
+ longColVector = (LongColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
+
+ convertVector(decimalColVector, longColVector, batchSize);
+ }
+ }
+
+ public static class AnyIntegerFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ private final TypeDescription readerType;
+ private BytesColumnVector bytesColVector;
+ private LongColumnVector longColVector;
+
+ AnyIntegerFromStringGroupTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
+ long longValue = parseLongFromString(string);
+ if (!getIsParseError()) {
+ downCastAnyInteger(longColVector, elementNum, longValue, readerType);
+ } else {
+ longColVector.noNulls = false;
+ longColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (bytesColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ bytesColVector = new BytesColumnVector();
+ longColVector = (LongColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
+
+ convertVector(bytesColVector, longColVector, batchSize);
+ }
+ }
+
+ public static class AnyIntegerFromTimestampTreeReader extends ConvertTreeReader {
+
+ private TimestampTreeReader timestampTreeReader;
+
+ private final TypeDescription readerType;
+ private TimestampColumnVector timestampColVector;
+ private LongColumnVector longColVector;
+
+ AnyIntegerFromTimestampTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
+ setConvertTreeReader(timestampTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ // Use TimestampWritable's getSeconds.
+ long longValue = TimestampUtils.millisToSeconds(
+ timestampColVector.asScratchTimestamp(elementNum).getTime());
+ downCastAnyInteger(longColVector, elementNum, longValue, readerType);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (timestampColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ timestampColVector = new TimestampColumnVector();
+ longColVector = (LongColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
+
+ convertVector(timestampColVector, longColVector, batchSize);
+ }
+ }
+
+ public static class FloatFromAnyIntegerTreeReader extends ConvertTreeReader {
+
+ private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
+
+ private LongColumnVector longColVector;
+ private DoubleColumnVector doubleColVector;
+
+ FloatFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ anyIntegerAsLongTreeReader =
+ new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
+ setConvertTreeReader(anyIntegerAsLongTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ float floatValue = (float) longColVector.vector[elementNum];
+ if (!Float.isNaN(floatValue)) {
+ doubleColVector.vector[elementNum] = floatValue;
+ } else {
+ doubleColVector.vector[elementNum] = Double.NaN;
+ doubleColVector.noNulls = false;
+ doubleColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (longColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ longColVector = new LongColumnVector();
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
+
+ convertVector(longColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class FloatFromDoubleTreeReader extends ConvertTreeReader {
+
+ private DoubleTreeReader doubleTreeReader;
+
+ FloatFromDoubleTreeReader(int columnId) throws IOException {
+ super(columnId);
+ doubleTreeReader = new DoubleTreeReader(columnId);
+ setConvertTreeReader(doubleTreeReader);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ doubleTreeReader.nextVector(previousVector, isNull, batchSize);
+
+ DoubleColumnVector resultColVector = (DoubleColumnVector) previousVector;
+ double[] resultVector = resultColVector.vector;
+ if (resultColVector.isRepeating) {
+ if (resultColVector.noNulls || !resultColVector.isNull[0]) {
+ resultVector[0] = (float) resultVector[0];
+ } else {
+ // Remains null.
+ }
+ } else if (resultColVector.noNulls){
+ for (int i = 0; i < batchSize; i++) {
+ resultVector[i] = (float) resultVector[i];
+ }
+ } else {
+ for (int i = 0; i < batchSize; i++) {
+ if (!resultColVector.isNull[i]) {
+ resultVector[i] = (float) resultVector[i];
+ } else {
+ // Remains null.
+ }
+ }
+ }
+ }
+ }
+
+ public static class FloatFromDecimalTreeReader extends ConvertTreeReader {
+
+ private DecimalTreeReader decimalTreeReader;
+
+ private final int precision;
+ private final int scale;
+ private DecimalColumnVector decimalColVector;
+ private DoubleColumnVector doubleColVector;
+
+ FloatFromDecimalTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType) throws IOException {
+ super(columnId);
+ this.precision = fileType.getPrecision();
+ this.scale = fileType.getScale();
+ decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
+ setConvertTreeReader(decimalTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ doubleColVector.vector[elementNum] =
+ (float) decimalColVector.vector[elementNum].doubleValue();
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (decimalColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ decimalColVector = new DecimalColumnVector(precision, scale);
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
+
+ convertVector(decimalColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class FloatFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ private BytesColumnVector bytesColVector;
+ private DoubleColumnVector doubleColVector;
+
+ FloatFromStringGroupTreeReader(int columnId, TypeDescription fileType)
+ throws IOException {
+ super(columnId);
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
+ float floatValue = parseFloatFromString(string);
+ if (!getIsParseError()) {
+ doubleColVector.vector[elementNum] = floatValue;
+ } else {
+ doubleColVector.vector[elementNum] = Double.NaN;
+ doubleColVector.noNulls = false;
+ doubleColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (bytesColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ bytesColVector = new BytesColumnVector();
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
+
+ convertVector(bytesColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class FloatFromTimestampTreeReader extends ConvertTreeReader {
+
+ private TimestampTreeReader timestampTreeReader;
+
+ private TimestampColumnVector timestampColVector;
+ private DoubleColumnVector doubleColVector;
+
+ FloatFromTimestampTreeReader(int columnId, boolean skipCorrupt) throws IOException {
+ super(columnId);
+ timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
+ setConvertTreeReader(timestampTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ doubleColVector.vector[elementNum] = (float) TimestampUtils.getDouble(
+ timestampColVector.asScratchTimestamp(elementNum));
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (timestampColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ timestampColVector = new TimestampColumnVector();
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
+
+ convertVector(timestampColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class DoubleFromAnyIntegerTreeReader extends ConvertTreeReader {
+
+ private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
+
+ private LongColumnVector longColVector;
+ private DoubleColumnVector doubleColVector;
+
+ DoubleFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ anyIntegerAsLongTreeReader =
+ new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
+ setConvertTreeReader(anyIntegerAsLongTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+
+ double doubleValue = (double) longColVector.vector[elementNum];
+ if (!Double.isNaN(doubleValue)) {
+ doubleColVector.vector[elementNum] = doubleValue;
+ } else {
+ doubleColVector.vector[elementNum] = Double.NaN;
+ doubleColVector.noNulls = false;
+ doubleColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (longColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ longColVector = new LongColumnVector();
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
+
+ convertVector(longColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class DoubleFromFloatTreeReader extends ConvertTreeReader {
+
+ private FloatTreeReader floatTreeReader;
+
+ DoubleFromFloatTreeReader(int columnId) throws IOException {
+ super(columnId);
+ floatTreeReader = new FloatTreeReader(columnId);
+ setConvertTreeReader(floatTreeReader);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ // we get the DoubleColumnVector produced by float tree reader first, then iterate through
+ // the elements and make double -> float -> string -> double conversion to preserve the
+ // precision. When float tree reader reads float and assign it to double, java's widening
+ // conversion adds more precision which will break all comparisons.
+ // Example: float f = 74.72
+ // double d = f ---> 74.72000122070312
+ // Double.parseDouble(String.valueOf(f)) ---> 74.72
+ floatTreeReader.nextVector(previousVector, isNull, batchSize);
+
+ DoubleColumnVector doubleColumnVector = (DoubleColumnVector) previousVector;
+ if (doubleColumnVector.isRepeating) {
+ if (doubleColumnVector.noNulls || !doubleColumnVector.isNull[0]) {
+ final float f = (float) doubleColumnVector.vector[0];
+ doubleColumnVector.vector[0] = Double.parseDouble(String.valueOf(f));
+ }
+ } else if (doubleColumnVector.noNulls){
+ for (int i = 0; i < batchSize; i++) {
+ final float f = (float) doubleColumnVector.vector[i];
+ doubleColumnVector.vector[i] = Double.parseDouble(String.valueOf(f));
+ }
+ } else {
+ for (int i = 0; i < batchSize; i++) {
+ if (!doubleColumnVector.isNull[i]) {
+ final float f = (float) doubleColumnVector.vector[i];
+ doubleColumnVector.vector[i] = Double.parseDouble(String.valueOf(f));
+ }
+ }
+ }
+ }
+ }
+
+ public static class DoubleFromDecimalTreeReader extends ConvertTreeReader {
+
+ private DecimalTreeReader decimalTreeReader;
+
+ private final int precision;
+ private final int scale;
+ private DecimalColumnVector decimalColVector;
+ private DoubleColumnVector doubleColVector;
+
+ DoubleFromDecimalTreeReader(int columnId, TypeDescription fileType) throws IOException {
+ super(columnId);
+ this.precision = fileType.getPrecision();
+ this.scale = fileType.getScale();
+ decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
+ setConvertTreeReader(decimalTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ doubleColVector.vector[elementNum] =
+ decimalColVector.vector[elementNum].doubleValue();
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (decimalColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ decimalColVector = new DecimalColumnVector(precision, scale);
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
+
+ convertVector(decimalColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class DoubleFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ private BytesColumnVector bytesColVector;
+ private DoubleColumnVector doubleColVector;
+
+ DoubleFromStringGroupTreeReader(int columnId, TypeDescription fileType)
+ throws IOException {
+ super(columnId);
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
+ double doubleValue = parseDoubleFromString(string);
+ if (!getIsParseError()) {
+ doubleColVector.vector[elementNum] = doubleValue;
+ } else {
+ doubleColVector.noNulls = false;
+ doubleColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (bytesColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ bytesColVector = new BytesColumnVector();
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
+
+ convertVector(bytesColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class DoubleFromTimestampTreeReader extends ConvertTreeReader {
+
+ private TimestampTreeReader timestampTreeReader;
+
+ private TimestampColumnVector timestampColVector;
+ private DoubleColumnVector doubleColVector;
+
+ DoubleFromTimestampTreeReader(int columnId, boolean skipCorrupt) throws IOException {
+ super(columnId);
+ timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
+ setConvertTreeReader(timestampTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ doubleColVector.vector[elementNum] = TimestampUtils.getDouble(
+ timestampColVector.asScratchTimestamp(elementNum));
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (timestampColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ timestampColVector = new TimestampColumnVector();
+ doubleColVector = (DoubleColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
+
+ convertVector(timestampColVector, doubleColVector, batchSize);
+ }
+ }
+
+ public static class DecimalFromAnyIntegerTreeReader extends ConvertTreeReader {
+
+ private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
+
+ private LongColumnVector longColVector;
+ private DecimalColumnVector decimalColVector;
+
+ DecimalFromAnyIntegerTreeReader(int columnId, TypeDescription fileType, boolean skipCorrupt)
+ throws IOException {
+ super(columnId);
+ anyIntegerAsLongTreeReader =
+ new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
+ setConvertTreeReader(anyIntegerAsLongTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ long longValue = longColVector.vector[elementNum];
+ HiveDecimalWritable hiveDecimalWritable = new HiveDecimalWritable(longValue);
+ // The DecimalColumnVector will enforce precision and scale and set the entry to null when out of bounds.
+ decimalColVector.set(elementNum, hiveDecimalWritable);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (longColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ longColVector = new LongColumnVector();
+ decimalColVector = (DecimalColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
+
+ convertVector(longColVector, decimalColVector, batchSize);
+ }
+ }
+
+ public static class DecimalFromFloatTreeReader extends ConvertTreeReader {
+
+ private FloatTreeReader floatTreeReader;
+
+ private DoubleColumnVector doubleColVector;
+ private DecimalColumnVector decimalColVector;
+
+ DecimalFromFloatTreeReader(int columnId, TypeDescription readerType)
+ throws IOException {
+ super(columnId);
+ floatTreeReader = new FloatTreeReader(columnId);
+ setConvertTreeReader(floatTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ float floatValue = (float) doubleColVector.vector[elementNum];
+ if (!Float.isNaN(floatValue)) {
+ HiveDecimal decimalValue =
+ HiveDecimal.create(Float.toString(floatValue));
+ if (decimalValue != null) {
+ // The DecimalColumnVector will enforce precision and scale and set the entry to null when out of bounds.
+ decimalColVector.set(elementNum, decimalValue);
+ } else {
+ decimalColVector.noNulls = false;
+ decimalColVector.isNull[elementNum] = true;
+ }
+ } else {
+ decimalColVector.noNulls = false;
+ decimalColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ decimalColVector = (DecimalColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ floatTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, decimalColVector, batchSize);
+ }
+ }
+
+ public static class DecimalFromDoubleTreeReader extends ConvertTreeReader {
+
+ private DoubleTreeReader doubleTreeReader;
+
+ private DoubleColumnVector doubleColVector;
+ private DecimalColumnVector decimalColVector;
+
+ DecimalFromDoubleTreeReader(int columnId, TypeDescription readerType)
+ throws IOException {
+ super(columnId);
+ doubleTreeReader = new DoubleTreeReader(columnId);
+ setConvertTreeReader(doubleTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ HiveDecimal value =
+ HiveDecimal.create(Double.toString(doubleColVector.vector[elementNum]));
+ if (value != null) {
+ decimalColVector.set(elementNum, value);
+ } else {
+ decimalColVector.noNulls = false;
+ decimalColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ decimalColVector = (DecimalColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ doubleTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, decimalColVector, batchSize);
+ }
+ }
+
+ public static class DecimalFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ private BytesColumnVector bytesColVector;
+ private DecimalColumnVector decimalColVector;
+
+ DecimalFromStringGroupTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType) throws IOException {
+ super(columnId);
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
+ HiveDecimal value = parseDecimalFromString(string);
+ if (value != null) {
+ // The DecimalColumnVector will enforce precision and scale and set the entry to null when out of bounds.
+ decimalColVector.set(elementNum, value);
+ } else {
+ decimalColVector.noNulls = false;
+ decimalColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (bytesColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ bytesColVector = new BytesColumnVector();
+ decimalColVector = (DecimalColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
+
+ convertVector(bytesColVector, decimalColVector, batchSize);
+ }
+ }
+
+ public static class DecimalFromTimestampTreeReader extends ConvertTreeReader {
+
+ private TimestampTreeReader timestampTreeReader;
+
+ private TimestampColumnVector timestampColVector;
+ private DecimalColumnVector decimalColVector;
+
+ DecimalFromTimestampTreeReader(int columnId, boolean skipCorrupt) throws IOException {
+ super(columnId);
+ timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
+ setConvertTreeReader(timestampTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ double doubleValue = TimestampUtils.getDouble(
+ timestampColVector.asScratchTimestamp(elementNum));
+ HiveDecimal value = HiveDecimal.create(Double.toString(doubleValue));
+ if (value != null) {
+ // The DecimalColumnVector will enforce precision and scale and set the entry to null when out of bounds.
+ decimalColVector.set(elementNum, value);
+ } else {
+ decimalColVector.noNulls = false;
+ decimalColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (timestampColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ timestampColVector = new TimestampColumnVector();
+ decimalColVector = (DecimalColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
+
+ convertVector(timestampColVector, decimalColVector, batchSize);
+ }
+ }
+
+ public static class DecimalFromDecimalTreeReader extends ConvertTreeReader {
+
+ private DecimalTreeReader decimalTreeReader;
+
+ private DecimalColumnVector fileDecimalColVector;
+ private int filePrecision;
+ private int fileScale;
+ private int readerPrecision;
+ private int readerScale;
+ private DecimalColumnVector decimalColVector;
+
+ DecimalFromDecimalTreeReader(int columnId, TypeDescription fileType, TypeDescription readerType)
+ throws IOException {
+ super(columnId);
+ filePrecision = fileType.getPrecision();
+ fileScale = fileType.getScale();
+ readerPrecision = readerType.getPrecision();
+ readerScale = readerType.getScale();
+ decimalTreeReader = new DecimalTreeReader(columnId, filePrecision, fileScale);
+ setConvertTreeReader(decimalTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+
+ decimalColVector.set(elementNum, fileDecimalColVector.vector[elementNum]);
+
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (fileDecimalColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ fileDecimalColVector = new DecimalColumnVector(filePrecision, fileScale);
+ decimalColVector = (DecimalColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ decimalTreeReader.nextVector(fileDecimalColVector, isNull, batchSize);
+
+ convertVector(fileDecimalColVector, decimalColVector, batchSize);
+ }
+ }
+
+ public static class StringGroupFromAnyIntegerTreeReader extends ConvertTreeReader {
+
+ private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
+
+ private final TypeDescription readerType;
+ private LongColumnVector longColVector;
+ private BytesColumnVector bytesColVector;
+
+ StringGroupFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType, boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ anyIntegerAsLongTreeReader =
+ new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
+ setConvertTreeReader(anyIntegerAsLongTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ long longValue = longColVector.vector[elementNum];
+ String string = anyIntegerAsLongTreeReader.getString(longValue);
+ byte[] bytes = string.getBytes();
+ assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (longColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ longColVector = new LongColumnVector();
+ bytesColVector = (BytesColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
+
+ convertVector(longColVector, bytesColVector, batchSize);
+ }
+ }
+
+ public static class StringGroupFromFloatTreeReader extends ConvertTreeReader {
+
+ private FloatTreeReader floatTreeReader;
+
+ private final TypeDescription readerType;
+ private DoubleColumnVector doubleColVector;
+ private BytesColumnVector bytesColVector;
+
+
+ StringGroupFromFloatTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ floatTreeReader = new FloatTreeReader(columnId);
+ setConvertTreeReader(floatTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ float floatValue = (float) doubleColVector.vector[elementNum];
+ if (!Float.isNaN(floatValue)) {
+ String string = String.valueOf(floatValue);
+ byte[] bytes = string.getBytes();
+ assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
+ } else {
+ bytesColVector.noNulls = false;
+ bytesColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ bytesColVector = (BytesColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ floatTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, bytesColVector, batchSize);
+ }
+ }
+
+ public static class StringGroupFromDoubleTreeReader extends ConvertTreeReader {
+
+ private DoubleTreeReader doubleTreeReader;
+
+ private final TypeDescription readerType;
+ private DoubleColumnVector doubleColVector;
+ private BytesColumnVector bytesColVector;
+
+ StringGroupFromDoubleTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ doubleTreeReader = new DoubleTreeReader(columnId);
+ setConvertTreeReader(doubleTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ double doubleValue = doubleColVector.vector[elementNum];
+ if (!Double.isNaN(doubleValue)) {
+ String string = String.valueOf(doubleValue);
+ byte[] bytes = string.getBytes();
+ assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
+ } else {
+ bytesColVector.noNulls = false;
+ bytesColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ bytesColVector = (BytesColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ doubleTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, bytesColVector, batchSize);
+ }
+ }
+
+
+
+ public static class StringGroupFromDecimalTreeReader extends ConvertTreeReader {
+
+ private DecimalTreeReader decimalTreeReader;
+
+ private int precision;
+ private int scale;
+ private final TypeDescription readerType;
+ private DecimalColumnVector decimalColVector;
+ private BytesColumnVector bytesColVector;
+ private byte[] scratchBuffer;
+
+ StringGroupFromDecimalTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType, boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.precision = fileType.getPrecision();
+ this.scale = fileType.getScale();
+ this.readerType = readerType;
+ decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
+ setConvertTreeReader(decimalTreeReader);
+ scratchBuffer = new byte[HiveDecimal.SCRATCH_BUFFER_LEN_TO_BYTES];
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ HiveDecimalWritable decWritable = decimalColVector.vector[elementNum];
+
+ // Convert decimal into bytes instead of a String for better performance.
+ final int byteIndex = decWritable.toBytes(scratchBuffer);
+
+ assignStringGroupVectorEntry(
+ bytesColVector, elementNum, readerType,
+ scratchBuffer, byteIndex, HiveDecimal.SCRATCH_BUFFER_LEN_TO_BYTES - byteIndex);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (decimalColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ decimalColVector = new DecimalColumnVector(precision, scale);
+ bytesColVector = (BytesColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
+
+ convertVector(decimalColVector, bytesColVector, batchSize);
+ }
+ }
+
+ public static class StringGroupFromTimestampTreeReader extends ConvertTreeReader {
+
+ private TimestampTreeReader timestampTreeReader;
+
+ private final TypeDescription readerType;
+ private TimestampColumnVector timestampColVector;
+ private BytesColumnVector bytesColVector;
+
+ StringGroupFromTimestampTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
+ setConvertTreeReader(timestampTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ String string =
+ timestampColVector.asScratchTimestamp(elementNum).toString();
+ byte[] bytes = string.getBytes();
+ assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (timestampColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ timestampColVector = new TimestampColumnVector();
+ bytesColVector = (BytesColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
+
+ convertVector(timestampColVector, bytesColVector, batchSize);
+ }
+ }
+
+ public static class StringGroupFromDateTreeReader extends ConvertTreeReader {
+
+ private DateTreeReader dateTreeReader;
+
+ private final TypeDescription readerType;
+ private LongColumnVector longColVector;
+ private BytesColumnVector bytesColVector;
+ private Date date;
+
+ StringGroupFromDateTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ dateTreeReader = new DateTreeReader(columnId);
+ setConvertTreeReader(dateTreeReader);
+ date = new Date(0);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ date.setTime(DateWritable.daysToMillis((int) longColVector.vector[elementNum]));
+ String string = date.toString();
+ byte[] bytes = string.getBytes();
+ assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (longColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ longColVector = new LongColumnVector();
+ bytesColVector = (BytesColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ dateTreeReader.nextVector(longColVector, isNull, batchSize);
+
+ convertVector(longColVector, bytesColVector, batchSize);
+ }
+ }
+
+ public static class StringGroupFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ private final TypeDescription readerType;
+
+ StringGroupFromStringGroupTreeReader(int columnId, TypeDescription fileType,
+ TypeDescription readerType) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ stringGroupTreeReader.nextVector(previousVector, isNull, batchSize);
+
+ BytesColumnVector resultColVector = (BytesColumnVector) previousVector;
+
+ if (resultColVector.isRepeating) {
+ if (resultColVector.noNulls || !resultColVector.isNull[0]) {
+ convertStringGroupVectorElement(resultColVector, 0, readerType);
+ } else {
+ // Remains null.
+ }
+ } else if (resultColVector.noNulls){
+ for (int i = 0; i < batchSize; i++) {
+ convertStringGroupVectorElement(resultColVector, i, readerType);
+ }
+ } else {
+ for (int i = 0; i < batchSize; i++) {
+ if (!resultColVector.isNull[i]) {
+ convertStringGroupVectorElement(resultColVector, i, readerType);
+ } else {
+ // Remains null.
+ }
+ }
+ }
+ }
+ }
+
+ public static class StringGroupFromBinaryTreeReader extends ConvertTreeReader {
+
+ private BinaryTreeReader binaryTreeReader;
+
+ private final TypeDescription readerType;
+ private BytesColumnVector inBytesColVector;
+ private BytesColumnVector outBytesColVector;
+
+ StringGroupFromBinaryTreeReader(int columnId, TypeDescription readerType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.readerType = readerType;
+ binaryTreeReader = new BinaryTreeReader(columnId);
+ setConvertTreeReader(binaryTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ byte[] bytes = inBytesColVector.vector[elementNum];
+ int start = inBytesColVector.start[elementNum];
+ int length = inBytesColVector.length[elementNum];
+ byte[] string = new byte[length == 0 ? 0 : 3 * length - 1];
+ for(int p = 0; p < string.length; p += 2) {
+ if (p != 0) {
+ string[p++] = ' ';
+ }
+ int num = 0xff & bytes[start++];
+ int digit = num / 16;
+ string[p] = (byte)((digit) + (digit < 10 ? '0' : 'a' - 10));
+ digit = num % 16;
+ string[p + 1] = (byte)((digit) + (digit < 10 ? '0' : 'a' - 10));
+ }
+ assignStringGroupVectorEntry(outBytesColVector, elementNum, readerType,
+ string, 0, string.length);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (inBytesColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ inBytesColVector = new BytesColumnVector();
+ outBytesColVector = (BytesColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ binaryTreeReader.nextVector(inBytesColVector, isNull, batchSize);
+
+ convertVector(inBytesColVector, outBytesColVector, batchSize);
+ }
+ }
+
+ public static class TimestampFromAnyIntegerTreeReader extends ConvertTreeReader {
+
+ private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
+
+ private LongColumnVector longColVector;
+ private TimestampColumnVector timestampColVector;
+
+ TimestampFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ anyIntegerAsLongTreeReader =
+ new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
+ setConvertTreeReader(anyIntegerAsLongTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ long longValue = longColVector.vector[elementNum];
+ // UNDONE: What does the boolean setting need to be?
+ timestampColVector.set(elementNum, new Timestamp(longValue));
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (longColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ longColVector = new LongColumnVector();
+ timestampColVector = (TimestampColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
+
+ convertVector(longColVector, timestampColVector, batchSize);
+ }
+ }
+
+ public static class TimestampFromFloatTreeReader extends ConvertTreeReader {
+
+ private FloatTreeReader floatTreeReader;
+
+ private DoubleColumnVector doubleColVector;
+ private TimestampColumnVector timestampColVector;
+
+ TimestampFromFloatTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ floatTreeReader = new FloatTreeReader(columnId);
+ setConvertTreeReader(floatTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ float floatValue = (float) doubleColVector.vector[elementNum];
+ Timestamp timestampValue = TimestampUtils.doubleToTimestamp(floatValue);
+ // The TimestampColumnVector will set the entry to null when a null timestamp is passed in.
+ timestampColVector.set(elementNum, timestampValue);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ timestampColVector = (TimestampColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ floatTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, timestampColVector, batchSize);
+ }
+ }
+
+ public static class TimestampFromDoubleTreeReader extends ConvertTreeReader {
+
+ private DoubleTreeReader doubleTreeReader;
+
+ private DoubleColumnVector doubleColVector;
+ private TimestampColumnVector timestampColVector;
+
+ TimestampFromDoubleTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ doubleTreeReader = new DoubleTreeReader(columnId);
+ setConvertTreeReader(doubleTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ double doubleValue = doubleColVector.vector[elementNum];
+ Timestamp timestampValue = TimestampUtils.doubleToTimestamp(doubleValue);
+ // The TimestampColumnVector will set the entry to null when a null timestamp is passed in.
+ timestampColVector.set(elementNum, timestampValue);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (doubleColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ doubleColVector = new DoubleColumnVector();
+ timestampColVector = (TimestampColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ doubleTreeReader.nextVector(doubleColVector, isNull, batchSize);
+
+ convertVector(doubleColVector, timestampColVector, batchSize);
+ }
+ }
+
+ public static class TimestampFromDecimalTreeReader extends ConvertTreeReader {
+
+ private DecimalTreeReader decimalTreeReader;
+
+ private final int precision;
+ private final int scale;
+ private DecimalColumnVector decimalColVector;
+ private TimestampColumnVector timestampColVector;
+
+ TimestampFromDecimalTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ this.precision = fileType.getPrecision();
+ this.scale = fileType.getScale();
+ decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
+ setConvertTreeReader(decimalTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ Timestamp timestampValue =
+ TimestampUtils.decimalToTimestamp(
+ decimalColVector.vector[elementNum].getHiveDecimal());
+ // The TimestampColumnVector will set the entry to null when a null timestamp is passed in.
+ timestampColVector.set(elementNum, timestampValue);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (decimalColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ decimalColVector = new DecimalColumnVector(precision, scale);
+ timestampColVector = (TimestampColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
+
+ convertVector(decimalColVector, timestampColVector, batchSize);
+ }
+ }
+
+ public static class TimestampFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ private BytesColumnVector bytesColVector;
+ private TimestampColumnVector timestampColVector;
+
+ TimestampFromStringGroupTreeReader(int columnId, TypeDescription fileType)
+ throws IOException {
+ super(columnId);
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ String stringValue =
+ stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
+ Timestamp timestampValue = parseTimestampFromString(stringValue);
+ if (timestampValue != null) {
+ timestampColVector.set(elementNum, timestampValue);
+ } else {
+ timestampColVector.noNulls = false;
+ timestampColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (bytesColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ bytesColVector = new BytesColumnVector();
+ timestampColVector = (TimestampColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
+
+ convertVector(bytesColVector, timestampColVector, batchSize);
+ }
+ }
+
+ public static class TimestampFromDateTreeReader extends ConvertTreeReader {
+
+ private DateTreeReader dateTreeReader;
+
+ private LongColumnVector longColVector;
+ private TimestampColumnVector timestampColVector;
+
+ TimestampFromDateTreeReader(int columnId, TypeDescription fileType,
+ boolean skipCorrupt) throws IOException {
+ super(columnId);
+ dateTreeReader = new DateTreeReader(columnId);
+ setConvertTreeReader(dateTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) {
+ long millis =
+ DateWritable.daysToMillis((int) longColVector.vector[elementNum]);
+ timestampColVector.set(elementNum, new Timestamp(millis));
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (longColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ longColVector = new LongColumnVector();
+ timestampColVector = (TimestampColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ dateTreeReader.nextVector(longColVector, isNull, batchSize);
+
+ convertVector(longColVector, timestampColVector, batchSize);
+ }
+ }
+
+ public static class DateFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ private BytesColumnVector bytesColVector;
+ private LongColumnVector longColVector;
+
+ DateFromStringGroupTreeReader(int columnId, TypeDescription fileType)
+ throws IOException {
+ super(columnId);
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ String stringValue =
+ stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
+ Date dateValue = parseDateFromString(stringValue);
+ if (dateValue != null) {
+ longColVector.vector[elementNum] = DateWritable.dateToDays(dateValue);
+ } else {
+ longColVector.noNulls = false;
+ longColVector.isNull[elementNum] = true;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (bytesColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ bytesColVector = new BytesColumnVector();
+ longColVector = (LongColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
+
+ convertVector(bytesColVector, longColVector, batchSize);
+ }
+ }
+
+ public static class DateFromTimestampTreeReader extends ConvertTreeReader {
+
+ private TimestampTreeReader timestampTreeReader;
+
+ private TimestampColumnVector timestampColVector;
+ private LongColumnVector longColVector;
+
+ DateFromTimestampTreeReader(int columnId, boolean skipCorrupt) throws IOException {
+ super(columnId);
+ timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
+ setConvertTreeReader(timestampTreeReader);
+ }
+
+ @Override
+ public void setConvertVectorElement(int elementNum) throws IOException {
+ Date dateValue =
+ DateWritable.timeToDate(TimestampUtils.millisToSeconds(
+ timestampColVector.asScratchTimestamp(elementNum).getTime()));
+ longColVector.vector[elementNum] = DateWritable.dateToDays(dateValue);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ if (timestampColVector == null) {
+ // Allocate column vector for file; cast column vector for reader.
+ timestampColVector = new TimestampColumnVector();
+ longColVector = (LongColumnVector) previousVector;
+ }
+ // Read present/isNull stream
+ timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
+
+ convertVector(timestampColVector, longColVector, batchSize);
+ }
+ }
+
+ public static class BinaryFromStringGroupTreeReader extends ConvertTreeReader {
+
+ private TreeReader stringGroupTreeReader;
+
+ BinaryFromStringGroupTreeReader(int columnId, TypeDescription fileType)
+ throws IOException {
+ super(columnId);
+ stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
+ setConvertTreeReader(stringGroupTreeReader);
+ }
+
+ @Override
+ public void nextVector(ColumnVector previousVector,
+ boolean[] isNull,
+ final int batchSize) throws IOException {
+ super.nextVector(previousVector, isNull, batchSize);
+ }
+ }
+
+ private static TreeReader createAnyIntegerConvertTreeReader(int columnId,
+ TypeDescription fileType,
+ TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+
+ // CONVERT from (BOOLEAN, BYTE, SHORT, INT, LONG) to schema type.
+ //
+ switch (readerType.getCategory()) {
+
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ if (fileType.getCategory() == readerType.getCategory()) {
+ throw new IllegalArgumentException("No conversion of type " +
+ readerType.getCategory() + " to self needed");
+ }
+ return new AnyIntegerFromAnyIntegerTreeReader(columnId, fileType, readerType,
+ skipCorrupt);
+
+ case FLOAT:
+ return new FloatFromAnyIntegerTreeReader(columnId, fileType,
+ skipCorrupt);
+
+ case DOUBLE:
+ return new DoubleFromAnyIntegerTreeReader(columnId, fileType,
+ skipCorrupt);
+
+ case DECIMAL:
+ return new DecimalFromAnyIntegerTreeReader(columnId, fileType, skipCorrupt);
+
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ return new StringGroupFromAnyIntegerTreeReader(columnId, fileType, readerType,
+ skipCorrupt);
+
+ case TIMESTAMP:
+ return new TimestampFromAnyIntegerTreeReader(columnId, fileType, skipCorrupt);
+
+ // Not currently supported conversion(s):
+ case BINARY:
+ case DATE:
+
+ case STRUCT:
+ case LIST:
+ case MAP:
+ case UNION:
+ default:
+ throw new IllegalArgumentException("Unsupported type " +
+ readerType.getCategory());
+ }
+ }
+
+ private static TreeReader createFloatConvertTreeReader(int columnId,
+ TypeDescription fileType,
+ TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+
+ // CONVERT from FLOAT to schema type.
+ switch (readerType.getCategory()) {
+
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new AnyIntegerFromFloatTreeReader(columnId, readerType);
+
+ case FLOAT:
+ throw new IllegalArgumentException("No conversion of type " +
+ readerType.getCategory() + " to self needed");
+
+ case DOUBLE:
+ return new DoubleFromFloatTreeReader(columnId);
+
+ case DECIMAL:
+ return new DecimalFromFloatTreeReader(columnId, readerType);
+
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ return new StringGroupFromFloatTreeReader(columnId, readerType, skipCorrupt);
+
+ case TIMESTAMP:
+ return new TimestampFromFloatTreeReader(columnId, readerType, skipCorrupt);
+
+ // Not currently supported conversion(s):
+ case BINARY:
+ case DATE:
+
+ case STRUCT:
+ case LIST:
+ case MAP:
+ case UNION:
+ default:
+ throw new IllegalArgumentException("Unsupported type " +
+ readerType.getCategory());
+ }
+ }
+
+ private static TreeReader createDoubleConvertTreeReader(int columnId,
+ TypeDescription fileType,
+ TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+
+ // CONVERT from DOUBLE to schema type.
+ switch (readerType.getCategory()) {
+
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new AnyIntegerFromDoubleTreeReader(columnId, readerType);
+
+ case FLOAT:
+ return new FloatFromDoubleTreeReader(columnId);
+
+ case DOUBLE:
+ throw new IllegalArgumentException("No conversion of type " +
+ readerType.getCategory() + " to self needed");
+
+ case DECIMAL:
+ return new DecimalFromDoubleTreeReader(columnId, readerType);
+
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ return new StringGroupFromDoubleTreeReader(columnId, readerType, skipCorrupt);
+
+ case TIMESTAMP:
+ return new TimestampFromDoubleTreeReader(columnId, readerType, skipCorrupt);
+
+ // Not currently supported conversion(s):
+ case BINARY:
+ case DATE:
+
+ case STRUCT:
+ case LIST:
+ case MAP:
+ case UNION:
+ default:
+ throw new IllegalArgumentException("Unsupported type " +
+ readerType.getCategory());
+ }
+ }
+
+ private static TreeReader createDecimalConvertTreeReader(int columnId,
+ TypeDescription fileType,
+ TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+
+ // CONVERT from DECIMAL to schema type.
+ switch (readerType.getCategory()) {
+
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new AnyIntegerFromDecimalTreeReader(columnId, fileType, readerType);
+
+ case FLOAT:
+ return new FloatFromDecimalTreeReader(columnId, fileType, readerType);
+
+ case DOUBLE:
+ return new DoubleFromDecimalTreeReader(columnId, fileType);
+
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ return new StringGroupFromDecimalTreeReader(columnId, fileType, readerType, skipCorrupt);
+
+ case TIMESTAMP:
+ return new TimestampFromDecimalTreeReader(columnId, fileType, skipCorrupt);
+
+ case DECIMAL:
+ return new DecimalFromDecimalTreeReader(columnId, fileType, readerType);
+
+ // Not currently supported conversion(s):
+ case BINARY:
+ case DATE:
+
+ case STRUCT:
+ case LIST:
+ case MAP:
+ case UNION:
+ default:
+ throw new IllegalArgumentException("Unsupported type " +
+ readerType.getCategory());
+ }
+ }
+
+ private static TreeReader createStringConvertTreeReader(int columnId,
+ TypeDescription fileType,
+ TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+
+ // CONVERT from STRING to schema type.
+ switch (readerType.getCategory()) {
+
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new AnyIntegerFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case FLOAT:
+ return new FloatFromStringGroupTreeReader(columnId, fileType);
+
+ case DOUBLE:
+ return new DoubleFromStringGroupTreeReader(columnId, fileType);
+
+ case DECIMAL:
+ return new DecimalFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case CHAR:
+ return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case VARCHAR:
+ return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case STRING:
+ throw new IllegalArgumentException("No conversion of type " +
+ readerType.getCategory() + " to self needed");
+
+ case BINARY:
+ return new BinaryFromStringGroupTreeReader(columnId, fileType);
+
+ case TIMESTAMP:
+ return new TimestampFromStringGroupTreeReader(columnId, fileType);
+
+ case DATE:
+ return new DateFromStringGroupTreeReader(columnId, fileType);
+
+ // Not currently supported conversion(s):
+
+ case STRUCT:
+ case LIST:
+ case MAP:
+ case UNION:
+ default:
+ throw new IllegalArgumentException("Unsupported type " +
+ readerType.getCategory());
+ }
+ }
+
+ private static TreeReader createCharConvertTreeReader(int columnId,
+ TypeDescription fileType,
+ TypeDescription readerType,
+ SchemaEvolution evolution,
+ boolean[] included,
+ boolean skipCorrupt) throws IOException {
+
+ // CONVERT from CHAR to schema type.
+ switch (readerType.getCategory()) {
+
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new AnyIntegerFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case FLOAT:
+ return new FloatFromStringGroupTreeReader(columnId, fileType);
+
+ case DOUBLE:
+ return new DoubleFromStringGroupTreeReader(columnId, fileType);
+
+ case DECIMAL:
+ return new DecimalFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case STRING:
+ return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case VARCHAR:
+ return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case CHAR:
+ return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
+
+ case BINARY:
+ return new BinaryFromStringGroupTreeReader(columnId, fileType);
+
+ case TIMESTAMP:
+ return new TimestampFromStringGroupTreeReader(columnId, fileType);
+
+ case DATE:
+ return new DateFromStringGroupTreeReader(columnId, fileTyp
<TRUNCATED>
[35/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/gen/protobuf-java/org/apache/orc/OrcProto.java
----------------------------------------------------------------------
diff --git a/orc/src/gen/protobuf-java/org/apache/orc/OrcProto.java b/orc/src/gen/protobuf-java/org/apache/orc/OrcProto.java
deleted file mode 100644
index 32193ec..0000000
--- a/orc/src/gen/protobuf-java/org/apache/orc/OrcProto.java
+++ /dev/null
@@ -1,20179 +0,0 @@
-// Generated by the protocol buffer compiler. DO NOT EDIT!
-// source: orc_proto.proto
-
-package org.apache.orc;
-
-public final class OrcProto {
- private OrcProto() {}
- public static void registerAllExtensions(
- com.google.protobuf.ExtensionRegistry registry) {
- }
- /**
- * Protobuf enum {@code orc.proto.CompressionKind}
- */
- public enum CompressionKind
- implements com.google.protobuf.ProtocolMessageEnum {
- /**
- * <code>NONE = 0;</code>
- */
- NONE(0, 0),
- /**
- * <code>ZLIB = 1;</code>
- */
- ZLIB(1, 1),
- /**
- * <code>SNAPPY = 2;</code>
- */
- SNAPPY(2, 2),
- /**
- * <code>LZO = 3;</code>
- */
- LZO(3, 3),
- ;
-
- /**
- * <code>NONE = 0;</code>
- */
- public static final int NONE_VALUE = 0;
- /**
- * <code>ZLIB = 1;</code>
- */
- public static final int ZLIB_VALUE = 1;
- /**
- * <code>SNAPPY = 2;</code>
- */
- public static final int SNAPPY_VALUE = 2;
- /**
- * <code>LZO = 3;</code>
- */
- public static final int LZO_VALUE = 3;
-
-
- public final int getNumber() { return value; }
-
- public static CompressionKind valueOf(int value) {
- switch (value) {
- case 0: return NONE;
- case 1: return ZLIB;
- case 2: return SNAPPY;
- case 3: return LZO;
- default: return null;
- }
- }
-
- public static com.google.protobuf.Internal.EnumLiteMap<CompressionKind>
- internalGetValueMap() {
- return internalValueMap;
- }
- private static com.google.protobuf.Internal.EnumLiteMap<CompressionKind>
- internalValueMap =
- new com.google.protobuf.Internal.EnumLiteMap<CompressionKind>() {
- public CompressionKind findValueByNumber(int number) {
- return CompressionKind.valueOf(number);
- }
- };
-
- public final com.google.protobuf.Descriptors.EnumValueDescriptor
- getValueDescriptor() {
- return getDescriptor().getValues().get(index);
- }
- public final com.google.protobuf.Descriptors.EnumDescriptor
- getDescriptorForType() {
- return getDescriptor();
- }
- public static final com.google.protobuf.Descriptors.EnumDescriptor
- getDescriptor() {
- return org.apache.orc.OrcProto.getDescriptor().getEnumTypes().get(0);
- }
-
- private static final CompressionKind[] VALUES = values();
-
- public static CompressionKind valueOf(
- com.google.protobuf.Descriptors.EnumValueDescriptor desc) {
- if (desc.getType() != getDescriptor()) {
- throw new java.lang.IllegalArgumentException(
- "EnumValueDescriptor is not for this type.");
- }
- return VALUES[desc.getIndex()];
- }
-
- private final int index;
- private final int value;
-
- private CompressionKind(int index, int value) {
- this.index = index;
- this.value = value;
- }
-
- // @@protoc_insertion_point(enum_scope:orc.proto.CompressionKind)
- }
-
- public interface IntegerStatisticsOrBuilder
- extends com.google.protobuf.MessageOrBuilder {
-
- // optional sint64 minimum = 1;
- /**
- * <code>optional sint64 minimum = 1;</code>
- */
- boolean hasMinimum();
- /**
- * <code>optional sint64 minimum = 1;</code>
- */
- long getMinimum();
-
- // optional sint64 maximum = 2;
- /**
- * <code>optional sint64 maximum = 2;</code>
- */
- boolean hasMaximum();
- /**
- * <code>optional sint64 maximum = 2;</code>
- */
- long getMaximum();
-
- // optional sint64 sum = 3;
- /**
- * <code>optional sint64 sum = 3;</code>
- */
- boolean hasSum();
- /**
- * <code>optional sint64 sum = 3;</code>
- */
- long getSum();
- }
- /**
- * Protobuf type {@code orc.proto.IntegerStatistics}
- */
- public static final class IntegerStatistics extends
- com.google.protobuf.GeneratedMessage
- implements IntegerStatisticsOrBuilder {
- // Use IntegerStatistics.newBuilder() to construct.
- private IntegerStatistics(com.google.protobuf.GeneratedMessage.Builder<?> builder) {
- super(builder);
- this.unknownFields = builder.getUnknownFields();
- }
- private IntegerStatistics(boolean noInit) { this.unknownFields = com.google.protobuf.UnknownFieldSet.getDefaultInstance(); }
-
- private static final IntegerStatistics defaultInstance;
- public static IntegerStatistics getDefaultInstance() {
- return defaultInstance;
- }
-
- public IntegerStatistics getDefaultInstanceForType() {
- return defaultInstance;
- }
-
- private final com.google.protobuf.UnknownFieldSet unknownFields;
- @java.lang.Override
- public final com.google.protobuf.UnknownFieldSet
- getUnknownFields() {
- return this.unknownFields;
- }
- private IntegerStatistics(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- initFields();
- int mutable_bitField0_ = 0;
- com.google.protobuf.UnknownFieldSet.Builder unknownFields =
- com.google.protobuf.UnknownFieldSet.newBuilder();
- try {
- boolean done = false;
- while (!done) {
- int tag = input.readTag();
- switch (tag) {
- case 0:
- done = true;
- break;
- default: {
- if (!parseUnknownField(input, unknownFields,
- extensionRegistry, tag)) {
- done = true;
- }
- break;
- }
- case 8: {
- bitField0_ |= 0x00000001;
- minimum_ = input.readSInt64();
- break;
- }
- case 16: {
- bitField0_ |= 0x00000002;
- maximum_ = input.readSInt64();
- break;
- }
- case 24: {
- bitField0_ |= 0x00000004;
- sum_ = input.readSInt64();
- break;
- }
- }
- }
- } catch (com.google.protobuf.InvalidProtocolBufferException e) {
- throw e.setUnfinishedMessage(this);
- } catch (java.io.IOException e) {
- throw new com.google.protobuf.InvalidProtocolBufferException(
- e.getMessage()).setUnfinishedMessage(this);
- } finally {
- this.unknownFields = unknownFields.build();
- makeExtensionsImmutable();
- }
- }
- public static final com.google.protobuf.Descriptors.Descriptor
- getDescriptor() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_IntegerStatistics_descriptor;
- }
-
- protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
- internalGetFieldAccessorTable() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_IntegerStatistics_fieldAccessorTable
- .ensureFieldAccessorsInitialized(
- org.apache.orc.OrcProto.IntegerStatistics.class, org.apache.orc.OrcProto.IntegerStatistics.Builder.class);
- }
-
- public static com.google.protobuf.Parser<IntegerStatistics> PARSER =
- new com.google.protobuf.AbstractParser<IntegerStatistics>() {
- public IntegerStatistics parsePartialFrom(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return new IntegerStatistics(input, extensionRegistry);
- }
- };
-
- @java.lang.Override
- public com.google.protobuf.Parser<IntegerStatistics> getParserForType() {
- return PARSER;
- }
-
- private int bitField0_;
- // optional sint64 minimum = 1;
- public static final int MINIMUM_FIELD_NUMBER = 1;
- private long minimum_;
- /**
- * <code>optional sint64 minimum = 1;</code>
- */
- public boolean hasMinimum() {
- return ((bitField0_ & 0x00000001) == 0x00000001);
- }
- /**
- * <code>optional sint64 minimum = 1;</code>
- */
- public long getMinimum() {
- return minimum_;
- }
-
- // optional sint64 maximum = 2;
- public static final int MAXIMUM_FIELD_NUMBER = 2;
- private long maximum_;
- /**
- * <code>optional sint64 maximum = 2;</code>
- */
- public boolean hasMaximum() {
- return ((bitField0_ & 0x00000002) == 0x00000002);
- }
- /**
- * <code>optional sint64 maximum = 2;</code>
- */
- public long getMaximum() {
- return maximum_;
- }
-
- // optional sint64 sum = 3;
- public static final int SUM_FIELD_NUMBER = 3;
- private long sum_;
- /**
- * <code>optional sint64 sum = 3;</code>
- */
- public boolean hasSum() {
- return ((bitField0_ & 0x00000004) == 0x00000004);
- }
- /**
- * <code>optional sint64 sum = 3;</code>
- */
- public long getSum() {
- return sum_;
- }
-
- private void initFields() {
- minimum_ = 0L;
- maximum_ = 0L;
- sum_ = 0L;
- }
- private byte memoizedIsInitialized = -1;
- public final boolean isInitialized() {
- byte isInitialized = memoizedIsInitialized;
- if (isInitialized != -1) return isInitialized == 1;
-
- memoizedIsInitialized = 1;
- return true;
- }
-
- public void writeTo(com.google.protobuf.CodedOutputStream output)
- throws java.io.IOException {
- getSerializedSize();
- if (((bitField0_ & 0x00000001) == 0x00000001)) {
- output.writeSInt64(1, minimum_);
- }
- if (((bitField0_ & 0x00000002) == 0x00000002)) {
- output.writeSInt64(2, maximum_);
- }
- if (((bitField0_ & 0x00000004) == 0x00000004)) {
- output.writeSInt64(3, sum_);
- }
- getUnknownFields().writeTo(output);
- }
-
- private int memoizedSerializedSize = -1;
- public int getSerializedSize() {
- int size = memoizedSerializedSize;
- if (size != -1) return size;
-
- size = 0;
- if (((bitField0_ & 0x00000001) == 0x00000001)) {
- size += com.google.protobuf.CodedOutputStream
- .computeSInt64Size(1, minimum_);
- }
- if (((bitField0_ & 0x00000002) == 0x00000002)) {
- size += com.google.protobuf.CodedOutputStream
- .computeSInt64Size(2, maximum_);
- }
- if (((bitField0_ & 0x00000004) == 0x00000004)) {
- size += com.google.protobuf.CodedOutputStream
- .computeSInt64Size(3, sum_);
- }
- size += getUnknownFields().getSerializedSize();
- memoizedSerializedSize = size;
- return size;
- }
-
- private static final long serialVersionUID = 0L;
- @java.lang.Override
- protected java.lang.Object writeReplace()
- throws java.io.ObjectStreamException {
- return super.writeReplace();
- }
-
- public static org.apache.orc.OrcProto.IntegerStatistics parseFrom(
- com.google.protobuf.ByteString data)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data);
- }
- public static org.apache.orc.OrcProto.IntegerStatistics parseFrom(
- com.google.protobuf.ByteString data,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.IntegerStatistics parseFrom(byte[] data)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data);
- }
- public static org.apache.orc.OrcProto.IntegerStatistics parseFrom(
- byte[] data,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.IntegerStatistics parseFrom(java.io.InputStream input)
- throws java.io.IOException {
- return PARSER.parseFrom(input);
- }
- public static org.apache.orc.OrcProto.IntegerStatistics parseFrom(
- java.io.InputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- return PARSER.parseFrom(input, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.IntegerStatistics parseDelimitedFrom(java.io.InputStream input)
- throws java.io.IOException {
- return PARSER.parseDelimitedFrom(input);
- }
- public static org.apache.orc.OrcProto.IntegerStatistics parseDelimitedFrom(
- java.io.InputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- return PARSER.parseDelimitedFrom(input, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.IntegerStatistics parseFrom(
- com.google.protobuf.CodedInputStream input)
- throws java.io.IOException {
- return PARSER.parseFrom(input);
- }
- public static org.apache.orc.OrcProto.IntegerStatistics parseFrom(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- return PARSER.parseFrom(input, extensionRegistry);
- }
-
- public static Builder newBuilder() { return Builder.create(); }
- public Builder newBuilderForType() { return newBuilder(); }
- public static Builder newBuilder(org.apache.orc.OrcProto.IntegerStatistics prototype) {
- return newBuilder().mergeFrom(prototype);
- }
- public Builder toBuilder() { return newBuilder(this); }
-
- @java.lang.Override
- protected Builder newBuilderForType(
- com.google.protobuf.GeneratedMessage.BuilderParent parent) {
- Builder builder = new Builder(parent);
- return builder;
- }
- /**
- * Protobuf type {@code orc.proto.IntegerStatistics}
- */
- public static final class Builder extends
- com.google.protobuf.GeneratedMessage.Builder<Builder>
- implements org.apache.orc.OrcProto.IntegerStatisticsOrBuilder {
- public static final com.google.protobuf.Descriptors.Descriptor
- getDescriptor() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_IntegerStatistics_descriptor;
- }
-
- protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
- internalGetFieldAccessorTable() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_IntegerStatistics_fieldAccessorTable
- .ensureFieldAccessorsInitialized(
- org.apache.orc.OrcProto.IntegerStatistics.class, org.apache.orc.OrcProto.IntegerStatistics.Builder.class);
- }
-
- // Construct using org.apache.orc.OrcProto.IntegerStatistics.newBuilder()
- private Builder() {
- maybeForceBuilderInitialization();
- }
-
- private Builder(
- com.google.protobuf.GeneratedMessage.BuilderParent parent) {
- super(parent);
- maybeForceBuilderInitialization();
- }
- private void maybeForceBuilderInitialization() {
- if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) {
- }
- }
- private static Builder create() {
- return new Builder();
- }
-
- public Builder clear() {
- super.clear();
- minimum_ = 0L;
- bitField0_ = (bitField0_ & ~0x00000001);
- maximum_ = 0L;
- bitField0_ = (bitField0_ & ~0x00000002);
- sum_ = 0L;
- bitField0_ = (bitField0_ & ~0x00000004);
- return this;
- }
-
- public Builder clone() {
- return create().mergeFrom(buildPartial());
- }
-
- public com.google.protobuf.Descriptors.Descriptor
- getDescriptorForType() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_IntegerStatistics_descriptor;
- }
-
- public org.apache.orc.OrcProto.IntegerStatistics getDefaultInstanceForType() {
- return org.apache.orc.OrcProto.IntegerStatistics.getDefaultInstance();
- }
-
- public org.apache.orc.OrcProto.IntegerStatistics build() {
- org.apache.orc.OrcProto.IntegerStatistics result = buildPartial();
- if (!result.isInitialized()) {
- throw newUninitializedMessageException(result);
- }
- return result;
- }
-
- public org.apache.orc.OrcProto.IntegerStatistics buildPartial() {
- org.apache.orc.OrcProto.IntegerStatistics result = new org.apache.orc.OrcProto.IntegerStatistics(this);
- int from_bitField0_ = bitField0_;
- int to_bitField0_ = 0;
- if (((from_bitField0_ & 0x00000001) == 0x00000001)) {
- to_bitField0_ |= 0x00000001;
- }
- result.minimum_ = minimum_;
- if (((from_bitField0_ & 0x00000002) == 0x00000002)) {
- to_bitField0_ |= 0x00000002;
- }
- result.maximum_ = maximum_;
- if (((from_bitField0_ & 0x00000004) == 0x00000004)) {
- to_bitField0_ |= 0x00000004;
- }
- result.sum_ = sum_;
- result.bitField0_ = to_bitField0_;
- onBuilt();
- return result;
- }
-
- public Builder mergeFrom(com.google.protobuf.Message other) {
- if (other instanceof org.apache.orc.OrcProto.IntegerStatistics) {
- return mergeFrom((org.apache.orc.OrcProto.IntegerStatistics)other);
- } else {
- super.mergeFrom(other);
- return this;
- }
- }
-
- public Builder mergeFrom(org.apache.orc.OrcProto.IntegerStatistics other) {
- if (other == org.apache.orc.OrcProto.IntegerStatistics.getDefaultInstance()) return this;
- if (other.hasMinimum()) {
- setMinimum(other.getMinimum());
- }
- if (other.hasMaximum()) {
- setMaximum(other.getMaximum());
- }
- if (other.hasSum()) {
- setSum(other.getSum());
- }
- this.mergeUnknownFields(other.getUnknownFields());
- return this;
- }
-
- public final boolean isInitialized() {
- return true;
- }
-
- public Builder mergeFrom(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- org.apache.orc.OrcProto.IntegerStatistics parsedMessage = null;
- try {
- parsedMessage = PARSER.parsePartialFrom(input, extensionRegistry);
- } catch (com.google.protobuf.InvalidProtocolBufferException e) {
- parsedMessage = (org.apache.orc.OrcProto.IntegerStatistics) e.getUnfinishedMessage();
- throw e;
- } finally {
- if (parsedMessage != null) {
- mergeFrom(parsedMessage);
- }
- }
- return this;
- }
- private int bitField0_;
-
- // optional sint64 minimum = 1;
- private long minimum_ ;
- /**
- * <code>optional sint64 minimum = 1;</code>
- */
- public boolean hasMinimum() {
- return ((bitField0_ & 0x00000001) == 0x00000001);
- }
- /**
- * <code>optional sint64 minimum = 1;</code>
- */
- public long getMinimum() {
- return minimum_;
- }
- /**
- * <code>optional sint64 minimum = 1;</code>
- */
- public Builder setMinimum(long value) {
- bitField0_ |= 0x00000001;
- minimum_ = value;
- onChanged();
- return this;
- }
- /**
- * <code>optional sint64 minimum = 1;</code>
- */
- public Builder clearMinimum() {
- bitField0_ = (bitField0_ & ~0x00000001);
- minimum_ = 0L;
- onChanged();
- return this;
- }
-
- // optional sint64 maximum = 2;
- private long maximum_ ;
- /**
- * <code>optional sint64 maximum = 2;</code>
- */
- public boolean hasMaximum() {
- return ((bitField0_ & 0x00000002) == 0x00000002);
- }
- /**
- * <code>optional sint64 maximum = 2;</code>
- */
- public long getMaximum() {
- return maximum_;
- }
- /**
- * <code>optional sint64 maximum = 2;</code>
- */
- public Builder setMaximum(long value) {
- bitField0_ |= 0x00000002;
- maximum_ = value;
- onChanged();
- return this;
- }
- /**
- * <code>optional sint64 maximum = 2;</code>
- */
- public Builder clearMaximum() {
- bitField0_ = (bitField0_ & ~0x00000002);
- maximum_ = 0L;
- onChanged();
- return this;
- }
-
- // optional sint64 sum = 3;
- private long sum_ ;
- /**
- * <code>optional sint64 sum = 3;</code>
- */
- public boolean hasSum() {
- return ((bitField0_ & 0x00000004) == 0x00000004);
- }
- /**
- * <code>optional sint64 sum = 3;</code>
- */
- public long getSum() {
- return sum_;
- }
- /**
- * <code>optional sint64 sum = 3;</code>
- */
- public Builder setSum(long value) {
- bitField0_ |= 0x00000004;
- sum_ = value;
- onChanged();
- return this;
- }
- /**
- * <code>optional sint64 sum = 3;</code>
- */
- public Builder clearSum() {
- bitField0_ = (bitField0_ & ~0x00000004);
- sum_ = 0L;
- onChanged();
- return this;
- }
-
- // @@protoc_insertion_point(builder_scope:orc.proto.IntegerStatistics)
- }
-
- static {
- defaultInstance = new IntegerStatistics(true);
- defaultInstance.initFields();
- }
-
- // @@protoc_insertion_point(class_scope:orc.proto.IntegerStatistics)
- }
-
- public interface DoubleStatisticsOrBuilder
- extends com.google.protobuf.MessageOrBuilder {
-
- // optional double minimum = 1;
- /**
- * <code>optional double minimum = 1;</code>
- */
- boolean hasMinimum();
- /**
- * <code>optional double minimum = 1;</code>
- */
- double getMinimum();
-
- // optional double maximum = 2;
- /**
- * <code>optional double maximum = 2;</code>
- */
- boolean hasMaximum();
- /**
- * <code>optional double maximum = 2;</code>
- */
- double getMaximum();
-
- // optional double sum = 3;
- /**
- * <code>optional double sum = 3;</code>
- */
- boolean hasSum();
- /**
- * <code>optional double sum = 3;</code>
- */
- double getSum();
- }
- /**
- * Protobuf type {@code orc.proto.DoubleStatistics}
- */
- public static final class DoubleStatistics extends
- com.google.protobuf.GeneratedMessage
- implements DoubleStatisticsOrBuilder {
- // Use DoubleStatistics.newBuilder() to construct.
- private DoubleStatistics(com.google.protobuf.GeneratedMessage.Builder<?> builder) {
- super(builder);
- this.unknownFields = builder.getUnknownFields();
- }
- private DoubleStatistics(boolean noInit) { this.unknownFields = com.google.protobuf.UnknownFieldSet.getDefaultInstance(); }
-
- private static final DoubleStatistics defaultInstance;
- public static DoubleStatistics getDefaultInstance() {
- return defaultInstance;
- }
-
- public DoubleStatistics getDefaultInstanceForType() {
- return defaultInstance;
- }
-
- private final com.google.protobuf.UnknownFieldSet unknownFields;
- @java.lang.Override
- public final com.google.protobuf.UnknownFieldSet
- getUnknownFields() {
- return this.unknownFields;
- }
- private DoubleStatistics(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- initFields();
- int mutable_bitField0_ = 0;
- com.google.protobuf.UnknownFieldSet.Builder unknownFields =
- com.google.protobuf.UnknownFieldSet.newBuilder();
- try {
- boolean done = false;
- while (!done) {
- int tag = input.readTag();
- switch (tag) {
- case 0:
- done = true;
- break;
- default: {
- if (!parseUnknownField(input, unknownFields,
- extensionRegistry, tag)) {
- done = true;
- }
- break;
- }
- case 9: {
- bitField0_ |= 0x00000001;
- minimum_ = input.readDouble();
- break;
- }
- case 17: {
- bitField0_ |= 0x00000002;
- maximum_ = input.readDouble();
- break;
- }
- case 25: {
- bitField0_ |= 0x00000004;
- sum_ = input.readDouble();
- break;
- }
- }
- }
- } catch (com.google.protobuf.InvalidProtocolBufferException e) {
- throw e.setUnfinishedMessage(this);
- } catch (java.io.IOException e) {
- throw new com.google.protobuf.InvalidProtocolBufferException(
- e.getMessage()).setUnfinishedMessage(this);
- } finally {
- this.unknownFields = unknownFields.build();
- makeExtensionsImmutable();
- }
- }
- public static final com.google.protobuf.Descriptors.Descriptor
- getDescriptor() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_DoubleStatistics_descriptor;
- }
-
- protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
- internalGetFieldAccessorTable() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_DoubleStatistics_fieldAccessorTable
- .ensureFieldAccessorsInitialized(
- org.apache.orc.OrcProto.DoubleStatistics.class, org.apache.orc.OrcProto.DoubleStatistics.Builder.class);
- }
-
- public static com.google.protobuf.Parser<DoubleStatistics> PARSER =
- new com.google.protobuf.AbstractParser<DoubleStatistics>() {
- public DoubleStatistics parsePartialFrom(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return new DoubleStatistics(input, extensionRegistry);
- }
- };
-
- @java.lang.Override
- public com.google.protobuf.Parser<DoubleStatistics> getParserForType() {
- return PARSER;
- }
-
- private int bitField0_;
- // optional double minimum = 1;
- public static final int MINIMUM_FIELD_NUMBER = 1;
- private double minimum_;
- /**
- * <code>optional double minimum = 1;</code>
- */
- public boolean hasMinimum() {
- return ((bitField0_ & 0x00000001) == 0x00000001);
- }
- /**
- * <code>optional double minimum = 1;</code>
- */
- public double getMinimum() {
- return minimum_;
- }
-
- // optional double maximum = 2;
- public static final int MAXIMUM_FIELD_NUMBER = 2;
- private double maximum_;
- /**
- * <code>optional double maximum = 2;</code>
- */
- public boolean hasMaximum() {
- return ((bitField0_ & 0x00000002) == 0x00000002);
- }
- /**
- * <code>optional double maximum = 2;</code>
- */
- public double getMaximum() {
- return maximum_;
- }
-
- // optional double sum = 3;
- public static final int SUM_FIELD_NUMBER = 3;
- private double sum_;
- /**
- * <code>optional double sum = 3;</code>
- */
- public boolean hasSum() {
- return ((bitField0_ & 0x00000004) == 0x00000004);
- }
- /**
- * <code>optional double sum = 3;</code>
- */
- public double getSum() {
- return sum_;
- }
-
- private void initFields() {
- minimum_ = 0D;
- maximum_ = 0D;
- sum_ = 0D;
- }
- private byte memoizedIsInitialized = -1;
- public final boolean isInitialized() {
- byte isInitialized = memoizedIsInitialized;
- if (isInitialized != -1) return isInitialized == 1;
-
- memoizedIsInitialized = 1;
- return true;
- }
-
- public void writeTo(com.google.protobuf.CodedOutputStream output)
- throws java.io.IOException {
- getSerializedSize();
- if (((bitField0_ & 0x00000001) == 0x00000001)) {
- output.writeDouble(1, minimum_);
- }
- if (((bitField0_ & 0x00000002) == 0x00000002)) {
- output.writeDouble(2, maximum_);
- }
- if (((bitField0_ & 0x00000004) == 0x00000004)) {
- output.writeDouble(3, sum_);
- }
- getUnknownFields().writeTo(output);
- }
-
- private int memoizedSerializedSize = -1;
- public int getSerializedSize() {
- int size = memoizedSerializedSize;
- if (size != -1) return size;
-
- size = 0;
- if (((bitField0_ & 0x00000001) == 0x00000001)) {
- size += com.google.protobuf.CodedOutputStream
- .computeDoubleSize(1, minimum_);
- }
- if (((bitField0_ & 0x00000002) == 0x00000002)) {
- size += com.google.protobuf.CodedOutputStream
- .computeDoubleSize(2, maximum_);
- }
- if (((bitField0_ & 0x00000004) == 0x00000004)) {
- size += com.google.protobuf.CodedOutputStream
- .computeDoubleSize(3, sum_);
- }
- size += getUnknownFields().getSerializedSize();
- memoizedSerializedSize = size;
- return size;
- }
-
- private static final long serialVersionUID = 0L;
- @java.lang.Override
- protected java.lang.Object writeReplace()
- throws java.io.ObjectStreamException {
- return super.writeReplace();
- }
-
- public static org.apache.orc.OrcProto.DoubleStatistics parseFrom(
- com.google.protobuf.ByteString data)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data);
- }
- public static org.apache.orc.OrcProto.DoubleStatistics parseFrom(
- com.google.protobuf.ByteString data,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.DoubleStatistics parseFrom(byte[] data)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data);
- }
- public static org.apache.orc.OrcProto.DoubleStatistics parseFrom(
- byte[] data,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.DoubleStatistics parseFrom(java.io.InputStream input)
- throws java.io.IOException {
- return PARSER.parseFrom(input);
- }
- public static org.apache.orc.OrcProto.DoubleStatistics parseFrom(
- java.io.InputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- return PARSER.parseFrom(input, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.DoubleStatistics parseDelimitedFrom(java.io.InputStream input)
- throws java.io.IOException {
- return PARSER.parseDelimitedFrom(input);
- }
- public static org.apache.orc.OrcProto.DoubleStatistics parseDelimitedFrom(
- java.io.InputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- return PARSER.parseDelimitedFrom(input, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.DoubleStatistics parseFrom(
- com.google.protobuf.CodedInputStream input)
- throws java.io.IOException {
- return PARSER.parseFrom(input);
- }
- public static org.apache.orc.OrcProto.DoubleStatistics parseFrom(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- return PARSER.parseFrom(input, extensionRegistry);
- }
-
- public static Builder newBuilder() { return Builder.create(); }
- public Builder newBuilderForType() { return newBuilder(); }
- public static Builder newBuilder(org.apache.orc.OrcProto.DoubleStatistics prototype) {
- return newBuilder().mergeFrom(prototype);
- }
- public Builder toBuilder() { return newBuilder(this); }
-
- @java.lang.Override
- protected Builder newBuilderForType(
- com.google.protobuf.GeneratedMessage.BuilderParent parent) {
- Builder builder = new Builder(parent);
- return builder;
- }
- /**
- * Protobuf type {@code orc.proto.DoubleStatistics}
- */
- public static final class Builder extends
- com.google.protobuf.GeneratedMessage.Builder<Builder>
- implements org.apache.orc.OrcProto.DoubleStatisticsOrBuilder {
- public static final com.google.protobuf.Descriptors.Descriptor
- getDescriptor() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_DoubleStatistics_descriptor;
- }
-
- protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
- internalGetFieldAccessorTable() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_DoubleStatistics_fieldAccessorTable
- .ensureFieldAccessorsInitialized(
- org.apache.orc.OrcProto.DoubleStatistics.class, org.apache.orc.OrcProto.DoubleStatistics.Builder.class);
- }
-
- // Construct using org.apache.orc.OrcProto.DoubleStatistics.newBuilder()
- private Builder() {
- maybeForceBuilderInitialization();
- }
-
- private Builder(
- com.google.protobuf.GeneratedMessage.BuilderParent parent) {
- super(parent);
- maybeForceBuilderInitialization();
- }
- private void maybeForceBuilderInitialization() {
- if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) {
- }
- }
- private static Builder create() {
- return new Builder();
- }
-
- public Builder clear() {
- super.clear();
- minimum_ = 0D;
- bitField0_ = (bitField0_ & ~0x00000001);
- maximum_ = 0D;
- bitField0_ = (bitField0_ & ~0x00000002);
- sum_ = 0D;
- bitField0_ = (bitField0_ & ~0x00000004);
- return this;
- }
-
- public Builder clone() {
- return create().mergeFrom(buildPartial());
- }
-
- public com.google.protobuf.Descriptors.Descriptor
- getDescriptorForType() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_DoubleStatistics_descriptor;
- }
-
- public org.apache.orc.OrcProto.DoubleStatistics getDefaultInstanceForType() {
- return org.apache.orc.OrcProto.DoubleStatistics.getDefaultInstance();
- }
-
- public org.apache.orc.OrcProto.DoubleStatistics build() {
- org.apache.orc.OrcProto.DoubleStatistics result = buildPartial();
- if (!result.isInitialized()) {
- throw newUninitializedMessageException(result);
- }
- return result;
- }
-
- public org.apache.orc.OrcProto.DoubleStatistics buildPartial() {
- org.apache.orc.OrcProto.DoubleStatistics result = new org.apache.orc.OrcProto.DoubleStatistics(this);
- int from_bitField0_ = bitField0_;
- int to_bitField0_ = 0;
- if (((from_bitField0_ & 0x00000001) == 0x00000001)) {
- to_bitField0_ |= 0x00000001;
- }
- result.minimum_ = minimum_;
- if (((from_bitField0_ & 0x00000002) == 0x00000002)) {
- to_bitField0_ |= 0x00000002;
- }
- result.maximum_ = maximum_;
- if (((from_bitField0_ & 0x00000004) == 0x00000004)) {
- to_bitField0_ |= 0x00000004;
- }
- result.sum_ = sum_;
- result.bitField0_ = to_bitField0_;
- onBuilt();
- return result;
- }
-
- public Builder mergeFrom(com.google.protobuf.Message other) {
- if (other instanceof org.apache.orc.OrcProto.DoubleStatistics) {
- return mergeFrom((org.apache.orc.OrcProto.DoubleStatistics)other);
- } else {
- super.mergeFrom(other);
- return this;
- }
- }
-
- public Builder mergeFrom(org.apache.orc.OrcProto.DoubleStatistics other) {
- if (other == org.apache.orc.OrcProto.DoubleStatistics.getDefaultInstance()) return this;
- if (other.hasMinimum()) {
- setMinimum(other.getMinimum());
- }
- if (other.hasMaximum()) {
- setMaximum(other.getMaximum());
- }
- if (other.hasSum()) {
- setSum(other.getSum());
- }
- this.mergeUnknownFields(other.getUnknownFields());
- return this;
- }
-
- public final boolean isInitialized() {
- return true;
- }
-
- public Builder mergeFrom(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- org.apache.orc.OrcProto.DoubleStatistics parsedMessage = null;
- try {
- parsedMessage = PARSER.parsePartialFrom(input, extensionRegistry);
- } catch (com.google.protobuf.InvalidProtocolBufferException e) {
- parsedMessage = (org.apache.orc.OrcProto.DoubleStatistics) e.getUnfinishedMessage();
- throw e;
- } finally {
- if (parsedMessage != null) {
- mergeFrom(parsedMessage);
- }
- }
- return this;
- }
- private int bitField0_;
-
- // optional double minimum = 1;
- private double minimum_ ;
- /**
- * <code>optional double minimum = 1;</code>
- */
- public boolean hasMinimum() {
- return ((bitField0_ & 0x00000001) == 0x00000001);
- }
- /**
- * <code>optional double minimum = 1;</code>
- */
- public double getMinimum() {
- return minimum_;
- }
- /**
- * <code>optional double minimum = 1;</code>
- */
- public Builder setMinimum(double value) {
- bitField0_ |= 0x00000001;
- minimum_ = value;
- onChanged();
- return this;
- }
- /**
- * <code>optional double minimum = 1;</code>
- */
- public Builder clearMinimum() {
- bitField0_ = (bitField0_ & ~0x00000001);
- minimum_ = 0D;
- onChanged();
- return this;
- }
-
- // optional double maximum = 2;
- private double maximum_ ;
- /**
- * <code>optional double maximum = 2;</code>
- */
- public boolean hasMaximum() {
- return ((bitField0_ & 0x00000002) == 0x00000002);
- }
- /**
- * <code>optional double maximum = 2;</code>
- */
- public double getMaximum() {
- return maximum_;
- }
- /**
- * <code>optional double maximum = 2;</code>
- */
- public Builder setMaximum(double value) {
- bitField0_ |= 0x00000002;
- maximum_ = value;
- onChanged();
- return this;
- }
- /**
- * <code>optional double maximum = 2;</code>
- */
- public Builder clearMaximum() {
- bitField0_ = (bitField0_ & ~0x00000002);
- maximum_ = 0D;
- onChanged();
- return this;
- }
-
- // optional double sum = 3;
- private double sum_ ;
- /**
- * <code>optional double sum = 3;</code>
- */
- public boolean hasSum() {
- return ((bitField0_ & 0x00000004) == 0x00000004);
- }
- /**
- * <code>optional double sum = 3;</code>
- */
- public double getSum() {
- return sum_;
- }
- /**
- * <code>optional double sum = 3;</code>
- */
- public Builder setSum(double value) {
- bitField0_ |= 0x00000004;
- sum_ = value;
- onChanged();
- return this;
- }
- /**
- * <code>optional double sum = 3;</code>
- */
- public Builder clearSum() {
- bitField0_ = (bitField0_ & ~0x00000004);
- sum_ = 0D;
- onChanged();
- return this;
- }
-
- // @@protoc_insertion_point(builder_scope:orc.proto.DoubleStatistics)
- }
-
- static {
- defaultInstance = new DoubleStatistics(true);
- defaultInstance.initFields();
- }
-
- // @@protoc_insertion_point(class_scope:orc.proto.DoubleStatistics)
- }
-
- public interface StringStatisticsOrBuilder
- extends com.google.protobuf.MessageOrBuilder {
-
- // optional string minimum = 1;
- /**
- * <code>optional string minimum = 1;</code>
- */
- boolean hasMinimum();
- /**
- * <code>optional string minimum = 1;</code>
- */
- java.lang.String getMinimum();
- /**
- * <code>optional string minimum = 1;</code>
- */
- com.google.protobuf.ByteString
- getMinimumBytes();
-
- // optional string maximum = 2;
- /**
- * <code>optional string maximum = 2;</code>
- */
- boolean hasMaximum();
- /**
- * <code>optional string maximum = 2;</code>
- */
- java.lang.String getMaximum();
- /**
- * <code>optional string maximum = 2;</code>
- */
- com.google.protobuf.ByteString
- getMaximumBytes();
-
- // optional sint64 sum = 3;
- /**
- * <code>optional sint64 sum = 3;</code>
- *
- * <pre>
- * sum will store the total length of all strings in a stripe
- * </pre>
- */
- boolean hasSum();
- /**
- * <code>optional sint64 sum = 3;</code>
- *
- * <pre>
- * sum will store the total length of all strings in a stripe
- * </pre>
- */
- long getSum();
- }
- /**
- * Protobuf type {@code orc.proto.StringStatistics}
- */
- public static final class StringStatistics extends
- com.google.protobuf.GeneratedMessage
- implements StringStatisticsOrBuilder {
- // Use StringStatistics.newBuilder() to construct.
- private StringStatistics(com.google.protobuf.GeneratedMessage.Builder<?> builder) {
- super(builder);
- this.unknownFields = builder.getUnknownFields();
- }
- private StringStatistics(boolean noInit) { this.unknownFields = com.google.protobuf.UnknownFieldSet.getDefaultInstance(); }
-
- private static final StringStatistics defaultInstance;
- public static StringStatistics getDefaultInstance() {
- return defaultInstance;
- }
-
- public StringStatistics getDefaultInstanceForType() {
- return defaultInstance;
- }
-
- private final com.google.protobuf.UnknownFieldSet unknownFields;
- @java.lang.Override
- public final com.google.protobuf.UnknownFieldSet
- getUnknownFields() {
- return this.unknownFields;
- }
- private StringStatistics(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- initFields();
- int mutable_bitField0_ = 0;
- com.google.protobuf.UnknownFieldSet.Builder unknownFields =
- com.google.protobuf.UnknownFieldSet.newBuilder();
- try {
- boolean done = false;
- while (!done) {
- int tag = input.readTag();
- switch (tag) {
- case 0:
- done = true;
- break;
- default: {
- if (!parseUnknownField(input, unknownFields,
- extensionRegistry, tag)) {
- done = true;
- }
- break;
- }
- case 10: {
- bitField0_ |= 0x00000001;
- minimum_ = input.readBytes();
- break;
- }
- case 18: {
- bitField0_ |= 0x00000002;
- maximum_ = input.readBytes();
- break;
- }
- case 24: {
- bitField0_ |= 0x00000004;
- sum_ = input.readSInt64();
- break;
- }
- }
- }
- } catch (com.google.protobuf.InvalidProtocolBufferException e) {
- throw e.setUnfinishedMessage(this);
- } catch (java.io.IOException e) {
- throw new com.google.protobuf.InvalidProtocolBufferException(
- e.getMessage()).setUnfinishedMessage(this);
- } finally {
- this.unknownFields = unknownFields.build();
- makeExtensionsImmutable();
- }
- }
- public static final com.google.protobuf.Descriptors.Descriptor
- getDescriptor() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_StringStatistics_descriptor;
- }
-
- protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
- internalGetFieldAccessorTable() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_StringStatistics_fieldAccessorTable
- .ensureFieldAccessorsInitialized(
- org.apache.orc.OrcProto.StringStatistics.class, org.apache.orc.OrcProto.StringStatistics.Builder.class);
- }
-
- public static com.google.protobuf.Parser<StringStatistics> PARSER =
- new com.google.protobuf.AbstractParser<StringStatistics>() {
- public StringStatistics parsePartialFrom(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return new StringStatistics(input, extensionRegistry);
- }
- };
-
- @java.lang.Override
- public com.google.protobuf.Parser<StringStatistics> getParserForType() {
- return PARSER;
- }
-
- private int bitField0_;
- // optional string minimum = 1;
- public static final int MINIMUM_FIELD_NUMBER = 1;
- private java.lang.Object minimum_;
- /**
- * <code>optional string minimum = 1;</code>
- */
- public boolean hasMinimum() {
- return ((bitField0_ & 0x00000001) == 0x00000001);
- }
- /**
- * <code>optional string minimum = 1;</code>
- */
- public java.lang.String getMinimum() {
- java.lang.Object ref = minimum_;
- if (ref instanceof java.lang.String) {
- return (java.lang.String) ref;
- } else {
- com.google.protobuf.ByteString bs =
- (com.google.protobuf.ByteString) ref;
- java.lang.String s = bs.toStringUtf8();
- if (bs.isValidUtf8()) {
- minimum_ = s;
- }
- return s;
- }
- }
- /**
- * <code>optional string minimum = 1;</code>
- */
- public com.google.protobuf.ByteString
- getMinimumBytes() {
- java.lang.Object ref = minimum_;
- if (ref instanceof java.lang.String) {
- com.google.protobuf.ByteString b =
- com.google.protobuf.ByteString.copyFromUtf8(
- (java.lang.String) ref);
- minimum_ = b;
- return b;
- } else {
- return (com.google.protobuf.ByteString) ref;
- }
- }
-
- // optional string maximum = 2;
- public static final int MAXIMUM_FIELD_NUMBER = 2;
- private java.lang.Object maximum_;
- /**
- * <code>optional string maximum = 2;</code>
- */
- public boolean hasMaximum() {
- return ((bitField0_ & 0x00000002) == 0x00000002);
- }
- /**
- * <code>optional string maximum = 2;</code>
- */
- public java.lang.String getMaximum() {
- java.lang.Object ref = maximum_;
- if (ref instanceof java.lang.String) {
- return (java.lang.String) ref;
- } else {
- com.google.protobuf.ByteString bs =
- (com.google.protobuf.ByteString) ref;
- java.lang.String s = bs.toStringUtf8();
- if (bs.isValidUtf8()) {
- maximum_ = s;
- }
- return s;
- }
- }
- /**
- * <code>optional string maximum = 2;</code>
- */
- public com.google.protobuf.ByteString
- getMaximumBytes() {
- java.lang.Object ref = maximum_;
- if (ref instanceof java.lang.String) {
- com.google.protobuf.ByteString b =
- com.google.protobuf.ByteString.copyFromUtf8(
- (java.lang.String) ref);
- maximum_ = b;
- return b;
- } else {
- return (com.google.protobuf.ByteString) ref;
- }
- }
-
- // optional sint64 sum = 3;
- public static final int SUM_FIELD_NUMBER = 3;
- private long sum_;
- /**
- * <code>optional sint64 sum = 3;</code>
- *
- * <pre>
- * sum will store the total length of all strings in a stripe
- * </pre>
- */
- public boolean hasSum() {
- return ((bitField0_ & 0x00000004) == 0x00000004);
- }
- /**
- * <code>optional sint64 sum = 3;</code>
- *
- * <pre>
- * sum will store the total length of all strings in a stripe
- * </pre>
- */
- public long getSum() {
- return sum_;
- }
-
- private void initFields() {
- minimum_ = "";
- maximum_ = "";
- sum_ = 0L;
- }
- private byte memoizedIsInitialized = -1;
- public final boolean isInitialized() {
- byte isInitialized = memoizedIsInitialized;
- if (isInitialized != -1) return isInitialized == 1;
-
- memoizedIsInitialized = 1;
- return true;
- }
-
- public void writeTo(com.google.protobuf.CodedOutputStream output)
- throws java.io.IOException {
- getSerializedSize();
- if (((bitField0_ & 0x00000001) == 0x00000001)) {
- output.writeBytes(1, getMinimumBytes());
- }
- if (((bitField0_ & 0x00000002) == 0x00000002)) {
- output.writeBytes(2, getMaximumBytes());
- }
- if (((bitField0_ & 0x00000004) == 0x00000004)) {
- output.writeSInt64(3, sum_);
- }
- getUnknownFields().writeTo(output);
- }
-
- private int memoizedSerializedSize = -1;
- public int getSerializedSize() {
- int size = memoizedSerializedSize;
- if (size != -1) return size;
-
- size = 0;
- if (((bitField0_ & 0x00000001) == 0x00000001)) {
- size += com.google.protobuf.CodedOutputStream
- .computeBytesSize(1, getMinimumBytes());
- }
- if (((bitField0_ & 0x00000002) == 0x00000002)) {
- size += com.google.protobuf.CodedOutputStream
- .computeBytesSize(2, getMaximumBytes());
- }
- if (((bitField0_ & 0x00000004) == 0x00000004)) {
- size += com.google.protobuf.CodedOutputStream
- .computeSInt64Size(3, sum_);
- }
- size += getUnknownFields().getSerializedSize();
- memoizedSerializedSize = size;
- return size;
- }
-
- private static final long serialVersionUID = 0L;
- @java.lang.Override
- protected java.lang.Object writeReplace()
- throws java.io.ObjectStreamException {
- return super.writeReplace();
- }
-
- public static org.apache.orc.OrcProto.StringStatistics parseFrom(
- com.google.protobuf.ByteString data)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data);
- }
- public static org.apache.orc.OrcProto.StringStatistics parseFrom(
- com.google.protobuf.ByteString data,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.StringStatistics parseFrom(byte[] data)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data);
- }
- public static org.apache.orc.OrcProto.StringStatistics parseFrom(
- byte[] data,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.StringStatistics parseFrom(java.io.InputStream input)
- throws java.io.IOException {
- return PARSER.parseFrom(input);
- }
- public static org.apache.orc.OrcProto.StringStatistics parseFrom(
- java.io.InputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- return PARSER.parseFrom(input, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.StringStatistics parseDelimitedFrom(java.io.InputStream input)
- throws java.io.IOException {
- return PARSER.parseDelimitedFrom(input);
- }
- public static org.apache.orc.OrcProto.StringStatistics parseDelimitedFrom(
- java.io.InputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- return PARSER.parseDelimitedFrom(input, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.StringStatistics parseFrom(
- com.google.protobuf.CodedInputStream input)
- throws java.io.IOException {
- return PARSER.parseFrom(input);
- }
- public static org.apache.orc.OrcProto.StringStatistics parseFrom(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- return PARSER.parseFrom(input, extensionRegistry);
- }
-
- public static Builder newBuilder() { return Builder.create(); }
- public Builder newBuilderForType() { return newBuilder(); }
- public static Builder newBuilder(org.apache.orc.OrcProto.StringStatistics prototype) {
- return newBuilder().mergeFrom(prototype);
- }
- public Builder toBuilder() { return newBuilder(this); }
-
- @java.lang.Override
- protected Builder newBuilderForType(
- com.google.protobuf.GeneratedMessage.BuilderParent parent) {
- Builder builder = new Builder(parent);
- return builder;
- }
- /**
- * Protobuf type {@code orc.proto.StringStatistics}
- */
- public static final class Builder extends
- com.google.protobuf.GeneratedMessage.Builder<Builder>
- implements org.apache.orc.OrcProto.StringStatisticsOrBuilder {
- public static final com.google.protobuf.Descriptors.Descriptor
- getDescriptor() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_StringStatistics_descriptor;
- }
-
- protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
- internalGetFieldAccessorTable() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_StringStatistics_fieldAccessorTable
- .ensureFieldAccessorsInitialized(
- org.apache.orc.OrcProto.StringStatistics.class, org.apache.orc.OrcProto.StringStatistics.Builder.class);
- }
-
- // Construct using org.apache.orc.OrcProto.StringStatistics.newBuilder()
- private Builder() {
- maybeForceBuilderInitialization();
- }
-
- private Builder(
- com.google.protobuf.GeneratedMessage.BuilderParent parent) {
- super(parent);
- maybeForceBuilderInitialization();
- }
- private void maybeForceBuilderInitialization() {
- if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) {
- }
- }
- private static Builder create() {
- return new Builder();
- }
-
- public Builder clear() {
- super.clear();
- minimum_ = "";
- bitField0_ = (bitField0_ & ~0x00000001);
- maximum_ = "";
- bitField0_ = (bitField0_ & ~0x00000002);
- sum_ = 0L;
- bitField0_ = (bitField0_ & ~0x00000004);
- return this;
- }
-
- public Builder clone() {
- return create().mergeFrom(buildPartial());
- }
-
- public com.google.protobuf.Descriptors.Descriptor
- getDescriptorForType() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_StringStatistics_descriptor;
- }
-
- public org.apache.orc.OrcProto.StringStatistics getDefaultInstanceForType() {
- return org.apache.orc.OrcProto.StringStatistics.getDefaultInstance();
- }
-
- public org.apache.orc.OrcProto.StringStatistics build() {
- org.apache.orc.OrcProto.StringStatistics result = buildPartial();
- if (!result.isInitialized()) {
- throw newUninitializedMessageException(result);
- }
- return result;
- }
-
- public org.apache.orc.OrcProto.StringStatistics buildPartial() {
- org.apache.orc.OrcProto.StringStatistics result = new org.apache.orc.OrcProto.StringStatistics(this);
- int from_bitField0_ = bitField0_;
- int to_bitField0_ = 0;
- if (((from_bitField0_ & 0x00000001) == 0x00000001)) {
- to_bitField0_ |= 0x00000001;
- }
- result.minimum_ = minimum_;
- if (((from_bitField0_ & 0x00000002) == 0x00000002)) {
- to_bitField0_ |= 0x00000002;
- }
- result.maximum_ = maximum_;
- if (((from_bitField0_ & 0x00000004) == 0x00000004)) {
- to_bitField0_ |= 0x00000004;
- }
- result.sum_ = sum_;
- result.bitField0_ = to_bitField0_;
- onBuilt();
- return result;
- }
-
- public Builder mergeFrom(com.google.protobuf.Message other) {
- if (other instanceof org.apache.orc.OrcProto.StringStatistics) {
- return mergeFrom((org.apache.orc.OrcProto.StringStatistics)other);
- } else {
- super.mergeFrom(other);
- return this;
- }
- }
-
- public Builder mergeFrom(org.apache.orc.OrcProto.StringStatistics other) {
- if (other == org.apache.orc.OrcProto.StringStatistics.getDefaultInstance()) return this;
- if (other.hasMinimum()) {
- bitField0_ |= 0x00000001;
- minimum_ = other.minimum_;
- onChanged();
- }
- if (other.hasMaximum()) {
- bitField0_ |= 0x00000002;
- maximum_ = other.maximum_;
- onChanged();
- }
- if (other.hasSum()) {
- setSum(other.getSum());
- }
- this.mergeUnknownFields(other.getUnknownFields());
- return this;
- }
-
- public final boolean isInitialized() {
- return true;
- }
-
- public Builder mergeFrom(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- org.apache.orc.OrcProto.StringStatistics parsedMessage = null;
- try {
- parsedMessage = PARSER.parsePartialFrom(input, extensionRegistry);
- } catch (com.google.protobuf.InvalidProtocolBufferException e) {
- parsedMessage = (org.apache.orc.OrcProto.StringStatistics) e.getUnfinishedMessage();
- throw e;
- } finally {
- if (parsedMessage != null) {
- mergeFrom(parsedMessage);
- }
- }
- return this;
- }
- private int bitField0_;
-
- // optional string minimum = 1;
- private java.lang.Object minimum_ = "";
- /**
- * <code>optional string minimum = 1;</code>
- */
- public boolean hasMinimum() {
- return ((bitField0_ & 0x00000001) == 0x00000001);
- }
- /**
- * <code>optional string minimum = 1;</code>
- */
- public java.lang.String getMinimum() {
- java.lang.Object ref = minimum_;
- if (!(ref instanceof java.lang.String)) {
- java.lang.String s = ((com.google.protobuf.ByteString) ref)
- .toStringUtf8();
- minimum_ = s;
- return s;
- } else {
- return (java.lang.String) ref;
- }
- }
- /**
- * <code>optional string minimum = 1;</code>
- */
- public com.google.protobuf.ByteString
- getMinimumBytes() {
- java.lang.Object ref = minimum_;
- if (ref instanceof String) {
- com.google.protobuf.ByteString b =
- com.google.protobuf.ByteString.copyFromUtf8(
- (java.lang.String) ref);
- minimum_ = b;
- return b;
- } else {
- return (com.google.protobuf.ByteString) ref;
- }
- }
- /**
- * <code>optional string minimum = 1;</code>
- */
- public Builder setMinimum(
- java.lang.String value) {
- if (value == null) {
- throw new NullPointerException();
- }
- bitField0_ |= 0x00000001;
- minimum_ = value;
- onChanged();
- return this;
- }
- /**
- * <code>optional string minimum = 1;</code>
- */
- public Builder clearMinimum() {
- bitField0_ = (bitField0_ & ~0x00000001);
- minimum_ = getDefaultInstance().getMinimum();
- onChanged();
- return this;
- }
- /**
- * <code>optional string minimum = 1;</code>
- */
- public Builder setMinimumBytes(
- com.google.protobuf.ByteString value) {
- if (value == null) {
- throw new NullPointerException();
- }
- bitField0_ |= 0x00000001;
- minimum_ = value;
- onChanged();
- return this;
- }
-
- // optional string maximum = 2;
- private java.lang.Object maximum_ = "";
- /**
- * <code>optional string maximum = 2;</code>
- */
- public boolean hasMaximum() {
- return ((bitField0_ & 0x00000002) == 0x00000002);
- }
- /**
- * <code>optional string maximum = 2;</code>
- */
- public java.lang.String getMaximum() {
- java.lang.Object ref = maximum_;
- if (!(ref instanceof java.lang.String)) {
- java.lang.String s = ((com.google.protobuf.ByteString) ref)
- .toStringUtf8();
- maximum_ = s;
- return s;
- } else {
- return (java.lang.String) ref;
- }
- }
- /**
- * <code>optional string maximum = 2;</code>
- */
- public com.google.protobuf.ByteString
- getMaximumBytes() {
- java.lang.Object ref = maximum_;
- if (ref instanceof String) {
- com.google.protobuf.ByteString b =
- com.google.protobuf.ByteString.copyFromUtf8(
- (java.lang.String) ref);
- maximum_ = b;
- return b;
- } else {
- return (com.google.protobuf.ByteString) ref;
- }
- }
- /**
- * <code>optional string maximum = 2;</code>
- */
- public Builder setMaximum(
- java.lang.String value) {
- if (value == null) {
- throw new NullPointerException();
- }
- bitField0_ |= 0x00000002;
- maximum_ = value;
- onChanged();
- return this;
- }
- /**
- * <code>optional string maximum = 2;</code>
- */
- public Builder clearMaximum() {
- bitField0_ = (bitField0_ & ~0x00000002);
- maximum_ = getDefaultInstance().getMaximum();
- onChanged();
- return this;
- }
- /**
- * <code>optional string maximum = 2;</code>
- */
- public Builder setMaximumBytes(
- com.google.protobuf.ByteString value) {
- if (value == null) {
- throw new NullPointerException();
- }
- bitField0_ |= 0x00000002;
- maximum_ = value;
- onChanged();
- return this;
- }
-
- // optional sint64 sum = 3;
- private long sum_ ;
- /**
- * <code>optional sint64 sum = 3;</code>
- *
- * <pre>
- * sum will store the total length of all strings in a stripe
- * </pre>
- */
- public boolean hasSum() {
- return ((bitField0_ & 0x00000004) == 0x00000004);
- }
- /**
- * <code>optional sint64 sum = 3;</code>
- *
- * <pre>
- * sum will store the total length of all strings in a stripe
- * </pre>
- */
- public long getSum() {
- return sum_;
- }
- /**
- * <code>optional sint64 sum = 3;</code>
- *
- * <pre>
- * sum will store the total length of all strings in a stripe
- * </pre>
- */
- public Builder setSum(long value) {
- bitField0_ |= 0x00000004;
- sum_ = value;
- onChanged();
- return this;
- }
- /**
- * <code>optional sint64 sum = 3;</code>
- *
- * <pre>
- * sum will store the total length of all strings in a stripe
- * </pre>
- */
- public Builder clearSum() {
- bitField0_ = (bitField0_ & ~0x00000004);
- sum_ = 0L;
- onChanged();
- return this;
- }
-
- // @@protoc_insertion_point(builder_scope:orc.proto.StringStatistics)
- }
-
- static {
- defaultInstance = new StringStatistics(true);
- defaultInstance.initFields();
- }
-
- // @@protoc_insertion_point(class_scope:orc.proto.StringStatistics)
- }
-
- public interface BucketStatisticsOrBuilder
- extends com.google.protobuf.MessageOrBuilder {
-
- // repeated uint64 count = 1 [packed = true];
- /**
- * <code>repeated uint64 count = 1 [packed = true];</code>
- */
- java.util.List<java.lang.Long> getCountList();
- /**
- * <code>repeated uint64 count = 1 [packed = true];</code>
- */
- int getCountCount();
- /**
- * <code>repeated uint64 count = 1 [packed = true];</code>
- */
- long getCount(int index);
- }
- /**
- * Protobuf type {@code orc.proto.BucketStatistics}
- */
- public static final class BucketStatistics extends
- com.google.protobuf.GeneratedMessage
- implements BucketStatisticsOrBuilder {
- // Use BucketStatistics.newBuilder() to construct.
- private BucketStatistics(com.google.protobuf.GeneratedMessage.Builder<?> builder) {
- super(builder);
- this.unknownFields = builder.getUnknownFields();
- }
- private BucketStatistics(boolean noInit) { this.unknownFields = com.google.protobuf.UnknownFieldSet.getDefaultInstance(); }
-
- private static final BucketStatistics defaultInstance;
- public static BucketStatistics getDefaultInstance() {
- return defaultInstance;
- }
-
- public BucketStatistics getDefaultInstanceForType() {
- return defaultInstance;
- }
-
- private final com.google.protobuf.UnknownFieldSet unknownFields;
- @java.lang.Override
- public final com.google.protobuf.UnknownFieldSet
- getUnknownFields() {
- return this.unknownFields;
- }
- private BucketStatistics(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- initFields();
- int mutable_bitField0_ = 0;
- com.google.protobuf.UnknownFieldSet.Builder unknownFields =
- com.google.protobuf.UnknownFieldSet.newBuilder();
- try {
- boolean done = false;
- while (!done) {
- int tag = input.readTag();
- switch (tag) {
- case 0:
- done = true;
- break;
- default: {
- if (!parseUnknownField(input, unknownFields,
- extensionRegistry, tag)) {
- done = true;
- }
- break;
- }
- case 8: {
- if (!((mutable_bitField0_ & 0x00000001) == 0x00000001)) {
- count_ = new java.util.ArrayList<java.lang.Long>();
- mutable_bitField0_ |= 0x00000001;
- }
- count_.add(input.readUInt64());
- break;
- }
- case 10: {
- int length = input.readRawVarint32();
- int limit = input.pushLimit(length);
- if (!((mutable_bitField0_ & 0x00000001) == 0x00000001) && input.getBytesUntilLimit() > 0) {
- count_ = new java.util.ArrayList<java.lang.Long>();
- mutable_bitField0_ |= 0x00000001;
- }
- while (input.getBytesUntilLimit() > 0) {
- count_.add(input.readUInt64());
- }
- input.popLimit(limit);
- break;
- }
- }
- }
- } catch (com.google.protobuf.InvalidProtocolBufferException e) {
- throw e.setUnfinishedMessage(this);
- } catch (java.io.IOException e) {
- throw new com.google.protobuf.InvalidProtocolBufferException(
- e.getMessage()).setUnfinishedMessage(this);
- } finally {
- if (((mutable_bitField0_ & 0x00000001) == 0x00000001)) {
- count_ = java.util.Collections.unmodifiableList(count_);
- }
- this.unknownFields = unknownFields.build();
- makeExtensionsImmutable();
- }
- }
- public static final com.google.protobuf.Descriptors.Descriptor
- getDescriptor() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_BucketStatistics_descriptor;
- }
-
- protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
- internalGetFieldAccessorTable() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_BucketStatistics_fieldAccessorTable
- .ensureFieldAccessorsInitialized(
- org.apache.orc.OrcProto.BucketStatistics.class, org.apache.orc.OrcProto.BucketStatistics.Builder.class);
- }
-
- public static com.google.protobuf.Parser<BucketStatistics> PARSER =
- new com.google.protobuf.AbstractParser<BucketStatistics>() {
- public BucketStatistics parsePartialFrom(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return new BucketStatistics(input, extensionRegistry);
- }
- };
-
- @java.lang.Override
- public com.google.protobuf.Parser<BucketStatistics> getParserForType() {
- return PARSER;
- }
-
- // repeated uint64 count = 1 [packed = true];
- public static final int COUNT_FIELD_NUMBER = 1;
- private java.util.List<java.lang.Long> count_;
- /**
- * <code>repeated uint64 count = 1 [packed = true];</code>
- */
- public java.util.List<java.lang.Long>
- getCountList() {
- return count_;
- }
- /**
- * <code>repeated uint64 count = 1 [packed = true];</code>
- */
- public int getCountCount() {
- return count_.size();
- }
- /**
- * <code>repeated uint64 count = 1 [packed = true];</code>
- */
- public long getCount(int index) {
- return count_.get(index);
- }
- private int countMemoizedSerializedSize = -1;
-
- private void initFields() {
- count_ = java.util.Collections.emptyList();
- }
- private byte memoizedIsInitialized = -1;
- public final boolean isInitialized() {
- byte isInitialized = memoizedIsInitialized;
- if (isInitialized != -1) return isInitialized == 1;
-
- memoizedIsInitialized = 1;
- return true;
- }
-
- public void writeTo(com.google.protobuf.CodedOutputStream output)
- throws java.io.IOException {
- getSerializedSize();
- if (getCountList().size() > 0) {
- output.writeRawVarint32(10);
- output.writeRawVarint32(countMemoizedSerializedSize);
- }
- for (int i = 0; i < count_.size(); i++) {
- output.writeUInt64NoTag(count_.get(i));
- }
- getUnknownFields().writeTo(output);
- }
-
- private int memoizedSerializedSize = -1;
- public int getSerializedSize() {
- int size = memoizedSerializedSize;
- if (size != -1) return size;
-
- size = 0;
- {
- int dataSize = 0;
- for (int i = 0; i < count_.size(); i++) {
- dataSize += com.google.protobuf.CodedOutputStream
- .computeUInt64SizeNoTag(count_.get(i));
- }
- size += dataSize;
- if (!getCountList().isEmpty()) {
- size += 1;
- size += com.google.protobuf.CodedOutputStream
- .computeInt32SizeNoTag(dataSize);
- }
- countMemoizedSerializedSize = dataSize;
- }
- size += getUnknownFields().getSerializedSize();
- memoizedSerializedSize = size;
- return size;
- }
-
- private static final long serialVersionUID = 0L;
- @java.lang.Override
- protected java.lang.Object writeReplace()
- throws java.io.ObjectStreamException {
- return super.writeReplace();
- }
-
- public static org.apache.orc.OrcProto.BucketStatistics parseFrom(
- com.google.protobuf.ByteString data)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data);
- }
- public static org.apache.orc.OrcProto.BucketStatistics parseFrom(
- com.google.protobuf.ByteString data,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.BucketStatistics parseFrom(byte[] data)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data);
- }
- public static org.apache.orc.OrcProto.BucketStatistics parseFrom(
- byte[] data,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return PARSER.parseFrom(data, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.BucketStatistics parseFrom(java.io.InputStream input)
- throws java.io.IOException {
- return PARSER.parseFrom(input);
- }
- public static org.apache.orc.OrcProto.BucketStatistics parseFrom(
- java.io.InputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- return PARSER.parseFrom(input, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.BucketStatistics parseDelimitedFrom(java.io.InputStream input)
- throws java.io.IOException {
- return PARSER.parseDelimitedFrom(input);
- }
- public static org.apache.orc.OrcProto.BucketStatistics parseDelimitedFrom(
- java.io.InputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- return PARSER.parseDelimitedFrom(input, extensionRegistry);
- }
- public static org.apache.orc.OrcProto.BucketStatistics parseFrom(
- com.google.protobuf.CodedInputStream input)
- throws java.io.IOException {
- return PARSER.parseFrom(input);
- }
- public static org.apache.orc.OrcProto.BucketStatistics parseFrom(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- return PARSER.parseFrom(input, extensionRegistry);
- }
-
- public static Builder newBuilder() { return Builder.create(); }
- public Builder newBuilderForType() { return newBuilder(); }
- public static Builder newBuilder(org.apache.orc.OrcProto.BucketStatistics prototype) {
- return newBuilder().mergeFrom(prototype);
- }
- public Builder toBuilder() { return newBuilder(this); }
-
- @java.lang.Override
- protected Builder newBuilderForType(
- com.google.protobuf.GeneratedMessage.BuilderParent parent) {
- Builder builder = new Builder(parent);
- return builder;
- }
- /**
- * Protobuf type {@code orc.proto.BucketStatistics}
- */
- public static final class Builder extends
- com.google.protobuf.GeneratedMessage.Builder<Builder>
- implements org.apache.orc.OrcProto.BucketStatisticsOrBuilder {
- public static final com.google.protobuf.Descriptors.Descriptor
- getDescriptor() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_BucketStatistics_descriptor;
- }
-
- protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
- internalGetFieldAccessorTable() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_BucketStatistics_fieldAccessorTable
- .ensureFieldAccessorsInitialized(
- org.apache.orc.OrcProto.BucketStatistics.class, org.apache.orc.OrcProto.BucketStatistics.Builder.class);
- }
-
- // Construct using org.apache.orc.OrcProto.BucketStatistics.newBuilder()
- private Builder() {
- maybeForceBuilderInitialization();
- }
-
- private Builder(
- com.google.protobuf.GeneratedMessage.BuilderParent parent) {
- super(parent);
- maybeForceBuilderInitialization();
- }
- private void maybeForceBuilderInitialization() {
- if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) {
- }
- }
- private static Builder create() {
- return new Builder();
- }
-
- public Builder clear() {
- super.clear();
- count_ = java.util.Collections.emptyList();
- bitField0_ = (bitField0_ & ~0x00000001);
- return this;
- }
-
- public Builder clone() {
- return create().mergeFrom(buildPartial());
- }
-
- public com.google.protobuf.Descriptors.Descriptor
- getDescriptorForType() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_BucketStatistics_descriptor;
- }
-
- public org.apache.orc.OrcProto.BucketStatistics getDefaultInstanceForType() {
- return org.apache.orc.OrcProto.BucketStatistics.getDefaultInstance();
- }
-
- public org.apache.orc.OrcProto.BucketStatistics build() {
- org.apache.orc.OrcProto.BucketStatistics result = buildPartial();
- if (!result.isInitialized()) {
- throw newUninitializedMessageException(result);
- }
- return result;
- }
-
- public org.apache.orc.OrcProto.BucketStatistics buildPartial() {
- org.apache.orc.OrcProto.BucketStatistics result = new org.apache.orc.OrcProto.BucketStatistics(this);
- int from_bitField0_ = bitField0_;
- if (((bitField0_ & 0x00000001) == 0x00000001)) {
- count_ = java.util.Collections.unmodifiableList(count_);
- bitField0_ = (bitField0_ & ~0x00000001);
- }
- result.count_ = count_;
- onBuilt();
- return result;
- }
-
- public Builder mergeFrom(com.google.protobuf.Message other) {
- if (other instanceof org.apache.orc.OrcProto.BucketStatistics) {
- return mergeFrom((org.apache.orc.OrcProto.BucketStatistics)other);
- } else {
- super.mergeFrom(other);
- return this;
- }
- }
-
- public Builder mergeFrom(org.apache.orc.OrcProto.BucketStatistics other) {
- if (other == org.apache.orc.OrcProto.BucketStatistics.getDefaultInstance()) return this;
- if (!other.count_.isEmpty()) {
- if (count_.isEmpty()) {
- count_ = other.count_;
- bitField0_ = (bitField0_ & ~0x00000001);
- } else {
- ensureCountIsMutable();
- count_.addAll(other.count_);
- }
- onChanged();
- }
- this.mergeUnknownFields(other.getUnknownFields());
- return this;
- }
-
- public final boolean isInitialized() {
- return true;
- }
-
- public Builder mergeFrom(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- org.apache.orc.OrcProto.BucketStatistics parsedMessage = null;
- try {
- parsedMessage = PARSER.parsePartialFrom(input, extensionRegistry);
- } catch (com.google.protobuf.InvalidProtocolBufferException e) {
- parsedMessage = (org.apache.orc.OrcProto.BucketStatistics) e.getUnfinishedMessage();
- throw e;
- } finally {
- if (parsedMessage != null) {
- mergeFrom(parsedMessage);
- }
- }
- return this;
- }
- private int bitField0_;
-
- // repeated uint64 count = 1 [packed = true];
- private java.util.List<java.lang.Long> count_ = java.util.Collections.emptyList();
- private void ensureCountIsMutable() {
- if (!((bitField0_ & 0x00000001) == 0x00000001)) {
- count_ = new java.util.ArrayList<java.lang.Long>(count_);
- bitField0_ |= 0x00000001;
- }
- }
- /**
- * <code>repeated uint64 count = 1 [packed = true];</code>
- */
- public java.util.List<java.lang.Long>
- getCountList() {
- return java.util.Collections.unmodifiableList(count_);
- }
- /**
- * <code>repeated uint64 count = 1 [packed = true];</code>
- */
- public int getCountCount() {
- return count_.size();
- }
- /**
- * <code>repeated uint64 count = 1 [packed = true];</code>
- */
- public long getCount(int index) {
- return count_.get(index);
- }
- /**
- * <code>repeated uint64 count = 1 [packed = true];</code>
- */
- public Builder setCount(
- int index, long value) {
- ensureCountIsMutable();
- count_.set(index, value);
- onChanged();
- return this;
- }
- /**
- * <code>repeated uint64 count = 1 [packed = true];</code>
- */
- public Builder addCount(long value) {
- ensureCountIsMutable();
- count_.add(value);
- onChanged();
- return this;
- }
- /**
- * <code>repeated uint64 count = 1 [packed = true];</code>
- */
- public Builder addAllCount(
- java.lang.Iterable<? extends java.lang.Long> values) {
- ensureCountIsMutable();
- super.addAll(values, count_);
- onChanged();
- return this;
- }
- /**
- * <code>repeated uint64 count = 1 [packed = true];</code>
- */
- public Builder clearCount() {
- count_ = java.util.Collections.emptyList();
- bitField0_ = (bitField0_ & ~0x00000001);
- onChanged();
- return this;
- }
-
- // @@protoc_insertion_point(builder_scope:orc.proto.BucketStatistics)
- }
-
- static {
- defaultInstance = new BucketStatistics(true);
- defaultInstance.initFields();
- }
-
- // @@protoc_insertion_point(class_scope:orc.proto.BucketStatistics)
- }
-
- public interface DecimalStatisticsOrBuilder
- extends com.google.protobuf.MessageOrBuilder {
-
- // optional string minimum = 1;
- /**
- * <code>optional string minimum = 1;</code>
- */
- boolean hasMinimum();
- /**
- * <code>optional string minimum = 1;</code>
- */
- java.lang.String getMinimum();
- /**
- * <code>optional string minimum = 1;</code>
- */
- com.google.protobuf.ByteString
- getMinimumBytes();
-
- // optional string maximum = 2;
- /**
- * <code>optional string maximum = 2;</code>
- */
- boolean hasMaximum();
- /**
- * <code>optional string maximum = 2;</code>
- */
- java.lang.String getMaximum();
- /**
- * <code>optional string maximum = 2;</code>
- */
- com.google.protobuf.ByteString
- getMaximumBytes();
-
- // optional string sum = 3;
- /**
- * <code>optional string sum = 3;</code>
- */
- boolean hasSum();
- /**
- * <code>optional string sum = 3;</code>
- */
- java.lang.String getSum();
- /**
- * <code>optional string sum = 3;</code>
- */
- com.google.protobuf.ByteString
- getSumBytes();
- }
- /**
- * Protobuf type {@code orc.proto.DecimalStatistics}
- */
- public static final class DecimalStatistics extends
- com.google.protobuf.GeneratedMessage
- implements DecimalStatisticsOrBuilder {
- // Use DecimalStatistics.newBuilder() to construct.
- private DecimalStatistics(com.google.protobuf.GeneratedMessage.Builder<?> builder) {
- super(builder);
- this.unknownFields = builder.getUnknownFields();
- }
- private DecimalStatistics(boolean noInit) { this.unknownFields = com.google.protobuf.UnknownFieldSet.getDefaultInstance(); }
-
- private static final DecimalStatistics defaultInstance;
- public static DecimalStatistics getDefaultInstance() {
- return defaultInstance;
- }
-
- public DecimalStatistics getDefaultInstanceForType() {
- return defaultInstance;
- }
-
- private final com.google.protobuf.UnknownFieldSet unknownFields;
- @java.lang.Override
- public final com.google.protobuf.UnknownFieldSet
- getUnknownFields() {
- return this.unknownFields;
- }
- private DecimalStatistics(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- initFields();
- int mutable_bitField0_ = 0;
- com.google.protobuf.UnknownFieldSet.Builder unknownFields =
- com.google.protobuf.UnknownFieldSet.newBuilder();
- try {
- boolean done = false;
- while (!done) {
- int tag = input.readTag();
- switch (tag) {
- case 0:
- done = true;
- break;
- default: {
- if (!parseUnknownField(input, unknownFields,
- extensionRegistry, tag)) {
- done = true;
- }
- break;
- }
- case 10: {
- bitField0_ |= 0x00000001;
- minimum_ = input.readBytes();
- break;
- }
- case 18: {
- bitField0_ |= 0x00000002;
- maximum_ = input.readBytes();
- break;
- }
- case 26: {
- bitField0_ |= 0x00000004;
- sum_ = input.readBytes();
- break;
- }
- }
- }
- } catch (com.google.protobuf.InvalidProtocolBufferException e) {
- throw e.setUnfinishedMessage(this);
- } catch (java.io.IOException e) {
- throw new com.google.protobuf.InvalidProtocolBufferException(
- e.getMessage()).setUnfinishedMessage(this);
- } finally {
- this.unknownFields = unknownFields.build();
- makeExtensionsImmutable();
- }
- }
- public static final com.google.protobuf.Descriptors.Descriptor
- getDescriptor() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_DecimalStatistics_descriptor;
- }
-
- protected com.google.protobuf.GeneratedMessage.FieldAccessorTable
- internalGetFieldAccessorTable() {
- return org.apache.orc.OrcProto.internal_static_orc_proto_DecimalStatistics_fieldAccessorTable
- .ensureFieldAccessorsInitialized(
- org.apache.orc.OrcProto.DecimalStatistics.class, org.apache.orc.OrcProto.DecimalStatistics.Builder.class);
- }
-
- public static com.google.protobuf.Parser<DecimalStatistics> PARSER =
- new com.google.protobuf.AbstractParser<DecimalStatistics>() {
- public DecimalStatistics parsePartialFrom(
- com.google.protobuf.CodedInputStream input,
- com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return new DecimalStatistics(input, extensionRegistry);
- }
- };
-
- @java.lang.Override
- public com.google.protobuf.Parser<DecimalStatistics> getParserForType() {
- return PARSER;
- }
-
- private int bitField0_;
- // optional string minimum = 1;
- public static final int MINIMUM_FIELD_NUMBER = 1;
- private java.lang.Object minimum_;
- /**
- * <code>optional string minimum = 1;</code>
- */
- public boolean hasMinimum() {
- return ((bitField0_ & 0x00000001) == 0x00000001);
- }
- /**
- * <code>optional string minimum = 1;</code>
- */
- public java.lang.String getMinimum() {
- java.lang.Object ref = minimum_;
- if (ref instanceof java.lang.String) {
- return (java.lang.String) ref;
- } else {
- com.google.protobuf.ByteString bs =
- (com.google.protobuf.ByteString) ref;
- java.lang.String s = bs.toStringUtf8();
- if (bs.isValidUtf8()) {
- minimum_ = s;
- }
- return s;
- }
- }
- /**
- * <code>optional string minimum = 1;</code>
- */
- public com.google.protobuf.ByteString
- getMinimumBytes() {
- java.lang.Object ref = minimum_;
- if (ref instanceof java.lang.String) {
- com.google.protobuf.ByteString b =
- com.google.protobuf.ByteString.copyFromUtf8(
- (java.lang.String) ref);
- minimum_ = b;
<TRUNCATED>
[23/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/IntegerColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/IntegerColumnStatistics.java b/orc/src/java/org/apache/orc/IntegerColumnStatistics.java
deleted file mode 100644
index 1a162ff..0000000
--- a/orc/src/java/org/apache/orc/IntegerColumnStatistics.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import org.apache.orc.ColumnStatistics;
-
-/**
- * Statistics for all of the integer columns, such as byte, short, int, and
- * long.
- */
-public interface IntegerColumnStatistics extends ColumnStatistics {
- /**
- * Get the smallest value in the column. Only defined if getNumberOfValues
- * is non-zero.
- * @return the minimum
- */
- long getMinimum();
-
- /**
- * Get the largest value in the column. Only defined if getNumberOfValues
- * is non-zero.
- * @return the maximum
- */
- long getMaximum();
-
- /**
- * Is the sum defined? If the sum overflowed the counter this will be false.
- * @return is the sum available
- */
- boolean isSumDefined();
-
- /**
- * Get the sum of the column. Only valid if isSumDefined returns true.
- * @return the sum of the column
- */
- long getSum();
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/OrcConf.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/OrcConf.java b/orc/src/java/org/apache/orc/OrcConf.java
deleted file mode 100644
index 357318d..0000000
--- a/orc/src/java/org/apache/orc/OrcConf.java
+++ /dev/null
@@ -1,193 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import org.apache.hadoop.conf.Configuration;
-
-import java.util.Properties;
-
-/**
- * Define the configuration properties that Orc understands.
- */
-public enum OrcConf {
- STRIPE_SIZE("orc.stripe.size", "hive.exec.orc.default.stripe.size",
- 64L * 1024 * 1024,
- "Define the default ORC stripe size, in bytes."),
- BLOCK_SIZE("orc.block.size", "hive.exec.orc.default.block.size",
- 256L * 1024 * 1024,
- "Define the default file system block size for ORC files."),
- ENABLE_INDEXES("orc.create.index", "orc.create.index", true,
- "Should the ORC writer create indexes as part of the file."),
- ROW_INDEX_STRIDE("orc.row.index.stride",
- "hive.exec.orc.default.row.index.stride", 10000,
- "Define the default ORC index stride in number of rows. (Stride is the\n"+
- " number of rows n index entry represents.)"),
- BUFFER_SIZE("orc.compress.size", "hive.exec.orc.default.buffer.size",
- 256 * 1024, "Define the default ORC buffer size, in bytes."),
- BASE_DELTA_RATIO("orc.base.delta.ratio", "hive.exec.orc.base.delta.ratio", 8,
- "The ratio of base writer and delta writer in terms of STRIPE_SIZE and BUFFER_SIZE."),
- BLOCK_PADDING("orc.block.padding", "hive.exec.orc.default.block.padding",
- true,
- "Define whether stripes should be padded to the HDFS block boundaries."),
- COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZLIB",
- "Define the default compression codec for ORC file"),
- WRITE_FORMAT("orc.write.format", "hive.exec.orc.write.format", "0.12",
- "Define the version of the file to write. Possible values are 0.11 and\n"+
- " 0.12. If this parameter is not defined, ORC will use the run\n" +
- " length encoding (RLE) introduced in Hive 0.12."),
- ENCODING_STRATEGY("orc.encoding.strategy", "hive.exec.orc.encoding.strategy",
- "SPEED",
- "Define the encoding strategy to use while writing data. Changing this\n"+
- "will only affect the light weight encoding for integers. This\n" +
- "flag will not change the compression level of higher level\n" +
- "compression codec (like ZLIB)."),
- COMPRESSION_STRATEGY("orc.compression.strategy",
- "hive.exec.orc.compression.strategy", "SPEED",
- "Define the compression strategy to use while writing data.\n" +
- "This changes the compression level of higher level compression\n" +
- "codec (like ZLIB)."),
- BLOCK_PADDING_TOLERANCE("orc.block.padding.tolerance",
- "hive.exec.orc.block.padding.tolerance", 0.05,
- "Define the tolerance for block padding as a decimal fraction of\n" +
- "stripe size (for example, the default value 0.05 is 5% of the\n" +
- "stripe size). For the defaults of 64Mb ORC stripe and 256Mb HDFS\n" +
- "blocks, the default block padding tolerance of 5% will\n" +
- "reserve a maximum of 3.2Mb for padding within the 256Mb block.\n" +
- "In that case, if the available size within the block is more than\n"+
- "3.2Mb, a new smaller stripe will be inserted to fit within that\n" +
- "space. This will make sure that no stripe written will block\n" +
- " boundaries and cause remote reads within a node local task."),
- BLOOM_FILTER_FPP("orc.bloom.filter.fpp", "orc.default.bloom.fpp", 0.05,
- "Define the default false positive probability for bloom filters."),
- USE_ZEROCOPY("orc.use.zerocopy", "hive.exec.orc.zerocopy", false,
- "Use zerocopy reads with ORC. (This requires Hadoop 2.3 or later.)"),
- SKIP_CORRUPT_DATA("orc.skip.corrupt.data", "hive.exec.orc.skip.corrupt.data",
- false,
- "If ORC reader encounters corrupt data, this value will be used to\n" +
- "determine whether to skip the corrupt data or throw exception.\n" +
- "The default behavior is to throw exception."),
- MEMORY_POOL("orc.memory.pool", "hive.exec.orc.memory.pool", 0.5,
- "Maximum fraction of heap that can be used by ORC file writers"),
- DICTIONARY_KEY_SIZE_THRESHOLD("orc.dictionary.key.threshold",
- "hive.exec.orc.dictionary.key.size.threshold",
- 0.8,
- "If the number of distinct keys in a dictionary is greater than this\n" +
- "fraction of the total number of non-null rows, turn off \n" +
- "dictionary encoding. Use 1 to always use dictionary encoding."),
- ROW_INDEX_STRIDE_DICTIONARY_CHECK("orc.dictionary.early.check",
- "hive.orc.row.index.stride.dictionary.check",
- true,
- "If enabled dictionary check will happen after first row index stride\n" +
- "(default 10000 rows) else dictionary check will happen before\n" +
- "writing first stripe. In both cases, the decision to use\n" +
- "dictionary or not will be retained thereafter."),
- BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns", "orc.bloom.filter.columns",
- "", "List of columns to create bloom filters for when writing.")
- ;
-
- private final String attribute;
- private final String hiveConfName;
- private final Object defaultValue;
- private final String description;
-
- OrcConf(String attribute,
- String hiveConfName,
- Object defaultValue,
- String description) {
- this.attribute = attribute;
- this.hiveConfName = hiveConfName;
- this.defaultValue = defaultValue;
- this.description = description;
- }
-
- public String getAttribute() {
- return attribute;
- }
-
- public String getHiveConfName() {
- return hiveConfName;
- }
-
- public Object getDefaultValue() {
- return defaultValue;
- }
-
- public String getDescription() {
- return description;
- }
-
- private String lookupValue(Properties tbl, Configuration conf) {
- String result = null;
- if (tbl != null) {
- result = tbl.getProperty(attribute);
- }
- if (result == null && conf != null) {
- result = conf.get(attribute);
- if (result == null) {
- result = conf.get(hiveConfName);
- }
- }
- return result;
- }
-
- public long getLong(Properties tbl, Configuration conf) {
- String value = lookupValue(tbl, conf);
- if (value != null) {
- return Long.parseLong(value);
- }
- return ((Number) defaultValue).longValue();
- }
-
- public long getLong(Configuration conf) {
- return getLong(null, conf);
- }
-
- public String getString(Properties tbl, Configuration conf) {
- String value = lookupValue(tbl, conf);
- return value == null ? (String) defaultValue : value;
- }
-
- public String getString(Configuration conf) {
- return getString(null, conf);
- }
-
- public boolean getBoolean(Properties tbl, Configuration conf) {
- String value = lookupValue(tbl, conf);
- if (value != null) {
- return Boolean.parseBoolean(value);
- }
- return (Boolean) defaultValue;
- }
-
- public boolean getBoolean(Configuration conf) {
- return getBoolean(null, conf);
- }
-
- public double getDouble(Properties tbl, Configuration conf) {
- String value = lookupValue(tbl, conf);
- if (value != null) {
- return Double.parseDouble(value);
- }
- return ((Number) defaultValue).doubleValue();
- }
-
- public double getDouble(Configuration conf) {
- return getDouble(null, conf);
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/OrcFile.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/OrcFile.java b/orc/src/java/org/apache/orc/OrcFile.java
deleted file mode 100644
index 06fb666..0000000
--- a/orc/src/java/org/apache/orc/OrcFile.java
+++ /dev/null
@@ -1,574 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import java.io.IOException;
-import java.util.Properties;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.orc.impl.MemoryManager;
-import org.apache.orc.impl.OrcTail;
-import org.apache.orc.impl.ReaderImpl;
-import org.apache.orc.impl.WriterImpl;
-
-/**
- * Contains factory methods to read or write ORC files.
- */
-public class OrcFile {
- public static final String MAGIC = "ORC";
-
- /**
- * Create a version number for the ORC file format, so that we can add
- * non-forward compatible changes in the future. To make it easier for users
- * to understand the version numbers, we use the Hive release number that
- * first wrote that version of ORC files.
- *
- * Thus, if you add new encodings or other non-forward compatible changes
- * to ORC files, which prevent the old reader from reading the new format,
- * you should change these variable to reflect the next Hive release number.
- * Non-forward compatible changes should never be added in patch releases.
- *
- * Do not make any changes that break backwards compatibility, which would
- * prevent the new reader from reading ORC files generated by any released
- * version of Hive.
- */
- public enum Version {
- V_0_11("0.11", 0, 11),
- V_0_12("0.12", 0, 12);
-
- public static final Version CURRENT = V_0_12;
-
- private final String name;
- private final int major;
- private final int minor;
-
- Version(String name, int major, int minor) {
- this.name = name;
- this.major = major;
- this.minor = minor;
- }
-
- public static Version byName(String name) {
- for(Version version: values()) {
- if (version.name.equals(name)) {
- return version;
- }
- }
- throw new IllegalArgumentException("Unknown ORC version " + name);
- }
-
- /**
- * Get the human readable name for the version.
- */
- public String getName() {
- return name;
- }
-
- /**
- * Get the major version number.
- */
- public int getMajor() {
- return major;
- }
-
- /**
- * Get the minor version number.
- */
- public int getMinor() {
- return minor;
- }
- }
-
- /**
- * Records the version of the writer in terms of which bugs have been fixed.
- * For bugs in the writer, but the old readers already read the new data
- * correctly, bump this version instead of the Version.
- */
- public enum WriterVersion {
- ORIGINAL(0),
- HIVE_8732(1), // corrupted stripe/file maximum column statistics
- HIVE_4243(2), // use real column names from Hive tables
- HIVE_12055(3), // vectorized writer
- HIVE_13083(4), // decimal writer updating present stream wrongly
-
- // Don't use any magic numbers here except for the below:
- FUTURE(Integer.MAX_VALUE); // a version from a future writer
-
- private final int id;
-
- public int getId() {
- return id;
- }
-
- WriterVersion(int id) {
- this.id = id;
- }
-
- private static final WriterVersion[] values;
- static {
- // Assumes few non-negative values close to zero.
- int max = Integer.MIN_VALUE;
- for (WriterVersion v : WriterVersion.values()) {
- if (v.id < 0) throw new AssertionError();
- if (v.id > max && FUTURE.id != v.id) {
- max = v.id;
- }
- }
- values = new WriterVersion[max + 1];
- for (WriterVersion v : WriterVersion.values()) {
- if (v.id < values.length) {
- values[v.id] = v;
- }
- }
- }
-
- /**
- * Convert the integer from OrcProto.PostScript.writerVersion
- * to the enumeration with unknown versions being mapped to FUTURE.
- * @param val the serialized writer version
- * @return the corresponding enumeration value
- */
- public static WriterVersion from(int val) {
- if (val >= values.length) {
- return FUTURE;
- }
- return values[val];
- }
- }
- public static final WriterVersion CURRENT_WRITER = WriterVersion.HIVE_13083;
-
- public enum EncodingStrategy {
- SPEED, COMPRESSION
- }
-
- public enum CompressionStrategy {
- SPEED, COMPRESSION
- }
-
- // unused
- protected OrcFile() {}
-
- public static class ReaderOptions {
- private final Configuration conf;
- private FileSystem filesystem;
- private long maxLength = Long.MAX_VALUE;
- private OrcTail orcTail;
- // TODO: We can generalize FileMetada interface. Make OrcTail implement FileMetadata interface
- // and remove this class altogether. Both footer caching and llap caching just needs OrcTail.
- // For now keeping this around to avoid complex surgery
- private FileMetadata fileMetadata;
-
- public ReaderOptions(Configuration conf) {
- this.conf = conf;
- }
-
- public ReaderOptions filesystem(FileSystem fs) {
- this.filesystem = fs;
- return this;
- }
-
- public ReaderOptions maxLength(long val) {
- maxLength = val;
- return this;
- }
-
- public ReaderOptions orcTail(OrcTail tail) {
- this.orcTail = tail;
- return this;
- }
-
- public Configuration getConfiguration() {
- return conf;
- }
-
- public FileSystem getFilesystem() {
- return filesystem;
- }
-
- public long getMaxLength() {
- return maxLength;
- }
-
- public OrcTail getOrcTail() {
- return orcTail;
- }
-
- public ReaderOptions fileMetadata(final FileMetadata metadata) {
- fileMetadata = metadata;
- return this;
- }
-
- public FileMetadata getFileMetadata() {
- return fileMetadata;
- }
- }
-
- public static ReaderOptions readerOptions(Configuration conf) {
- return new ReaderOptions(conf);
- }
-
- public static Reader createReader(Path path,
- ReaderOptions options) throws IOException {
- return new ReaderImpl(path, options);
- }
-
- public interface WriterContext {
- Writer getWriter();
- }
-
- public interface WriterCallback {
- void preStripeWrite(WriterContext context) throws IOException;
- void preFooterWrite(WriterContext context) throws IOException;
- }
-
- /**
- * Options for creating ORC file writers.
- */
- public static class WriterOptions {
- private final Configuration configuration;
- private FileSystem fileSystemValue = null;
- private TypeDescription schema = null;
- private long stripeSizeValue;
- private long blockSizeValue;
- private int rowIndexStrideValue;
- private int bufferSizeValue;
- private boolean enforceBufferSize = false;
- private boolean blockPaddingValue;
- private CompressionKind compressValue;
- private MemoryManager memoryManagerValue;
- private Version versionValue;
- private WriterCallback callback;
- private EncodingStrategy encodingStrategy;
- private CompressionStrategy compressionStrategy;
- private double paddingTolerance;
- private String bloomFilterColumns;
- private double bloomFilterFpp;
-
- protected WriterOptions(Properties tableProperties, Configuration conf) {
- configuration = conf;
- memoryManagerValue = getStaticMemoryManager(conf);
- stripeSizeValue = OrcConf.STRIPE_SIZE.getLong(tableProperties, conf);
- blockSizeValue = OrcConf.BLOCK_SIZE.getLong(tableProperties, conf);
- rowIndexStrideValue =
- (int) OrcConf.ROW_INDEX_STRIDE.getLong(tableProperties, conf);
- bufferSizeValue = (int) OrcConf.BUFFER_SIZE.getLong(tableProperties,
- conf);
- blockPaddingValue =
- OrcConf.BLOCK_PADDING.getBoolean(tableProperties, conf);
- compressValue =
- CompressionKind.valueOf(OrcConf.COMPRESS.getString(tableProperties,
- conf).toUpperCase());
- String versionName = OrcConf.WRITE_FORMAT.getString(tableProperties,
- conf);
- versionValue = Version.byName(versionName);
- String enString = OrcConf.ENCODING_STRATEGY.getString(tableProperties,
- conf);
- encodingStrategy = EncodingStrategy.valueOf(enString);
-
- String compString =
- OrcConf.COMPRESSION_STRATEGY.getString(tableProperties, conf);
- compressionStrategy = CompressionStrategy.valueOf(compString);
-
- paddingTolerance =
- OrcConf.BLOCK_PADDING_TOLERANCE.getDouble(tableProperties, conf);
-
- bloomFilterColumns = OrcConf.BLOOM_FILTER_COLUMNS.getString(tableProperties,
- conf);
- bloomFilterFpp = OrcConf.BLOOM_FILTER_FPP.getDouble(tableProperties,
- conf);
- }
-
- /**
- * Provide the filesystem for the path, if the client has it available.
- * If it is not provided, it will be found from the path.
- */
- public WriterOptions fileSystem(FileSystem value) {
- fileSystemValue = value;
- return this;
- }
-
- /**
- * Set the stripe size for the file. The writer stores the contents of the
- * stripe in memory until this memory limit is reached and the stripe
- * is flushed to the HDFS file and the next stripe started.
- */
- public WriterOptions stripeSize(long value) {
- stripeSizeValue = value;
- return this;
- }
-
- /**
- * Set the file system block size for the file. For optimal performance,
- * set the block size to be multiple factors of stripe size.
- */
- public WriterOptions blockSize(long value) {
- blockSizeValue = value;
- return this;
- }
-
- /**
- * Set the distance between entries in the row index. The minimum value is
- * 1000 to prevent the index from overwhelming the data. If the stride is
- * set to 0, no indexes will be included in the file.
- */
- public WriterOptions rowIndexStride(int value) {
- rowIndexStrideValue = value;
- return this;
- }
-
- /**
- * The size of the memory buffers used for compressing and storing the
- * stripe in memory. NOTE: ORC writer may choose to use smaller buffer
- * size based on stripe size and number of columns for efficient stripe
- * writing and memory utilization. To enforce writer to use the requested
- * buffer size use enforceBufferSize().
- */
- public WriterOptions bufferSize(int value) {
- bufferSizeValue = value;
- return this;
- }
-
- /**
- * Enforce writer to use requested buffer size instead of estimating
- * buffer size based on stripe size and number of columns.
- * See bufferSize() method for more info.
- * Default: false
- */
- public WriterOptions enforceBufferSize() {
- enforceBufferSize = true;
- return this;
- }
-
- /**
- * Sets whether the HDFS blocks are padded to prevent stripes from
- * straddling blocks. Padding improves locality and thus the speed of
- * reading, but costs space.
- */
- public WriterOptions blockPadding(boolean value) {
- blockPaddingValue = value;
- return this;
- }
-
- /**
- * Sets the encoding strategy that is used to encode the data.
- */
- public WriterOptions encodingStrategy(EncodingStrategy strategy) {
- encodingStrategy = strategy;
- return this;
- }
-
- /**
- * Sets the tolerance for block padding as a percentage of stripe size.
- */
- public WriterOptions paddingTolerance(double value) {
- paddingTolerance = value;
- return this;
- }
-
- /**
- * Comma separated values of column names for which bloom filter is to be created.
- */
- public WriterOptions bloomFilterColumns(String columns) {
- bloomFilterColumns = columns;
- return this;
- }
-
- /**
- * Specify the false positive probability for bloom filter.
- * @param fpp - false positive probability
- * @return this
- */
- public WriterOptions bloomFilterFpp(double fpp) {
- bloomFilterFpp = fpp;
- return this;
- }
-
- /**
- * Sets the generic compression that is used to compress the data.
- */
- public WriterOptions compress(CompressionKind value) {
- compressValue = value;
- return this;
- }
-
- /**
- * Set the schema for the file. This is a required parameter.
- * @param schema the schema for the file.
- * @return this
- */
- public WriterOptions setSchema(TypeDescription schema) {
- this.schema = schema;
- return this;
- }
-
- /**
- * Sets the version of the file that will be written.
- */
- public WriterOptions version(Version value) {
- versionValue = value;
- return this;
- }
-
- /**
- * Add a listener for when the stripe and file are about to be closed.
- * @param callback the object to be called when the stripe is closed
- * @return this
- */
- public WriterOptions callback(WriterCallback callback) {
- this.callback = callback;
- return this;
- }
-
- /**
- * A package local option to set the memory manager.
- */
- protected WriterOptions memory(MemoryManager value) {
- memoryManagerValue = value;
- return this;
- }
-
- public boolean getBlockPadding() {
- return blockPaddingValue;
- }
-
- public long getBlockSize() {
- return blockSizeValue;
- }
-
- public String getBloomFilterColumns() {
- return bloomFilterColumns;
- }
-
- public FileSystem getFileSystem() {
- return fileSystemValue;
- }
-
- public Configuration getConfiguration() {
- return configuration;
- }
-
- public TypeDescription getSchema() {
- return schema;
- }
-
- public long getStripeSize() {
- return stripeSizeValue;
- }
-
- public CompressionKind getCompress() {
- return compressValue;
- }
-
- public WriterCallback getCallback() {
- return callback;
- }
-
- public Version getVersion() {
- return versionValue;
- }
-
- public MemoryManager getMemoryManager() {
- return memoryManagerValue;
- }
-
- public int getBufferSize() {
- return bufferSizeValue;
- }
-
- public boolean isEnforceBufferSize() {
- return enforceBufferSize;
- }
-
- public int getRowIndexStride() {
- return rowIndexStrideValue;
- }
-
- public CompressionStrategy getCompressionStrategy() {
- return compressionStrategy;
- }
-
- public EncodingStrategy getEncodingStrategy() {
- return encodingStrategy;
- }
-
- public double getPaddingTolerance() {
- return paddingTolerance;
- }
-
- public double getBloomFilterFpp() {
- return bloomFilterFpp;
- }
- }
-
- /**
- * Create a set of writer options based on a configuration.
- * @param conf the configuration to use for values
- * @return A WriterOptions object that can be modified
- */
- public static WriterOptions writerOptions(Configuration conf) {
- return new WriterOptions(null, conf);
- }
-
- /**
- * Create a set of write options based on a set of table properties and
- * configuration.
- * @param tableProperties the properties of the table
- * @param conf the configuration of the query
- * @return a WriterOptions object that can be modified
- */
- public static WriterOptions writerOptions(Properties tableProperties,
- Configuration conf) {
- return new WriterOptions(tableProperties, conf);
- }
-
- private static ThreadLocal<MemoryManager> memoryManager = null;
-
- private static synchronized MemoryManager getStaticMemoryManager(
- final Configuration conf) {
- if (memoryManager == null) {
- memoryManager = new ThreadLocal<MemoryManager>() {
- @Override
- protected MemoryManager initialValue() {
- return new MemoryManager(conf);
- }
- };
- }
- return memoryManager.get();
- }
-
- /**
- * Create an ORC file writer. This is the public interface for creating
- * writers going forward and new options will only be added to this method.
- * @param path filename to write to
- * @param opts the options
- * @return a new ORC file writer
- * @throws IOException
- */
- public static Writer createWriter(Path path,
- WriterOptions opts
- ) throws IOException {
- FileSystem fs = opts.getFileSystem() == null ?
- path.getFileSystem(opts.getConfiguration()) : opts.getFileSystem();
-
- return new WriterImpl(fs, path, opts);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/OrcUtils.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/OrcUtils.java b/orc/src/java/org/apache/orc/OrcUtils.java
deleted file mode 100644
index 4f02926..0000000
--- a/orc/src/java/org/apache/orc/OrcUtils.java
+++ /dev/null
@@ -1,624 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.orc.OrcProto.Type.Builder;
-import org.apache.orc.impl.ReaderImpl;
-
-import com.google.common.collect.Lists;
-
-public class OrcUtils {
-
- /**
- * Returns selected columns as a boolean array with true value set for specified column names.
- * The result will contain number of elements equal to flattened number of columns.
- * For example:
- * selectedColumns - a,b,c
- * allColumns - a,b,c,d
- * If column c is a complex type, say list<string> and other types are primitives then result will
- * be [false, true, true, true, true, true, false]
- * Index 0 is the root element of the struct which is set to false by default, index 1,2
- * corresponds to columns a and b. Index 3,4 correspond to column c which is list<string> and
- * index 5 correspond to column d. After flattening list<string> gets 2 columns.
- *
- * @param selectedColumns - comma separated list of selected column names
- * @param schema - object schema
- * @return - boolean array with true value set for the specified column names
- */
- public static boolean[] includeColumns(String selectedColumns,
- TypeDescription schema) {
- int numFlattenedCols = schema.getMaximumId();
- boolean[] results = new boolean[numFlattenedCols + 1];
- if ("*".equals(selectedColumns)) {
- Arrays.fill(results, true);
- return results;
- }
- if (selectedColumns != null &&
- schema.getCategory() == TypeDescription.Category.STRUCT) {
- List<String> fieldNames = schema.getFieldNames();
- List<TypeDescription> fields = schema.getChildren();
- for (String column: selectedColumns.split((","))) {
- TypeDescription col = findColumn(column, fieldNames, fields);
- if (col != null) {
- for(int i=col.getId(); i <= col.getMaximumId(); ++i) {
- results[i] = true;
- }
- }
- }
- }
- return results;
- }
-
- private static TypeDescription findColumn(String columnName,
- List<String> fieldNames,
- List<TypeDescription> fields) {
- int i = 0;
- for(String fieldName: fieldNames) {
- if (fieldName.equalsIgnoreCase(columnName)) {
- return fields.get(i);
- } else {
- i += 1;
- }
- }
- return null;
- }
-
- public static List<OrcProto.Type> getOrcTypes(TypeDescription typeDescr) {
- List<OrcProto.Type> result = Lists.newArrayList();
- appendOrcTypes(result, typeDescr);
- return result;
- }
-
- private static void appendOrcTypes(List<OrcProto.Type> result, TypeDescription typeDescr) {
- OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
- List<TypeDescription> children = typeDescr.getChildren();
- switch (typeDescr.getCategory()) {
- case BOOLEAN:
- type.setKind(OrcProto.Type.Kind.BOOLEAN);
- break;
- case BYTE:
- type.setKind(OrcProto.Type.Kind.BYTE);
- break;
- case SHORT:
- type.setKind(OrcProto.Type.Kind.SHORT);
- break;
- case INT:
- type.setKind(OrcProto.Type.Kind.INT);
- break;
- case LONG:
- type.setKind(OrcProto.Type.Kind.LONG);
- break;
- case FLOAT:
- type.setKind(OrcProto.Type.Kind.FLOAT);
- break;
- case DOUBLE:
- type.setKind(OrcProto.Type.Kind.DOUBLE);
- break;
- case STRING:
- type.setKind(OrcProto.Type.Kind.STRING);
- break;
- case CHAR:
- type.setKind(OrcProto.Type.Kind.CHAR);
- type.setMaximumLength(typeDescr.getMaxLength());
- break;
- case VARCHAR:
- type.setKind(OrcProto.Type.Kind.VARCHAR);
- type.setMaximumLength(typeDescr.getMaxLength());
- break;
- case BINARY:
- type.setKind(OrcProto.Type.Kind.BINARY);
- break;
- case TIMESTAMP:
- type.setKind(OrcProto.Type.Kind.TIMESTAMP);
- break;
- case DATE:
- type.setKind(OrcProto.Type.Kind.DATE);
- break;
- case DECIMAL:
- type.setKind(OrcProto.Type.Kind.DECIMAL);
- type.setPrecision(typeDescr.getPrecision());
- type.setScale(typeDescr.getScale());
- break;
- case LIST:
- type.setKind(OrcProto.Type.Kind.LIST);
- type.addSubtypes(children.get(0).getId());
- break;
- case MAP:
- type.setKind(OrcProto.Type.Kind.MAP);
- for(TypeDescription t: children) {
- type.addSubtypes(t.getId());
- }
- break;
- case STRUCT:
- type.setKind(OrcProto.Type.Kind.STRUCT);
- for(TypeDescription t: children) {
- type.addSubtypes(t.getId());
- }
- for(String field: typeDescr.getFieldNames()) {
- type.addFieldNames(field);
- }
- break;
- case UNION:
- type.setKind(OrcProto.Type.Kind.UNION);
- for(TypeDescription t: children) {
- type.addSubtypes(t.getId());
- }
- break;
- default:
- throw new IllegalArgumentException("Unknown category: " +
- typeDescr.getCategory());
- }
- result.add(type.build());
- if (children != null) {
- for(TypeDescription child: children) {
- appendOrcTypes(result, child);
- }
- }
- }
-
- /**
- * NOTE: This method ignores the subtype numbers in the TypeDescription rebuilds the subtype
- * numbers based on the length of the result list being appended.
- *
- * @param result
- * @param typeDescr
- */
- public static void appendOrcTypesRebuildSubtypes(List<OrcProto.Type> result,
- TypeDescription typeDescr) {
-
- int subtype = result.size();
- OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
- boolean needsAdd = true;
- List<TypeDescription> children = typeDescr.getChildren();
- switch (typeDescr.getCategory()) {
- case BOOLEAN:
- type.setKind(OrcProto.Type.Kind.BOOLEAN);
- break;
- case BYTE:
- type.setKind(OrcProto.Type.Kind.BYTE);
- break;
- case SHORT:
- type.setKind(OrcProto.Type.Kind.SHORT);
- break;
- case INT:
- type.setKind(OrcProto.Type.Kind.INT);
- break;
- case LONG:
- type.setKind(OrcProto.Type.Kind.LONG);
- break;
- case FLOAT:
- type.setKind(OrcProto.Type.Kind.FLOAT);
- break;
- case DOUBLE:
- type.setKind(OrcProto.Type.Kind.DOUBLE);
- break;
- case STRING:
- type.setKind(OrcProto.Type.Kind.STRING);
- break;
- case CHAR:
- type.setKind(OrcProto.Type.Kind.CHAR);
- type.setMaximumLength(typeDescr.getMaxLength());
- break;
- case VARCHAR:
- type.setKind(OrcProto.Type.Kind.VARCHAR);
- type.setMaximumLength(typeDescr.getMaxLength());
- break;
- case BINARY:
- type.setKind(OrcProto.Type.Kind.BINARY);
- break;
- case TIMESTAMP:
- type.setKind(OrcProto.Type.Kind.TIMESTAMP);
- break;
- case DATE:
- type.setKind(OrcProto.Type.Kind.DATE);
- break;
- case DECIMAL:
- type.setKind(OrcProto.Type.Kind.DECIMAL);
- type.setPrecision(typeDescr.getPrecision());
- type.setScale(typeDescr.getScale());
- break;
- case LIST:
- type.setKind(OrcProto.Type.Kind.LIST);
- type.addSubtypes(++subtype);
- result.add(type.build());
- needsAdd = false;
- appendOrcTypesRebuildSubtypes(result, children.get(0));
- break;
- case MAP:
- {
- // Make room for MAP type.
- result.add(null);
-
- // Add MAP type pair in order to determine their subtype values.
- appendOrcTypesRebuildSubtypes(result, children.get(0));
- int subtype2 = result.size();
- appendOrcTypesRebuildSubtypes(result, children.get(1));
- type.setKind(OrcProto.Type.Kind.MAP);
- type.addSubtypes(subtype + 1);
- type.addSubtypes(subtype2);
- result.set(subtype, type.build());
- needsAdd = false;
- }
- break;
- case STRUCT:
- {
- List<String> fieldNames = typeDescr.getFieldNames();
-
- // Make room for STRUCT type.
- result.add(null);
-
- List<Integer> fieldSubtypes = new ArrayList<Integer>(fieldNames.size());
- for(TypeDescription child: children) {
- int fieldSubtype = result.size();
- fieldSubtypes.add(fieldSubtype);
- appendOrcTypesRebuildSubtypes(result, child);
- }
-
- type.setKind(OrcProto.Type.Kind.STRUCT);
-
- for (int i = 0 ; i < fieldNames.size(); i++) {
- type.addSubtypes(fieldSubtypes.get(i));
- type.addFieldNames(fieldNames.get(i));
- }
- result.set(subtype, type.build());
- needsAdd = false;
- }
- break;
- case UNION:
- {
- // Make room for UNION type.
- result.add(null);
-
- List<Integer> unionSubtypes = new ArrayList<Integer>(children.size());
- for(TypeDescription child: children) {
- int unionSubtype = result.size();
- unionSubtypes.add(unionSubtype);
- appendOrcTypesRebuildSubtypes(result, child);
- }
-
- type.setKind(OrcProto.Type.Kind.UNION);
- for (int i = 0 ; i < children.size(); i++) {
- type.addSubtypes(unionSubtypes.get(i));
- }
- result.set(subtype, type.build());
- needsAdd = false;
- }
- break;
- default:
- throw new IllegalArgumentException("Unknown category: " + typeDescr.getCategory());
- }
- if (needsAdd) {
- result.add(type.build());
- }
- }
-
- /**
- * NOTE: This method ignores the subtype numbers in the OrcProto.Type rebuilds the subtype
- * numbers based on the length of the result list being appended.
- *
- * @param result
- * @param types
- * @param columnId
- */
- public static int appendOrcTypesRebuildSubtypes(List<OrcProto.Type> result,
- List<OrcProto.Type> types, int columnId) {
-
- OrcProto.Type oldType = types.get(columnId++);
-
- int subtype = result.size();
- OrcProto.Type.Builder builder = OrcProto.Type.newBuilder();
- boolean needsAdd = true;
- switch (oldType.getKind()) {
- case BOOLEAN:
- builder.setKind(OrcProto.Type.Kind.BOOLEAN);
- break;
- case BYTE:
- builder.setKind(OrcProto.Type.Kind.BYTE);
- break;
- case SHORT:
- builder.setKind(OrcProto.Type.Kind.SHORT);
- break;
- case INT:
- builder.setKind(OrcProto.Type.Kind.INT);
- break;
- case LONG:
- builder.setKind(OrcProto.Type.Kind.LONG);
- break;
- case FLOAT:
- builder.setKind(OrcProto.Type.Kind.FLOAT);
- break;
- case DOUBLE:
- builder.setKind(OrcProto.Type.Kind.DOUBLE);
- break;
- case STRING:
- builder.setKind(OrcProto.Type.Kind.STRING);
- break;
- case CHAR:
- builder.setKind(OrcProto.Type.Kind.CHAR);
- builder.setMaximumLength(oldType.getMaximumLength());
- break;
- case VARCHAR:
- builder.setKind(OrcProto.Type.Kind.VARCHAR);
- builder.setMaximumLength(oldType.getMaximumLength());
- break;
- case BINARY:
- builder.setKind(OrcProto.Type.Kind.BINARY);
- break;
- case TIMESTAMP:
- builder.setKind(OrcProto.Type.Kind.TIMESTAMP);
- break;
- case DATE:
- builder.setKind(OrcProto.Type.Kind.DATE);
- break;
- case DECIMAL:
- builder.setKind(OrcProto.Type.Kind.DECIMAL);
- builder.setPrecision(oldType.getPrecision());
- builder.setScale(oldType.getScale());
- break;
- case LIST:
- builder.setKind(OrcProto.Type.Kind.LIST);
- builder.addSubtypes(++subtype);
- result.add(builder.build());
- needsAdd = false;
- columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
- break;
- case MAP:
- {
- // Make room for MAP type.
- result.add(null);
-
- // Add MAP type pair in order to determine their subtype values.
- columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
- int subtype2 = result.size();
- columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
- builder.setKind(OrcProto.Type.Kind.MAP);
- builder.addSubtypes(subtype + 1);
- builder.addSubtypes(subtype2);
- result.set(subtype, builder.build());
- needsAdd = false;
- }
- break;
- case STRUCT:
- {
- List<String> fieldNames = oldType.getFieldNamesList();
-
- // Make room for STRUCT type.
- result.add(null);
-
- List<Integer> fieldSubtypes = new ArrayList<Integer>(fieldNames.size());
- for(int i = 0 ; i < fieldNames.size(); i++) {
- int fieldSubtype = result.size();
- fieldSubtypes.add(fieldSubtype);
- columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
- }
-
- builder.setKind(OrcProto.Type.Kind.STRUCT);
-
- for (int i = 0 ; i < fieldNames.size(); i++) {
- builder.addSubtypes(fieldSubtypes.get(i));
- builder.addFieldNames(fieldNames.get(i));
- }
- result.set(subtype, builder.build());
- needsAdd = false;
- }
- break;
- case UNION:
- {
- int subtypeCount = oldType.getSubtypesCount();
-
- // Make room for UNION type.
- result.add(null);
-
- List<Integer> unionSubtypes = new ArrayList<Integer>(subtypeCount);
- for(int i = 0 ; i < subtypeCount; i++) {
- int unionSubtype = result.size();
- unionSubtypes.add(unionSubtype);
- columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
- }
-
- builder.setKind(OrcProto.Type.Kind.UNION);
- for (int i = 0 ; i < subtypeCount; i++) {
- builder.addSubtypes(unionSubtypes.get(i));
- }
- result.set(subtype, builder.build());
- needsAdd = false;
- }
- break;
- default:
- throw new IllegalArgumentException("Unknown category: " + oldType.getKind());
- }
- if (needsAdd) {
- result.add(builder.build());
- }
- return columnId;
- }
-
- /**
- * Translate the given rootColumn from the list of types to a TypeDescription.
- * @param types all of the types
- * @param rootColumn translate this type
- * @return a new TypeDescription that matches the given rootColumn
- */
- public static
- TypeDescription convertTypeFromProtobuf(List<OrcProto.Type> types,
- int rootColumn) {
- OrcProto.Type type = types.get(rootColumn);
- switch (type.getKind()) {
- case BOOLEAN:
- return TypeDescription.createBoolean();
- case BYTE:
- return TypeDescription.createByte();
- case SHORT:
- return TypeDescription.createShort();
- case INT:
- return TypeDescription.createInt();
- case LONG:
- return TypeDescription.createLong();
- case FLOAT:
- return TypeDescription.createFloat();
- case DOUBLE:
- return TypeDescription.createDouble();
- case STRING:
- return TypeDescription.createString();
- case CHAR:
- case VARCHAR: {
- TypeDescription result = type.getKind() == OrcProto.Type.Kind.CHAR ?
- TypeDescription.createChar() : TypeDescription.createVarchar();
- if (type.hasMaximumLength()) {
- result.withMaxLength(type.getMaximumLength());
- }
- return result;
- }
- case BINARY:
- return TypeDescription.createBinary();
- case TIMESTAMP:
- return TypeDescription.createTimestamp();
- case DATE:
- return TypeDescription.createDate();
- case DECIMAL: {
- TypeDescription result = TypeDescription.createDecimal();
- if (type.hasScale()) {
- result.withScale(type.getScale());
- }
- if (type.hasPrecision()) {
- result.withPrecision(type.getPrecision());
- }
- return result;
- }
- case LIST:
- return TypeDescription.createList(
- convertTypeFromProtobuf(types, type.getSubtypes(0)));
- case MAP:
- return TypeDescription.createMap(
- convertTypeFromProtobuf(types, type.getSubtypes(0)),
- convertTypeFromProtobuf(types, type.getSubtypes(1)));
- case STRUCT: {
- TypeDescription result = TypeDescription.createStruct();
- for(int f=0; f < type.getSubtypesCount(); ++f) {
- result.addField(type.getFieldNames(f),
- convertTypeFromProtobuf(types, type.getSubtypes(f)));
- }
- return result;
- }
- case UNION: {
- TypeDescription result = TypeDescription.createUnion();
- for(int f=0; f < type.getSubtypesCount(); ++f) {
- result.addUnionChild(
- convertTypeFromProtobuf(types, type.getSubtypes(f)));
- }
- return result;
- }
- }
- throw new IllegalArgumentException("Unknown ORC type " + type.getKind());
- }
-
- public static List<StripeInformation> convertProtoStripesToStripes(
- List<OrcProto.StripeInformation> stripes) {
- List<StripeInformation> result = new ArrayList<StripeInformation>(stripes.size());
- for (OrcProto.StripeInformation info : stripes) {
- result.add(new ReaderImpl.StripeInformationImpl(info));
- }
- return result;
- }
-
- public static List<TypeDescription> setTypeBuilderFromSchema(
- OrcProto.Type.Builder type, TypeDescription schema) {
- List<TypeDescription> children = schema.getChildren();
- switch (schema.getCategory()) {
- case BOOLEAN:
- type.setKind(OrcProto.Type.Kind.BOOLEAN);
- break;
- case BYTE:
- type.setKind(OrcProto.Type.Kind.BYTE);
- break;
- case SHORT:
- type.setKind(OrcProto.Type.Kind.SHORT);
- break;
- case INT:
- type.setKind(OrcProto.Type.Kind.INT);
- break;
- case LONG:
- type.setKind(OrcProto.Type.Kind.LONG);
- break;
- case FLOAT:
- type.setKind(OrcProto.Type.Kind.FLOAT);
- break;
- case DOUBLE:
- type.setKind(OrcProto.Type.Kind.DOUBLE);
- break;
- case STRING:
- type.setKind(OrcProto.Type.Kind.STRING);
- break;
- case CHAR:
- type.setKind(OrcProto.Type.Kind.CHAR);
- type.setMaximumLength(schema.getMaxLength());
- break;
- case VARCHAR:
- type.setKind(OrcProto.Type.Kind.VARCHAR);
- type.setMaximumLength(schema.getMaxLength());
- break;
- case BINARY:
- type.setKind(OrcProto.Type.Kind.BINARY);
- break;
- case TIMESTAMP:
- type.setKind(OrcProto.Type.Kind.TIMESTAMP);
- break;
- case DATE:
- type.setKind(OrcProto.Type.Kind.DATE);
- break;
- case DECIMAL:
- type.setKind(OrcProto.Type.Kind.DECIMAL);
- type.setPrecision(schema.getPrecision());
- type.setScale(schema.getScale());
- break;
- case LIST:
- type.setKind(OrcProto.Type.Kind.LIST);
- type.addSubtypes(children.get(0).getId());
- break;
- case MAP:
- type.setKind(OrcProto.Type.Kind.MAP);
- for(TypeDescription t: children) {
- type.addSubtypes(t.getId());
- }
- break;
- case STRUCT:
- type.setKind(OrcProto.Type.Kind.STRUCT);
- for(TypeDescription t: children) {
- type.addSubtypes(t.getId());
- }
- for(String field: schema.getFieldNames()) {
- type.addFieldNames(field);
- }
- break;
- case UNION:
- type.setKind(OrcProto.Type.Kind.UNION);
- for(TypeDescription t: children) {
- type.addSubtypes(t.getId());
- }
- break;
- default:
- throw new IllegalArgumentException("Unknown category: " +
- schema.getCategory());
- }
- return children;
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/Reader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/Reader.java b/orc/src/java/org/apache/orc/Reader.java
deleted file mode 100644
index c2d5235..0000000
--- a/orc/src/java/org/apache/orc/Reader.java
+++ /dev/null
@@ -1,375 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.List;
-
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
-
-/**
- * The interface for reading ORC files.
- *
- * One Reader can support multiple concurrent RecordReader.
- */
-public interface Reader {
-
- /**
- * Get the number of rows in the file.
- * @return the number of rows
- */
- long getNumberOfRows();
-
- /**
- * Get the deserialized data size of the file
- * @return raw data size
- */
- long getRawDataSize();
-
- /**
- * Get the deserialized data size of the specified columns
- * @param colNames
- * @return raw data size of columns
- */
- long getRawDataSizeOfColumns(List<String> colNames);
-
- /**
- * Get the deserialized data size of the specified columns ids
- * @param colIds - internal column id (check orcfiledump for column ids)
- * @return raw data size of columns
- */
- long getRawDataSizeFromColIndices(List<Integer> colIds);
-
- /**
- * Get the user metadata keys.
- * @return the set of metadata keys
- */
- List<String> getMetadataKeys();
-
- /**
- * Get a user metadata value.
- * @param key a key given by the user
- * @return the bytes associated with the given key
- */
- ByteBuffer getMetadataValue(String key);
-
- /**
- * Did the user set the given metadata value.
- * @param key the key to check
- * @return true if the metadata value was set
- */
- boolean hasMetadataValue(String key);
-
- /**
- * Get the compression kind.
- * @return the kind of compression in the file
- */
- CompressionKind getCompressionKind();
-
- /**
- * Get the buffer size for the compression.
- * @return number of bytes to buffer for the compression codec.
- */
- int getCompressionSize();
-
- /**
- * Get the number of rows per a entry in the row index.
- * @return the number of rows per an entry in the row index or 0 if there
- * is no row index.
- */
- int getRowIndexStride();
-
- /**
- * Get the list of stripes.
- * @return the information about the stripes in order
- */
- List<StripeInformation> getStripes();
-
- /**
- * Get the length of the file.
- * @return the number of bytes in the file
- */
- long getContentLength();
-
- /**
- * Get the statistics about the columns in the file.
- * @return the information about the column
- */
- ColumnStatistics[] getStatistics();
-
- /**
- * Get the type of rows in this ORC file.
- */
- TypeDescription getSchema();
-
- /**
- * Get the list of types contained in the file. The root type is the first
- * type in the list.
- * @return the list of flattened types
- * @deprecated use getSchema instead
- */
- List<OrcProto.Type> getTypes();
-
- /**
- * Get the file format version.
- */
- OrcFile.Version getFileVersion();
-
- /**
- * Get the version of the writer of this file.
- */
- OrcFile.WriterVersion getWriterVersion();
-
- /**
- * Get the file tail (footer + postscript)
- *
- * @return - file tail
- */
- OrcProto.FileTail getFileTail();
-
- /**
- * Options for creating a RecordReader.
- */
- public static class Options {
- private boolean[] include;
- private long offset = 0;
- private long length = Long.MAX_VALUE;
- private SearchArgument sarg = null;
- private String[] columnNames = null;
- private Boolean useZeroCopy = null;
- private Boolean skipCorruptRecords = null;
- private TypeDescription schema = null;
- private DataReader dataReader = null;
-
- /**
- * Set the list of columns to read.
- * @param include a list of columns to read
- * @return this
- */
- public Options include(boolean[] include) {
- this.include = include;
- return this;
- }
-
- /**
- * Set the range of bytes to read
- * @param offset the starting byte offset
- * @param length the number of bytes to read
- * @return this
- */
- public Options range(long offset, long length) {
- this.offset = offset;
- this.length = length;
- return this;
- }
-
- /**
- * Set the schema on read type description.
- */
- public Options schema(TypeDescription schema) {
- this.schema = schema;
- return this;
- }
-
- /**
- * Set search argument for predicate push down.
- * @param sarg the search argument
- * @param columnNames the column names for
- * @return this
- */
- public Options searchArgument(SearchArgument sarg, String[] columnNames) {
- this.sarg = sarg;
- this.columnNames = columnNames;
- return this;
- }
-
- /**
- * Set whether to use zero copy from HDFS.
- * @param value the new zero copy flag
- * @return this
- */
- public Options useZeroCopy(boolean value) {
- this.useZeroCopy = value;
- return this;
- }
-
- public Options dataReader(DataReader value) {
- this.dataReader = value;
- return this;
- }
-
- /**
- * Set whether to skip corrupt records.
- * @param value the new skip corrupt records flag
- * @return this
- */
- public Options skipCorruptRecords(boolean value) {
- this.skipCorruptRecords = value;
- return this;
- }
-
- public boolean[] getInclude() {
- return include;
- }
-
- public long getOffset() {
- return offset;
- }
-
- public long getLength() {
- return length;
- }
-
- public TypeDescription getSchema() {
- return schema;
- }
-
- public SearchArgument getSearchArgument() {
- return sarg;
- }
-
- public String[] getColumnNames() {
- return columnNames;
- }
-
- public long getMaxOffset() {
- long result = offset + length;
- if (result < 0) {
- result = Long.MAX_VALUE;
- }
- return result;
- }
-
- public Boolean getUseZeroCopy() {
- return useZeroCopy;
- }
-
- public Boolean getSkipCorruptRecords() {
- return skipCorruptRecords;
- }
-
- public DataReader getDataReader() {
- return dataReader;
- }
-
- public Options clone() {
- Options result = new Options();
- result.include = include;
- result.offset = offset;
- result.length = length;
- result.sarg = sarg;
- result.schema = schema;
- result.columnNames = columnNames;
- result.useZeroCopy = useZeroCopy;
- result.skipCorruptRecords = skipCorruptRecords;
- result.dataReader = dataReader == null ? null : dataReader.clone();
- return result;
- }
-
- @Override
- public String toString() {
- StringBuilder buffer = new StringBuilder();
- buffer.append("{include: ");
- if (include == null) {
- buffer.append("null");
- } else {
- buffer.append("[");
- for(int i=0; i < include.length; ++i) {
- if (i != 0) {
- buffer.append(", ");
- }
- buffer.append(include[i]);
- }
- buffer.append("]");
- }
- buffer.append(", offset: ");
- buffer.append(offset);
- buffer.append(", length: ");
- buffer.append(length);
- if (sarg != null) {
- buffer.append(", sarg: ");
- buffer.append(sarg.toString());
- buffer.append(", columns: [");
- for(int i=0; i < columnNames.length; ++i) {
- if (i != 0) {
- buffer.append(", ");
- }
- buffer.append("'");
- buffer.append(columnNames[i]);
- buffer.append("'");
- }
- buffer.append("]");
- }
- if (schema != null) {
- buffer.append(", schema: ");
- schema.printToBuffer(buffer);
- }
- buffer.append("}");
- return buffer.toString();
- }
- }
-
- /**
- * Create a RecordReader that reads everything with the default options.
- * @return a new RecordReader
- * @throws IOException
- */
- RecordReader rows() throws IOException;
-
- /**
- * Create a RecordReader that uses the options given.
- * This method can't be named rows, because many callers used rows(null)
- * before the rows() method was introduced.
- * @param options the options to read with
- * @return a new RecordReader
- * @throws IOException
- */
- RecordReader rows(Options options) throws IOException;
-
- /**
- * @return List of integers representing version of the file, in order from major to minor.
- */
- List<Integer> getVersionList();
-
- /**
- * @return Gets the size of metadata, in bytes.
- */
- int getMetadataSize();
-
- /**
- * @return Stripe statistics, in original protobuf form.
- */
- List<OrcProto.StripeStatistics> getOrcProtoStripeStatistics();
-
- /**
- * @return Stripe statistics.
- */
- List<StripeStatistics> getStripeStatistics() throws IOException;
-
- /**
- * @return File statistics, in original protobuf form.
- */
- List<OrcProto.ColumnStatistics> getOrcProtoFileStatistics();
-
- /**
- * @return Serialized file metadata read from disk for the purposes of caching, etc.
- */
- ByteBuffer getSerializedFileFooter();
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/RecordReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/RecordReader.java b/orc/src/java/org/apache/orc/RecordReader.java
deleted file mode 100644
index 09ba0f0..0000000
--- a/orc/src/java/org/apache/orc/RecordReader.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import java.io.IOException;
-
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-
-/**
- * A row-by-row iterator for ORC files.
- */
-public interface RecordReader {
- /**
- * Read the next row batch. The size of the batch to read cannot be
- * controlled by the callers. Caller need to look at
- * VectorizedRowBatch.size of the retunred object to know the batch
- * size read.
- * @param batch a row batch object to read into
- * @return were more rows available to read?
- * @throws java.io.IOException
- */
- boolean nextBatch(VectorizedRowBatch batch) throws IOException;
-
- /**
- * Get the row number of the row that will be returned by the following
- * call to next().
- * @return the row number from 0 to the number of rows in the file
- * @throws java.io.IOException
- */
- long getRowNumber() throws IOException;
-
- /**
- * Get the progress of the reader through the rows.
- * @return a fraction between 0.0 and 1.0 of rows read
- * @throws java.io.IOException
- */
- float getProgress() throws IOException;
-
- /**
- * Release the resources associated with the given reader.
- * @throws java.io.IOException
- */
- void close() throws IOException;
-
- /**
- * Seek to a particular row number.
- */
- void seekToRow(long rowCount) throws IOException;
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/StringColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/StringColumnStatistics.java b/orc/src/java/org/apache/orc/StringColumnStatistics.java
deleted file mode 100644
index 5a868d0..0000000
--- a/orc/src/java/org/apache/orc/StringColumnStatistics.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import org.apache.orc.ColumnStatistics;
-
-/**
- * Statistics for string columns.
- */
-public interface StringColumnStatistics extends ColumnStatistics {
- /**
- * Get the minimum string.
- * @return the minimum
- */
- String getMinimum();
-
- /**
- * Get the maximum string.
- * @return the maximum
- */
- String getMaximum();
-
- /**
- * Get the total length of all strings
- * @return the sum (total length)
- */
- long getSum();
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/StripeInformation.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/StripeInformation.java b/orc/src/java/org/apache/orc/StripeInformation.java
deleted file mode 100644
index 38f7eba..0000000
--- a/orc/src/java/org/apache/orc/StripeInformation.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-/**
- * Information about the stripes in an ORC file that is provided by the Reader.
- */
-public interface StripeInformation {
- /**
- * Get the byte offset of the start of the stripe.
- * @return the bytes from the start of the file
- */
- long getOffset();
-
- /**
- * Get the total length of the stripe in bytes.
- * @return the number of bytes in the stripe
- */
- long getLength();
-
- /**
- * Get the length of the stripe's indexes.
- * @return the number of bytes in the index
- */
- long getIndexLength();
-
- /**
- * Get the length of the stripe's data.
- * @return the number of bytes in the stripe
- */
- long getDataLength();
-
- /**
- * Get the length of the stripe's tail section, which contains its index.
- * @return the number of bytes in the tail
- */
- long getFooterLength();
-
- /**
- * Get the number of rows in the stripe.
- * @return a count of the number of rows
- */
- long getNumberOfRows();
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/StripeStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/StripeStatistics.java b/orc/src/java/org/apache/orc/StripeStatistics.java
deleted file mode 100644
index 8fc91cb..0000000
--- a/orc/src/java/org/apache/orc/StripeStatistics.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import org.apache.orc.impl.ColumnStatisticsImpl;
-
-import java.util.List;
-
-public class StripeStatistics {
- private final List<OrcProto.ColumnStatistics> cs;
-
- public StripeStatistics(List<OrcProto.ColumnStatistics> list) {
- this.cs = list;
- }
-
- /**
- * Return list of column statistics
- *
- * @return column stats
- */
- public ColumnStatistics[] getColumnStatistics() {
- ColumnStatistics[] result = new ColumnStatistics[cs.size()];
- for (int i = 0; i < result.length; ++i) {
- result[i] = ColumnStatisticsImpl.deserialize(cs.get(i));
- }
- return result;
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/TimestampColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/TimestampColumnStatistics.java b/orc/src/java/org/apache/orc/TimestampColumnStatistics.java
deleted file mode 100644
index 27dc49f..0000000
--- a/orc/src/java/org/apache/orc/TimestampColumnStatistics.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import java.sql.Timestamp;
-
-/**
- * Statistics for Timestamp columns.
- */
-public interface TimestampColumnStatistics extends ColumnStatistics {
- /**
- * Get the minimum value for the column.
- * @return minimum value
- */
- Timestamp getMinimum();
-
- /**
- * Get the maximum value for the column.
- * @return maximum value
- */
- Timestamp getMaximum();
-}
[28/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/RunLengthByteWriter.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/RunLengthByteWriter.java b/orc/src/java/org/apache/hive/orc/impl/RunLengthByteWriter.java
new file mode 100644
index 0000000..b94b6a9
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/RunLengthByteWriter.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+
+/**
+ * A streamFactory that writes a sequence of bytes. A control byte is written before
+ * each run with positive values 0 to 127 meaning 2 to 129 repetitions. If the
+ * bytes is -1 to -128, 1 to 128 literal byte values follow.
+ */
+public class RunLengthByteWriter {
+ static final int MIN_REPEAT_SIZE = 3;
+ static final int MAX_LITERAL_SIZE = 128;
+ static final int MAX_REPEAT_SIZE= 127 + MIN_REPEAT_SIZE;
+ private final PositionedOutputStream output;
+ private final byte[] literals = new byte[MAX_LITERAL_SIZE];
+ private int numLiterals = 0;
+ private boolean repeat = false;
+ private int tailRunLength = 0;
+
+ public RunLengthByteWriter(PositionedOutputStream output) {
+ this.output = output;
+ }
+
+ private void writeValues() throws IOException {
+ if (numLiterals != 0) {
+ if (repeat) {
+ output.write(numLiterals - MIN_REPEAT_SIZE);
+ output.write(literals, 0, 1);
+ } else {
+ output.write(-numLiterals);
+ output.write(literals, 0, numLiterals);
+ }
+ repeat = false;
+ tailRunLength = 0;
+ numLiterals = 0;
+ }
+ }
+
+ public void flush() throws IOException {
+ writeValues();
+ output.flush();
+ }
+
+ public void write(byte value) throws IOException {
+ if (numLiterals == 0) {
+ literals[numLiterals++] = value;
+ tailRunLength = 1;
+ } else if (repeat) {
+ if (value == literals[0]) {
+ numLiterals += 1;
+ if (numLiterals == MAX_REPEAT_SIZE) {
+ writeValues();
+ }
+ } else {
+ writeValues();
+ literals[numLiterals++] = value;
+ tailRunLength = 1;
+ }
+ } else {
+ if (value == literals[numLiterals - 1]) {
+ tailRunLength += 1;
+ } else {
+ tailRunLength = 1;
+ }
+ if (tailRunLength == MIN_REPEAT_SIZE) {
+ if (numLiterals + 1 == MIN_REPEAT_SIZE) {
+ repeat = true;
+ numLiterals += 1;
+ } else {
+ numLiterals -= MIN_REPEAT_SIZE - 1;
+ writeValues();
+ literals[0] = value;
+ repeat = true;
+ numLiterals = MIN_REPEAT_SIZE;
+ }
+ } else {
+ literals[numLiterals++] = value;
+ if (numLiterals == MAX_LITERAL_SIZE) {
+ writeValues();
+ }
+ }
+ }
+ }
+
+ public void getPosition(PositionRecorder recorder) throws IOException {
+ output.getPosition(recorder);
+ recorder.addPosition(numLiterals);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerReader.java b/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerReader.java
new file mode 100644
index 0000000..5b613f6
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerReader.java
@@ -0,0 +1,173 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.EOFException;
+import java.io.IOException;
+
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+
+/**
+ * A reader that reads a sequence of integers.
+ * */
+public class RunLengthIntegerReader implements IntegerReader {
+ private InStream input;
+ private final boolean signed;
+ private final long[] literals =
+ new long[RunLengthIntegerWriter.MAX_LITERAL_SIZE];
+ private int numLiterals = 0;
+ private int delta = 0;
+ private int used = 0;
+ private boolean repeat = false;
+ private SerializationUtils utils;
+
+ public RunLengthIntegerReader(InStream input, boolean signed) throws IOException {
+ this.input = input;
+ this.signed = signed;
+ this.utils = new SerializationUtils();
+ }
+
+ private void readValues(boolean ignoreEof) throws IOException {
+ int control = input.read();
+ if (control == -1) {
+ if (!ignoreEof) {
+ throw new EOFException("Read past end of RLE integer from " + input);
+ }
+ used = numLiterals = 0;
+ return;
+ } else if (control < 0x80) {
+ numLiterals = control + RunLengthIntegerWriter.MIN_REPEAT_SIZE;
+ used = 0;
+ repeat = true;
+ delta = input.read();
+ if (delta == -1) {
+ throw new EOFException("End of stream in RLE Integer from " + input);
+ }
+ // convert from 0 to 255 to -128 to 127 by converting to a signed byte
+ delta = (byte) (0 + delta);
+ if (signed) {
+ literals[0] = utils.readVslong(input);
+ } else {
+ literals[0] = utils.readVulong(input);
+ }
+ } else {
+ repeat = false;
+ numLiterals = 0x100 - control;
+ used = 0;
+ for(int i=0; i < numLiterals; ++i) {
+ if (signed) {
+ literals[i] = utils.readVslong(input);
+ } else {
+ literals[i] = utils.readVulong(input);
+ }
+ }
+ }
+ }
+
+ @Override
+ public boolean hasNext() throws IOException {
+ return used != numLiterals || input.available() > 0;
+ }
+
+ @Override
+ public long next() throws IOException {
+ long result;
+ if (used == numLiterals) {
+ readValues(false);
+ }
+ if (repeat) {
+ result = literals[0] + (used++) * delta;
+ } else {
+ result = literals[used++];
+ }
+ return result;
+ }
+
+ @Override
+ public void nextVector(ColumnVector previous,
+ long[] data,
+ int previousLen) throws IOException {
+ previous.isRepeating = true;
+ for (int i = 0; i < previousLen; i++) {
+ if (!previous.isNull[i]) {
+ data[i] = next();
+ } else {
+ // The default value of null for int type in vectorized
+ // processing is 1, so set that if the value is null
+ data[i] = 1;
+ }
+
+ // The default value for nulls in Vectorization for int types is 1
+ // and given that non null value can also be 1, we need to check for isNull also
+ // when determining the isRepeating flag.
+ if (previous.isRepeating
+ && i > 0
+ && (data[0] != data[i] || previous.isNull[0] != previous.isNull[i])) {
+ previous.isRepeating = false;
+ }
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector vector,
+ int[] data,
+ int size) throws IOException {
+ if (vector.noNulls) {
+ for(int r=0; r < data.length && r < size; ++r) {
+ data[r] = (int) next();
+ }
+ } else if (!(vector.isRepeating && vector.isNull[0])) {
+ for(int r=0; r < data.length && r < size; ++r) {
+ if (!vector.isNull[r]) {
+ data[r] = (int) next();
+ } else {
+ data[r] = 1;
+ }
+ }
+ }
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ input.seek(index);
+ int consumed = (int) index.getNext();
+ if (consumed != 0) {
+ // a loop is required for cases where we break the run into two parts
+ while (consumed > 0) {
+ readValues(false);
+ used = consumed;
+ consumed -= numLiterals;
+ }
+ } else {
+ used = 0;
+ numLiterals = 0;
+ }
+ }
+
+ @Override
+ public void skip(long numValues) throws IOException {
+ while (numValues > 0) {
+ if (used == numLiterals) {
+ readValues(false);
+ }
+ long consume = Math.min(numValues, numLiterals - used);
+ used += consume;
+ numValues -= consume;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerReaderV2.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerReaderV2.java b/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerReaderV2.java
new file mode 100644
index 0000000..d0c2b54
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerReaderV2.java
@@ -0,0 +1,406 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A reader that reads a sequence of light weight compressed integers. Refer
+ * {@link RunLengthIntegerWriterV2} for description of various lightweight
+ * compression techniques.
+ */
+public class RunLengthIntegerReaderV2 implements IntegerReader {
+ public static final Logger LOG = LoggerFactory.getLogger(RunLengthIntegerReaderV2.class);
+
+ private InStream input;
+ private final boolean signed;
+ private final long[] literals = new long[RunLengthIntegerWriterV2.MAX_SCOPE];
+ private boolean isRepeating = false;
+ private int numLiterals = 0;
+ private int used = 0;
+ private final boolean skipCorrupt;
+ private final SerializationUtils utils;
+ private RunLengthIntegerWriterV2.EncodingType currentEncoding;
+
+ public RunLengthIntegerReaderV2(InStream input, boolean signed,
+ boolean skipCorrupt) throws IOException {
+ this.input = input;
+ this.signed = signed;
+ this.skipCorrupt = skipCorrupt;
+ this.utils = new SerializationUtils();
+ }
+
+ private final static RunLengthIntegerWriterV2.EncodingType[] encodings = RunLengthIntegerWriterV2.EncodingType.values();
+ private void readValues(boolean ignoreEof) throws IOException {
+ // read the first 2 bits and determine the encoding type
+ isRepeating = false;
+ int firstByte = input.read();
+ if (firstByte < 0) {
+ if (!ignoreEof) {
+ throw new EOFException("Read past end of RLE integer from " + input);
+ }
+ used = numLiterals = 0;
+ return;
+ }
+ currentEncoding = encodings[(firstByte >>> 6) & 0x03];
+ switch (currentEncoding) {
+ case SHORT_REPEAT: readShortRepeatValues(firstByte); break;
+ case DIRECT: readDirectValues(firstByte); break;
+ case PATCHED_BASE: readPatchedBaseValues(firstByte); break;
+ case DELTA: readDeltaValues(firstByte); break;
+ default: throw new IOException("Unknown encoding " + currentEncoding);
+ }
+ }
+
+ private void readDeltaValues(int firstByte) throws IOException {
+
+ // extract the number of fixed bits
+ int fb = (firstByte >>> 1) & 0x1f;
+ if (fb != 0) {
+ fb = utils.decodeBitWidth(fb);
+ }
+
+ // extract the blob run length
+ int len = (firstByte & 0x01) << 8;
+ len |= input.read();
+
+ // read the first value stored as vint
+ long firstVal = 0;
+ if (signed) {
+ firstVal = utils.readVslong(input);
+ } else {
+ firstVal = utils.readVulong(input);
+ }
+
+ // store first value to result buffer
+ long prevVal = firstVal;
+ literals[numLiterals++] = firstVal;
+
+ // if fixed bits is 0 then all values have fixed delta
+ if (fb == 0) {
+ // read the fixed delta value stored as vint (deltas can be negative even
+ // if all number are positive)
+ long fd = utils.readVslong(input);
+ if (fd == 0) {
+ isRepeating = true;
+ assert numLiterals == 1;
+ Arrays.fill(literals, numLiterals, numLiterals + len, literals[0]);
+ numLiterals += len;
+ } else {
+ // add fixed deltas to adjacent values
+ for(int i = 0; i < len; i++) {
+ literals[numLiterals++] = literals[numLiterals - 2] + fd;
+ }
+ }
+ } else {
+ long deltaBase = utils.readVslong(input);
+ // add delta base and first value
+ literals[numLiterals++] = firstVal + deltaBase;
+ prevVal = literals[numLiterals - 1];
+ len -= 1;
+
+ // write the unpacked values, add it to previous value and store final
+ // value to result buffer. if the delta base value is negative then it
+ // is a decreasing sequence else an increasing sequence
+ utils.readInts(literals, numLiterals, len, fb, input);
+ while (len > 0) {
+ if (deltaBase < 0) {
+ literals[numLiterals] = prevVal - literals[numLiterals];
+ } else {
+ literals[numLiterals] = prevVal + literals[numLiterals];
+ }
+ prevVal = literals[numLiterals];
+ len--;
+ numLiterals++;
+ }
+ }
+ }
+
+ private void readPatchedBaseValues(int firstByte) throws IOException {
+
+ // extract the number of fixed bits
+ int fbo = (firstByte >>> 1) & 0x1f;
+ int fb = utils.decodeBitWidth(fbo);
+
+ // extract the run length of data blob
+ int len = (firstByte & 0x01) << 8;
+ len |= input.read();
+ // runs are always one off
+ len += 1;
+
+ // extract the number of bytes occupied by base
+ int thirdByte = input.read();
+ int bw = (thirdByte >>> 5) & 0x07;
+ // base width is one off
+ bw += 1;
+
+ // extract patch width
+ int pwo = thirdByte & 0x1f;
+ int pw = utils.decodeBitWidth(pwo);
+
+ // read fourth byte and extract patch gap width
+ int fourthByte = input.read();
+ int pgw = (fourthByte >>> 5) & 0x07;
+ // patch gap width is one off
+ pgw += 1;
+
+ // extract the length of the patch list
+ int pl = fourthByte & 0x1f;
+
+ // read the next base width number of bytes to extract base value
+ long base = utils.bytesToLongBE(input, bw);
+ long mask = (1L << ((bw * 8) - 1));
+ // if MSB of base value is 1 then base is negative value else positive
+ if ((base & mask) != 0) {
+ base = base & ~mask;
+ base = -base;
+ }
+
+ // unpack the data blob
+ long[] unpacked = new long[len];
+ utils.readInts(unpacked, 0, len, fb, input);
+
+ // unpack the patch blob
+ long[] unpackedPatch = new long[pl];
+
+ if ((pw + pgw) > 64 && !skipCorrupt) {
+ throw new IOException("Corruption in ORC data encountered. To skip" +
+ " reading corrupted data, set hive.exec.orc.skip.corrupt.data to" +
+ " true");
+ }
+ int bitSize = utils.getClosestFixedBits(pw + pgw);
+ utils.readInts(unpackedPatch, 0, pl, bitSize, input);
+
+ // apply the patch directly when decoding the packed data
+ int patchIdx = 0;
+ long currGap = 0;
+ long currPatch = 0;
+ long patchMask = ((1L << pw) - 1);
+ currGap = unpackedPatch[patchIdx] >>> pw;
+ currPatch = unpackedPatch[patchIdx] & patchMask;
+ long actualGap = 0;
+
+ // special case: gap is >255 then patch value will be 0.
+ // if gap is <=255 then patch value cannot be 0
+ while (currGap == 255 && currPatch == 0) {
+ actualGap += 255;
+ patchIdx++;
+ currGap = unpackedPatch[patchIdx] >>> pw;
+ currPatch = unpackedPatch[patchIdx] & patchMask;
+ }
+ // add the left over gap
+ actualGap += currGap;
+
+ // unpack data blob, patch it (if required), add base to get final result
+ for(int i = 0; i < unpacked.length; i++) {
+ if (i == actualGap) {
+ // extract the patch value
+ long patchedVal = unpacked[i] | (currPatch << fb);
+
+ // add base to patched value
+ literals[numLiterals++] = base + patchedVal;
+
+ // increment the patch to point to next entry in patch list
+ patchIdx++;
+
+ if (patchIdx < pl) {
+ // read the next gap and patch
+ currGap = unpackedPatch[patchIdx] >>> pw;
+ currPatch = unpackedPatch[patchIdx] & patchMask;
+ actualGap = 0;
+
+ // special case: gap is >255 then patch will be 0. if gap is
+ // <=255 then patch cannot be 0
+ while (currGap == 255 && currPatch == 0) {
+ actualGap += 255;
+ patchIdx++;
+ currGap = unpackedPatch[patchIdx] >>> pw;
+ currPatch = unpackedPatch[patchIdx] & patchMask;
+ }
+ // add the left over gap
+ actualGap += currGap;
+
+ // next gap is relative to the current gap
+ actualGap += i;
+ }
+ } else {
+ // no patching required. add base to unpacked value to get final value
+ literals[numLiterals++] = base + unpacked[i];
+ }
+ }
+
+ }
+
+ private void readDirectValues(int firstByte) throws IOException {
+
+ // extract the number of fixed bits
+ int fbo = (firstByte >>> 1) & 0x1f;
+ int fb = utils.decodeBitWidth(fbo);
+
+ // extract the run length
+ int len = (firstByte & 0x01) << 8;
+ len |= input.read();
+ // runs are one off
+ len += 1;
+
+ // write the unpacked values and zigzag decode to result buffer
+ utils.readInts(literals, numLiterals, len, fb, input);
+ if (signed) {
+ for(int i = 0; i < len; i++) {
+ literals[numLiterals] = utils.zigzagDecode(literals[numLiterals]);
+ numLiterals++;
+ }
+ } else {
+ numLiterals += len;
+ }
+ }
+
+ private void readShortRepeatValues(int firstByte) throws IOException {
+
+ // read the number of bytes occupied by the value
+ int size = (firstByte >>> 3) & 0x07;
+ // #bytes are one off
+ size += 1;
+
+ // read the run length
+ int len = firstByte & 0x07;
+ // run lengths values are stored only after MIN_REPEAT value is met
+ len += RunLengthIntegerWriterV2.MIN_REPEAT;
+
+ // read the repeated value which is store using fixed bytes
+ long val = utils.bytesToLongBE(input, size);
+
+ if (signed) {
+ val = utils.zigzagDecode(val);
+ }
+
+ if (numLiterals != 0) {
+ // Currently this always holds, which makes peekNextAvailLength simpler.
+ // If this changes, peekNextAvailLength should be adjusted accordingly.
+ throw new AssertionError("readValues called with existing values present");
+ }
+ // repeat the value for length times
+ isRepeating = true;
+ // TODO: this is not so useful and V1 reader doesn't do that. Fix? Same if delta == 0
+ for(int i = 0; i < len; i++) {
+ literals[i] = val;
+ }
+ numLiterals = len;
+ }
+
+ @Override
+ public boolean hasNext() throws IOException {
+ return used != numLiterals || input.available() > 0;
+ }
+
+ @Override
+ public long next() throws IOException {
+ long result;
+ if (used == numLiterals) {
+ numLiterals = 0;
+ used = 0;
+ readValues(false);
+ }
+ result = literals[used++];
+ return result;
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ input.seek(index);
+ int consumed = (int) index.getNext();
+ if (consumed != 0) {
+ // a loop is required for cases where we break the run into two
+ // parts
+ while (consumed > 0) {
+ numLiterals = 0;
+ readValues(false);
+ used = consumed;
+ consumed -= numLiterals;
+ }
+ } else {
+ used = 0;
+ numLiterals = 0;
+ }
+ }
+
+ @Override
+ public void skip(long numValues) throws IOException {
+ while (numValues > 0) {
+ if (used == numLiterals) {
+ numLiterals = 0;
+ used = 0;
+ readValues(false);
+ }
+ long consume = Math.min(numValues, numLiterals - used);
+ used += consume;
+ numValues -= consume;
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector previous,
+ long[] data,
+ int previousLen) throws IOException {
+ previous.isRepeating = true;
+ for (int i = 0; i < previousLen; i++) {
+ if (!previous.isNull[i]) {
+ data[i] = next();
+ } else {
+ // The default value of null for int type in vectorized
+ // processing is 1, so set that if the value is null
+ data[i] = 1;
+ }
+
+ // The default value for nulls in Vectorization for int types is 1
+ // and given that non null value can also be 1, we need to check for isNull also
+ // when determining the isRepeating flag.
+ if (previous.isRepeating
+ && i > 0
+ && (data[0] != data[i] ||
+ previous.isNull[0] != previous.isNull[i])) {
+ previous.isRepeating = false;
+ }
+ }
+ }
+
+ @Override
+ public void nextVector(ColumnVector vector,
+ int[] data,
+ int size) throws IOException {
+ if (vector.noNulls) {
+ for(int r=0; r < data.length && r < size; ++r) {
+ data[r] = (int) next();
+ }
+ } else if (!(vector.isRepeating && vector.isNull[0])) {
+ for(int r=0; r < data.length && r < size; ++r) {
+ if (!vector.isNull[r]) {
+ data[r] = (int) next();
+ } else {
+ data[r] = 1;
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerWriter.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerWriter.java b/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerWriter.java
new file mode 100644
index 0000000..2153001
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerWriter.java
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+
+/**
+ * A streamFactory that writes a sequence of integers. A control byte is written before
+ * each run with positive values 0 to 127 meaning 3 to 130 repetitions, each
+ * repetition is offset by a delta. If the control byte is -1 to -128, 1 to 128
+ * literal vint values follow.
+ */
+public class RunLengthIntegerWriter implements IntegerWriter {
+ static final int MIN_REPEAT_SIZE = 3;
+ static final int MAX_DELTA = 127;
+ static final int MIN_DELTA = -128;
+ static final int MAX_LITERAL_SIZE = 128;
+ private static final int MAX_REPEAT_SIZE = 127 + MIN_REPEAT_SIZE;
+ private final PositionedOutputStream output;
+ private final boolean signed;
+ private final long[] literals = new long[MAX_LITERAL_SIZE];
+ private int numLiterals = 0;
+ private long delta = 0;
+ private boolean repeat = false;
+ private int tailRunLength = 0;
+ private SerializationUtils utils;
+
+ public RunLengthIntegerWriter(PositionedOutputStream output,
+ boolean signed) {
+ this.output = output;
+ this.signed = signed;
+ this.utils = new SerializationUtils();
+ }
+
+ private void writeValues() throws IOException {
+ if (numLiterals != 0) {
+ if (repeat) {
+ output.write(numLiterals - MIN_REPEAT_SIZE);
+ output.write((byte) delta);
+ if (signed) {
+ utils.writeVslong(output, literals[0]);
+ } else {
+ utils.writeVulong(output, literals[0]);
+ }
+ } else {
+ output.write(-numLiterals);
+ for(int i=0; i < numLiterals; ++i) {
+ if (signed) {
+ utils.writeVslong(output, literals[i]);
+ } else {
+ utils.writeVulong(output, literals[i]);
+ }
+ }
+ }
+ repeat = false;
+ numLiterals = 0;
+ tailRunLength = 0;
+ }
+ }
+
+ @Override
+ public void flush() throws IOException {
+ writeValues();
+ output.flush();
+ }
+
+ @Override
+ public void write(long value) throws IOException {
+ if (numLiterals == 0) {
+ literals[numLiterals++] = value;
+ tailRunLength = 1;
+ } else if (repeat) {
+ if (value == literals[0] + delta * numLiterals) {
+ numLiterals += 1;
+ if (numLiterals == MAX_REPEAT_SIZE) {
+ writeValues();
+ }
+ } else {
+ writeValues();
+ literals[numLiterals++] = value;
+ tailRunLength = 1;
+ }
+ } else {
+ if (tailRunLength == 1) {
+ delta = value - literals[numLiterals - 1];
+ if (delta < MIN_DELTA || delta > MAX_DELTA) {
+ tailRunLength = 1;
+ } else {
+ tailRunLength = 2;
+ }
+ } else if (value == literals[numLiterals - 1] + delta) {
+ tailRunLength += 1;
+ } else {
+ delta = value - literals[numLiterals - 1];
+ if (delta < MIN_DELTA || delta > MAX_DELTA) {
+ tailRunLength = 1;
+ } else {
+ tailRunLength = 2;
+ }
+ }
+ if (tailRunLength == MIN_REPEAT_SIZE) {
+ if (numLiterals + 1 == MIN_REPEAT_SIZE) {
+ repeat = true;
+ numLiterals += 1;
+ } else {
+ numLiterals -= MIN_REPEAT_SIZE - 1;
+ long base = literals[numLiterals];
+ writeValues();
+ literals[0] = base;
+ repeat = true;
+ numLiterals = MIN_REPEAT_SIZE;
+ }
+ } else {
+ literals[numLiterals++] = value;
+ if (numLiterals == MAX_LITERAL_SIZE) {
+ writeValues();
+ }
+ }
+ }
+ }
+
+ @Override
+ public void getPosition(PositionRecorder recorder) throws IOException {
+ output.getPosition(recorder);
+ recorder.addPosition(numLiterals);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerWriterV2.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerWriterV2.java b/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerWriterV2.java
new file mode 100644
index 0000000..1140ab4
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/RunLengthIntegerWriterV2.java
@@ -0,0 +1,831 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+
+/**
+ * A writer that performs light weight compression over sequence of integers.
+ * <p>
+ * There are four types of lightweight integer compression
+ * <ul>
+ * <li>SHORT_REPEAT</li>
+ * <li>DIRECT</li>
+ * <li>PATCHED_BASE</li>
+ * <li>DELTA</li>
+ * </ul>
+ * </p>
+ * The description and format for these types are as below:
+ * <p>
+ * <b>SHORT_REPEAT:</b> Used for short repeated integer sequences.
+ * <ul>
+ * <li>1 byte header
+ * <ul>
+ * <li>2 bits for encoding type</li>
+ * <li>3 bits for bytes required for repeating value</li>
+ * <li>3 bits for repeat count (MIN_REPEAT + run length)</li>
+ * </ul>
+ * </li>
+ * <li>Blob - repeat value (fixed bytes)</li>
+ * </ul>
+ * </p>
+ * <p>
+ * <b>DIRECT:</b> Used for random integer sequences whose number of bit
+ * requirement doesn't vary a lot.
+ * <ul>
+ * <li>2 bytes header
+ * <ul>
+ * 1st byte
+ * <li>2 bits for encoding type</li>
+ * <li>5 bits for fixed bit width of values in blob</li>
+ * <li>1 bit for storing MSB of run length</li>
+ * </ul>
+ * <ul>
+ * 2nd byte
+ * <li>8 bits for lower run length bits</li>
+ * </ul>
+ * </li>
+ * <li>Blob - stores the direct values using fixed bit width. The length of the
+ * data blob is (fixed width * run length) bits long</li>
+ * </ul>
+ * </p>
+ * <p>
+ * <b>PATCHED_BASE:</b> Used for random integer sequences whose number of bit
+ * requirement varies beyond a threshold.
+ * <ul>
+ * <li>4 bytes header
+ * <ul>
+ * 1st byte
+ * <li>2 bits for encoding type</li>
+ * <li>5 bits for fixed bit width of values in blob</li>
+ * <li>1 bit for storing MSB of run length</li>
+ * </ul>
+ * <ul>
+ * 2nd byte
+ * <li>8 bits for lower run length bits</li>
+ * </ul>
+ * <ul>
+ * 3rd byte
+ * <li>3 bits for bytes required to encode base value</li>
+ * <li>5 bits for patch width</li>
+ * </ul>
+ * <ul>
+ * 4th byte
+ * <li>3 bits for patch gap width</li>
+ * <li>5 bits for patch length</li>
+ * </ul>
+ * </li>
+ * <li>Base value - Stored using fixed number of bytes. If MSB is set, base
+ * value is negative else positive. Length of base value is (base width * 8)
+ * bits.</li>
+ * <li>Data blob - Base reduced values as stored using fixed bit width. Length
+ * of data blob is (fixed width * run length) bits.</li>
+ * <li>Patch blob - Patch blob is a list of gap and patch value. Each entry in
+ * the patch list is (patch width + patch gap width) bits long. Gap between the
+ * subsequent elements to be patched are stored in upper part of entry whereas
+ * patch values are stored in lower part of entry. Length of patch blob is
+ * ((patch width + patch gap width) * patch length) bits.</li>
+ * </ul>
+ * </p>
+ * <p>
+ * <b>DELTA</b> Used for monotonically increasing or decreasing sequences,
+ * sequences with fixed delta values or long repeated sequences.
+ * <ul>
+ * <li>2 bytes header
+ * <ul>
+ * 1st byte
+ * <li>2 bits for encoding type</li>
+ * <li>5 bits for fixed bit width of values in blob</li>
+ * <li>1 bit for storing MSB of run length</li>
+ * </ul>
+ * <ul>
+ * 2nd byte
+ * <li>8 bits for lower run length bits</li>
+ * </ul>
+ * </li>
+ * <li>Base value - zigzag encoded value written as varint</li>
+ * <li>Delta base - zigzag encoded value written as varint</li>
+ * <li>Delta blob - only positive values. monotonicity and orderness are decided
+ * based on the sign of the base value and delta base</li>
+ * </ul>
+ * </p>
+ */
+public class RunLengthIntegerWriterV2 implements IntegerWriter {
+
+ public enum EncodingType {
+ SHORT_REPEAT, DIRECT, PATCHED_BASE, DELTA
+ }
+
+ static final int MAX_SCOPE = 512;
+ static final int MIN_REPEAT = 3;
+ private static final int MAX_SHORT_REPEAT_LENGTH = 10;
+ private long prevDelta = 0;
+ private int fixedRunLength = 0;
+ private int variableRunLength = 0;
+ private final long[] literals = new long[MAX_SCOPE];
+ private final PositionedOutputStream output;
+ private final boolean signed;
+ private EncodingType encoding;
+ private int numLiterals;
+ private final long[] zigzagLiterals = new long[MAX_SCOPE];
+ private final long[] baseRedLiterals = new long[MAX_SCOPE];
+ private final long[] adjDeltas = new long[MAX_SCOPE];
+ private long fixedDelta;
+ private int zzBits90p;
+ private int zzBits100p;
+ private int brBits95p;
+ private int brBits100p;
+ private int bitsDeltaMax;
+ private int patchWidth;
+ private int patchGapWidth;
+ private int patchLength;
+ private long[] gapVsPatchList;
+ private long min;
+ private boolean isFixedDelta;
+ private SerializationUtils utils;
+ private boolean alignedBitpacking;
+
+ RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed) {
+ this(output, signed, true);
+ }
+
+ public RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed,
+ boolean alignedBitpacking) {
+ this.output = output;
+ this.signed = signed;
+ this.alignedBitpacking = alignedBitpacking;
+ this.utils = new SerializationUtils();
+ clear();
+ }
+
+ private void writeValues() throws IOException {
+ if (numLiterals != 0) {
+
+ if (encoding.equals(EncodingType.SHORT_REPEAT)) {
+ writeShortRepeatValues();
+ } else if (encoding.equals(EncodingType.DIRECT)) {
+ writeDirectValues();
+ } else if (encoding.equals(EncodingType.PATCHED_BASE)) {
+ writePatchedBaseValues();
+ } else {
+ writeDeltaValues();
+ }
+
+ // clear all the variables
+ clear();
+ }
+ }
+
+ private void writeDeltaValues() throws IOException {
+ int len = 0;
+ int fb = bitsDeltaMax;
+ int efb = 0;
+
+ if (alignedBitpacking) {
+ fb = utils.getClosestAlignedFixedBits(fb);
+ }
+
+ if (isFixedDelta) {
+ // if fixed run length is greater than threshold then it will be fixed
+ // delta sequence with delta value 0 else fixed delta sequence with
+ // non-zero delta value
+ if (fixedRunLength > MIN_REPEAT) {
+ // ex. sequence: 2 2 2 2 2 2 2 2
+ len = fixedRunLength - 1;
+ fixedRunLength = 0;
+ } else {
+ // ex. sequence: 4 6 8 10 12 14 16
+ len = variableRunLength - 1;
+ variableRunLength = 0;
+ }
+ } else {
+ // fixed width 0 is used for long repeating values.
+ // sequences that require only 1 bit to encode will have an additional bit
+ if (fb == 1) {
+ fb = 2;
+ }
+ efb = utils.encodeBitWidth(fb);
+ efb = efb << 1;
+ len = variableRunLength - 1;
+ variableRunLength = 0;
+ }
+
+ // extract the 9th bit of run length
+ final int tailBits = (len & 0x100) >>> 8;
+
+ // create first byte of the header
+ final int headerFirstByte = getOpcode() | efb | tailBits;
+
+ // second byte of the header stores the remaining 8 bits of runlength
+ final int headerSecondByte = len & 0xff;
+
+ // write header
+ output.write(headerFirstByte);
+ output.write(headerSecondByte);
+
+ // store the first value from zigzag literal array
+ if (signed) {
+ utils.writeVslong(output, literals[0]);
+ } else {
+ utils.writeVulong(output, literals[0]);
+ }
+
+ if (isFixedDelta) {
+ // if delta is fixed then we don't need to store delta blob
+ utils.writeVslong(output, fixedDelta);
+ } else {
+ // store the first value as delta value using zigzag encoding
+ utils.writeVslong(output, adjDeltas[0]);
+
+ // adjacent delta values are bit packed. The length of adjDeltas array is
+ // always one less than the number of literals (delta difference for n
+ // elements is n-1). We have already written one element, write the
+ // remaining numLiterals - 2 elements here
+ utils.writeInts(adjDeltas, 1, numLiterals - 2, fb, output);
+ }
+ }
+
+ private void writePatchedBaseValues() throws IOException {
+
+ // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding
+ // because patch is applied to MSB bits. For example: If fixed bit width of
+ // base value is 7 bits and if patch is 3 bits, the actual value is
+ // constructed by shifting the patch to left by 7 positions.
+ // actual_value = patch << 7 | base_value
+ // So, if we align base_value then actual_value can not be reconstructed.
+
+ // write the number of fixed bits required in next 5 bits
+ final int fb = brBits95p;
+ final int efb = utils.encodeBitWidth(fb) << 1;
+
+ // adjust variable run length, they are one off
+ variableRunLength -= 1;
+
+ // extract the 9th bit of run length
+ final int tailBits = (variableRunLength & 0x100) >>> 8;
+
+ // create first byte of the header
+ final int headerFirstByte = getOpcode() | efb | tailBits;
+
+ // second byte of the header stores the remaining 8 bits of runlength
+ final int headerSecondByte = variableRunLength & 0xff;
+
+ // if the min value is negative toggle the sign
+ final boolean isNegative = min < 0 ? true : false;
+ if (isNegative) {
+ min = -min;
+ }
+
+ // find the number of bytes required for base and shift it by 5 bits
+ // to accommodate patch width. The additional bit is used to store the sign
+ // of the base value.
+ final int baseWidth = utils.findClosestNumBits(min) + 1;
+ final int baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1;
+ final int bb = (baseBytes - 1) << 5;
+
+ // if the base value is negative then set MSB to 1
+ if (isNegative) {
+ min |= (1L << ((baseBytes * 8) - 1));
+ }
+
+ // third byte contains 3 bits for number of bytes occupied by base
+ // and 5 bits for patchWidth
+ final int headerThirdByte = bb | utils.encodeBitWidth(patchWidth);
+
+ // fourth byte contains 3 bits for page gap width and 5 bits for
+ // patch length
+ final int headerFourthByte = (patchGapWidth - 1) << 5 | patchLength;
+
+ // write header
+ output.write(headerFirstByte);
+ output.write(headerSecondByte);
+ output.write(headerThirdByte);
+ output.write(headerFourthByte);
+
+ // write the base value using fixed bytes in big endian order
+ for(int i = baseBytes - 1; i >= 0; i--) {
+ byte b = (byte) ((min >>> (i * 8)) & 0xff);
+ output.write(b);
+ }
+
+ // base reduced literals are bit packed
+ int closestFixedBits = utils.getClosestFixedBits(fb);
+
+ utils.writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits,
+ output);
+
+ // write patch list
+ closestFixedBits = utils.getClosestFixedBits(patchGapWidth + patchWidth);
+
+ utils.writeInts(gapVsPatchList, 0, gapVsPatchList.length, closestFixedBits,
+ output);
+
+ // reset run length
+ variableRunLength = 0;
+ }
+
+ /**
+ * Store the opcode in 2 MSB bits
+ * @return opcode
+ */
+ private int getOpcode() {
+ return encoding.ordinal() << 6;
+ }
+
+ private void writeDirectValues() throws IOException {
+
+ // write the number of fixed bits required in next 5 bits
+ int fb = zzBits100p;
+
+ if (alignedBitpacking) {
+ fb = utils.getClosestAlignedFixedBits(fb);
+ }
+
+ final int efb = utils.encodeBitWidth(fb) << 1;
+
+ // adjust variable run length
+ variableRunLength -= 1;
+
+ // extract the 9th bit of run length
+ final int tailBits = (variableRunLength & 0x100) >>> 8;
+
+ // create first byte of the header
+ final int headerFirstByte = getOpcode() | efb | tailBits;
+
+ // second byte of the header stores the remaining 8 bits of runlength
+ final int headerSecondByte = variableRunLength & 0xff;
+
+ // write header
+ output.write(headerFirstByte);
+ output.write(headerSecondByte);
+
+ // bit packing the zigzag encoded literals
+ utils.writeInts(zigzagLiterals, 0, numLiterals, fb, output);
+
+ // reset run length
+ variableRunLength = 0;
+ }
+
+ private void writeShortRepeatValues() throws IOException {
+ // get the value that is repeating, compute the bits and bytes required
+ long repeatVal = 0;
+ if (signed) {
+ repeatVal = utils.zigzagEncode(literals[0]);
+ } else {
+ repeatVal = literals[0];
+ }
+
+ final int numBitsRepeatVal = utils.findClosestNumBits(repeatVal);
+ final int numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? numBitsRepeatVal >>> 3
+ : (numBitsRepeatVal >>> 3) + 1;
+
+ // write encoding type in top 2 bits
+ int header = getOpcode();
+
+ // write the number of bytes required for the value
+ header |= ((numBytesRepeatVal - 1) << 3);
+
+ // write the run length
+ fixedRunLength -= MIN_REPEAT;
+ header |= fixedRunLength;
+
+ // write the header
+ output.write(header);
+
+ // write the repeating value in big endian byte order
+ for(int i = numBytesRepeatVal - 1; i >= 0; i--) {
+ int b = (int) ((repeatVal >>> (i * 8)) & 0xff);
+ output.write(b);
+ }
+
+ fixedRunLength = 0;
+ }
+
+ private void determineEncoding() {
+
+ // we need to compute zigzag values for DIRECT encoding if we decide to
+ // break early for delta overflows or for shorter runs
+ computeZigZagLiterals();
+
+ zzBits100p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
+
+ // not a big win for shorter runs to determine encoding
+ if (numLiterals <= MIN_REPEAT) {
+ encoding = EncodingType.DIRECT;
+ return;
+ }
+
+ // DELTA encoding check
+
+ // for identifying monotonic sequences
+ boolean isIncreasing = true;
+ boolean isDecreasing = true;
+ this.isFixedDelta = true;
+
+ this.min = literals[0];
+ long max = literals[0];
+ final long initialDelta = literals[1] - literals[0];
+ long currDelta = initialDelta;
+ long deltaMax = initialDelta;
+ this.adjDeltas[0] = initialDelta;
+
+ for (int i = 1; i < numLiterals; i++) {
+ final long l1 = literals[i];
+ final long l0 = literals[i - 1];
+ currDelta = l1 - l0;
+ min = Math.min(min, l1);
+ max = Math.max(max, l1);
+
+ isIncreasing &= (l0 <= l1);
+ isDecreasing &= (l0 >= l1);
+
+ isFixedDelta &= (currDelta == initialDelta);
+ if (i > 1) {
+ adjDeltas[i - 1] = Math.abs(currDelta);
+ deltaMax = Math.max(deltaMax, adjDeltas[i - 1]);
+ }
+ }
+
+ // its faster to exit under delta overflow condition without checking for
+ // PATCHED_BASE condition as encoding using DIRECT is faster and has less
+ // overhead than PATCHED_BASE
+ if (!utils.isSafeSubtract(max, min)) {
+ encoding = EncodingType.DIRECT;
+ return;
+ }
+
+ // invariant - subtracting any number from any other in the literals after
+ // this point won't overflow
+
+ // if min is equal to max then the delta is 0, this condition happens for
+ // fixed values run >10 which cannot be encoded with SHORT_REPEAT
+ if (min == max) {
+ assert isFixedDelta : min + "==" + max +
+ ", isFixedDelta cannot be false";
+ assert currDelta == 0 : min + "==" + max + ", currDelta should be zero";
+ fixedDelta = 0;
+ encoding = EncodingType.DELTA;
+ return;
+ }
+
+ if (isFixedDelta) {
+ assert currDelta == initialDelta
+ : "currDelta should be equal to initialDelta for fixed delta encoding";
+ encoding = EncodingType.DELTA;
+ fixedDelta = currDelta;
+ return;
+ }
+
+ // if initialDelta is 0 then we cannot delta encode as we cannot identify
+ // the sign of deltas (increasing or decreasing)
+ if (initialDelta != 0) {
+ // stores the number of bits required for packing delta blob in
+ // delta encoding
+ bitsDeltaMax = utils.findClosestNumBits(deltaMax);
+
+ // monotonic condition
+ if (isIncreasing || isDecreasing) {
+ encoding = EncodingType.DELTA;
+ return;
+ }
+ }
+
+ // PATCHED_BASE encoding check
+
+ // percentile values are computed for the zigzag encoded values. if the
+ // number of bit requirement between 90th and 100th percentile varies
+ // beyond a threshold then we need to patch the values. if the variation
+ // is not significant then we can use direct encoding
+
+ zzBits90p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 0.9);
+ int diffBitsLH = zzBits100p - zzBits90p;
+
+ // if the difference between 90th percentile and 100th percentile fixed
+ // bits is > 1 then we need patch the values
+ if (diffBitsLH > 1) {
+
+ // patching is done only on base reduced values.
+ // remove base from literals
+ for (int i = 0; i < numLiterals; i++) {
+ baseRedLiterals[i] = literals[i] - min;
+ }
+
+ // 95th percentile width is used to determine max allowed value
+ // after which patching will be done
+ brBits95p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 0.95);
+
+ // 100th percentile is used to compute the max patch width
+ brBits100p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 1.0);
+
+ // after base reducing the values, if the difference in bits between
+ // 95th percentile and 100th percentile value is zero then there
+ // is no point in patching the values, in which case we will
+ // fallback to DIRECT encoding.
+ // The decision to use patched base was based on zigzag values, but the
+ // actual patching is done on base reduced literals.
+ if ((brBits100p - brBits95p) != 0) {
+ encoding = EncodingType.PATCHED_BASE;
+ preparePatchedBlob();
+ return;
+ } else {
+ encoding = EncodingType.DIRECT;
+ return;
+ }
+ } else {
+ // if difference in bits between 95th percentile and 100th percentile is
+ // 0, then patch length will become 0. Hence we will fallback to direct
+ encoding = EncodingType.DIRECT;
+ return;
+ }
+ }
+
+ private void computeZigZagLiterals() {
+ // populate zigzag encoded literals
+ long zzEncVal = 0;
+ for (int i = 0; i < numLiterals; i++) {
+ if (signed) {
+ zzEncVal = utils.zigzagEncode(literals[i]);
+ } else {
+ zzEncVal = literals[i];
+ }
+ zigzagLiterals[i] = zzEncVal;
+ }
+ }
+
+ private void preparePatchedBlob() {
+ // mask will be max value beyond which patch will be generated
+ long mask = (1L << brBits95p) - 1;
+
+ // since we are considering only 95 percentile, the size of gap and
+ // patch array can contain only be 5% values
+ patchLength = (int) Math.ceil((numLiterals * 0.05));
+
+ int[] gapList = new int[patchLength];
+ long[] patchList = new long[patchLength];
+
+ // #bit for patch
+ patchWidth = brBits100p - brBits95p;
+ patchWidth = utils.getClosestFixedBits(patchWidth);
+
+ // if patch bit requirement is 64 then it will not possible to pack
+ // gap and patch together in a long. To make sure gap and patch can be
+ // packed together adjust the patch width
+ if (patchWidth == 64) {
+ patchWidth = 56;
+ brBits95p = 8;
+ mask = (1L << brBits95p) - 1;
+ }
+
+ int gapIdx = 0;
+ int patchIdx = 0;
+ int prev = 0;
+ int gap = 0;
+ int maxGap = 0;
+
+ for(int i = 0; i < numLiterals; i++) {
+ // if value is above mask then create the patch and record the gap
+ if (baseRedLiterals[i] > mask) {
+ gap = i - prev;
+ if (gap > maxGap) {
+ maxGap = gap;
+ }
+
+ // gaps are relative, so store the previous patched value index
+ prev = i;
+ gapList[gapIdx++] = gap;
+
+ // extract the most significant bits that are over mask bits
+ long patch = baseRedLiterals[i] >>> brBits95p;
+ patchList[patchIdx++] = patch;
+
+ // strip off the MSB to enable safe bit packing
+ baseRedLiterals[i] &= mask;
+ }
+ }
+
+ // adjust the patch length to number of entries in gap list
+ patchLength = gapIdx;
+
+ // if the element to be patched is the first and only element then
+ // max gap will be 0, but to store the gap as 0 we need atleast 1 bit
+ if (maxGap == 0 && patchLength != 0) {
+ patchGapWidth = 1;
+ } else {
+ patchGapWidth = utils.findClosestNumBits(maxGap);
+ }
+
+ // special case: if the patch gap width is greater than 256, then
+ // we need 9 bits to encode the gap width. But we only have 3 bits in
+ // header to record the gap width. To deal with this case, we will save
+ // two entries in patch list in the following way
+ // 256 gap width => 0 for patch value
+ // actual gap - 256 => actual patch value
+ // We will do the same for gap width = 511. If the element to be patched is
+ // the last element in the scope then gap width will be 511. In this case we
+ // will have 3 entries in the patch list in the following way
+ // 255 gap width => 0 for patch value
+ // 255 gap width => 0 for patch value
+ // 1 gap width => actual patch value
+ if (patchGapWidth > 8) {
+ patchGapWidth = 8;
+ // for gap = 511, we need two additional entries in patch list
+ if (maxGap == 511) {
+ patchLength += 2;
+ } else {
+ patchLength += 1;
+ }
+ }
+
+ // create gap vs patch list
+ gapIdx = 0;
+ patchIdx = 0;
+ gapVsPatchList = new long[patchLength];
+ for(int i = 0; i < patchLength; i++) {
+ long g = gapList[gapIdx++];
+ long p = patchList[patchIdx++];
+ while (g > 255) {
+ gapVsPatchList[i++] = (255L << patchWidth);
+ g -= 255;
+ }
+
+ // store patch value in LSBs and gap in MSBs
+ gapVsPatchList[i] = (g << patchWidth) | p;
+ }
+ }
+
+ /**
+ * clears all the variables
+ */
+ private void clear() {
+ numLiterals = 0;
+ encoding = null;
+ prevDelta = 0;
+ fixedDelta = 0;
+ zzBits90p = 0;
+ zzBits100p = 0;
+ brBits95p = 0;
+ brBits100p = 0;
+ bitsDeltaMax = 0;
+ patchGapWidth = 0;
+ patchLength = 0;
+ patchWidth = 0;
+ gapVsPatchList = null;
+ min = 0;
+ isFixedDelta = true;
+ }
+
+ @Override
+ public void flush() throws IOException {
+ if (numLiterals != 0) {
+ if (variableRunLength != 0) {
+ determineEncoding();
+ writeValues();
+ } else if (fixedRunLength != 0) {
+ if (fixedRunLength < MIN_REPEAT) {
+ variableRunLength = fixedRunLength;
+ fixedRunLength = 0;
+ determineEncoding();
+ writeValues();
+ } else if (fixedRunLength >= MIN_REPEAT
+ && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
+ encoding = EncodingType.SHORT_REPEAT;
+ writeValues();
+ } else {
+ encoding = EncodingType.DELTA;
+ isFixedDelta = true;
+ writeValues();
+ }
+ }
+ }
+ output.flush();
+ }
+
+ @Override
+ public void write(long val) throws IOException {
+ if (numLiterals == 0) {
+ initializeLiterals(val);
+ } else {
+ if (numLiterals == 1) {
+ prevDelta = val - literals[0];
+ literals[numLiterals++] = val;
+ // if both values are same count as fixed run else variable run
+ if (val == literals[0]) {
+ fixedRunLength = 2;
+ variableRunLength = 0;
+ } else {
+ fixedRunLength = 0;
+ variableRunLength = 2;
+ }
+ } else {
+ long currentDelta = val - literals[numLiterals - 1];
+ if (prevDelta == 0 && currentDelta == 0) {
+ // fixed delta run
+
+ literals[numLiterals++] = val;
+
+ // if variable run is non-zero then we are seeing repeating
+ // values at the end of variable run in which case keep
+ // updating variable and fixed runs
+ if (variableRunLength > 0) {
+ fixedRunLength = 2;
+ }
+ fixedRunLength += 1;
+
+ // if fixed run met the minimum condition and if variable
+ // run is non-zero then flush the variable run and shift the
+ // tail fixed runs to start of the buffer
+ if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
+ numLiterals -= MIN_REPEAT;
+ variableRunLength -= MIN_REPEAT - 1;
+ // copy the tail fixed runs
+ long[] tailVals = new long[MIN_REPEAT];
+ System.arraycopy(literals, numLiterals, tailVals, 0, MIN_REPEAT);
+
+ // determine variable encoding and flush values
+ determineEncoding();
+ writeValues();
+
+ // shift tail fixed runs to beginning of the buffer
+ for(long l : tailVals) {
+ literals[numLiterals++] = l;
+ }
+ }
+
+ // if fixed runs reached max repeat length then write values
+ if (fixedRunLength == MAX_SCOPE) {
+ determineEncoding();
+ writeValues();
+ }
+ } else {
+ // variable delta run
+
+ // if fixed run length is non-zero and if it satisfies the
+ // short repeat conditions then write the values as short repeats
+ // else use delta encoding
+ if (fixedRunLength >= MIN_REPEAT) {
+ if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
+ encoding = EncodingType.SHORT_REPEAT;
+ writeValues();
+ } else {
+ encoding = EncodingType.DELTA;
+ isFixedDelta = true;
+ writeValues();
+ }
+ }
+
+ // if fixed run length is <MIN_REPEAT and current value is
+ // different from previous then treat it as variable run
+ if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT) {
+ if (val != literals[numLiterals - 1]) {
+ variableRunLength = fixedRunLength;
+ fixedRunLength = 0;
+ }
+ }
+
+ // after writing values re-initialize the variables
+ if (numLiterals == 0) {
+ initializeLiterals(val);
+ } else {
+ // keep updating variable run lengths
+ prevDelta = val - literals[numLiterals - 1];
+ literals[numLiterals++] = val;
+ variableRunLength += 1;
+
+ // if variable run length reach the max scope, write it
+ if (variableRunLength == MAX_SCOPE) {
+ determineEncoding();
+ writeValues();
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private void initializeLiterals(long val) {
+ literals[numLiterals++] = val;
+ fixedRunLength = 1;
+ variableRunLength = 1;
+ }
+
+ @Override
+ public void getPosition(PositionRecorder recorder) throws IOException {
+ output.getPosition(recorder);
+ recorder.addPosition(numLiterals);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/SchemaEvolution.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/SchemaEvolution.java b/orc/src/java/org/apache/hive/orc/impl/SchemaEvolution.java
new file mode 100644
index 0000000..3c7124b
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/SchemaEvolution.java
@@ -0,0 +1,399 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.hive.orc.TypeDescription;
+
+/**
+ * Take the file types and the (optional) configuration column names/types and see if there
+ * has been schema evolution.
+ */
+public class SchemaEvolution {
+ // indexed by reader column id
+ private final TypeDescription[] readerFileTypes;
+ // indexed by reader column id
+ private final boolean[] readerIncluded;
+ // the offset to the first column id ignoring any ACID columns
+ private final int readerColumnOffset;
+ // indexed by file column id
+ private final boolean[] fileIncluded;
+ private final TypeDescription fileSchema;
+ private final TypeDescription readerSchema;
+ private boolean hasConversion;
+ // indexed by reader column id
+ private final boolean[] ppdSafeConversion;
+
+ public SchemaEvolution(TypeDescription fileSchema, boolean[] includedCols) {
+ this(fileSchema, null, includedCols);
+ }
+
+ public SchemaEvolution(TypeDescription fileSchema,
+ TypeDescription readerSchema,
+ boolean[] includeCols) {
+ this.readerIncluded = includeCols == null ? null : Arrays.copyOf(includeCols, includeCols.length);
+ this.fileIncluded = new boolean[fileSchema.getMaximumId() + 1];
+ this.hasConversion = false;
+ this.fileSchema = fileSchema;
+ boolean isAcid = checkAcidSchema(fileSchema);
+ this.readerColumnOffset = isAcid ? acidEventFieldNames.size() : 0;
+ if (readerSchema != null) {
+ if (isAcid) {
+ this.readerSchema = createEventSchema(readerSchema);
+ } else {
+ this.readerSchema = readerSchema;
+ }
+ if (readerIncluded != null &&
+ readerIncluded.length + readerColumnOffset != this.readerSchema.getMaximumId() + 1) {
+ throw new IllegalArgumentException("Include vector the wrong length: " +
+ this.readerSchema.toJson() + " with include length " +
+ readerIncluded.length);
+ }
+ this.readerFileTypes = new TypeDescription[this.readerSchema.getMaximumId() + 1];
+ buildConversionFileTypesArray(fileSchema, this.readerSchema);
+ } else {
+ this.readerSchema = fileSchema;
+ this.readerFileTypes = new TypeDescription[this.readerSchema.getMaximumId() + 1];
+ if (readerIncluded != null &&
+ readerIncluded.length + readerColumnOffset != this.readerSchema.getMaximumId() + 1) {
+ throw new IllegalArgumentException("Include vector the wrong length: " +
+ this.readerSchema.toJson() + " with include length " +
+ readerIncluded.length);
+ }
+ buildSameSchemaFileTypesArray();
+ }
+ this.ppdSafeConversion = populatePpdSafeConversion();
+ }
+
+ public TypeDescription getReaderSchema() {
+ return readerSchema;
+ }
+
+ /**
+ * Returns the non-ACID (aka base) reader type description.
+ *
+ * @return the reader type ignoring the ACID rowid columns, if any
+ */
+ public TypeDescription getReaderBaseSchema() {
+ return readerSchema.findSubtype(readerColumnOffset);
+ }
+
+ /**
+ * Is there Schema Evolution data type conversion?
+ * @return
+ */
+ public boolean hasConversion() {
+ return hasConversion;
+ }
+
+ public TypeDescription getFileType(TypeDescription readerType) {
+ return getFileType(readerType.getId());
+ }
+
+ /**
+ * Get whether each column is included from the reader's point of view.
+ * @return a boolean array indexed by reader column id
+ */
+ public boolean[] getReaderIncluded() {
+ return readerIncluded;
+ }
+
+ /**
+ * Get whether each column is included from the file's point of view.
+ * @return a boolean array indexed by file column id
+ */
+ public boolean[] getFileIncluded() {
+ return fileIncluded;
+ }
+
+ /**
+ * Get the file type by reader type id.
+ * @param id reader column id
+ * @return
+ */
+ public TypeDescription getFileType(int id) {
+ return readerFileTypes[id];
+ }
+
+ /**
+ * Check if column is safe for ppd evaluation
+ * @param colId reader column id
+ * @return true if the specified column is safe for ppd evaluation else false
+ */
+ public boolean isPPDSafeConversion(final int colId) {
+ if (hasConversion()) {
+ if (colId < 0 || colId >= ppdSafeConversion.length) {
+ return false;
+ }
+ return ppdSafeConversion[colId];
+ }
+
+ // when there is no schema evolution PPD is safe
+ return true;
+ }
+
+ private boolean[] populatePpdSafeConversion() {
+ if (fileSchema == null || readerSchema == null || readerFileTypes == null) {
+ return null;
+ }
+
+ boolean[] result = new boolean[readerSchema.getMaximumId() + 1];
+ boolean safePpd = validatePPDConversion(fileSchema, readerSchema);
+ result[readerSchema.getId()] = safePpd;
+ List<TypeDescription> children = readerSchema.getChildren();
+ if (children != null) {
+ for (TypeDescription child : children) {
+ TypeDescription fileType = getFileType(child.getId());
+ safePpd = validatePPDConversion(fileType, child);
+ result[child.getId()] = safePpd;
+ }
+ }
+ return result;
+ }
+
+ private boolean validatePPDConversion(final TypeDescription fileType,
+ final TypeDescription readerType) {
+ if (fileType == null) {
+ return false;
+ }
+ if (fileType.getCategory().isPrimitive()) {
+ if (fileType.getCategory().equals(readerType.getCategory())) {
+ // for decimals alone do equality check to not mess up with precision change
+ if (fileType.getCategory().equals(TypeDescription.Category.DECIMAL) &&
+ !fileType.equals(readerType)) {
+ return false;
+ }
+ return true;
+ }
+
+ // only integer and string evolutions are safe
+ // byte -> short -> int -> long
+ // string <-> char <-> varchar
+ // NOTE: Float to double evolution is not safe as floats are stored as doubles in ORC's
+ // internal index, but when doing predicate evaluation for queries like "select * from
+ // orc_float where f = 74.72" the constant on the filter is converted from string -> double
+ // so the precisions will be different and the comparison will fail.
+ // Soon, we should convert all sargs that compare equality between floats or
+ // doubles to range predicates.
+
+ // Similarly string -> char and varchar -> char and vice versa is not possible, as ORC stores
+ // char with padded spaces in its internal index.
+ switch (fileType.getCategory()) {
+ case BYTE:
+ if (readerType.getCategory().equals(TypeDescription.Category.SHORT) ||
+ readerType.getCategory().equals(TypeDescription.Category.INT) ||
+ readerType.getCategory().equals(TypeDescription.Category.LONG)) {
+ return true;
+ }
+ break;
+ case SHORT:
+ if (readerType.getCategory().equals(TypeDescription.Category.INT) ||
+ readerType.getCategory().equals(TypeDescription.Category.LONG)) {
+ return true;
+ }
+ break;
+ case INT:
+ if (readerType.getCategory().equals(TypeDescription.Category.LONG)) {
+ return true;
+ }
+ break;
+ case STRING:
+ if (readerType.getCategory().equals(TypeDescription.Category.VARCHAR)) {
+ return true;
+ }
+ break;
+ case VARCHAR:
+ if (readerType.getCategory().equals(TypeDescription.Category.STRING)) {
+ return true;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Should we read the given reader column?
+ * @param readerId the id of column in the extended reader schema
+ * @return true if the column should be read
+ */
+ public boolean includeReaderColumn(int readerId) {
+ return readerIncluded == null ||
+ readerId <= readerColumnOffset ||
+ readerIncluded[readerId - readerColumnOffset];
+ }
+
+ void buildConversionFileTypesArray(TypeDescription fileType,
+ TypeDescription readerType) {
+ // if the column isn't included, don't map it
+ int readerId = readerType.getId();
+ if (!includeReaderColumn(readerId)) {
+ return;
+ }
+ boolean isOk = true;
+ // check the easy case first
+ if (fileType.getCategory() == readerType.getCategory()) {
+ switch (readerType.getCategory()) {
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case DOUBLE:
+ case FLOAT:
+ case STRING:
+ case TIMESTAMP:
+ case BINARY:
+ case DATE:
+ // these are always a match
+ break;
+ case CHAR:
+ case VARCHAR:
+ // We do conversion when same CHAR/VARCHAR type but different maxLength.
+ if (fileType.getMaxLength() != readerType.getMaxLength()) {
+ hasConversion = true;
+ }
+ break;
+ case DECIMAL:
+ // We do conversion when same DECIMAL type but different precision/scale.
+ if (fileType.getPrecision() != readerType.getPrecision() ||
+ fileType.getScale() != readerType.getScale()) {
+ hasConversion = true;
+ }
+ break;
+ case UNION:
+ case MAP:
+ case LIST: {
+ // these must be an exact match
+ List<TypeDescription> fileChildren = fileType.getChildren();
+ List<TypeDescription> readerChildren = readerType.getChildren();
+ if (fileChildren.size() == readerChildren.size()) {
+ for(int i=0; i < fileChildren.size(); ++i) {
+ buildConversionFileTypesArray(fileChildren.get(i), readerChildren.get(i));
+ }
+ } else {
+ isOk = false;
+ }
+ break;
+ }
+ case STRUCT: {
+ // allow either side to have fewer fields than the other
+ List<TypeDescription> fileChildren = fileType.getChildren();
+ List<TypeDescription> readerChildren = readerType.getChildren();
+ if (fileChildren.size() != readerChildren.size()) {
+ hasConversion = true;
+ }
+ int jointSize = Math.min(fileChildren.size(), readerChildren.size());
+ for(int i=0; i < jointSize; ++i) {
+ buildConversionFileTypesArray(fileChildren.get(i), readerChildren.get(i));
+ }
+ break;
+ }
+ default:
+ throw new IllegalArgumentException("Unknown type " + readerType);
+ }
+ } else {
+ /*
+ * Check for the few cases where will not convert....
+ */
+
+ isOk = ConvertTreeReaderFactory.canConvert(fileType, readerType);
+ hasConversion = true;
+ }
+ if (isOk) {
+ if (readerFileTypes[readerId] != null) {
+ throw new RuntimeException("reader to file type entry already assigned");
+ }
+ readerFileTypes[readerId] = fileType;
+ fileIncluded[fileType.getId()] = true;
+ } else {
+ throw new IllegalArgumentException(
+ String.format(
+ "ORC does not support type conversion from file type %s (%d) to reader type %s (%d)",
+ fileType.toString(), fileType.getId(),
+ readerType.toString(), readerId));
+ }
+ }
+
+ /**
+ * Use to make a reader to file type array when the schema is the same.
+ * @return
+ */
+ private void buildSameSchemaFileTypesArray() {
+ buildSameSchemaFileTypesArrayRecurse(readerSchema);
+ }
+
+ void buildSameSchemaFileTypesArrayRecurse(TypeDescription readerType) {
+ int id = readerType.getId();
+ if (!includeReaderColumn(id)) {
+ return;
+ }
+ if (readerFileTypes[id] != null) {
+ throw new RuntimeException("reader to file type entry already assigned");
+ }
+ readerFileTypes[id] = readerType;
+ fileIncluded[id] = true;
+ List<TypeDescription> children = readerType.getChildren();
+ if (children != null) {
+ for (TypeDescription child : children) {
+ buildSameSchemaFileTypesArrayRecurse(child);
+ }
+ }
+ }
+
+ private static boolean checkAcidSchema(TypeDescription type) {
+ if (type.getCategory().equals(TypeDescription.Category.STRUCT)) {
+ List<String> rootFields = type.getFieldNames();
+ if (acidEventFieldNames.equals(rootFields)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * @param typeDescr
+ * @return ORC types for the ACID event based on the row's type description
+ */
+ public static TypeDescription createEventSchema(TypeDescription typeDescr) {
+ TypeDescription result = TypeDescription.createStruct()
+ .addField("operation", TypeDescription.createInt())
+ .addField("originalTransaction", TypeDescription.createLong())
+ .addField("bucket", TypeDescription.createInt())
+ .addField("rowId", TypeDescription.createLong())
+ .addField("currentTransaction", TypeDescription.createLong())
+ .addField("row", typeDescr.clone());
+ return result;
+ }
+
+ public static final List<String> acidEventFieldNames= new ArrayList<String>();
+ static {
+ acidEventFieldNames.add("operation");
+ acidEventFieldNames.add("originalTransaction");
+ acidEventFieldNames.add("bucket");
+ acidEventFieldNames.add("rowId");
+ acidEventFieldNames.add("currentTransaction");
+ acidEventFieldNames.add("row");
+ }
+}
[29/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/RecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/RecordReaderImpl.java b/orc/src/java/org/apache/hive/orc/impl/RecordReaderImpl.java
new file mode 100644
index 0000000..d0edce5
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/RecordReaderImpl.java
@@ -0,0 +1,1238 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.sql.Date;
+import java.sql.Timestamp;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hive.orc.BooleanColumnStatistics;
+import org.apache.hive.orc.ColumnStatistics;
+import org.apache.hive.orc.CompressionCodec;
+import org.apache.hive.orc.DoubleColumnStatistics;
+import org.apache.hive.orc.IntegerColumnStatistics;
+import org.apache.hive.orc.Reader;
+import org.apache.hive.orc.RecordReader;
+import org.apache.hive.orc.TypeDescription;
+import org.apache.hive.orc.DataReader;
+import org.apache.hive.orc.DateColumnStatistics;
+import org.apache.hive.orc.DecimalColumnStatistics;
+import org.apache.hive.orc.OrcConf;
+import org.apache.hive.orc.StringColumnStatistics;
+import org.apache.hive.orc.StripeInformation;
+import org.apache.hive.orc.TimestampColumnStatistics;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.io.DiskRange;
+import org.apache.hadoop.hive.common.io.DiskRangeList;
+import org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hive.orc.BloomFilterIO;
+import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.hive.ql.util.TimestampUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hive.orc.OrcProto;
+
+public class RecordReaderImpl implements RecordReader {
+ static final Logger LOG = LoggerFactory.getLogger(RecordReaderImpl.class);
+ private static final boolean isLogDebugEnabled = LOG.isDebugEnabled();
+ private static final Object UNKNOWN_VALUE = new Object();
+ protected final Path path;
+ private final long firstRow;
+ private final List<StripeInformation> stripes =
+ new ArrayList<StripeInformation>();
+ private OrcProto.StripeFooter stripeFooter;
+ private final long totalRowCount;
+ private final CompressionCodec codec;
+ protected final TypeDescription schema;
+ private final List<OrcProto.Type> types;
+ private final int bufferSize;
+ private final SchemaEvolution evolution;
+ // the file included columns indexed by the file's column ids.
+ private final boolean[] included;
+ private final long rowIndexStride;
+ private long rowInStripe = 0;
+ private int currentStripe = -1;
+ private long rowBaseInStripe = 0;
+ private long rowCountInStripe = 0;
+ private final Map<StreamName, InStream> streams =
+ new HashMap<StreamName, InStream>();
+ DiskRangeList bufferChunks = null;
+ private final TreeReaderFactory.TreeReader reader;
+ private final OrcProto.RowIndex[] indexes;
+ private final OrcProto.BloomFilterIndex[] bloomFilterIndices;
+ private final SargApplier sargApp;
+ // an array about which row groups aren't skipped
+ private boolean[] includedRowGroups = null;
+ private final DataReader dataReader;
+
+ /**
+ * Given a list of column names, find the given column and return the index.
+ *
+ * @param evolution the mapping from reader to file schema
+ * @param columnName the column name to look for
+ * @return the file column id or -1 if the column wasn't found
+ */
+ static int findColumns(SchemaEvolution evolution,
+ String columnName) {
+ TypeDescription readerSchema = evolution.getReaderBaseSchema();
+ List<String> fieldNames = readerSchema.getFieldNames();
+ List<TypeDescription> children = readerSchema.getChildren();
+ for (int i = 0; i < fieldNames.size(); ++i) {
+ if (columnName.equals(fieldNames.get(i))) {
+ TypeDescription result = evolution.getFileType(children.get(i).getId());
+ return result == null ? -1 : result.getId();
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Find the mapping from predicate leaves to columns.
+ * @param sargLeaves the search argument that we need to map
+ * @param evolution the mapping from reader to file schema
+ * @return an array mapping the sarg leaves to file column ids
+ */
+ public static int[] mapSargColumnsToOrcInternalColIdx(List<PredicateLeaf> sargLeaves,
+ SchemaEvolution evolution) {
+ int[] result = new int[sargLeaves.size()];
+ Arrays.fill(result, -1);
+ for(int i=0; i < result.length; ++i) {
+ String colName = sargLeaves.get(i).getColumnName();
+ result[i] = findColumns(evolution, colName);
+ }
+ return result;
+ }
+
+ protected RecordReaderImpl(ReaderImpl fileReader,
+ Reader.Options options) throws IOException {
+ boolean[] readerIncluded = options.getInclude();
+ if (options.getSchema() == null) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Reader schema not provided -- using file schema " +
+ fileReader.getSchema());
+ }
+ evolution = new SchemaEvolution(fileReader.getSchema(), readerIncluded);
+ } else {
+
+ // Now that we are creating a record reader for a file, validate that the schema to read
+ // is compatible with the file schema.
+ //
+ evolution = new SchemaEvolution(fileReader.getSchema(),
+ options.getSchema(), readerIncluded);
+ if (LOG.isDebugEnabled() && evolution.hasConversion()) {
+ LOG.debug("ORC file " + fileReader.path.toString() +
+ " has data type conversion --\n" +
+ "reader schema: " + options.getSchema().toString() + "\n" +
+ "file schema: " + fileReader.getSchema());
+ }
+ }
+ this.schema = evolution.getReaderSchema();
+ this.path = fileReader.path;
+ this.codec = fileReader.codec;
+ this.types = fileReader.types;
+ this.bufferSize = fileReader.bufferSize;
+ this.rowIndexStride = fileReader.rowIndexStride;
+ SearchArgument sarg = options.getSearchArgument();
+ if (sarg != null && rowIndexStride != 0) {
+ sargApp = new SargApplier(sarg, options.getColumnNames(), rowIndexStride,
+ evolution);
+ } else {
+ sargApp = null;
+ }
+ long rows = 0;
+ long skippedRows = 0;
+ long offset = options.getOffset();
+ long maxOffset = options.getMaxOffset();
+ for(StripeInformation stripe: fileReader.getStripes()) {
+ long stripeStart = stripe.getOffset();
+ if (offset > stripeStart) {
+ skippedRows += stripe.getNumberOfRows();
+ } else if (stripeStart < maxOffset) {
+ this.stripes.add(stripe);
+ rows += stripe.getNumberOfRows();
+ }
+ }
+
+ Boolean zeroCopy = options.getUseZeroCopy();
+ if (zeroCopy == null) {
+ zeroCopy = OrcConf.USE_ZEROCOPY.getBoolean(fileReader.conf);
+ }
+ if (options.getDataReader() != null) {
+ this.dataReader = options.getDataReader();
+ } else {
+ this.dataReader = RecordReaderUtils.createDefaultDataReader(
+ DataReaderProperties.builder()
+ .withBufferSize(bufferSize)
+ .withCompression(fileReader.compressionKind)
+ .withFileSystem(fileReader.fileSystem)
+ .withPath(fileReader.path)
+ .withTypeCount(types.size())
+ .withZeroCopy(zeroCopy)
+ .build());
+ }
+ this.dataReader.open();
+
+ firstRow = skippedRows;
+ totalRowCount = rows;
+ Boolean skipCorrupt = options.getSkipCorruptRecords();
+ if (skipCorrupt == null) {
+ skipCorrupt = OrcConf.SKIP_CORRUPT_DATA.getBoolean(fileReader.conf);
+ }
+
+ reader = TreeReaderFactory.createTreeReader(evolution.getReaderSchema(),
+ evolution, readerIncluded, skipCorrupt);
+ indexes = new OrcProto.RowIndex[types.size()];
+ bloomFilterIndices = new OrcProto.BloomFilterIndex[types.size()];
+ this.included = evolution.getFileIncluded();
+ advanceToNextRow(reader, 0L, true);
+ }
+
+ public static final class PositionProviderImpl implements PositionProvider {
+ private final OrcProto.RowIndexEntry entry;
+ private int index;
+
+ public PositionProviderImpl(OrcProto.RowIndexEntry entry) {
+ this(entry, 0);
+ }
+
+ public PositionProviderImpl(OrcProto.RowIndexEntry entry, int startPos) {
+ this.entry = entry;
+ this.index = startPos;
+ }
+
+ @Override
+ public long getNext() {
+ return entry.getPositions(index++);
+ }
+
+ @Override
+ public String toString() {
+ return "{" + entry.getPositionsList() + "; " + index + "}";
+ }
+ }
+
+ public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe
+ ) throws IOException {
+ return dataReader.readStripeFooter(stripe);
+ }
+
+ enum Location {
+ BEFORE, MIN, MIDDLE, MAX, AFTER
+ }
+
+ /**
+ * Given a point and min and max, determine if the point is before, at the
+ * min, in the middle, at the max, or after the range.
+ * @param point the point to test
+ * @param min the minimum point
+ * @param max the maximum point
+ * @param <T> the type of the comparision
+ * @return the location of the point
+ */
+ static <T> Location compareToRange(Comparable<T> point, T min, T max) {
+ int minCompare = point.compareTo(min);
+ if (minCompare < 0) {
+ return Location.BEFORE;
+ } else if (minCompare == 0) {
+ return Location.MIN;
+ }
+ int maxCompare = point.compareTo(max);
+ if (maxCompare > 0) {
+ return Location.AFTER;
+ } else if (maxCompare == 0) {
+ return Location.MAX;
+ }
+ return Location.MIDDLE;
+ }
+
+ /**
+ * Get the maximum value out of an index entry.
+ * @param index
+ * the index entry
+ * @return the object for the maximum value or null if there isn't one
+ */
+ static Object getMax(ColumnStatistics index) {
+ if (index instanceof IntegerColumnStatistics) {
+ return ((IntegerColumnStatistics) index).getMaximum();
+ } else if (index instanceof DoubleColumnStatistics) {
+ return ((DoubleColumnStatistics) index).getMaximum();
+ } else if (index instanceof StringColumnStatistics) {
+ return ((StringColumnStatistics) index).getMaximum();
+ } else if (index instanceof DateColumnStatistics) {
+ return ((DateColumnStatistics) index).getMaximum();
+ } else if (index instanceof DecimalColumnStatistics) {
+ return ((DecimalColumnStatistics) index).getMaximum();
+ } else if (index instanceof TimestampColumnStatistics) {
+ return ((TimestampColumnStatistics) index).getMaximum();
+ } else if (index instanceof BooleanColumnStatistics) {
+ if (((BooleanColumnStatistics)index).getTrueCount()!=0) {
+ return Boolean.TRUE;
+ } else {
+ return Boolean.FALSE;
+ }
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * Get the minimum value out of an index entry.
+ * @param index
+ * the index entry
+ * @return the object for the minimum value or null if there isn't one
+ */
+ static Object getMin(ColumnStatistics index) {
+ if (index instanceof IntegerColumnStatistics) {
+ return ((IntegerColumnStatistics) index).getMinimum();
+ } else if (index instanceof DoubleColumnStatistics) {
+ return ((DoubleColumnStatistics) index).getMinimum();
+ } else if (index instanceof StringColumnStatistics) {
+ return ((StringColumnStatistics) index).getMinimum();
+ } else if (index instanceof DateColumnStatistics) {
+ return ((DateColumnStatistics) index).getMinimum();
+ } else if (index instanceof DecimalColumnStatistics) {
+ return ((DecimalColumnStatistics) index).getMinimum();
+ } else if (index instanceof TimestampColumnStatistics) {
+ return ((TimestampColumnStatistics) index).getMinimum();
+ } else if (index instanceof BooleanColumnStatistics) {
+ if (((BooleanColumnStatistics)index).getFalseCount()!=0) {
+ return Boolean.FALSE;
+ } else {
+ return Boolean.TRUE;
+ }
+ } else {
+ return UNKNOWN_VALUE; // null is not safe here
+ }
+ }
+
+ /**
+ * Evaluate a predicate with respect to the statistics from the column
+ * that is referenced in the predicate.
+ * @param statsProto the statistics for the column mentioned in the predicate
+ * @param predicate the leaf predicate we need to evaluation
+ * @param bloomFilter
+ * @return the set of truth values that may be returned for the given
+ * predicate.
+ */
+ static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto,
+ PredicateLeaf predicate, OrcProto.BloomFilter bloomFilter) {
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(statsProto);
+ Object minValue = getMin(cs);
+ Object maxValue = getMax(cs);
+ BloomFilterIO bf = null;
+ if (bloomFilter != null) {
+ bf = new BloomFilterIO(bloomFilter);
+ }
+ return evaluatePredicateRange(predicate, minValue, maxValue, cs.hasNull(), bf);
+ }
+
+ /**
+ * Evaluate a predicate with respect to the statistics from the column
+ * that is referenced in the predicate.
+ * @param stats the statistics for the column mentioned in the predicate
+ * @param predicate the leaf predicate we need to evaluation
+ * @return the set of truth values that may be returned for the given
+ * predicate.
+ */
+ public static TruthValue evaluatePredicate(ColumnStatistics stats,
+ PredicateLeaf predicate,
+ BloomFilterIO bloomFilter) {
+ Object minValue = getMin(stats);
+ Object maxValue = getMax(stats);
+ return evaluatePredicateRange(predicate, minValue, maxValue, stats.hasNull(), bloomFilter);
+ }
+
+ static TruthValue evaluatePredicateRange(PredicateLeaf predicate, Object min,
+ Object max, boolean hasNull, BloomFilterIO bloomFilter) {
+ // if we didn't have any values, everything must have been null
+ if (min == null) {
+ if (predicate.getOperator() == PredicateLeaf.Operator.IS_NULL) {
+ return TruthValue.YES;
+ } else {
+ return TruthValue.NULL;
+ }
+ } else if (min == UNKNOWN_VALUE) {
+ return TruthValue.YES_NO_NULL;
+ }
+
+ // TODO: Enabling PPD for timestamp requires ORC-101 and ORC-135
+ if (min != null && min instanceof Timestamp) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Not using predication pushdown on {} because it doesn't " +
+ "include ORC-135.", predicate.getColumnName());
+ }
+ return TruthValue.YES_NO_NULL;
+ }
+
+ TruthValue result;
+ Object baseObj = predicate.getLiteral();
+ try {
+ // Predicate object and stats objects are converted to the type of the predicate object.
+ Object minValue = getBaseObjectForComparison(predicate.getType(), min);
+ Object maxValue = getBaseObjectForComparison(predicate.getType(), max);
+ Object predObj = getBaseObjectForComparison(predicate.getType(), baseObj);
+
+ result = evaluatePredicateMinMax(predicate, predObj, minValue, maxValue, hasNull);
+ if (shouldEvaluateBloomFilter(predicate, result, bloomFilter)) {
+ result = evaluatePredicateBloomFilter(predicate, predObj, bloomFilter, hasNull);
+ }
+ // in case failed conversion, return the default YES_NO_NULL truth value
+ } catch (Exception e) {
+ if (LOG.isDebugEnabled()) {
+ final String statsType = min == null ?
+ (max == null ? "null" : max.getClass().getSimpleName()) :
+ min.getClass().getSimpleName();
+ final String predicateType = baseObj == null ? "null" : baseObj.getClass().getSimpleName();
+ final String reason = e.getClass().getSimpleName() + " when evaluating predicate." +
+ " Skipping ORC PPD." +
+ " Exception: " + e.getMessage() +
+ " StatsType: " + statsType +
+ " PredicateType: " + predicateType;
+ LOG.debug(reason);
+ }
+ if (predicate.getOperator().equals(PredicateLeaf.Operator.NULL_SAFE_EQUALS) || !hasNull) {
+ result = TruthValue.YES_NO;
+ } else {
+ result = TruthValue.YES_NO_NULL;
+ }
+ }
+ return result;
+ }
+
+ private static boolean shouldEvaluateBloomFilter(PredicateLeaf predicate,
+ TruthValue result, BloomFilterIO bloomFilter) {
+ // evaluate bloom filter only when
+ // 1) Bloom filter is available
+ // 2) Min/Max evaluation yield YES or MAYBE
+ // 3) Predicate is EQUALS or IN list
+ if (bloomFilter != null
+ && result != TruthValue.NO_NULL && result != TruthValue.NO
+ && (predicate.getOperator().equals(PredicateLeaf.Operator.EQUALS)
+ || predicate.getOperator().equals(PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+ || predicate.getOperator().equals(PredicateLeaf.Operator.IN))) {
+ return true;
+ }
+ return false;
+ }
+
+ private static TruthValue evaluatePredicateMinMax(PredicateLeaf predicate, Object predObj,
+ Object minValue,
+ Object maxValue,
+ boolean hasNull) {
+ Location loc;
+
+ switch (predicate.getOperator()) {
+ case NULL_SAFE_EQUALS:
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (loc == Location.BEFORE || loc == Location.AFTER) {
+ return TruthValue.NO;
+ } else {
+ return TruthValue.YES_NO;
+ }
+ case EQUALS:
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (minValue.equals(maxValue) && loc == Location.MIN) {
+ return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
+ } else if (loc == Location.BEFORE || loc == Location.AFTER) {
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ } else {
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ case LESS_THAN:
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (loc == Location.AFTER) {
+ return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
+ } else if (loc == Location.BEFORE || loc == Location.MIN) {
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ } else {
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ case LESS_THAN_EQUALS:
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (loc == Location.AFTER || loc == Location.MAX) {
+ return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
+ } else if (loc == Location.BEFORE) {
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ } else {
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ case IN:
+ if (minValue.equals(maxValue)) {
+ // for a single value, look through to see if that value is in the
+ // set
+ for (Object arg : predicate.getLiteralList()) {
+ predObj = getBaseObjectForComparison(predicate.getType(), arg);
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (loc == Location.MIN) {
+ return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
+ }
+ }
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ } else {
+ // are all of the values outside of the range?
+ for (Object arg : predicate.getLiteralList()) {
+ predObj = getBaseObjectForComparison(predicate.getType(), arg);
+ loc = compareToRange((Comparable) predObj, minValue, maxValue);
+ if (loc == Location.MIN || loc == Location.MIDDLE ||
+ loc == Location.MAX) {
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ }
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ }
+ case BETWEEN:
+ List<Object> args = predicate.getLiteralList();
+ Object predObj1 = getBaseObjectForComparison(predicate.getType(), args.get(0));
+
+ loc = compareToRange((Comparable) predObj1, minValue, maxValue);
+ if (loc == Location.BEFORE || loc == Location.MIN) {
+ Object predObj2 = getBaseObjectForComparison(predicate.getType(), args.get(1));
+
+ Location loc2 = compareToRange((Comparable) predObj2, minValue, maxValue);
+ if (loc2 == Location.AFTER || loc2 == Location.MAX) {
+ return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
+ } else if (loc2 == Location.BEFORE) {
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ } else {
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ } else if (loc == Location.AFTER) {
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ } else {
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ case IS_NULL:
+ // min = null condition above handles the all-nulls YES case
+ return hasNull ? TruthValue.YES_NO : TruthValue.NO;
+ default:
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ }
+
+ private static TruthValue evaluatePredicateBloomFilter(PredicateLeaf predicate,
+ final Object predObj, BloomFilterIO bloomFilter, boolean hasNull) {
+ switch (predicate.getOperator()) {
+ case NULL_SAFE_EQUALS:
+ // null safe equals does not return *_NULL variant. So set hasNull to false
+ return checkInBloomFilter(bloomFilter, predObj, false);
+ case EQUALS:
+ return checkInBloomFilter(bloomFilter, predObj, hasNull);
+ case IN:
+ for (Object arg : predicate.getLiteralList()) {
+ // if atleast one value in IN list exist in bloom filter, qualify the row group/stripe
+ Object predObjItem = getBaseObjectForComparison(predicate.getType(), arg);
+ TruthValue result = checkInBloomFilter(bloomFilter, predObjItem, hasNull);
+ if (result == TruthValue.YES_NO_NULL || result == TruthValue.YES_NO) {
+ return result;
+ }
+ }
+ return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+ default:
+ return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
+ }
+ }
+
+ private static TruthValue checkInBloomFilter(BloomFilterIO bf, Object predObj, boolean hasNull) {
+ TruthValue result = hasNull ? TruthValue.NO_NULL : TruthValue.NO;
+
+ if (predObj instanceof Long) {
+ if (bf.testLong(((Long) predObj).longValue())) {
+ result = TruthValue.YES_NO_NULL;
+ }
+ } else if (predObj instanceof Double) {
+ if (bf.testDouble(((Double) predObj).doubleValue())) {
+ result = TruthValue.YES_NO_NULL;
+ }
+ } else if (predObj instanceof String || predObj instanceof Text ||
+ predObj instanceof HiveDecimalWritable ||
+ predObj instanceof BigDecimal) {
+ if (bf.testString(predObj.toString())) {
+ result = TruthValue.YES_NO_NULL;
+ }
+ } else if (predObj instanceof Timestamp) {
+ if (bf.testLong(((Timestamp) predObj).getTime())) {
+ result = TruthValue.YES_NO_NULL;
+ }
+ } else if (predObj instanceof Date) {
+ if (bf.testLong(DateWritable.dateToDays((Date) predObj))) {
+ result = TruthValue.YES_NO_NULL;
+ }
+ } else {
+ // if the predicate object is null and if hasNull says there are no nulls then return NO
+ if (predObj == null && !hasNull) {
+ result = TruthValue.NO;
+ } else {
+ result = TruthValue.YES_NO_NULL;
+ }
+ }
+
+ if (result == TruthValue.YES_NO_NULL && !hasNull) {
+ result = TruthValue.YES_NO;
+ }
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Bloom filter evaluation: " + result.toString());
+ }
+
+ return result;
+ }
+
+ private static Object getBaseObjectForComparison(PredicateLeaf.Type type, Object obj) {
+ if (obj == null) {
+ return null;
+ }
+ switch (type) {
+ case BOOLEAN:
+ if (obj instanceof Boolean) {
+ return obj;
+ } else {
+ // will only be true if the string conversion yields "true", all other values are
+ // considered false
+ return Boolean.valueOf(obj.toString());
+ }
+ case DATE:
+ if (obj instanceof Date) {
+ return obj;
+ } else if (obj instanceof String) {
+ return Date.valueOf((String) obj);
+ } else if (obj instanceof Timestamp) {
+ return DateWritable.timeToDate(((Timestamp) obj).getTime() / 1000L);
+ }
+ // always string, but prevent the comparison to numbers (are they days/seconds/milliseconds?)
+ break;
+ case DECIMAL:
+ if (obj instanceof Boolean) {
+ return new HiveDecimalWritable(((Boolean) obj).booleanValue() ?
+ HiveDecimal.ONE : HiveDecimal.ZERO);
+ } else if (obj instanceof Integer) {
+ return new HiveDecimalWritable(((Integer) obj).intValue());
+ } else if (obj instanceof Long) {
+ return new HiveDecimalWritable(((Long) obj));
+ } else if (obj instanceof Float || obj instanceof Double ||
+ obj instanceof String) {
+ return new HiveDecimalWritable(obj.toString());
+ } else if (obj instanceof BigDecimal) {
+ return new HiveDecimalWritable(HiveDecimal.create((BigDecimal) obj));
+ } else if (obj instanceof HiveDecimal) {
+ return new HiveDecimalWritable((HiveDecimal) obj);
+ } else if (obj instanceof HiveDecimalWritable) {
+ return obj;
+ } else if (obj instanceof Timestamp) {
+ return new HiveDecimalWritable(Double.toString(
+ TimestampUtils.getDouble((Timestamp) obj)));
+ }
+ break;
+ case FLOAT:
+ if (obj instanceof Number) {
+ // widening conversion
+ return ((Number) obj).doubleValue();
+ } else if (obj instanceof HiveDecimal) {
+ return ((HiveDecimal) obj).doubleValue();
+ } else if (obj instanceof String) {
+ return Double.valueOf(obj.toString());
+ } else if (obj instanceof Timestamp) {
+ return TimestampUtils.getDouble((Timestamp) obj);
+ } else if (obj instanceof HiveDecimal) {
+ return ((HiveDecimal) obj).doubleValue();
+ } else if (obj instanceof BigDecimal) {
+ return ((BigDecimal) obj).doubleValue();
+ }
+ break;
+ case LONG:
+ if (obj instanceof Number) {
+ // widening conversion
+ return ((Number) obj).longValue();
+ } else if (obj instanceof HiveDecimal) {
+ return ((HiveDecimal) obj).longValue();
+ } else if (obj instanceof String) {
+ return Long.valueOf(obj.toString());
+ }
+ break;
+ case STRING:
+ if (obj != null) {
+ return (obj.toString());
+ }
+ break;
+ case TIMESTAMP:
+ if (obj instanceof Timestamp) {
+ return obj;
+ } else if (obj instanceof Integer) {
+ return new Timestamp(((Number) obj).longValue());
+ } else if (obj instanceof Float) {
+ return TimestampUtils.doubleToTimestamp(((Float) obj).doubleValue());
+ } else if (obj instanceof Double) {
+ return TimestampUtils.doubleToTimestamp(((Double) obj).doubleValue());
+ } else if (obj instanceof HiveDecimal) {
+ return TimestampUtils.decimalToTimestamp((HiveDecimal) obj);
+ } else if (obj instanceof HiveDecimalWritable) {
+ return TimestampUtils.decimalToTimestamp(((HiveDecimalWritable) obj).getHiveDecimal());
+ } else if (obj instanceof Date) {
+ return new Timestamp(((Date) obj).getTime());
+ }
+ // float/double conversion to timestamp is interpreted as seconds whereas integer conversion
+ // to timestamp is interpreted as milliseconds by default. The integer to timestamp casting
+ // is also config driven. The filter operator changes its promotion based on config:
+ // "int.timestamp.conversion.in.seconds". Disable PPD for integer cases.
+ break;
+ default:
+ break;
+ }
+
+ throw new IllegalArgumentException(String.format(
+ "ORC SARGS could not convert from %s to %s", obj == null ? "(null)" : obj.getClass()
+ .getSimpleName(), type));
+ }
+
+ public static class SargApplier {
+ public final static boolean[] READ_ALL_RGS = null;
+ public final static boolean[] READ_NO_RGS = new boolean[0];
+
+ private final SearchArgument sarg;
+ private final List<PredicateLeaf> sargLeaves;
+ private final int[] filterColumns;
+ private final long rowIndexStride;
+ // same as the above array, but indices are set to true
+ private final boolean[] sargColumns;
+ private SchemaEvolution evolution;
+
+ public SargApplier(SearchArgument sarg, String[] columnNames,
+ long rowIndexStride,
+ SchemaEvolution evolution) {
+ this.sarg = sarg;
+ sargLeaves = sarg.getLeaves();
+ filterColumns = mapSargColumnsToOrcInternalColIdx(sargLeaves, evolution);
+ this.rowIndexStride = rowIndexStride;
+ // included will not be null, row options will fill the array with trues if null
+ sargColumns = new boolean[evolution.getFileIncluded().length];
+ for (int i : filterColumns) {
+ // filter columns may have -1 as index which could be partition column in SARG.
+ if (i > 0) {
+ sargColumns[i] = true;
+ }
+ }
+ this.evolution = evolution;
+ }
+
+ /**
+ * Pick the row groups that we need to load from the current stripe.
+ *
+ * @return an array with a boolean for each row group or null if all of the
+ * row groups must be read.
+ * @throws IOException
+ */
+ public boolean[] pickRowGroups(StripeInformation stripe, OrcProto.RowIndex[] indexes,
+ OrcProto.BloomFilterIndex[] bloomFilterIndices, boolean returnNone) throws IOException {
+ long rowsInStripe = stripe.getNumberOfRows();
+ int groupsInStripe = (int) ((rowsInStripe + rowIndexStride - 1) / rowIndexStride);
+ boolean[] result = new boolean[groupsInStripe]; // TODO: avoid alloc?
+ TruthValue[] leafValues = new TruthValue[sargLeaves.size()];
+ boolean hasSelected = false, hasSkipped = false;
+ for (int rowGroup = 0; rowGroup < result.length; ++rowGroup) {
+ for (int pred = 0; pred < leafValues.length; ++pred) {
+ int columnIx = filterColumns[pred];
+ if (columnIx != -1) {
+ if (indexes[columnIx] == null) {
+ throw new AssertionError("Index is not populated for " + columnIx);
+ }
+ OrcProto.RowIndexEntry entry = indexes[columnIx].getEntry(rowGroup);
+ if (entry == null) {
+ throw new AssertionError("RG is not populated for " + columnIx + " rg " + rowGroup);
+ }
+ OrcProto.ColumnStatistics stats = entry.getStatistics();
+ OrcProto.BloomFilter bf = null;
+ if (bloomFilterIndices != null && bloomFilterIndices[columnIx] != null) {
+ bf = bloomFilterIndices[columnIx].getBloomFilter(rowGroup);
+ }
+ if (evolution != null && evolution.isPPDSafeConversion(columnIx)) {
+ leafValues[pred] = evaluatePredicateProto(stats, sargLeaves.get(pred), bf);
+ } else {
+ leafValues[pred] = TruthValue.YES_NO_NULL;
+ }
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Stats = " + stats);
+ LOG.trace("Setting " + sargLeaves.get(pred) + " to " + leafValues[pred]);
+ }
+ } else {
+ // the column is a virtual column
+ leafValues[pred] = TruthValue.YES_NO_NULL;
+ }
+ }
+ result[rowGroup] = sarg.evaluate(leafValues).isNeeded();
+ hasSelected = hasSelected || result[rowGroup];
+ hasSkipped = hasSkipped || (!result[rowGroup]);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Row group " + (rowIndexStride * rowGroup) + " to " +
+ (rowIndexStride * (rowGroup + 1) - 1) + " is " +
+ (result[rowGroup] ? "" : "not ") + "included.");
+ }
+ }
+
+ return hasSkipped ? ((hasSelected || !returnNone) ? result : READ_NO_RGS) : READ_ALL_RGS;
+ }
+ }
+
+ /**
+ * Pick the row groups that we need to load from the current stripe.
+ *
+ * @return an array with a boolean for each row group or null if all of the
+ * row groups must be read.
+ * @throws IOException
+ */
+ protected boolean[] pickRowGroups() throws IOException {
+ // if we don't have a sarg or indexes, we read everything
+ if (sargApp == null) {
+ return null;
+ }
+ readRowIndex(currentStripe, included, sargApp.sargColumns);
+ return sargApp.pickRowGroups(stripes.get(currentStripe), indexes, bloomFilterIndices, false);
+ }
+
+ private void clearStreams() {
+ // explicit close of all streams to de-ref ByteBuffers
+ for (InStream is : streams.values()) {
+ is.close();
+ }
+ if (bufferChunks != null) {
+ if (dataReader.isTrackingDiskRanges()) {
+ for (DiskRangeList range = bufferChunks; range != null; range = range.next) {
+ if (!(range instanceof BufferChunk)) {
+ continue;
+ }
+ dataReader.releaseBuffer(((BufferChunk) range).getChunk());
+ }
+ }
+ }
+ bufferChunks = null;
+ streams.clear();
+ }
+
+ /**
+ * Read the current stripe into memory.
+ *
+ * @throws IOException
+ */
+ private void readStripe() throws IOException {
+ StripeInformation stripe = beginReadStripe();
+ includedRowGroups = pickRowGroups();
+
+ // move forward to the first unskipped row
+ if (includedRowGroups != null) {
+ while (rowInStripe < rowCountInStripe &&
+ !includedRowGroups[(int) (rowInStripe / rowIndexStride)]) {
+ rowInStripe = Math.min(rowCountInStripe, rowInStripe + rowIndexStride);
+ }
+ }
+
+ // if we haven't skipped the whole stripe, read the data
+ if (rowInStripe < rowCountInStripe) {
+ // if we aren't projecting columns or filtering rows, just read it all
+ if (included == null && includedRowGroups == null) {
+ readAllDataStreams(stripe);
+ } else {
+ readPartialDataStreams(stripe);
+ }
+ reader.startStripe(streams, stripeFooter);
+ // if we skipped the first row group, move the pointers forward
+ if (rowInStripe != 0) {
+ seekToRowEntry(reader, (int) (rowInStripe / rowIndexStride));
+ }
+ }
+ }
+
+ private StripeInformation beginReadStripe() throws IOException {
+ StripeInformation stripe = stripes.get(currentStripe);
+ stripeFooter = readStripeFooter(stripe);
+ clearStreams();
+ // setup the position in the stripe
+ rowCountInStripe = stripe.getNumberOfRows();
+ rowInStripe = 0;
+ rowBaseInStripe = 0;
+ for (int i = 0; i < currentStripe; ++i) {
+ rowBaseInStripe += stripes.get(i).getNumberOfRows();
+ }
+ // reset all of the indexes
+ for (int i = 0; i < indexes.length; ++i) {
+ indexes[i] = null;
+ }
+ return stripe;
+ }
+
+ private void readAllDataStreams(StripeInformation stripe) throws IOException {
+ long start = stripe.getIndexLength();
+ long end = start + stripe.getDataLength();
+ // explicitly trigger 1 big read
+ DiskRangeList toRead = new DiskRangeList(start, end);
+ bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false);
+ List<OrcProto.Stream> streamDescriptions = stripeFooter.getStreamsList();
+ createStreams(streamDescriptions, bufferChunks, null, codec, bufferSize, streams);
+ }
+
+ /**
+ * Plan the ranges of the file that we need to read given the list of
+ * columns and row groups.
+ *
+ * @param streamList the list of streams available
+ * @param indexes the indexes that have been loaded
+ * @param includedColumns which columns are needed
+ * @param includedRowGroups which row groups are needed
+ * @param isCompressed does the file have generic compression
+ * @param encodings the encodings for each column
+ * @param types the types of the columns
+ * @param compressionSize the compression block size
+ * @return the list of disk ranges that will be loaded
+ */
+ static DiskRangeList planReadPartialDataStreams
+ (List<OrcProto.Stream> streamList,
+ OrcProto.RowIndex[] indexes,
+ boolean[] includedColumns,
+ boolean[] includedRowGroups,
+ boolean isCompressed,
+ List<OrcProto.ColumnEncoding> encodings,
+ List<OrcProto.Type> types,
+ int compressionSize,
+ boolean doMergeBuffers) {
+ long offset = 0;
+ // figure out which columns have a present stream
+ boolean[] hasNull = RecordReaderUtils.findPresentStreamsByColumn(streamList, types);
+ CreateHelper list = new CreateHelper();
+ for (OrcProto.Stream stream : streamList) {
+ long length = stream.getLength();
+ int column = stream.getColumn();
+ OrcProto.Stream.Kind streamKind = stream.getKind();
+ // since stream kind is optional, first check if it exists
+ if (stream.hasKind() &&
+ (StreamName.getArea(streamKind) == StreamName.Area.DATA) &&
+ (column < includedColumns.length && includedColumns[column])) {
+ // if we aren't filtering or it is a dictionary, load it.
+ if (includedRowGroups == null
+ || RecordReaderUtils.isDictionary(streamKind, encodings.get(column))) {
+ RecordReaderUtils.addEntireStreamToRanges(offset, length, list, doMergeBuffers);
+ } else {
+ RecordReaderUtils.addRgFilteredStreamToRanges(stream, includedRowGroups,
+ isCompressed, indexes[column], encodings.get(column), types.get(column),
+ compressionSize, hasNull[column], offset, length, list, doMergeBuffers);
+ }
+ }
+ offset += length;
+ }
+ return list.extract();
+ }
+
+ void createStreams(List<OrcProto.Stream> streamDescriptions,
+ DiskRangeList ranges,
+ boolean[] includeColumn,
+ CompressionCodec codec,
+ int bufferSize,
+ Map<StreamName, InStream> streams) throws IOException {
+ long streamOffset = 0;
+ for (OrcProto.Stream streamDesc : streamDescriptions) {
+ int column = streamDesc.getColumn();
+ if ((includeColumn != null &&
+ (column < included.length && !includeColumn[column])) ||
+ streamDesc.hasKind() &&
+ (StreamName.getArea(streamDesc.getKind()) != StreamName.Area.DATA)) {
+ streamOffset += streamDesc.getLength();
+ continue;
+ }
+ List<DiskRange> buffers = RecordReaderUtils.getStreamBuffers(
+ ranges, streamOffset, streamDesc.getLength());
+ StreamName name = new StreamName(column, streamDesc.getKind());
+ streams.put(name, InStream.create(name.toString(), buffers,
+ streamDesc.getLength(), codec, bufferSize));
+ streamOffset += streamDesc.getLength();
+ }
+ }
+
+ private void readPartialDataStreams(StripeInformation stripe) throws IOException {
+ List<OrcProto.Stream> streamList = stripeFooter.getStreamsList();
+ DiskRangeList toRead = planReadPartialDataStreams(streamList,
+ indexes, included, includedRowGroups, codec != null,
+ stripeFooter.getColumnsList(), types, bufferSize, true);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("chunks = " + RecordReaderUtils.stringifyDiskRanges(toRead));
+ }
+ bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("merge = " + RecordReaderUtils.stringifyDiskRanges(bufferChunks));
+ }
+
+ createStreams(streamList, bufferChunks, included, codec, bufferSize, streams);
+ }
+
+ /**
+ * Read the next stripe until we find a row that we don't skip.
+ *
+ * @throws IOException
+ */
+ private void advanceStripe() throws IOException {
+ rowInStripe = rowCountInStripe;
+ while (rowInStripe >= rowCountInStripe &&
+ currentStripe < stripes.size() - 1) {
+ currentStripe += 1;
+ readStripe();
+ }
+ }
+
+ /**
+ * Skip over rows that we aren't selecting, so that the next row is
+ * one that we will read.
+ *
+ * @param nextRow the row we want to go to
+ * @throws IOException
+ */
+ private boolean advanceToNextRow(
+ TreeReaderFactory.TreeReader reader, long nextRow, boolean canAdvanceStripe)
+ throws IOException {
+ long nextRowInStripe = nextRow - rowBaseInStripe;
+ // check for row skipping
+ if (rowIndexStride != 0 &&
+ includedRowGroups != null &&
+ nextRowInStripe < rowCountInStripe) {
+ int rowGroup = (int) (nextRowInStripe / rowIndexStride);
+ if (!includedRowGroups[rowGroup]) {
+ while (rowGroup < includedRowGroups.length && !includedRowGroups[rowGroup]) {
+ rowGroup += 1;
+ }
+ if (rowGroup >= includedRowGroups.length) {
+ if (canAdvanceStripe) {
+ advanceStripe();
+ }
+ return canAdvanceStripe;
+ }
+ nextRowInStripe = Math.min(rowCountInStripe, rowGroup * rowIndexStride);
+ }
+ }
+ if (nextRowInStripe >= rowCountInStripe) {
+ if (canAdvanceStripe) {
+ advanceStripe();
+ }
+ return canAdvanceStripe;
+ }
+ if (nextRowInStripe != rowInStripe) {
+ if (rowIndexStride != 0) {
+ int rowGroup = (int) (nextRowInStripe / rowIndexStride);
+ seekToRowEntry(reader, rowGroup);
+ reader.skipRows(nextRowInStripe - rowGroup * rowIndexStride);
+ } else {
+ reader.skipRows(nextRowInStripe - rowInStripe);
+ }
+ rowInStripe = nextRowInStripe;
+ }
+ return true;
+ }
+
+ @Override
+ public boolean nextBatch(VectorizedRowBatch batch) throws IOException {
+ try {
+ if (rowInStripe >= rowCountInStripe) {
+ currentStripe += 1;
+ if (currentStripe >= stripes.size()) {
+ batch.size = 0;
+ return false;
+ }
+ readStripe();
+ }
+
+ int batchSize = computeBatchSize(batch.getMaxSize());
+
+ rowInStripe += batchSize;
+ reader.setVectorColumnCount(batch.getDataColumnCount());
+ reader.nextBatch(batch, batchSize);
+ batch.selectedInUse = false;
+ batch.size = batchSize;
+ advanceToNextRow(reader, rowInStripe + rowBaseInStripe, true);
+ return batch.size != 0;
+ } catch (IOException e) {
+ // Rethrow exception with file name in log message
+ throw new IOException("Error reading file: " + path, e);
+ }
+ }
+
+ private int computeBatchSize(long targetBatchSize) {
+ final int batchSize;
+ // In case of PPD, batch size should be aware of row group boundaries. If only a subset of row
+ // groups are selected then marker position is set to the end of range (subset of row groups
+ // within strip). Batch size computed out of marker position makes sure that batch size is
+ // aware of row group boundary and will not cause overflow when reading rows
+ // illustration of this case is here https://issues.apache.org/jira/browse/HIVE-6287
+ if (rowIndexStride != 0 && includedRowGroups != null && rowInStripe < rowCountInStripe) {
+ int startRowGroup = (int) (rowInStripe / rowIndexStride);
+ if (!includedRowGroups[startRowGroup]) {
+ while (startRowGroup < includedRowGroups.length && !includedRowGroups[startRowGroup]) {
+ startRowGroup += 1;
+ }
+ }
+
+ int endRowGroup = startRowGroup;
+ while (endRowGroup < includedRowGroups.length && includedRowGroups[endRowGroup]) {
+ endRowGroup += 1;
+ }
+
+ final long markerPosition =
+ (endRowGroup * rowIndexStride) < rowCountInStripe ? (endRowGroup * rowIndexStride)
+ : rowCountInStripe;
+ batchSize = (int) Math.min(targetBatchSize, (markerPosition - rowInStripe));
+
+ if (isLogDebugEnabled && batchSize < targetBatchSize) {
+ LOG.debug("markerPosition: " + markerPosition + " batchSize: " + batchSize);
+ }
+ } else {
+ batchSize = (int) Math.min(targetBatchSize, (rowCountInStripe - rowInStripe));
+ }
+ return batchSize;
+ }
+
+ @Override
+ public void close() throws IOException {
+ clearStreams();
+ dataReader.close();
+ }
+
+ @Override
+ public long getRowNumber() {
+ return rowInStripe + rowBaseInStripe + firstRow;
+ }
+
+ /**
+ * Return the fraction of rows that have been read from the selected.
+ * section of the file
+ *
+ * @return fraction between 0.0 and 1.0 of rows consumed
+ */
+ @Override
+ public float getProgress() {
+ return ((float) rowBaseInStripe + rowInStripe) / totalRowCount;
+ }
+
+ private int findStripe(long rowNumber) {
+ for (int i = 0; i < stripes.size(); i++) {
+ StripeInformation stripe = stripes.get(i);
+ if (stripe.getNumberOfRows() > rowNumber) {
+ return i;
+ }
+ rowNumber -= stripe.getNumberOfRows();
+ }
+ throw new IllegalArgumentException("Seek after the end of reader range");
+ }
+
+ public OrcIndex readRowIndex(int stripeIndex, boolean[] included,
+ boolean[] sargColumns) throws IOException {
+ return readRowIndex(stripeIndex, included, null, null, sargColumns);
+ }
+
+ public OrcIndex readRowIndex(int stripeIndex, boolean[] included,
+ OrcProto.RowIndex[] indexes,
+ OrcProto.BloomFilterIndex[] bloomFilterIndex,
+ boolean[] sargColumns) throws IOException {
+ StripeInformation stripe = stripes.get(stripeIndex);
+ OrcProto.StripeFooter stripeFooter = null;
+ // if this is the current stripe, use the cached objects.
+ if (stripeIndex == currentStripe) {
+ stripeFooter = this.stripeFooter;
+ indexes = indexes == null ? this.indexes : indexes;
+ bloomFilterIndex = bloomFilterIndex == null ? this.bloomFilterIndices : bloomFilterIndex;
+ sargColumns = sargColumns == null ?
+ (sargApp == null ? null : sargApp.sargColumns) : sargColumns;
+ }
+ return dataReader.readRowIndex(stripe, stripeFooter, included, indexes, sargColumns,
+ bloomFilterIndex);
+ }
+
+ private void seekToRowEntry(TreeReaderFactory.TreeReader reader, int rowEntry)
+ throws IOException {
+ PositionProvider[] index = new PositionProvider[indexes.length];
+ for (int i = 0; i < indexes.length; ++i) {
+ if (indexes[i] != null) {
+ index[i] = new PositionProviderImpl(indexes[i].getEntry(rowEntry));
+ }
+ }
+ reader.seek(index);
+ }
+
+ @Override
+ public void seekToRow(long rowNumber) throws IOException {
+ if (rowNumber < 0) {
+ throw new IllegalArgumentException("Seek to a negative row number " +
+ rowNumber);
+ } else if (rowNumber < firstRow) {
+ throw new IllegalArgumentException("Seek before reader range " +
+ rowNumber);
+ }
+ // convert to our internal form (rows from the beginning of slice)
+ rowNumber -= firstRow;
+
+ // move to the right stripe
+ int rightStripe = findStripe(rowNumber);
+ if (rightStripe != currentStripe) {
+ currentStripe = rightStripe;
+ readStripe();
+ }
+ readRowIndex(currentStripe, included, sargApp == null ? null : sargApp.sargColumns);
+
+ // if we aren't to the right row yet, advance in the stripe.
+ advanceToNextRow(reader, rowNumber, true);
+ }
+
+ private static final String TRANSLATED_SARG_SEPARATOR = "_";
+ public static String encodeTranslatedSargColumn(int rootColumn, Integer indexInSourceTable) {
+ return rootColumn + TRANSLATED_SARG_SEPARATOR
+ + ((indexInSourceTable == null) ? -1 : indexInSourceTable);
+ }
+
+ public static int[] mapTranslatedSargColumns(
+ List<OrcProto.Type> types, List<PredicateLeaf> sargLeaves) {
+ int[] result = new int[sargLeaves.size()];
+ OrcProto.Type lastRoot = null; // Root will be the same for everyone as of now.
+ String lastRootStr = null;
+ for (int i = 0; i < result.length; ++i) {
+ String[] rootAndIndex = sargLeaves.get(i).getColumnName().split(TRANSLATED_SARG_SEPARATOR);
+ assert rootAndIndex.length == 2;
+ String rootStr = rootAndIndex[0], indexStr = rootAndIndex[1];
+ int index = Integer.parseInt(indexStr);
+ // First, check if the column even maps to anything.
+ if (index == -1) {
+ result[i] = -1;
+ continue;
+ }
+ assert index >= 0;
+ // Then, find the root type if needed.
+ if (!rootStr.equals(lastRootStr)) {
+ lastRoot = types.get(Integer.parseInt(rootStr));
+ lastRootStr = rootStr;
+ }
+ // Subtypes of the root types correspond, in order, to the columns in the table schema
+ // (disregarding schema evolution that doesn't presently work). Get the index for the
+ // corresponding subtype.
+ result[i] = lastRoot.getSubtypes(index);
+ }
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/RecordReaderUtils.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/RecordReaderUtils.java b/orc/src/java/org/apache/hive/orc/impl/RecordReaderUtils.java
new file mode 100644
index 0000000..16af69d
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/RecordReaderUtils.java
@@ -0,0 +1,578 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+import com.google.common.collect.Lists;
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.io.DiskRange;
+import org.apache.hadoop.hive.common.io.DiskRangeList;
+import org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper;
+import org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper;
+import org.apache.hive.orc.CompressionCodec;
+import org.apache.hive.orc.DataReader;
+import org.apache.hive.orc.OrcProto;
+import org.apache.hive.orc.StripeInformation;
+
+import com.google.common.collect.ComparisonChain;
+
+/**
+ * Stateless methods shared between RecordReaderImpl and EncodedReaderImpl.
+ */
+public class RecordReaderUtils {
+ private static final HadoopShims SHIMS = HadoopShims.Factory.get();
+
+ private static class DefaultDataReader implements DataReader {
+ private FSDataInputStream file = null;
+ private final ByteBufferAllocatorPool pool;
+ private HadoopShims.ZeroCopyReaderShim zcr = null;
+ private final FileSystem fs;
+ private final Path path;
+ private final boolean useZeroCopy;
+ private final CompressionCodec codec;
+ private final int bufferSize;
+ private final int typeCount;
+
+ private DefaultDataReader(DefaultDataReader other) {
+ this.pool = other.pool;
+ this.bufferSize = other.bufferSize;
+ this.typeCount = other.typeCount;
+ this.fs = other.fs;
+ this.path = other.path;
+ this.useZeroCopy = other.useZeroCopy;
+ this.codec = other.codec;
+ }
+
+ private DefaultDataReader(DataReaderProperties properties) {
+ this.fs = properties.getFileSystem();
+ this.path = properties.getPath();
+ this.useZeroCopy = properties.getZeroCopy();
+ this.codec = PhysicalFsWriter.createCodec(properties.getCompression());
+ this.bufferSize = properties.getBufferSize();
+ this.typeCount = properties.getTypeCount();
+ if (useZeroCopy) {
+ this.pool = new ByteBufferAllocatorPool();
+ } else {
+ this.pool = null;
+ }
+ }
+
+ @Override
+ public void open() throws IOException {
+ this.file = fs.open(path);
+ if (useZeroCopy) {
+ zcr = RecordReaderUtils.createZeroCopyShim(file, codec, pool);
+ } else {
+ zcr = null;
+ }
+ }
+
+ @Override
+ public OrcIndex readRowIndex(StripeInformation stripe,
+ OrcProto.StripeFooter footer,
+ boolean[] included,
+ OrcProto.RowIndex[] indexes,
+ boolean[] sargColumns,
+ OrcProto.BloomFilterIndex[] bloomFilterIndices
+ ) throws IOException {
+ if (file == null) {
+ open();
+ }
+ if (footer == null) {
+ footer = readStripeFooter(stripe);
+ }
+ if (indexes == null) {
+ indexes = new OrcProto.RowIndex[typeCount];
+ }
+ if (bloomFilterIndices == null) {
+ bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
+ }
+ long offset = stripe.getOffset();
+ List<OrcProto.Stream> streams = footer.getStreamsList();
+ for (int i = 0; i < streams.size(); i++) {
+ OrcProto.Stream stream = streams.get(i);
+ OrcProto.Stream nextStream = null;
+ if (i < streams.size() - 1) {
+ nextStream = streams.get(i+1);
+ }
+ int col = stream.getColumn();
+ int len = (int) stream.getLength();
+ // row index stream and bloom filter are interlaced, check if the sarg column contains bloom
+ // filter and combine the io to read row index and bloom filters for that column together
+ if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.ROW_INDEX)) {
+ boolean readBloomFilter = false;
+ if (sargColumns != null && sargColumns[col] &&
+ nextStream.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER) {
+ len += nextStream.getLength();
+ i += 1;
+ readBloomFilter = true;
+ }
+ if ((included == null || included[col]) && indexes[col] == null) {
+ byte[] buffer = new byte[len];
+ file.readFully(offset, buffer, 0, buffer.length);
+ ByteBuffer bb = ByteBuffer.wrap(buffer);
+ indexes[col] = OrcProto.RowIndex.parseFrom(InStream.create("index",
+ Lists.<DiskRange>newArrayList(new BufferChunk(bb, 0)), stream.getLength(),
+ codec, bufferSize));
+ if (readBloomFilter) {
+ bb.position((int) stream.getLength());
+ bloomFilterIndices[col] = OrcProto.BloomFilterIndex.parseFrom(InStream.create(
+ "bloom_filter", Lists.<DiskRange>newArrayList(new BufferChunk(bb, 0)),
+ nextStream.getLength(), codec, bufferSize));
+ }
+ }
+ }
+ offset += len;
+ }
+
+ OrcIndex index = new OrcIndex(indexes, bloomFilterIndices);
+ return index;
+ }
+
+ @Override
+ public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe) throws IOException {
+ if (file == null) {
+ open();
+ }
+ long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength();
+ int tailLength = (int) stripe.getFooterLength();
+
+ // read the footer
+ ByteBuffer tailBuf = ByteBuffer.allocate(tailLength);
+ file.readFully(offset, tailBuf.array(), tailBuf.arrayOffset(), tailLength);
+ return OrcProto.StripeFooter.parseFrom(InStream.createCodedInputStream("footer",
+ Lists.<DiskRange>newArrayList(new BufferChunk(tailBuf, 0)),
+ tailLength, codec, bufferSize));
+ }
+
+ @Override
+ public DiskRangeList readFileData(
+ DiskRangeList range, long baseOffset, boolean doForceDirect) throws IOException {
+ return RecordReaderUtils.readDiskRanges(file, zcr, baseOffset, range, doForceDirect);
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (pool != null) {
+ pool.clear();
+ }
+ // close both zcr and file
+ try (HadoopShims.ZeroCopyReaderShim myZcr = zcr) {
+ if (file != null) {
+ file.close();
+ }
+ }
+ }
+
+ @Override
+ public boolean isTrackingDiskRanges() {
+ return zcr != null;
+ }
+
+ @Override
+ public void releaseBuffer(ByteBuffer buffer) {
+ zcr.releaseBuffer(buffer);
+ }
+
+ @Override
+ public DataReader clone() {
+ return new DefaultDataReader(this);
+ }
+
+ }
+
+ public static DataReader createDefaultDataReader(DataReaderProperties properties) {
+ return new DefaultDataReader(properties);
+ }
+
+ public static boolean[] findPresentStreamsByColumn(
+ List<OrcProto.Stream> streamList, List<OrcProto.Type> types) {
+ boolean[] hasNull = new boolean[types.size()];
+ for(OrcProto.Stream stream: streamList) {
+ if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.PRESENT)) {
+ hasNull[stream.getColumn()] = true;
+ }
+ }
+ return hasNull;
+ }
+
+ /**
+ * Does region A overlap region B? The end points are inclusive on both sides.
+ * @param leftA A's left point
+ * @param rightA A's right point
+ * @param leftB B's left point
+ * @param rightB B's right point
+ * @return Does region A overlap region B?
+ */
+ static boolean overlap(long leftA, long rightA, long leftB, long rightB) {
+ if (leftA <= leftB) {
+ return rightA >= leftB;
+ }
+ return rightB >= leftA;
+ }
+
+ public static void addEntireStreamToRanges(
+ long offset, long length, CreateHelper list, boolean doMergeBuffers) {
+ list.addOrMerge(offset, offset + length, doMergeBuffers, false);
+ }
+
+ public static void addRgFilteredStreamToRanges(OrcProto.Stream stream,
+ boolean[] includedRowGroups, boolean isCompressed, OrcProto.RowIndex index,
+ OrcProto.ColumnEncoding encoding, OrcProto.Type type, int compressionSize, boolean hasNull,
+ long offset, long length, CreateHelper list, boolean doMergeBuffers) {
+ for (int group = 0; group < includedRowGroups.length; ++group) {
+ if (!includedRowGroups[group]) continue;
+ int posn = getIndexPosition(
+ encoding.getKind(), type.getKind(), stream.getKind(), isCompressed, hasNull);
+ long start = index.getEntry(group).getPositions(posn);
+ final long nextGroupOffset;
+ boolean isLast = group == (includedRowGroups.length - 1);
+ nextGroupOffset = isLast ? length : index.getEntry(group + 1).getPositions(posn);
+
+ start += offset;
+ long end = offset + estimateRgEndOffset(
+ isCompressed, isLast, nextGroupOffset, length, compressionSize);
+ list.addOrMerge(start, end, doMergeBuffers, true);
+ }
+ }
+
+ public static long estimateRgEndOffset(boolean isCompressed, boolean isLast,
+ long nextGroupOffset, long streamLength, int bufferSize) {
+ // figure out the worst case last location
+ // if adjacent groups have the same compressed block offset then stretch the slop
+ // by factor of 2 to safely accommodate the next compression block.
+ // One for the current compression block and another for the next compression block.
+ long slop = isCompressed ? 2 * (OutStream.HEADER_SIZE + bufferSize) : WORST_UNCOMPRESSED_SLOP;
+ return isLast ? streamLength : Math.min(streamLength, nextGroupOffset + slop);
+ }
+
+ private static final int BYTE_STREAM_POSITIONS = 1;
+ private static final int RUN_LENGTH_BYTE_POSITIONS = BYTE_STREAM_POSITIONS + 1;
+ private static final int BITFIELD_POSITIONS = RUN_LENGTH_BYTE_POSITIONS + 1;
+ private static final int RUN_LENGTH_INT_POSITIONS = BYTE_STREAM_POSITIONS + 1;
+
+ /**
+ * Get the offset in the index positions for the column that the given
+ * stream starts.
+ * @param columnEncoding the encoding of the column
+ * @param columnType the type of the column
+ * @param streamType the kind of the stream
+ * @param isCompressed is the file compressed
+ * @param hasNulls does the column have a PRESENT stream?
+ * @return the number of positions that will be used for that stream
+ */
+ public static int getIndexPosition(OrcProto.ColumnEncoding.Kind columnEncoding,
+ OrcProto.Type.Kind columnType,
+ OrcProto.Stream.Kind streamType,
+ boolean isCompressed,
+ boolean hasNulls) {
+ if (streamType == OrcProto.Stream.Kind.PRESENT) {
+ return 0;
+ }
+ int compressionValue = isCompressed ? 1 : 0;
+ int base = hasNulls ? (BITFIELD_POSITIONS + compressionValue) : 0;
+ switch (columnType) {
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ case DATE:
+ case STRUCT:
+ case MAP:
+ case LIST:
+ case UNION:
+ return base;
+ case CHAR:
+ case VARCHAR:
+ case STRING:
+ if (columnEncoding == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
+ columnEncoding == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
+ return base;
+ } else {
+ if (streamType == OrcProto.Stream.Kind.DATA) {
+ return base;
+ } else {
+ return base + BYTE_STREAM_POSITIONS + compressionValue;
+ }
+ }
+ case BINARY:
+ if (streamType == OrcProto.Stream.Kind.DATA) {
+ return base;
+ }
+ return base + BYTE_STREAM_POSITIONS + compressionValue;
+ case DECIMAL:
+ if (streamType == OrcProto.Stream.Kind.DATA) {
+ return base;
+ }
+ return base + BYTE_STREAM_POSITIONS + compressionValue;
+ case TIMESTAMP:
+ if (streamType == OrcProto.Stream.Kind.DATA) {
+ return base;
+ }
+ return base + RUN_LENGTH_INT_POSITIONS + compressionValue;
+ default:
+ throw new IllegalArgumentException("Unknown type " + columnType);
+ }
+ }
+
+ // for uncompressed streams, what is the most overlap with the following set
+ // of rows (long vint literal group).
+ static final int WORST_UNCOMPRESSED_SLOP = 2 + 8 * 512;
+
+ /**
+ * Is this stream part of a dictionary?
+ * @return is this part of a dictionary?
+ */
+ public static boolean isDictionary(OrcProto.Stream.Kind kind,
+ OrcProto.ColumnEncoding encoding) {
+ assert kind != OrcProto.Stream.Kind.DICTIONARY_COUNT;
+ OrcProto.ColumnEncoding.Kind encodingKind = encoding.getKind();
+ return kind == OrcProto.Stream.Kind.DICTIONARY_DATA ||
+ (kind == OrcProto.Stream.Kind.LENGTH &&
+ (encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
+ encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2));
+ }
+
+ /**
+ * Build a string representation of a list of disk ranges.
+ * @param range ranges to stringify
+ * @return the resulting string
+ */
+ public static String stringifyDiskRanges(DiskRangeList range) {
+ StringBuilder buffer = new StringBuilder();
+ buffer.append("[");
+ boolean isFirst = true;
+ while (range != null) {
+ if (!isFirst) {
+ buffer.append(", {");
+ } else {
+ buffer.append("{");
+ }
+ isFirst = false;
+ buffer.append(range.toString());
+ buffer.append("}");
+ range = range.next;
+ }
+ buffer.append("]");
+ return buffer.toString();
+ }
+
+ /**
+ * Read the list of ranges from the file.
+ * @param file the file to read
+ * @param base the base of the stripe
+ * @param range the disk ranges within the stripe to read
+ * @return the bytes read for each disk range, which is the same length as
+ * ranges
+ * @throws IOException
+ */
+ static DiskRangeList readDiskRanges(FSDataInputStream file,
+ HadoopShims.ZeroCopyReaderShim zcr,
+ long base,
+ DiskRangeList range,
+ boolean doForceDirect) throws IOException {
+ if (range == null) return null;
+ DiskRangeList prev = range.prev;
+ if (prev == null) {
+ prev = new MutateHelper(range);
+ }
+ while (range != null) {
+ if (range.hasData()) {
+ range = range.next;
+ continue;
+ }
+ int len = (int) (range.getEnd() - range.getOffset());
+ long off = range.getOffset();
+ if (zcr != null) {
+ file.seek(base + off);
+ boolean hasReplaced = false;
+ while (len > 0) {
+ ByteBuffer partial = zcr.readBuffer(len, false);
+ BufferChunk bc = new BufferChunk(partial, off);
+ if (!hasReplaced) {
+ range.replaceSelfWith(bc);
+ hasReplaced = true;
+ } else {
+ range.insertAfter(bc);
+ }
+ range = bc;
+ int read = partial.remaining();
+ len -= read;
+ off += read;
+ }
+ } else {
+ // Don't use HDFS ByteBuffer API because it has no readFully, and is buggy and pointless.
+ byte[] buffer = new byte[len];
+ file.readFully((base + off), buffer, 0, buffer.length);
+ ByteBuffer bb = null;
+ if (doForceDirect) {
+ bb = ByteBuffer.allocateDirect(len);
+ bb.put(buffer);
+ bb.position(0);
+ bb.limit(len);
+ } else {
+ bb = ByteBuffer.wrap(buffer);
+ }
+ range = range.replaceSelfWith(new BufferChunk(bb, range.getOffset()));
+ }
+ range = range.next;
+ }
+ return prev.next;
+ }
+
+
+ static List<DiskRange> getStreamBuffers(DiskRangeList range, long offset, long length) {
+ // This assumes sorted ranges (as do many other parts of ORC code.
+ ArrayList<DiskRange> buffers = new ArrayList<DiskRange>();
+ if (length == 0) return buffers;
+ long streamEnd = offset + length;
+ boolean inRange = false;
+ while (range != null) {
+ if (!inRange) {
+ if (range.getEnd() <= offset) {
+ range = range.next;
+ continue; // Skip until we are in range.
+ }
+ inRange = true;
+ if (range.getOffset() < offset) {
+ // Partial first buffer, add a slice of it.
+ buffers.add(range.sliceAndShift(offset, Math.min(streamEnd, range.getEnd()), -offset));
+ if (range.getEnd() >= streamEnd) break; // Partial first buffer is also partial last buffer.
+ range = range.next;
+ continue;
+ }
+ } else if (range.getOffset() >= streamEnd) {
+ break;
+ }
+ if (range.getEnd() > streamEnd) {
+ // Partial last buffer (may also be the first buffer), add a slice of it.
+ buffers.add(range.sliceAndShift(range.getOffset(), streamEnd, -offset));
+ break;
+ }
+ // Buffer that belongs entirely to one stream.
+ // TODO: ideally we would want to reuse the object and remove it from the list, but we cannot
+ // because bufferChunks is also used by clearStreams for zcr. Create a useless dup.
+ buffers.add(range.sliceAndShift(range.getOffset(), range.getEnd(), -offset));
+ if (range.getEnd() == streamEnd) break;
+ range = range.next;
+ }
+ return buffers;
+ }
+
+ static HadoopShims.ZeroCopyReaderShim createZeroCopyShim(FSDataInputStream file,
+ CompressionCodec codec, ByteBufferAllocatorPool pool) throws IOException {
+ if ((codec == null || ((codec instanceof DirectDecompressionCodec)
+ && ((DirectDecompressionCodec) codec).isAvailable()))) {
+ /* codec is null or is available */
+ return SHIMS.getZeroCopyReader(file, pool);
+ }
+ return null;
+ }
+
+ // this is an implementation copied from ElasticByteBufferPool in hadoop-2,
+ // which lacks a clear()/clean() operation
+ public final static class ByteBufferAllocatorPool implements HadoopShims.ByteBufferPoolShim {
+ private static final class Key implements Comparable<Key> {
+ private final int capacity;
+ private final long insertionGeneration;
+
+ Key(int capacity, long insertionGeneration) {
+ this.capacity = capacity;
+ this.insertionGeneration = insertionGeneration;
+ }
+
+ @Override
+ public int compareTo(Key other) {
+ return ComparisonChain.start().compare(capacity, other.capacity)
+ .compare(insertionGeneration, other.insertionGeneration).result();
+ }
+
+ @Override
+ public boolean equals(Object rhs) {
+ if (rhs == null) {
+ return false;
+ }
+ try {
+ Key o = (Key) rhs;
+ return (compareTo(o) == 0);
+ } catch (ClassCastException e) {
+ return false;
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ return new HashCodeBuilder().append(capacity).append(insertionGeneration)
+ .toHashCode();
+ }
+ }
+
+ private final TreeMap<Key, ByteBuffer> buffers = new TreeMap<Key, ByteBuffer>();
+
+ private final TreeMap<Key, ByteBuffer> directBuffers = new TreeMap<Key, ByteBuffer>();
+
+ private long currentGeneration = 0;
+
+ private final TreeMap<Key, ByteBuffer> getBufferTree(boolean direct) {
+ return direct ? directBuffers : buffers;
+ }
+
+ public void clear() {
+ buffers.clear();
+ directBuffers.clear();
+ }
+
+ @Override
+ public ByteBuffer getBuffer(boolean direct, int length) {
+ TreeMap<Key, ByteBuffer> tree = getBufferTree(direct);
+ Map.Entry<Key, ByteBuffer> entry = tree.ceilingEntry(new Key(length, 0));
+ if (entry == null) {
+ return direct ? ByteBuffer.allocateDirect(length) : ByteBuffer
+ .allocate(length);
+ }
+ tree.remove(entry.getKey());
+ return entry.getValue();
+ }
+
+ @Override
+ public void putBuffer(ByteBuffer buffer) {
+ TreeMap<Key, ByteBuffer> tree = getBufferTree(buffer.isDirect());
+ while (true) {
+ Key key = new Key(buffer.capacity(), currentGeneration++);
+ if (!tree.containsKey(key)) {
+ tree.put(key, buffer);
+ return;
+ }
+ // Buffers are indexed by (capacity, generation).
+ // If our key is not unique on the first try, we try again
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/RedBlackTree.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/RedBlackTree.java b/orc/src/java/org/apache/hive/orc/impl/RedBlackTree.java
new file mode 100644
index 0000000..a340c50
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/RedBlackTree.java
@@ -0,0 +1,309 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+/**
+ * A memory efficient red-black tree that does not allocate any objects per
+ * an element. This class is abstract and assumes that the child class
+ * handles the key and comparisons with the key.
+ */
+abstract class RedBlackTree {
+ public static final int NULL = -1;
+
+ // Various values controlling the offset of the data within the array.
+ private static final int LEFT_OFFSET = 0;
+ private static final int RIGHT_OFFSET = 1;
+ private static final int ELEMENT_SIZE = 2;
+
+ protected int size = 0;
+ private final DynamicIntArray data;
+ protected int root = NULL;
+ protected int lastAdd = 0;
+ private boolean wasAdd = false;
+
+ /**
+ * Create a set with the given initial capacity.
+ */
+ public RedBlackTree(int initialCapacity) {
+ data = new DynamicIntArray(initialCapacity * ELEMENT_SIZE);
+ }
+
+ /**
+ * Insert a new node into the data array, growing the array as necessary.
+ *
+ * @return Returns the position of the new node.
+ */
+ private int insert(int left, int right, boolean isRed) {
+ int position = size;
+ size += 1;
+ setLeft(position, left, isRed);
+ setRight(position, right);
+ return position;
+ }
+
+ /**
+ * Compare the value at the given position to the new value.
+ * @return 0 if the values are the same, -1 if the new value is smaller and
+ * 1 if the new value is larger.
+ */
+ protected abstract int compareValue(int position);
+
+ /**
+ * Is the given node red as opposed to black? To prevent having an extra word
+ * in the data array, we just the low bit on the left child index.
+ */
+ protected boolean isRed(int position) {
+ return position != NULL &&
+ (data.get(position * ELEMENT_SIZE + LEFT_OFFSET) & 1) == 1;
+ }
+
+ /**
+ * Set the red bit true or false.
+ */
+ private void setRed(int position, boolean isRed) {
+ int offset = position * ELEMENT_SIZE + LEFT_OFFSET;
+ if (isRed) {
+ data.set(offset, data.get(offset) | 1);
+ } else {
+ data.set(offset, data.get(offset) & ~1);
+ }
+ }
+
+ /**
+ * Get the left field of the given position.
+ */
+ protected int getLeft(int position) {
+ return data.get(position * ELEMENT_SIZE + LEFT_OFFSET) >> 1;
+ }
+
+ /**
+ * Get the right field of the given position.
+ */
+ protected int getRight(int position) {
+ return data.get(position * ELEMENT_SIZE + RIGHT_OFFSET);
+ }
+
+ /**
+ * Set the left field of the given position.
+ * Note that we are storing the node color in the low bit of the left pointer.
+ */
+ private void setLeft(int position, int left) {
+ int offset = position * ELEMENT_SIZE + LEFT_OFFSET;
+ data.set(offset, (left << 1) | (data.get(offset) & 1));
+ }
+
+ /**
+ * Set the left field of the given position.
+ * Note that we are storing the node color in the low bit of the left pointer.
+ */
+ private void setLeft(int position, int left, boolean isRed) {
+ int offset = position * ELEMENT_SIZE + LEFT_OFFSET;
+ data.set(offset, (left << 1) | (isRed ? 1 : 0));
+ }
+
+ /**
+ * Set the right field of the given position.
+ */
+ private void setRight(int position, int right) {
+ data.set(position * ELEMENT_SIZE + RIGHT_OFFSET, right);
+ }
+
+ /**
+ * Insert or find a given key in the tree and rebalance the tree correctly.
+ * Rebalancing restores the red-black aspect of the tree to maintain the
+ * invariants:
+ * 1. If a node is red, both of its children are black.
+ * 2. Each child of a node has the same black height (the number of black
+ * nodes between it and the leaves of the tree).
+ *
+ * Inserted nodes are at the leaves and are red, therefore there is at most a
+ * violation of rule 1 at the node we just put in. Instead of always keeping
+ * the parents, this routine passing down the context.
+ *
+ * The fix is broken down into 6 cases (1.{1,2,3} and 2.{1,2,3} that are
+ * left-right mirror images of each other). See Algorighms by Cormen,
+ * Leiserson, and Rivest for the explaination of the subcases.
+ *
+ * @param node The node that we are fixing right now.
+ * @param fromLeft Did we come down from the left?
+ * @param parent Nodes' parent
+ * @param grandparent Parent's parent
+ * @param greatGrandparent Grandparent's parent
+ * @return Does parent also need to be checked and/or fixed?
+ */
+ private boolean add(int node, boolean fromLeft, int parent,
+ int grandparent, int greatGrandparent) {
+ if (node == NULL) {
+ if (root == NULL) {
+ lastAdd = insert(NULL, NULL, false);
+ root = lastAdd;
+ wasAdd = true;
+ return false;
+ } else {
+ lastAdd = insert(NULL, NULL, true);
+ node = lastAdd;
+ wasAdd = true;
+ // connect the new node into the tree
+ if (fromLeft) {
+ setLeft(parent, node);
+ } else {
+ setRight(parent, node);
+ }
+ }
+ } else {
+ int compare = compareValue(node);
+ boolean keepGoing;
+
+ // Recurse down to find where the node needs to be added
+ if (compare < 0) {
+ keepGoing = add(getLeft(node), true, node, parent, grandparent);
+ } else if (compare > 0) {
+ keepGoing = add(getRight(node), false, node, parent, grandparent);
+ } else {
+ lastAdd = node;
+ wasAdd = false;
+ return false;
+ }
+
+ // we don't need to fix the root (because it is always set to black)
+ if (node == root || !keepGoing) {
+ return false;
+ }
+ }
+
+
+ // Do we need to fix this node? Only if there are two reds right under each
+ // other.
+ if (isRed(node) && isRed(parent)) {
+ if (parent == getLeft(grandparent)) {
+ int uncle = getRight(grandparent);
+ if (isRed(uncle)) {
+ // case 1.1
+ setRed(parent, false);
+ setRed(uncle, false);
+ setRed(grandparent, true);
+ return true;
+ } else {
+ if (node == getRight(parent)) {
+ // case 1.2
+ // swap node and parent
+ int tmp = node;
+ node = parent;
+ parent = tmp;
+ // left-rotate on node
+ setLeft(grandparent, parent);
+ setRight(node, getLeft(parent));
+ setLeft(parent, node);
+ }
+
+ // case 1.2 and 1.3
+ setRed(parent, false);
+ setRed(grandparent, true);
+
+ // right-rotate on grandparent
+ if (greatGrandparent == NULL) {
+ root = parent;
+ } else if (getLeft(greatGrandparent) == grandparent) {
+ setLeft(greatGrandparent, parent);
+ } else {
+ setRight(greatGrandparent, parent);
+ }
+ setLeft(grandparent, getRight(parent));
+ setRight(parent, grandparent);
+ return false;
+ }
+ } else {
+ int uncle = getLeft(grandparent);
+ if (isRed(uncle)) {
+ // case 2.1
+ setRed(parent, false);
+ setRed(uncle, false);
+ setRed(grandparent, true);
+ return true;
+ } else {
+ if (node == getLeft(parent)) {
+ // case 2.2
+ // swap node and parent
+ int tmp = node;
+ node = parent;
+ parent = tmp;
+ // right-rotate on node
+ setRight(grandparent, parent);
+ setLeft(node, getRight(parent));
+ setRight(parent, node);
+ }
+ // case 2.2 and 2.3
+ setRed(parent, false);
+ setRed(grandparent, true);
+ // left-rotate on grandparent
+ if (greatGrandparent == NULL) {
+ root = parent;
+ } else if (getRight(greatGrandparent) == grandparent) {
+ setRight(greatGrandparent, parent);
+ } else {
+ setLeft(greatGrandparent, parent);
+ }
+ setRight(grandparent, getLeft(parent));
+ setLeft(parent, grandparent);
+ return false;
+ }
+ }
+ } else {
+ return true;
+ }
+ }
+
+ /**
+ * Add the new key to the tree.
+ * @return true if the element is a new one.
+ */
+ protected boolean add() {
+ add(root, false, NULL, NULL, NULL);
+ if (wasAdd) {
+ setRed(root, false);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Get the number of elements in the set.
+ */
+ public int size() {
+ return size;
+ }
+
+ /**
+ * Reset the table to empty.
+ */
+ public void clear() {
+ root = NULL;
+ size = 0;
+ data.clear();
+ }
+
+ /**
+ * Get the buffer size in bytes.
+ */
+ public long getSizeInBytes() {
+ return data.getSizeInBytes();
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/RunLengthByteReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/RunLengthByteReader.java b/orc/src/java/org/apache/hive/orc/impl/RunLengthByteReader.java
new file mode 100644
index 0000000..1dd5dab
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/RunLengthByteReader.java
@@ -0,0 +1,174 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.EOFException;
+import java.io.IOException;
+
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+
+/**
+ * A reader that reads a sequence of bytes. A control byte is read before
+ * each run with positive values 0 to 127 meaning 3 to 130 repetitions. If the
+ * byte is -1 to -128, 1 to 128 literal byte values follow.
+ */
+public class RunLengthByteReader {
+ private InStream input;
+ private final byte[] literals =
+ new byte[RunLengthByteWriter.MAX_LITERAL_SIZE];
+ private int numLiterals = 0;
+ private int used = 0;
+ private boolean repeat = false;
+
+ public RunLengthByteReader(InStream input) throws IOException {
+ this.input = input;
+ }
+
+ public void setInStream(InStream input) {
+ this.input = input;
+ }
+
+ private void readValues(boolean ignoreEof) throws IOException {
+ int control = input.read();
+ used = 0;
+ if (control == -1) {
+ if (!ignoreEof) {
+ throw new EOFException("Read past end of buffer RLE byte from " + input);
+ }
+ used = numLiterals = 0;
+ return;
+ } else if (control < 0x80) {
+ repeat = true;
+ numLiterals = control + RunLengthByteWriter.MIN_REPEAT_SIZE;
+ int val = input.read();
+ if (val == -1) {
+ throw new EOFException("Reading RLE byte got EOF");
+ }
+ literals[0] = (byte) val;
+ } else {
+ repeat = false;
+ numLiterals = 0x100 - control;
+ int bytes = 0;
+ while (bytes < numLiterals) {
+ int result = input.read(literals, bytes, numLiterals - bytes);
+ if (result == -1) {
+ throw new EOFException("Reading RLE byte literal got EOF in " + this);
+ }
+ bytes += result;
+ }
+ }
+ }
+
+ public boolean hasNext() throws IOException {
+ return used != numLiterals || input.available() > 0;
+ }
+
+ public byte next() throws IOException {
+ byte result;
+ if (used == numLiterals) {
+ readValues(false);
+ }
+ if (repeat) {
+ result = literals[0];
+ } else {
+ result = literals[used];
+ }
+ ++used;
+ return result;
+ }
+
+ public void nextVector(ColumnVector previous, long[] data, long size)
+ throws IOException {
+ previous.isRepeating = true;
+ for (int i = 0; i < size; i++) {
+ if (!previous.isNull[i]) {
+ data[i] = next();
+ } else {
+ // The default value of null for int types in vectorized
+ // processing is 1, so set that if the value is null
+ data[i] = 1;
+ }
+
+ // The default value for nulls in Vectorization for int types is 1
+ // and given that non null value can also be 1, we need to check for isNull also
+ // when determining the isRepeating flag.
+ if (previous.isRepeating
+ && i > 0
+ && ((data[0] != data[i]) ||
+ (previous.isNull[0] != previous.isNull[i]))) {
+ previous.isRepeating = false;
+ }
+ }
+ }
+
+ /**
+ * Read the next size bytes into the data array, skipping over any slots
+ * where isNull is true.
+ * @param isNull if non-null, skip any rows where isNull[r] is true
+ * @param data the array to read into
+ * @param size the number of elements to read
+ * @throws IOException
+ */
+ public void nextVector(boolean[] isNull, int[] data,
+ long size) throws IOException {
+ if (isNull == null) {
+ for(int i=0; i < size; ++i) {
+ data[i] = next();
+ }
+ } else {
+ for(int i=0; i < size; ++i) {
+ if (!isNull[i]) {
+ data[i] = next();
+ }
+ }
+ }
+ }
+
+ public void seek(PositionProvider index) throws IOException {
+ input.seek(index);
+ int consumed = (int) index.getNext();
+ if (consumed != 0) {
+ // a loop is required for cases where we break the run into two parts
+ while (consumed > 0) {
+ readValues(false);
+ used = consumed;
+ consumed -= numLiterals;
+ }
+ } else {
+ used = 0;
+ numLiterals = 0;
+ }
+ }
+
+ public void skip(long items) throws IOException {
+ while (items > 0) {
+ if (used == numLiterals) {
+ readValues(false);
+ }
+ long consume = Math.min(items, numLiterals - used);
+ used += consume;
+ items -= consume;
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "byte rle " + (repeat ? "repeat" : "literal") + " used: " +
+ used + "/" + numLiterals + " from " + input;
+ }
+}
[24/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/ZeroCopyShims.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/ZeroCopyShims.java b/orc/src/java/org/apache/hive/orc/impl/ZeroCopyShims.java
new file mode 100644
index 0000000..6322801
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/ZeroCopyShims.java
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.EnumSet;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.ReadOption;
+import org.apache.hadoop.io.ByteBufferPool;
+
+class ZeroCopyShims {
+ private static final class ByteBufferPoolAdapter implements ByteBufferPool {
+ private HadoopShims.ByteBufferPoolShim pool;
+
+ public ByteBufferPoolAdapter(HadoopShims.ByteBufferPoolShim pool) {
+ this.pool = pool;
+ }
+
+ @Override
+ public final ByteBuffer getBuffer(boolean direct, int length) {
+ return this.pool.getBuffer(direct, length);
+ }
+
+ @Override
+ public final void putBuffer(ByteBuffer buffer) {
+ this.pool.putBuffer(buffer);
+ }
+ }
+
+ private static final class ZeroCopyAdapter implements HadoopShims.ZeroCopyReaderShim {
+ private final FSDataInputStream in;
+ private final ByteBufferPoolAdapter pool;
+ private final static EnumSet<ReadOption> CHECK_SUM = EnumSet
+ .noneOf(ReadOption.class);
+ private final static EnumSet<ReadOption> NO_CHECK_SUM = EnumSet
+ .of(ReadOption.SKIP_CHECKSUMS);
+
+ public ZeroCopyAdapter(FSDataInputStream in,
+ HadoopShims.ByteBufferPoolShim poolshim) {
+ this.in = in;
+ if (poolshim != null) {
+ pool = new ByteBufferPoolAdapter(poolshim);
+ } else {
+ pool = null;
+ }
+ }
+
+ public final ByteBuffer readBuffer(int maxLength, boolean verifyChecksums)
+ throws IOException {
+ EnumSet<ReadOption> options = NO_CHECK_SUM;
+ if (verifyChecksums) {
+ options = CHECK_SUM;
+ }
+ return this.in.read(this.pool, maxLength, options);
+ }
+
+ public final void releaseBuffer(ByteBuffer buffer) {
+ this.in.releaseBuffer(buffer);
+ }
+
+ @Override
+ public final void close() throws IOException {
+ this.in.close();
+ }
+ }
+
+ public static HadoopShims.ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in,
+ HadoopShims.ByteBufferPoolShim pool) throws IOException {
+ return new ZeroCopyAdapter(in, pool);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/ZlibCodec.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/ZlibCodec.java b/orc/src/java/org/apache/hive/orc/impl/ZlibCodec.java
new file mode 100644
index 0000000..16bd955
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/ZlibCodec.java
@@ -0,0 +1,168 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.EnumSet;
+import java.util.zip.DataFormatException;
+import java.util.zip.Deflater;
+import java.util.zip.Inflater;
+
+import javax.annotation.Nullable;
+
+import org.apache.hive.orc.CompressionCodec;
+
+public class ZlibCodec implements CompressionCodec, DirectDecompressionCodec {
+ private static final HadoopShims SHIMS = HadoopShims.Factory.get();
+ private Boolean direct = null;
+
+ private final int level;
+ private final int strategy;
+
+ public ZlibCodec() {
+ level = Deflater.DEFAULT_COMPRESSION;
+ strategy = Deflater.DEFAULT_STRATEGY;
+ }
+
+ private ZlibCodec(int level, int strategy) {
+ this.level = level;
+ this.strategy = strategy;
+ }
+
+ @Override
+ public boolean compress(ByteBuffer in, ByteBuffer out,
+ ByteBuffer overflow) throws IOException {
+ Deflater deflater = new Deflater(level, true);
+ deflater.setStrategy(strategy);
+ int length = in.remaining();
+ deflater.setInput(in.array(), in.arrayOffset() + in.position(), length);
+ deflater.finish();
+ int outSize = 0;
+ int offset = out.arrayOffset() + out.position();
+ while (!deflater.finished() && (length > outSize)) {
+ int size = deflater.deflate(out.array(), offset, out.remaining());
+ out.position(size + out.position());
+ outSize += size;
+ offset += size;
+ // if we run out of space in the out buffer, use the overflow
+ if (out.remaining() == 0) {
+ if (overflow == null) {
+ deflater.end();
+ return false;
+ }
+ out = overflow;
+ offset = out.arrayOffset() + out.position();
+ }
+ }
+ deflater.end();
+ return length > outSize;
+ }
+
+ @Override
+ public void decompress(ByteBuffer in, ByteBuffer out) throws IOException {
+
+ if(in.isDirect() && out.isDirect()) {
+ directDecompress(in, out);
+ return;
+ }
+
+ Inflater inflater = new Inflater(true);
+ inflater.setInput(in.array(), in.arrayOffset() + in.position(),
+ in.remaining());
+ while (!(inflater.finished() || inflater.needsDictionary() ||
+ inflater.needsInput())) {
+ try {
+ int count = inflater.inflate(out.array(),
+ out.arrayOffset() + out.position(),
+ out.remaining());
+ out.position(count + out.position());
+ } catch (DataFormatException dfe) {
+ throw new IOException("Bad compression data", dfe);
+ }
+ }
+ out.flip();
+ inflater.end();
+ in.position(in.limit());
+ }
+
+ @Override
+ public boolean isAvailable() {
+ if (direct == null) {
+ // see nowrap option in new Inflater(boolean) which disables zlib headers
+ try {
+ if (SHIMS.getDirectDecompressor(
+ HadoopShims.DirectCompressionType.ZLIB_NOHEADER) != null) {
+ direct = Boolean.valueOf(true);
+ } else {
+ direct = Boolean.valueOf(false);
+ }
+ } catch (UnsatisfiedLinkError ule) {
+ direct = Boolean.valueOf(false);
+ }
+ }
+ return direct.booleanValue();
+ }
+
+ @Override
+ public void directDecompress(ByteBuffer in, ByteBuffer out)
+ throws IOException {
+ HadoopShims.DirectDecompressor decompressShim =
+ SHIMS.getDirectDecompressor(HadoopShims.DirectCompressionType.ZLIB_NOHEADER);
+ decompressShim.decompress(in, out);
+ out.flip(); // flip for read
+ }
+
+ @Override
+ public CompressionCodec modify(@Nullable EnumSet<Modifier> modifiers) {
+
+ if (modifiers == null) {
+ return this;
+ }
+
+ int l = this.level;
+ int s = this.strategy;
+
+ for (Modifier m : modifiers) {
+ switch (m) {
+ case BINARY:
+ /* filtered == less LZ77, more huffman */
+ s = Deflater.FILTERED;
+ break;
+ case TEXT:
+ s = Deflater.DEFAULT_STRATEGY;
+ break;
+ case FASTEST:
+ // deflate_fast looking for 8 byte patterns
+ l = Deflater.BEST_SPEED;
+ break;
+ case FAST:
+ // deflate_fast looking for 16 byte patterns
+ l = Deflater.BEST_SPEED + 1;
+ break;
+ case DEFAULT:
+ // deflate_slow looking for 128 byte patterns
+ l = Deflater.DEFAULT_COMPRESSION;
+ break;
+ default:
+ break;
+ }
+ }
+ return new ZlibCodec(l, s);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/tools/FileDump.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/tools/FileDump.java b/orc/src/java/org/apache/hive/orc/tools/FileDump.java
new file mode 100644
index 0000000..c6b68de
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/tools/FileDump.java
@@ -0,0 +1,946 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.tools;
+
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintStream;
+import java.text.DecimalFormat;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.hdfs.DistributedFileSystem;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hive.orc.ColumnStatistics;
+import org.apache.hive.orc.OrcFile;
+import org.apache.hive.orc.BloomFilterIO;
+import org.apache.hive.orc.CompressionKind;
+import org.apache.hive.orc.Reader;
+import org.apache.hive.orc.RecordReader;
+import org.apache.hive.orc.TypeDescription;
+import org.apache.hive.orc.Writer;
+import org.apache.hive.orc.impl.AcidStats;
+import org.apache.hive.orc.impl.ColumnStatisticsImpl;
+import org.apache.hive.orc.impl.OrcAcidUtils;
+import org.apache.hive.orc.impl.OrcIndex;
+import org.apache.hive.orc.OrcProto;
+import org.apache.hive.orc.StripeInformation;
+import org.apache.hive.orc.StripeStatistics;
+import org.apache.hive.orc.impl.RecordReaderImpl;
+import org.codehaus.jettison.json.JSONException;
+import org.codehaus.jettison.json.JSONWriter;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Strings;
+import com.google.common.collect.Lists;
+
+/**
+ * A tool for printing out the file structure of ORC files.
+ */
+public final class FileDump {
+ public static final String UNKNOWN = "UNKNOWN";
+ public static final String SEPARATOR = Strings.repeat("_", 120) + "\n";
+ public static final int DEFAULT_BLOCK_SIZE = 256 * 1024 * 1024;
+ public static final String DEFAULT_BACKUP_PATH = System.getProperty("java.io.tmpdir");
+ public static final PathFilter HIDDEN_AND_SIDE_FILE_FILTER = new PathFilter() {
+ public boolean accept(Path p) {
+ String name = p.getName();
+ return !name.startsWith("_") && !name.startsWith(".") && !name.endsWith(
+ OrcAcidUtils.DELTA_SIDE_FILE_SUFFIX);
+ }
+ };
+
+ // not used
+ private FileDump() {
+ }
+
+ public static void main(String[] args) throws Exception {
+ Configuration conf = new Configuration();
+
+ List<Integer> rowIndexCols = new ArrayList<Integer>(0);
+ Options opts = createOptions();
+ CommandLine cli = new GnuParser().parse(opts, args);
+
+ if (cli.hasOption('h')) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp("orcfiledump", opts);
+ return;
+ }
+
+ boolean dumpData = cli.hasOption('d');
+ boolean recover = cli.hasOption("recover");
+ boolean skipDump = cli.hasOption("skip-dump");
+ String backupPath = DEFAULT_BACKUP_PATH;
+ if (cli.hasOption("backup-path")) {
+ backupPath = cli.getOptionValue("backup-path");
+ }
+
+ if (cli.hasOption("r")) {
+ String val = cli.getOptionValue("r");
+ if (val != null && val.trim().equals("*")) {
+ rowIndexCols = null; // All the columns
+ } else {
+ String[] colStrs = cli.getOptionValue("r").split(",");
+ rowIndexCols = new ArrayList<Integer>(colStrs.length);
+ for (String colStr : colStrs) {
+ rowIndexCols.add(Integer.parseInt(colStr));
+ }
+ }
+ }
+
+ boolean printTimeZone = cli.hasOption('t');
+ boolean jsonFormat = cli.hasOption('j');
+ String[] files = cli.getArgs();
+ if (files.length == 0) {
+ System.err.println("Error : ORC files are not specified");
+ return;
+ }
+
+ // if the specified path is directory, iterate through all files and print the file dump
+ List<String> filesInPath = Lists.newArrayList();
+ for (String filename : files) {
+ Path path = new Path(filename);
+ filesInPath.addAll(getAllFilesInPath(path, conf));
+ }
+
+ if (dumpData) {
+ printData(filesInPath, conf);
+ } else if (recover && skipDump) {
+ recoverFiles(filesInPath, conf, backupPath);
+ } else {
+ if (jsonFormat) {
+ boolean prettyPrint = cli.hasOption('p');
+ JsonFileDump.printJsonMetaData(filesInPath, conf, rowIndexCols, prettyPrint, printTimeZone);
+ } else {
+ printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath);
+ }
+ }
+ }
+
+ /**
+ * This method returns an ORC reader object if the specified file is readable. If the specified
+ * file has side file (_flush_length) file, then max footer offset will be read from the side
+ * file and orc reader will be created from that offset. Since both data file and side file
+ * use hflush() for flushing the data, there could be some inconsistencies and both files could be
+ * out-of-sync. Following are the cases under which null will be returned
+ *
+ * 1) If the file specified by path or its side file is still open for writes
+ * 2) If *_flush_length file does not return any footer offset
+ * 3) If *_flush_length returns a valid footer offset but the data file is not readable at that
+ * position (incomplete data file)
+ * 4) If *_flush_length file length is not a multiple of 8, then reader will be created from
+ * previous valid footer. If there is no such footer (file length > 0 and < 8), then null will
+ * be returned
+ *
+ * Also, if this method detects any file corruption (mismatch between data file and side file)
+ * then it will add the corresponding file to the specified input list for corrupted files.
+ *
+ * In all other cases, where the file is readable this method will return a reader object.
+ *
+ * @param path - file to get reader for
+ * @param conf - configuration object
+ * @param corruptFiles - fills this list with all possible corrupted files
+ * @return - reader for the specified file or null
+ * @throws IOException
+ */
+ static Reader getReader(final Path path, final Configuration conf,
+ final List<String> corruptFiles) throws IOException {
+ FileSystem fs = path.getFileSystem(conf);
+ long dataFileLen = fs.getFileStatus(path).getLen();
+ System.err.println("Processing data file " + path + " [length: " + dataFileLen + "]");
+ Path sideFile = OrcAcidUtils.getSideFile(path);
+ final boolean sideFileExists = fs.exists(sideFile);
+ boolean openDataFile = false;
+ boolean openSideFile = false;
+ if (fs instanceof DistributedFileSystem) {
+ DistributedFileSystem dfs = (DistributedFileSystem) fs;
+ openDataFile = !dfs.isFileClosed(path);
+ openSideFile = sideFileExists && !dfs.isFileClosed(sideFile);
+ }
+
+ if (openDataFile || openSideFile) {
+ if (openDataFile && openSideFile) {
+ System.err.println("Unable to perform file dump as " + path + " and " + sideFile +
+ " are still open for writes.");
+ } else if (openSideFile) {
+ System.err.println("Unable to perform file dump as " + sideFile +
+ " is still open for writes.");
+ } else {
+ System.err.println("Unable to perform file dump as " + path +
+ " is still open for writes.");
+ }
+
+ return null;
+ }
+
+ Reader reader = null;
+ if (sideFileExists) {
+ final long maxLen = OrcAcidUtils.getLastFlushLength(fs, path);
+ final long sideFileLen = fs.getFileStatus(sideFile).getLen();
+ System.err.println("Found flush length file " + sideFile
+ + " [length: " + sideFileLen + ", maxFooterOffset: " + maxLen + "]");
+ // no offsets read from side file
+ if (maxLen == -1) {
+
+ // if data file is larger than last flush length, then additional data could be recovered
+ if (dataFileLen > maxLen) {
+ System.err.println("Data file has more data than max footer offset:" + maxLen +
+ ". Adding data file to recovery list.");
+ if (corruptFiles != null) {
+ corruptFiles.add(path.toUri().toString());
+ }
+ }
+ return null;
+ }
+
+ try {
+ reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).maxLength(maxLen));
+
+ // if data file is larger than last flush length, then additional data could be recovered
+ if (dataFileLen > maxLen) {
+ System.err.println("Data file has more data than max footer offset:" + maxLen +
+ ". Adding data file to recovery list.");
+ if (corruptFiles != null) {
+ corruptFiles.add(path.toUri().toString());
+ }
+ }
+ } catch (Exception e) {
+ if (corruptFiles != null) {
+ corruptFiles.add(path.toUri().toString());
+ }
+ System.err.println("Unable to read data from max footer offset." +
+ " Adding data file to recovery list.");
+ return null;
+ }
+ } else {
+ reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
+ }
+
+ return reader;
+ }
+
+ public static Collection<String> getAllFilesInPath(final Path path,
+ final Configuration conf) throws IOException {
+ List<String> filesInPath = Lists.newArrayList();
+ FileSystem fs = path.getFileSystem(conf);
+ FileStatus fileStatus = fs.getFileStatus(path);
+ if (fileStatus.isDir()) {
+ FileStatus[] fileStatuses = fs.listStatus(path, HIDDEN_AND_SIDE_FILE_FILTER);
+ for (FileStatus fileInPath : fileStatuses) {
+ if (fileInPath.isDir()) {
+ filesInPath.addAll(getAllFilesInPath(fileInPath.getPath(), conf));
+ } else {
+ filesInPath.add(fileInPath.getPath().toString());
+ }
+ }
+ } else {
+ filesInPath.add(path.toString());
+ }
+
+ return filesInPath;
+ }
+
+ private static void printData(List<String> files,
+ Configuration conf) throws IOException,
+ JSONException {
+ for (String file : files) {
+ try {
+ Path path = new Path(file);
+ Reader reader = getReader(path, conf, Lists.<String>newArrayList());
+ if (reader == null) {
+ continue;
+ }
+ printJsonData(reader);
+ System.out.println(SEPARATOR);
+ } catch (Exception e) {
+ System.err.println("Unable to dump data for file: " + file);
+ continue;
+ }
+ }
+ }
+
+ private static void printMetaData(List<String> files, Configuration conf,
+ List<Integer> rowIndexCols, boolean printTimeZone, final boolean recover,
+ final String backupPath)
+ throws IOException {
+ List<String> corruptFiles = Lists.newArrayList();
+ for (String filename : files) {
+ printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles);
+ System.out.println(SEPARATOR);
+ }
+
+ if (!corruptFiles.isEmpty()) {
+ if (recover) {
+ recoverFiles(corruptFiles, conf, backupPath);
+ } else {
+ System.err.println(corruptFiles.size() + " file(s) are corrupted." +
+ " Run the following command to recover corrupted files.\n");
+ String fileNames = Joiner.on(" ").skipNulls().join(corruptFiles);
+ System.err.println("hive --orcfiledump --recover --skip-dump " + fileNames);
+ System.out.println(SEPARATOR);
+ }
+ }
+ }
+
+ private static void printMetaDataImpl(final String filename,
+ final Configuration conf, List<Integer> rowIndexCols, final boolean printTimeZone,
+ final List<String> corruptFiles) throws IOException {
+ Path file = new Path(filename);
+ Reader reader = getReader(file, conf, corruptFiles);
+ // if we can create reader then footer is not corrupt and file will readable
+ if (reader == null) {
+ return;
+ }
+
+ System.out.println("Structure for " + filename);
+ System.out.println("File Version: " + reader.getFileVersion().getName() +
+ " with " + reader.getWriterVersion());
+ RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
+ System.out.println("Rows: " + reader.getNumberOfRows());
+ System.out.println("Compression: " + reader.getCompressionKind());
+ if (reader.getCompressionKind() != CompressionKind.NONE) {
+ System.out.println("Compression size: " + reader.getCompressionSize());
+ }
+ System.out.println("Type: " + reader.getSchema().toString());
+ System.out.println("\nStripe Statistics:");
+ List<StripeStatistics> stripeStats = reader.getStripeStatistics();
+ for (int n = 0; n < stripeStats.size(); n++) {
+ System.out.println(" Stripe " + (n + 1) + ":");
+ StripeStatistics ss = stripeStats.get(n);
+ for (int i = 0; i < ss.getColumnStatistics().length; ++i) {
+ System.out.println(" Column " + i + ": " +
+ ss.getColumnStatistics()[i].toString());
+ }
+ }
+ ColumnStatistics[] stats = reader.getStatistics();
+ int colCount = stats.length;
+ if (rowIndexCols == null) {
+ rowIndexCols = new ArrayList<>(colCount);
+ for (int i = 0; i < colCount; ++i) {
+ rowIndexCols.add(i);
+ }
+ }
+ System.out.println("\nFile Statistics:");
+ for (int i = 0; i < stats.length; ++i) {
+ System.out.println(" Column " + i + ": " + stats[i].toString());
+ }
+ System.out.println("\nStripes:");
+ int stripeIx = -1;
+ for (StripeInformation stripe : reader.getStripes()) {
+ ++stripeIx;
+ long stripeStart = stripe.getOffset();
+ OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
+ if (printTimeZone) {
+ String tz = footer.getWriterTimezone();
+ if (tz == null || tz.isEmpty()) {
+ tz = UNKNOWN;
+ }
+ System.out.println(" Stripe: " + stripe.toString() + " timezone: " + tz);
+ } else {
+ System.out.println(" Stripe: " + stripe.toString());
+ }
+ long sectionStart = stripeStart;
+ for (OrcProto.Stream section : footer.getStreamsList()) {
+ String kind = section.hasKind() ? section.getKind().name() : UNKNOWN;
+ System.out.println(" Stream: column " + section.getColumn() +
+ " section " + kind + " start: " + sectionStart +
+ " length " + section.getLength());
+ sectionStart += section.getLength();
+ }
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ StringBuilder buf = new StringBuilder();
+ buf.append(" Encoding column ");
+ buf.append(i);
+ buf.append(": ");
+ buf.append(encoding.getKind());
+ if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
+ encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
+ buf.append("[");
+ buf.append(encoding.getDictionarySize());
+ buf.append("]");
+ }
+ System.out.println(buf);
+ }
+ if (rowIndexCols != null && !rowIndexCols.isEmpty()) {
+ // include the columns that are specified, only if the columns are included, bloom filter
+ // will be read
+ boolean[] sargColumns = new boolean[colCount];
+ for (int colIdx : rowIndexCols) {
+ sargColumns[colIdx] = true;
+ }
+ OrcIndex indices = rows
+ .readRowIndex(stripeIx, null, null, null, sargColumns);
+ for (int col : rowIndexCols) {
+ StringBuilder buf = new StringBuilder();
+ String rowIdxString = getFormattedRowIndices(col, indices.getRowGroupIndex());
+ buf.append(rowIdxString);
+ String bloomFilString = getFormattedBloomFilters(col, indices.getBloomFilterIndex());
+ buf.append(bloomFilString);
+ System.out.println(buf);
+ }
+ }
+ }
+
+ FileSystem fs = file.getFileSystem(conf);
+ long fileLen = fs.getFileStatus(file).getLen();
+ long paddedBytes = getTotalPaddingSize(reader);
+ // empty ORC file is ~45 bytes. Assumption here is file length always >0
+ double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
+ DecimalFormat format = new DecimalFormat("##.##");
+ System.out.println("\nFile length: " + fileLen + " bytes");
+ System.out.println("Padding length: " + paddedBytes + " bytes");
+ System.out.println("Padding ratio: " + format.format(percentPadding) + "%");
+ AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader);
+ if (acidStats != null) {
+ System.out.println("ACID stats:" + acidStats);
+ }
+ rows.close();
+ }
+
+ private static void recoverFiles(final List<String> corruptFiles, final Configuration conf,
+ final String backup)
+ throws IOException {
+ for (String corruptFile : corruptFiles) {
+ System.err.println("Recovering file " + corruptFile);
+ Path corruptPath = new Path(corruptFile);
+ FileSystem fs = corruptPath.getFileSystem(conf);
+ FSDataInputStream fdis = fs.open(corruptPath);
+ try {
+ long corruptFileLen = fs.getFileStatus(corruptPath).getLen();
+ long remaining = corruptFileLen;
+ List<Long> footerOffsets = Lists.newArrayList();
+
+ // start reading the data file form top to bottom and record the valid footers
+ while (remaining > 0) {
+ int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
+ byte[] data = new byte[toRead];
+ long startPos = corruptFileLen - remaining;
+ fdis.readFully(startPos, data, 0, toRead);
+
+ // find all MAGIC string and see if the file is readable from there
+ int index = 0;
+ long nextFooterOffset;
+
+ while (index != -1) {
+ index = indexOf(data, OrcFile.MAGIC.getBytes(), index + 1);
+ if (index != -1) {
+ nextFooterOffset = startPos + index + OrcFile.MAGIC.length() + 1;
+ if (isReadable(corruptPath, conf, nextFooterOffset)) {
+ footerOffsets.add(nextFooterOffset);
+ }
+ }
+ }
+
+ System.err.println("Scanning for valid footers - startPos: " + startPos +
+ " toRead: " + toRead + " remaining: " + remaining);
+ remaining = remaining - toRead;
+ }
+
+ System.err.println("Readable footerOffsets: " + footerOffsets);
+ recoverFile(corruptPath, fs, conf, footerOffsets, backup);
+ } catch (Exception e) {
+ Path recoveryFile = getRecoveryFile(corruptPath);
+ if (fs.exists(recoveryFile)) {
+ fs.delete(recoveryFile, false);
+ }
+ System.err.println("Unable to recover file " + corruptFile);
+ e.printStackTrace();
+ System.err.println(SEPARATOR);
+ continue;
+ } finally {
+ fdis.close();
+ }
+ System.err.println(corruptFile + " recovered successfully!");
+ System.err.println(SEPARATOR);
+ }
+ }
+
+ private static void recoverFile(final Path corruptPath, final FileSystem fs,
+ final Configuration conf, final List<Long> footerOffsets, final String backup)
+ throws IOException {
+
+ // first recover the file to .recovered file and then once successful rename it to actual file
+ Path recoveredPath = getRecoveryFile(corruptPath);
+
+ // make sure that file does not exist
+ if (fs.exists(recoveredPath)) {
+ fs.delete(recoveredPath, false);
+ }
+
+ // if there are no valid footers, the file should still be readable so create an empty orc file
+ if (footerOffsets == null || footerOffsets.isEmpty()) {
+ System.err.println("No readable footers found. Creating empty orc file.");
+ TypeDescription schema = TypeDescription.createStruct();
+ Writer writer = OrcFile.createWriter(recoveredPath,
+ OrcFile.writerOptions(conf).setSchema(schema));
+ writer.close();
+ } else {
+ FSDataInputStream fdis = fs.open(corruptPath);
+ FileStatus fileStatus = fs.getFileStatus(corruptPath);
+ // read corrupt file and copy it to recovered file until last valid footer
+ FSDataOutputStream fdos = fs.create(recoveredPath, true,
+ conf.getInt("io.file.buffer.size", 4096),
+ fileStatus.getReplication(),
+ fileStatus.getBlockSize());
+ try {
+ long fileLen = footerOffsets.get(footerOffsets.size() - 1);
+ long remaining = fileLen;
+
+ while (remaining > 0) {
+ int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
+ byte[] data = new byte[toRead];
+ long startPos = fileLen - remaining;
+ fdis.readFully(startPos, data, 0, toRead);
+ fdos.write(data);
+ System.err.println("Copying data to recovery file - startPos: " + startPos +
+ " toRead: " + toRead + " remaining: " + remaining);
+ remaining = remaining - toRead;
+ }
+ } catch (Exception e) {
+ fs.delete(recoveredPath, false);
+ throw new IOException(e);
+ } finally {
+ fdis.close();
+ fdos.close();
+ }
+ }
+
+ // validate the recovered file once again and start moving corrupt files to backup folder
+ if (isReadable(recoveredPath, conf, Long.MAX_VALUE)) {
+ Path backupDataPath;
+ String scheme = corruptPath.toUri().getScheme();
+ String authority = corruptPath.toUri().getAuthority();
+ String filePath = corruptPath.toUri().getPath();
+
+ // use the same filesystem as corrupt file if backup-path is not explicitly specified
+ if (backup.equals(DEFAULT_BACKUP_PATH)) {
+ backupDataPath = new Path(scheme, authority, DEFAULT_BACKUP_PATH + filePath);
+ } else {
+ backupDataPath = Path.mergePaths(new Path(backup), corruptPath);
+ }
+
+ // Move data file to backup path
+ moveFiles(fs, corruptPath, backupDataPath);
+
+ // Move side file to backup path
+ Path sideFilePath = OrcAcidUtils.getSideFile(corruptPath);
+ Path backupSideFilePath = new Path(backupDataPath.getParent(), sideFilePath.getName());
+ moveFiles(fs, sideFilePath, backupSideFilePath);
+
+ // finally move recovered file to actual file
+ moveFiles(fs, recoveredPath, corruptPath);
+
+ // we are done recovering, backing up and validating
+ System.err.println("Validation of recovered file successful!");
+ }
+ }
+
+ private static void moveFiles(final FileSystem fs, final Path src, final Path dest)
+ throws IOException {
+ try {
+ // create the dest directory if not exist
+ if (!fs.exists(dest.getParent())) {
+ fs.mkdirs(dest.getParent());
+ }
+
+ // if the destination file exists for some reason delete it
+ fs.delete(dest, false);
+
+ if (fs.rename(src, dest)) {
+ System.err.println("Moved " + src + " to " + dest);
+ } else {
+ throw new IOException("Unable to move " + src + " to " + dest);
+ }
+
+ } catch (Exception e) {
+ throw new IOException("Unable to move " + src + " to " + dest, e);
+ }
+ }
+
+ private static Path getRecoveryFile(final Path corruptPath) {
+ return new Path(corruptPath.getParent(), corruptPath.getName() + ".recovered");
+ }
+
+ private static boolean isReadable(final Path corruptPath, final Configuration conf,
+ final long maxLen) {
+ try {
+ OrcFile.createReader(corruptPath, OrcFile.readerOptions(conf).maxLength(maxLen));
+ return true;
+ } catch (Exception e) {
+ // ignore this exception as maxLen is unreadable
+ return false;
+ }
+ }
+
+ // search for byte pattern in another byte array
+ private static int indexOf(final byte[] data, final byte[] pattern, final int index) {
+ if (data == null || data.length == 0 || pattern == null || pattern.length == 0 ||
+ index > data.length || index < 0) {
+ return -1;
+ }
+
+ int j = 0;
+ for (int i = index; i < data.length; i++) {
+ if (pattern[j] == data[i]) {
+ j++;
+ } else {
+ j = 0;
+ }
+
+ if (j == pattern.length) {
+ return i - pattern.length + 1;
+ }
+ }
+
+ return -1;
+ }
+
+ private static String getFormattedBloomFilters(int col,
+ OrcProto.BloomFilterIndex[] bloomFilterIndex) {
+ StringBuilder buf = new StringBuilder();
+ BloomFilterIO stripeLevelBF = null;
+ if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
+ int idx = 0;
+ buf.append("\n Bloom filters for column ").append(col).append(":");
+ for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
+ BloomFilterIO toMerge = new BloomFilterIO(bf);
+ buf.append("\n Entry ").append(idx++).append(":").append(getBloomFilterStats(toMerge));
+ if (stripeLevelBF == null) {
+ stripeLevelBF = toMerge;
+ } else {
+ stripeLevelBF.merge(toMerge);
+ }
+ }
+ String bloomFilterStats = getBloomFilterStats(stripeLevelBF);
+ buf.append("\n Stripe level merge:").append(bloomFilterStats);
+ }
+ return buf.toString();
+ }
+
+ private static String getBloomFilterStats(BloomFilterIO bf) {
+ StringBuilder sb = new StringBuilder();
+ int bitCount = bf.getBitSize();
+ int popCount = 0;
+ for (long l : bf.getBitSet()) {
+ popCount += Long.bitCount(l);
+ }
+ int k = bf.getNumHashFunctions();
+ float loadFactor = (float) popCount / (float) bitCount;
+ float expectedFpp = (float) Math.pow(loadFactor, k);
+ DecimalFormat df = new DecimalFormat("###.####");
+ sb.append(" numHashFunctions: ").append(k);
+ sb.append(" bitCount: ").append(bitCount);
+ sb.append(" popCount: ").append(popCount);
+ sb.append(" loadFactor: ").append(df.format(loadFactor));
+ sb.append(" expectedFpp: ").append(expectedFpp);
+ return sb.toString();
+ }
+
+ private static String getFormattedRowIndices(int col,
+ OrcProto.RowIndex[] rowGroupIndex) {
+ StringBuilder buf = new StringBuilder();
+ OrcProto.RowIndex index;
+ buf.append(" Row group indices for column ").append(col).append(":");
+ if (rowGroupIndex == null || (col >= rowGroupIndex.length) ||
+ ((index = rowGroupIndex[col]) == null)) {
+ buf.append(" not found\n");
+ return buf.toString();
+ }
+
+ for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
+ buf.append("\n Entry ").append(entryIx).append(": ");
+ OrcProto.RowIndexEntry entry = index.getEntry(entryIx);
+ if (entry == null) {
+ buf.append("unknown\n");
+ continue;
+ }
+ OrcProto.ColumnStatistics colStats = entry.getStatistics();
+ if (colStats == null) {
+ buf.append("no stats at ");
+ } else {
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(colStats);
+ buf.append(cs.toString());
+ }
+ buf.append(" positions: ");
+ for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
+ if (posIx != 0) {
+ buf.append(",");
+ }
+ buf.append(entry.getPositions(posIx));
+ }
+ }
+ return buf.toString();
+ }
+
+ public static long getTotalPaddingSize(Reader reader) throws IOException {
+ long paddedBytes = 0;
+ List<StripeInformation> stripes = reader.getStripes();
+ for (int i = 1; i < stripes.size(); i++) {
+ long prevStripeOffset = stripes.get(i - 1).getOffset();
+ long prevStripeLen = stripes.get(i - 1).getLength();
+ paddedBytes += stripes.get(i).getOffset() - (prevStripeOffset + prevStripeLen);
+ }
+ return paddedBytes;
+ }
+
+ @SuppressWarnings("static-access")
+ static Options createOptions() {
+ Options result = new Options();
+
+ // add -d and --data to print the rows
+ result.addOption(OptionBuilder
+ .withLongOpt("data")
+ .withDescription("Should the data be printed")
+ .create('d'));
+
+ // to avoid breaking unit tests (when run in different time zones) for file dump, printing
+ // of timezone is made optional
+ result.addOption(OptionBuilder
+ .withLongOpt("timezone")
+ .withDescription("Print writer's time zone")
+ .create('t'));
+
+ result.addOption(OptionBuilder
+ .withLongOpt("help")
+ .withDescription("print help message")
+ .create('h'));
+
+ result.addOption(OptionBuilder
+ .withLongOpt("rowindex")
+ .withArgName("comma separated list of column ids for which row index should be printed")
+ .withDescription("Dump stats for column number(s)")
+ .hasArg()
+ .create('r'));
+
+ result.addOption(OptionBuilder
+ .withLongOpt("json")
+ .withDescription("Print metadata in JSON format")
+ .create('j'));
+
+ result.addOption(OptionBuilder
+ .withLongOpt("pretty")
+ .withDescription("Pretty print json metadata output")
+ .create('p'));
+
+ result.addOption(OptionBuilder
+ .withLongOpt("recover")
+ .withDescription("recover corrupted orc files generated by streaming")
+ .create());
+
+ result.addOption(OptionBuilder
+ .withLongOpt("skip-dump")
+ .withDescription("used along with --recover to directly recover files without dumping")
+ .create());
+
+ result.addOption(OptionBuilder
+ .withLongOpt("backup-path")
+ .withDescription("specify a backup path to store the corrupted files (default: /tmp)")
+ .hasArg()
+ .create());
+ return result;
+ }
+
+ private static void printMap(JSONWriter writer,
+ MapColumnVector vector,
+ TypeDescription schema,
+ int row) throws JSONException {
+ writer.array();
+ TypeDescription keyType = schema.getChildren().get(0);
+ TypeDescription valueType = schema.getChildren().get(1);
+ int offset = (int) vector.offsets[row];
+ for (int i = 0; i < vector.lengths[row]; ++i) {
+ writer.object();
+ writer.key("_key");
+ printValue(writer, vector.keys, keyType, offset + i);
+ writer.key("_value");
+ printValue(writer, vector.values, valueType, offset + i);
+ writer.endObject();
+ }
+ writer.endArray();
+ }
+
+ private static void printList(JSONWriter writer,
+ ListColumnVector vector,
+ TypeDescription schema,
+ int row) throws JSONException {
+ writer.array();
+ int offset = (int) vector.offsets[row];
+ TypeDescription childType = schema.getChildren().get(0);
+ for (int i = 0; i < vector.lengths[row]; ++i) {
+ printValue(writer, vector.child, childType, offset + i);
+ }
+ writer.endArray();
+ }
+
+ private static void printUnion(JSONWriter writer,
+ UnionColumnVector vector,
+ TypeDescription schema,
+ int row) throws JSONException {
+ int tag = vector.tags[row];
+ printValue(writer, vector.fields[tag], schema.getChildren().get(tag), row);
+ }
+
+ static void printStruct(JSONWriter writer,
+ StructColumnVector batch,
+ TypeDescription schema,
+ int row) throws JSONException {
+ writer.object();
+ List<String> fieldNames = schema.getFieldNames();
+ List<TypeDescription> fieldTypes = schema.getChildren();
+ for (int i = 0; i < fieldTypes.size(); ++i) {
+ writer.key(fieldNames.get(i));
+ printValue(writer, batch.fields[i], fieldTypes.get(i), row);
+ }
+ writer.endObject();
+ }
+
+ static void printBinary(JSONWriter writer, BytesColumnVector vector,
+ int row) throws JSONException {
+ writer.array();
+ int offset = vector.start[row];
+ for(int i=0; i < vector.length[row]; ++i) {
+ writer.value(0xff & (int) vector.vector[row][offset + i]);
+ }
+ writer.endArray();
+ }
+ static void printValue(JSONWriter writer, ColumnVector vector,
+ TypeDescription schema, int row) throws JSONException {
+ if (vector.isRepeating) {
+ row = 0;
+ }
+ if (vector.noNulls || !vector.isNull[row]) {
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ writer.value(((LongColumnVector) vector).vector[row] != 0);
+ break;
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ writer.value(((LongColumnVector) vector).vector[row]);
+ break;
+ case FLOAT:
+ case DOUBLE:
+ writer.value(((DoubleColumnVector) vector).vector[row]);
+ break;
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ writer.value(((BytesColumnVector) vector).toString(row));
+ break;
+ case BINARY:
+ printBinary(writer, (BytesColumnVector) vector, row);
+ break;
+ case DECIMAL:
+ writer.value(((DecimalColumnVector) vector).vector[row].toString());
+ break;
+ case DATE:
+ writer.value(new DateWritable(
+ (int) ((LongColumnVector) vector).vector[row]).toString());
+ break;
+ case TIMESTAMP:
+ writer.value(((TimestampColumnVector) vector)
+ .asScratchTimestamp(row).toString());
+ break;
+ case LIST:
+ printList(writer, (ListColumnVector) vector, schema, row);
+ break;
+ case MAP:
+ printMap(writer, (MapColumnVector) vector, schema, row);
+ break;
+ case STRUCT:
+ printStruct(writer, (StructColumnVector) vector, schema, row);
+ break;
+ case UNION:
+ printUnion(writer, (UnionColumnVector) vector, schema, row);
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown type " +
+ schema.toString());
+ }
+ } else {
+ writer.value(null);
+ }
+ }
+
+ static void printRow(JSONWriter writer,
+ VectorizedRowBatch batch,
+ TypeDescription schema,
+ int row) throws JSONException {
+ if (schema.getCategory() == TypeDescription.Category.STRUCT) {
+ List<TypeDescription> fieldTypes = schema.getChildren();
+ List<String> fieldNames = schema.getFieldNames();
+ writer.object();
+ for (int c = 0; c < batch.cols.length; ++c) {
+ writer.key(fieldNames.get(c));
+ printValue(writer, batch.cols[c], fieldTypes.get(c), row);
+ }
+ writer.endObject();
+ } else {
+ printValue(writer, batch.cols[0], schema, row);
+ }
+ }
+
+ static void printJsonData(final Reader reader) throws IOException, JSONException {
+ PrintStream printStream = System.out;
+ OutputStreamWriter out = new OutputStreamWriter(printStream, "UTF-8");
+ RecordReader rows = reader.rows();
+ try {
+ TypeDescription schema = reader.getSchema();
+ VectorizedRowBatch batch = schema.createRowBatch();
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ JSONWriter writer = new JSONWriter(out);
+ printRow(writer, batch, schema, r);
+ out.write("\n");
+ out.flush();
+ if (printStream.checkError()) {
+ throw new IOException("Error encountered when writing to stdout.");
+ }
+ }
+ }
+ } finally {
+ rows.close();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/tools/JsonFileDump.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/tools/JsonFileDump.java b/orc/src/java/org/apache/hive/orc/tools/JsonFileDump.java
new file mode 100644
index 0000000..5e60eed
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/tools/JsonFileDump.java
@@ -0,0 +1,411 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.tools;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hive.orc.BinaryColumnStatistics;
+import org.apache.hive.orc.BloomFilterIO;
+import org.apache.hive.orc.BooleanColumnStatistics;
+import org.apache.hive.orc.ColumnStatistics;
+import org.apache.hive.orc.DecimalColumnStatistics;
+import org.apache.hive.orc.DoubleColumnStatistics;
+import org.apache.hive.orc.IntegerColumnStatistics;
+import org.apache.hive.orc.OrcProto;
+import org.apache.hive.orc.StripeInformation;
+import org.apache.hive.orc.StripeStatistics;
+import org.apache.hive.orc.TimestampColumnStatistics;
+import org.apache.hive.orc.impl.AcidStats;
+import org.apache.hive.orc.impl.ColumnStatisticsImpl;
+import org.apache.hive.orc.impl.OrcIndex;
+import org.apache.hive.orc.impl.RecordReaderImpl;
+import org.apache.hive.orc.CompressionKind;
+import org.apache.hive.orc.Reader;
+import org.apache.hive.orc.impl.OrcAcidUtils;
+import org.codehaus.jettison.json.JSONArray;
+import org.apache.hive.orc.DateColumnStatistics;
+import org.apache.hive.orc.StringColumnStatistics;
+import org.codehaus.jettison.json.JSONException;
+import org.codehaus.jettison.json.JSONObject;
+import org.codehaus.jettison.json.JSONStringer;
+import org.codehaus.jettison.json.JSONWriter;
+
+/**
+ * File dump tool with json formatted output.
+ */
+public class JsonFileDump {
+
+ public static void printJsonMetaData(List<String> files,
+ Configuration conf,
+ List<Integer> rowIndexCols, boolean prettyPrint, boolean printTimeZone)
+ throws JSONException, IOException {
+ if (files.isEmpty()) {
+ return;
+ }
+ JSONStringer writer = new JSONStringer();
+ boolean multiFile = files.size() > 1;
+ if (multiFile) {
+ writer.array();
+ } else {
+ writer.object();
+ }
+ for (String filename : files) {
+ try {
+ if (multiFile) {
+ writer.object();
+ }
+ writer.key("fileName").value(filename);
+ Path path = new Path(filename);
+ Reader reader = FileDump.getReader(path, conf, null);
+ if (reader == null) {
+ writer.key("status").value("FAILED");
+ continue;
+ }
+ writer.key("fileVersion").value(reader.getFileVersion().getName());
+ writer.key("writerVersion").value(reader.getWriterVersion());
+ RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
+ writer.key("numberOfRows").value(reader.getNumberOfRows());
+ writer.key("compression").value(reader.getCompressionKind());
+ if (reader.getCompressionKind() != CompressionKind.NONE) {
+ writer.key("compressionBufferSize").value(reader.getCompressionSize());
+ }
+ writer.key("schemaString").value(reader.getSchema().toString());
+ writer.key("schema").array();
+ writeSchema(writer, reader.getTypes());
+ writer.endArray();
+
+ writer.key("stripeStatistics").array();
+ List<StripeStatistics> stripeStatistics = reader.getStripeStatistics();
+ for (int n = 0; n < stripeStatistics.size(); n++) {
+ writer.object();
+ writer.key("stripeNumber").value(n + 1);
+ StripeStatistics ss = stripeStatistics.get(n);
+ writer.key("columnStatistics").array();
+ for (int i = 0; i < ss.getColumnStatistics().length; i++) {
+ writer.object();
+ writer.key("columnId").value(i);
+ writeColumnStatistics(writer, ss.getColumnStatistics()[i]);
+ writer.endObject();
+ }
+ writer.endArray();
+ writer.endObject();
+ }
+ writer.endArray();
+
+ ColumnStatistics[] stats = reader.getStatistics();
+ int colCount = stats.length;
+ if (rowIndexCols == null) {
+ rowIndexCols = new ArrayList<>(colCount);
+ for (int i = 0; i < colCount; ++i) {
+ rowIndexCols.add(i);
+ }
+ }
+ writer.key("fileStatistics").array();
+ for (int i = 0; i < stats.length; ++i) {
+ writer.object();
+ writer.key("columnId").value(i);
+ writeColumnStatistics(writer, stats[i]);
+ writer.endObject();
+ }
+ writer.endArray();
+
+ writer.key("stripes").array();
+ int stripeIx = -1;
+ for (StripeInformation stripe : reader.getStripes()) {
+ ++stripeIx;
+ long stripeStart = stripe.getOffset();
+ OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
+ writer.object(); // start of stripe information
+ writer.key("stripeNumber").value(stripeIx + 1);
+ writer.key("stripeInformation");
+ writeStripeInformation(writer, stripe);
+ if (printTimeZone) {
+ writer.key("writerTimezone").value(
+ footer.hasWriterTimezone() ? footer.getWriterTimezone() : FileDump.UNKNOWN);
+ }
+ long sectionStart = stripeStart;
+
+ writer.key("streams").array();
+ for (OrcProto.Stream section : footer.getStreamsList()) {
+ writer.object();
+ String kind = section.hasKind() ? section.getKind().name() : FileDump.UNKNOWN;
+ writer.key("columnId").value(section.getColumn());
+ writer.key("section").value(kind);
+ writer.key("startOffset").value(sectionStart);
+ writer.key("length").value(section.getLength());
+ sectionStart += section.getLength();
+ writer.endObject();
+ }
+ writer.endArray();
+
+ writer.key("encodings").array();
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ writer.object();
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ writer.key("columnId").value(i);
+ writer.key("kind").value(encoding.getKind());
+ if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
+ encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
+ writer.key("dictionarySize").value(encoding.getDictionarySize());
+ }
+ writer.endObject();
+ }
+ writer.endArray();
+ if (!rowIndexCols.isEmpty()) {
+ // include the columns that are specified, only if the columns are included, bloom filter
+ // will be read
+ boolean[] sargColumns = new boolean[colCount];
+ for (int colIdx : rowIndexCols) {
+ sargColumns[colIdx] = true;
+ }
+ OrcIndex indices = rows.readRowIndex(stripeIx, null, sargColumns);
+ writer.key("indexes").array();
+ for (int col : rowIndexCols) {
+ writer.object();
+ writer.key("columnId").value(col);
+ writeRowGroupIndexes(writer, col, indices.getRowGroupIndex());
+ writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex());
+ writer.endObject();
+ }
+ writer.endArray();
+ }
+ writer.endObject(); // end of stripe information
+ }
+ writer.endArray();
+
+ FileSystem fs = path.getFileSystem(conf);
+ long fileLen = fs.getContentSummary(path).getLength();
+ long paddedBytes = FileDump.getTotalPaddingSize(reader);
+ // empty ORC file is ~45 bytes. Assumption here is file length always >0
+ double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
+ writer.key("fileLength").value(fileLen);
+ writer.key("paddingLength").value(paddedBytes);
+ writer.key("paddingRatio").value(percentPadding);
+ AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader);
+ if (acidStats != null) {
+ writer.key("numInserts").value(acidStats.inserts);
+ writer.key("numDeletes").value(acidStats.deletes);
+ writer.key("numUpdates").value(acidStats.updates);
+ }
+ writer.key("status").value("OK");
+ rows.close();
+
+ writer.endObject();
+ } catch (Exception e) {
+ writer.key("status").value("FAILED");
+ throw e;
+ }
+ }
+ if (multiFile) {
+ writer.endArray();
+ }
+
+ if (prettyPrint) {
+ final String prettyJson;
+ if (multiFile) {
+ JSONArray jsonArray = new JSONArray(writer.toString());
+ prettyJson = jsonArray.toString(2);
+ } else {
+ JSONObject jsonObject = new JSONObject(writer.toString());
+ prettyJson = jsonObject.toString(2);
+ }
+ System.out.println(prettyJson);
+ } else {
+ System.out.println(writer.toString());
+ }
+ }
+
+ private static void writeSchema(JSONStringer writer, List<OrcProto.Type> types)
+ throws JSONException {
+ int i = 0;
+ for(OrcProto.Type type : types) {
+ writer.object();
+ writer.key("columnId").value(i++);
+ writer.key("columnType").value(type.getKind());
+ if (type.getFieldNamesCount() > 0) {
+ writer.key("childColumnNames").array();
+ for (String field : type.getFieldNamesList()) {
+ writer.value(field);
+ }
+ writer.endArray();
+ writer.key("childColumnIds").array();
+ for (Integer colId : type.getSubtypesList()) {
+ writer.value(colId);
+ }
+ writer.endArray();
+ }
+ if (type.hasPrecision()) {
+ writer.key("precision").value(type.getPrecision());
+ }
+
+ if (type.hasScale()) {
+ writer.key("scale").value(type.getScale());
+ }
+
+ if (type.hasMaximumLength()) {
+ writer.key("maxLength").value(type.getMaximumLength());
+ }
+ writer.endObject();
+ }
+ }
+
+ private static void writeStripeInformation(JSONWriter writer, StripeInformation stripe)
+ throws JSONException {
+ writer.object();
+ writer.key("offset").value(stripe.getOffset());
+ writer.key("indexLength").value(stripe.getIndexLength());
+ writer.key("dataLength").value(stripe.getDataLength());
+ writer.key("footerLength").value(stripe.getFooterLength());
+ writer.key("rowCount").value(stripe.getNumberOfRows());
+ writer.endObject();
+ }
+
+ private static void writeColumnStatistics(JSONWriter writer, ColumnStatistics cs)
+ throws JSONException {
+ if (cs != null) {
+ writer.key("count").value(cs.getNumberOfValues());
+ writer.key("hasNull").value(cs.hasNull());
+ if (cs instanceof BinaryColumnStatistics) {
+ writer.key("totalLength").value(((BinaryColumnStatistics) cs).getSum());
+ writer.key("type").value(OrcProto.Type.Kind.BINARY);
+ } else if (cs instanceof BooleanColumnStatistics) {
+ writer.key("trueCount").value(((BooleanColumnStatistics) cs).getTrueCount());
+ writer.key("falseCount").value(((BooleanColumnStatistics) cs).getFalseCount());
+ writer.key("type").value(OrcProto.Type.Kind.BOOLEAN);
+ } else if (cs instanceof IntegerColumnStatistics) {
+ writer.key("min").value(((IntegerColumnStatistics) cs).getMinimum());
+ writer.key("max").value(((IntegerColumnStatistics) cs).getMaximum());
+ if (((IntegerColumnStatistics) cs).isSumDefined()) {
+ writer.key("sum").value(((IntegerColumnStatistics) cs).getSum());
+ }
+ writer.key("type").value(OrcProto.Type.Kind.LONG);
+ } else if (cs instanceof DoubleColumnStatistics) {
+ writer.key("min").value(((DoubleColumnStatistics) cs).getMinimum());
+ writer.key("max").value(((DoubleColumnStatistics) cs).getMaximum());
+ writer.key("sum").value(((DoubleColumnStatistics) cs).getSum());
+ writer.key("type").value(OrcProto.Type.Kind.DOUBLE);
+ } else if (cs instanceof StringColumnStatistics) {
+ writer.key("min").value(((StringColumnStatistics) cs).getMinimum());
+ writer.key("max").value(((StringColumnStatistics) cs).getMaximum());
+ writer.key("totalLength").value(((StringColumnStatistics) cs).getSum());
+ writer.key("type").value(OrcProto.Type.Kind.STRING);
+ } else if (cs instanceof DateColumnStatistics) {
+ if (((DateColumnStatistics) cs).getMaximum() != null) {
+ writer.key("min").value(((DateColumnStatistics) cs).getMinimum());
+ writer.key("max").value(((DateColumnStatistics) cs).getMaximum());
+ }
+ writer.key("type").value(OrcProto.Type.Kind.DATE);
+ } else if (cs instanceof TimestampColumnStatistics) {
+ if (((TimestampColumnStatistics) cs).getMaximum() != null) {
+ writer.key("min").value(((TimestampColumnStatistics) cs).getMinimum());
+ writer.key("max").value(((TimestampColumnStatistics) cs).getMaximum());
+ }
+ writer.key("type").value(OrcProto.Type.Kind.TIMESTAMP);
+ } else if (cs instanceof DecimalColumnStatistics) {
+ if (((DecimalColumnStatistics) cs).getMaximum() != null) {
+ writer.key("min").value(((DecimalColumnStatistics) cs).getMinimum());
+ writer.key("max").value(((DecimalColumnStatistics) cs).getMaximum());
+ writer.key("sum").value(((DecimalColumnStatistics) cs).getSum());
+ }
+ writer.key("type").value(OrcProto.Type.Kind.DECIMAL);
+ }
+ }
+ }
+
+ private static void writeBloomFilterIndexes(JSONWriter writer, int col,
+ OrcProto.BloomFilterIndex[] bloomFilterIndex) throws JSONException {
+
+ BloomFilterIO stripeLevelBF = null;
+ if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
+ int entryIx = 0;
+ writer.key("bloomFilterIndexes").array();
+ for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
+ writer.object();
+ writer.key("entryId").value(entryIx++);
+ BloomFilterIO toMerge = new BloomFilterIO(bf);
+ writeBloomFilterStats(writer, toMerge);
+ if (stripeLevelBF == null) {
+ stripeLevelBF = toMerge;
+ } else {
+ stripeLevelBF.merge(toMerge);
+ }
+ writer.endObject();
+ }
+ writer.endArray();
+ }
+ if (stripeLevelBF != null) {
+ writer.key("stripeLevelBloomFilter");
+ writer.object();
+ writeBloomFilterStats(writer, stripeLevelBF);
+ writer.endObject();
+ }
+ }
+
+ private static void writeBloomFilterStats(JSONWriter writer, BloomFilterIO bf)
+ throws JSONException {
+ int bitCount = bf.getBitSize();
+ int popCount = 0;
+ for (long l : bf.getBitSet()) {
+ popCount += Long.bitCount(l);
+ }
+ int k = bf.getNumHashFunctions();
+ float loadFactor = (float) popCount / (float) bitCount;
+ float expectedFpp = (float) Math.pow(loadFactor, k);
+ writer.key("numHashFunctions").value(k);
+ writer.key("bitCount").value(bitCount);
+ writer.key("popCount").value(popCount);
+ writer.key("loadFactor").value(loadFactor);
+ writer.key("expectedFpp").value(expectedFpp);
+ }
+
+ private static void writeRowGroupIndexes(JSONWriter writer, int col,
+ OrcProto.RowIndex[] rowGroupIndex)
+ throws JSONException {
+
+ OrcProto.RowIndex index;
+ if (rowGroupIndex == null || (col >= rowGroupIndex.length) ||
+ ((index = rowGroupIndex[col]) == null)) {
+ return;
+ }
+
+ writer.key("rowGroupIndexes").array();
+ for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
+ writer.object();
+ writer.key("entryId").value(entryIx);
+ OrcProto.RowIndexEntry entry = index.getEntry(entryIx);
+ if (entry == null) {
+ continue;
+ }
+ OrcProto.ColumnStatistics colStats = entry.getStatistics();
+ writeColumnStatistics(writer, ColumnStatisticsImpl.deserialize(colStats));
+ writer.key("positions").array();
+ for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
+ writer.value(entry.getPositions(posIx));
+ }
+ writer.endArray();
+ writer.endObject();
+ }
+ writer.endArray();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/BinaryColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/BinaryColumnStatistics.java b/orc/src/java/org/apache/orc/BinaryColumnStatistics.java
deleted file mode 100644
index 19db98a..0000000
--- a/orc/src/java/org/apache/orc/BinaryColumnStatistics.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import org.apache.orc.ColumnStatistics;
-
-/**
- * Statistics for binary columns.
- */
-public interface BinaryColumnStatistics extends ColumnStatistics {
- long getSum();
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/BloomFilterIO.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/BloomFilterIO.java b/orc/src/java/org/apache/orc/BloomFilterIO.java
deleted file mode 100644
index 1406266..0000000
--- a/orc/src/java/org/apache/orc/BloomFilterIO.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import org.apache.hive.common.util.BloomFilter;
-
-import com.google.common.primitives.Longs;
-
-public class BloomFilterIO extends BloomFilter {
-
- public BloomFilterIO(long expectedEntries) {
- super(expectedEntries, DEFAULT_FPP);
- }
-
- public BloomFilterIO(long expectedEntries, double fpp) {
- super(expectedEntries, fpp);
- }
-
-/**
- * Initializes the BloomFilter from the given Orc BloomFilter
- */
- public BloomFilterIO(OrcProto.BloomFilter bloomFilter) {
- this.bitSet = new BitSet(Longs.toArray(bloomFilter.getBitsetList()));
- this.numHashFunctions = bloomFilter.getNumHashFunctions();
- this.numBits = (int) this.bitSet.bitSize();
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/BooleanColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/BooleanColumnStatistics.java b/orc/src/java/org/apache/orc/BooleanColumnStatistics.java
deleted file mode 100644
index af08f06..0000000
--- a/orc/src/java/org/apache/orc/BooleanColumnStatistics.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import org.apache.orc.ColumnStatistics;
-
-/**
- * Statistics for boolean columns.
- */
-public interface BooleanColumnStatistics extends ColumnStatistics {
- long getFalseCount();
-
- long getTrueCount();
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/ColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/ColumnStatistics.java b/orc/src/java/org/apache/orc/ColumnStatistics.java
deleted file mode 100644
index 72d8fbf..0000000
--- a/orc/src/java/org/apache/orc/ColumnStatistics.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-/**
- * Statistics that are available for all types of columns.
- */
-public interface ColumnStatistics {
- /**
- * Get the number of values in this column. It will differ from the number
- * of rows because of NULL values and repeated values.
- * @return the number of values
- */
- long getNumberOfValues();
-
- /**
- * Returns true if there are nulls in the scope of column statistics.
- * @return true if null present else false
- */
- boolean hasNull();
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/CompressionCodec.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/CompressionCodec.java b/orc/src/java/org/apache/orc/CompressionCodec.java
deleted file mode 100644
index 3421969..0000000
--- a/orc/src/java/org/apache/orc/CompressionCodec.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.EnumSet;
-
-import javax.annotation.Nullable;
-
-public interface CompressionCodec {
-
- enum Modifier {
- /* speed/compression tradeoffs */
- FASTEST,
- FAST,
- DEFAULT,
- /* data sensitivity modifiers */
- TEXT,
- BINARY
- };
-
- /**
- * Compress the in buffer to the out buffer.
- * @param in the bytes to compress
- * @param out the uncompressed bytes
- * @param overflow put any additional bytes here
- * @return true if the output is smaller than input
- * @throws IOException
- */
- boolean compress(ByteBuffer in, ByteBuffer out, ByteBuffer overflow
- ) throws IOException;
-
- /**
- * Decompress the in buffer to the out buffer.
- * @param in the bytes to decompress
- * @param out the decompressed bytes
- * @throws IOException
- */
- void decompress(ByteBuffer in, ByteBuffer out) throws IOException;
-
- /**
- * Produce a modified compression codec if the underlying algorithm allows
- * modification.
- *
- * This does not modify the current object, but returns a new object if
- * modifications are possible. Returns the same object if no modifications
- * are possible.
- * @param modifiers compression modifiers
- * @return codec for use after optional modification
- */
- CompressionCodec modify(@Nullable EnumSet<Modifier> modifiers);
-
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/CompressionKind.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/CompressionKind.java b/orc/src/java/org/apache/orc/CompressionKind.java
deleted file mode 100644
index f684bef..0000000
--- a/orc/src/java/org/apache/orc/CompressionKind.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-/**
- * An enumeration that lists the generic compression algorithms that
- * can be applied to ORC files.
- */
-public enum CompressionKind {
- NONE, ZLIB, SNAPPY, LZO
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/DataReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/DataReader.java b/orc/src/java/org/apache/orc/DataReader.java
deleted file mode 100644
index a5dbb76..0000000
--- a/orc/src/java/org/apache/orc/DataReader.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-import org.apache.hadoop.hive.common.io.DiskRangeList;
-import org.apache.orc.impl.OrcIndex;
-
-/** An abstract data reader that IO formats can use to read bytes from underlying storage. */
-public interface DataReader extends AutoCloseable {
-
- /** Opens the DataReader, making it ready to use. */
- void open() throws IOException;
-
- OrcIndex readRowIndex(StripeInformation stripe,
- OrcProto.StripeFooter footer,
- boolean[] included, OrcProto.RowIndex[] indexes,
- boolean[] sargColumns,
- OrcProto.BloomFilterIndex[] bloomFilterIndices
- ) throws IOException;
-
- OrcProto.StripeFooter readStripeFooter(StripeInformation stripe) throws IOException;
-
- /** Reads the data.
- *
- * Note that for the cases such as zero-copy read, caller must release the disk ranges
- * produced after being done with them. Call isTrackingDiskRanges to find out if this is needed.
- * @param range List if disk ranges to read. Ranges with data will be ignored.
- * @param baseOffset Base offset from the start of the file of the ranges in disk range list.
- * @param doForceDirect Whether the data should be read into direct buffers.
- * @return New or modified list of DiskRange-s, where all the ranges are filled with data.
- */
- DiskRangeList readFileData(
- DiskRangeList range, long baseOffset, boolean doForceDirect) throws IOException;
-
-
- /**
- * Whether the user should release buffers created by readFileData. See readFileData javadoc.
- */
- boolean isTrackingDiskRanges();
-
- /**
- * Releases buffers created by readFileData. See readFileData javadoc.
- * @param toRelease The buffer to release.
- */
- void releaseBuffer(ByteBuffer toRelease);
-
- /**
- * Clone the entire state of the DataReader with the assumption that the
- * clone will be closed at a different time. Thus, any file handles in the
- * implementation need to be cloned.
- * @return a new instance
- */
- DataReader clone();
-
- @Override
- public void close() throws IOException;
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/DateColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/DateColumnStatistics.java b/orc/src/java/org/apache/orc/DateColumnStatistics.java
deleted file mode 100644
index cdd01af..0000000
--- a/orc/src/java/org/apache/orc/DateColumnStatistics.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import org.apache.orc.ColumnStatistics;
-
-import java.util.Date;
-
-/**
- * Statistics for DATE columns.
- */
-public interface DateColumnStatistics extends ColumnStatistics {
- /**
- * Get the minimum value for the column.
- * @return minimum value
- */
- Date getMinimum();
-
- /**
- * Get the maximum value for the column.
- * @return maximum value
- */
- Date getMaximum();
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/DecimalColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/DecimalColumnStatistics.java b/orc/src/java/org/apache/orc/DecimalColumnStatistics.java
deleted file mode 100644
index 51b6d7d..0000000
--- a/orc/src/java/org/apache/orc/DecimalColumnStatistics.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.orc.ColumnStatistics;
-
-/**
- * Statistics for decimal columns.
- */
-public interface DecimalColumnStatistics extends ColumnStatistics {
-
- /**
- * Get the minimum value for the column.
- * @return the minimum value
- */
- HiveDecimal getMinimum();
-
- /**
- * Get the maximum value for the column.
- * @return the maximum value
- */
- HiveDecimal getMaximum();
-
- /**
- * Get the sum of the values of the column.
- * @return the sum
- */
- HiveDecimal getSum();
-
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/DoubleColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/DoubleColumnStatistics.java b/orc/src/java/org/apache/orc/DoubleColumnStatistics.java
deleted file mode 100644
index 00c728f..0000000
--- a/orc/src/java/org/apache/orc/DoubleColumnStatistics.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import org.apache.orc.ColumnStatistics;
-
-/**
- * Statistics for float and double columns.
- */
-public interface DoubleColumnStatistics extends ColumnStatistics {
-
- /**
- * Get the smallest value in the column. Only defined if getNumberOfValues
- * is non-zero.
- * @return the minimum
- */
- double getMinimum();
-
- /**
- * Get the largest value in the column. Only defined if getNumberOfValues
- * is non-zero.
- * @return the maximum
- */
- double getMaximum();
-
- /**
- * Get the sum of the values in the column.
- * @return the sum
- */
- double getSum();
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/FileFormatException.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/FileFormatException.java b/orc/src/java/org/apache/orc/FileFormatException.java
deleted file mode 100644
index 2cebea7..0000000
--- a/orc/src/java/org/apache/orc/FileFormatException.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import java.io.IOException;
-
-/**
- * Thrown when an invalid file format is encountered.
- */
-public class FileFormatException extends IOException {
-
- public FileFormatException(String errMsg) {
- super(errMsg);
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/FileMetadata.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/FileMetadata.java b/orc/src/java/org/apache/orc/FileMetadata.java
deleted file mode 100644
index 807e696..0000000
--- a/orc/src/java/org/apache/orc/FileMetadata.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import java.util.List;
-
-import org.apache.orc.CompressionKind;
-import org.apache.orc.OrcProto;
-import org.apache.orc.StripeInformation;
-
-/**
- * Cached file metadata. Right now, it caches everything; we don't have to store all the
- * protobuf structs actually, we could just store what we need, but that would require that
- * ORC stop depending on them too. Luckily, they shouldn't be very big.
- */
-public interface FileMetadata {
- boolean isOriginalFormat();
-
- List<StripeInformation> getStripes();
-
- CompressionKind getCompressionKind();
-
- int getCompressionBufferSize();
-
- int getRowIndexStride();
-
- int getColumnCount();
-
- int getFlattenedColumnCount();
-
- Object getFileKey();
-
- List<Integer> getVersionList();
-
- int getMetadataSize();
-
- int getWriterVersionNum();
-
- List<OrcProto.Type> getTypes();
-
- List<OrcProto.StripeStatistics> getStripeStats();
-
- long getContentLength();
-
- long getNumberOfRows();
-
- List<OrcProto.ColumnStatistics> getFileStats();
-}
\ No newline at end of file
[34/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/BinaryColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/BinaryColumnStatistics.java b/orc/src/java/org/apache/hive/orc/BinaryColumnStatistics.java
new file mode 100644
index 0000000..92ddfbe
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/BinaryColumnStatistics.java
@@ -0,0 +1,25 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+/**
+ * Statistics for binary columns.
+ */
+public interface BinaryColumnStatistics extends ColumnStatistics {
+ long getSum();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/BloomFilterIO.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/BloomFilterIO.java b/orc/src/java/org/apache/hive/orc/BloomFilterIO.java
new file mode 100644
index 0000000..a6e6408
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/BloomFilterIO.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc;
+
+import org.apache.hive.common.util.BloomFilter;
+
+import com.google.common.primitives.Longs;
+
+public class BloomFilterIO extends BloomFilter {
+
+ public BloomFilterIO(long expectedEntries) {
+ super(expectedEntries, DEFAULT_FPP);
+ }
+
+ public BloomFilterIO(long expectedEntries, double fpp) {
+ super(expectedEntries, fpp);
+ }
+
+/**
+ * Initializes the BloomFilter from the given Orc BloomFilter
+ */
+ public BloomFilterIO(OrcProto.BloomFilter bloomFilter) {
+ this.bitSet = new BitSet(Longs.toArray(bloomFilter.getBitsetList()));
+ this.numHashFunctions = bloomFilter.getNumHashFunctions();
+ this.numBits = (int) this.bitSet.bitSize();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/BooleanColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/BooleanColumnStatistics.java b/orc/src/java/org/apache/hive/orc/BooleanColumnStatistics.java
new file mode 100644
index 0000000..14fc6cf
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/BooleanColumnStatistics.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+/**
+ * Statistics for boolean columns.
+ */
+public interface BooleanColumnStatistics extends ColumnStatistics {
+ long getFalseCount();
+
+ long getTrueCount();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/ColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/ColumnStatistics.java b/orc/src/java/org/apache/hive/orc/ColumnStatistics.java
new file mode 100644
index 0000000..5ab8f55
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/ColumnStatistics.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+/**
+ * Statistics that are available for all types of columns.
+ */
+public interface ColumnStatistics {
+ /**
+ * Get the number of values in this column. It will differ from the number
+ * of rows because of NULL values and repeated values.
+ * @return the number of values
+ */
+ long getNumberOfValues();
+
+ /**
+ * Returns true if there are nulls in the scope of column statistics.
+ * @return true if null present else false
+ */
+ boolean hasNull();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/CompressionCodec.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/CompressionCodec.java b/orc/src/java/org/apache/hive/orc/CompressionCodec.java
new file mode 100644
index 0000000..eee8dc3
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/CompressionCodec.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.EnumSet;
+
+import javax.annotation.Nullable;
+
+public interface CompressionCodec {
+
+ enum Modifier {
+ /* speed/compression tradeoffs */
+ FASTEST,
+ FAST,
+ DEFAULT,
+ /* data sensitivity modifiers */
+ TEXT,
+ BINARY
+ };
+
+ /**
+ * Compress the in buffer to the out buffer.
+ * @param in the bytes to compress
+ * @param out the uncompressed bytes
+ * @param overflow put any additional bytes here
+ * @return true if the output is smaller than input
+ * @throws IOException
+ */
+ boolean compress(ByteBuffer in, ByteBuffer out, ByteBuffer overflow
+ ) throws IOException;
+
+ /**
+ * Decompress the in buffer to the out buffer.
+ * @param in the bytes to decompress
+ * @param out the decompressed bytes
+ * @throws IOException
+ */
+ void decompress(ByteBuffer in, ByteBuffer out) throws IOException;
+
+ /**
+ * Produce a modified compression codec if the underlying algorithm allows
+ * modification.
+ *
+ * This does not modify the current object, but returns a new object if
+ * modifications are possible. Returns the same object if no modifications
+ * are possible.
+ * @param modifiers compression modifiers
+ * @return codec for use after optional modification
+ */
+ CompressionCodec modify(@Nullable EnumSet<Modifier> modifiers);
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/CompressionKind.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/CompressionKind.java b/orc/src/java/org/apache/hive/orc/CompressionKind.java
new file mode 100644
index 0000000..0d78642
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/CompressionKind.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc;
+
+/**
+ * An enumeration that lists the generic compression algorithms that
+ * can be applied to ORC files.
+ */
+public enum CompressionKind {
+ NONE, ZLIB, SNAPPY, LZO
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/DataReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/DataReader.java b/orc/src/java/org/apache/hive/orc/DataReader.java
new file mode 100644
index 0000000..091a5b9
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/DataReader.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.hadoop.hive.common.io.DiskRangeList;
+import org.apache.hive.orc.impl.OrcIndex;
+
+/** An abstract data reader that IO formats can use to read bytes from underlying storage. */
+public interface DataReader extends AutoCloseable {
+
+ /** Opens the DataReader, making it ready to use. */
+ void open() throws IOException;
+
+ OrcIndex readRowIndex(StripeInformation stripe,
+ OrcProto.StripeFooter footer,
+ boolean[] included, OrcProto.RowIndex[] indexes,
+ boolean[] sargColumns,
+ OrcProto.BloomFilterIndex[] bloomFilterIndices
+ ) throws IOException;
+
+ OrcProto.StripeFooter readStripeFooter(StripeInformation stripe) throws IOException;
+
+ /** Reads the data.
+ *
+ * Note that for the cases such as zero-copy read, caller must release the disk ranges
+ * produced after being done with them. Call isTrackingDiskRanges to find out if this is needed.
+ * @param range List if disk ranges to read. Ranges with data will be ignored.
+ * @param baseOffset Base offset from the start of the file of the ranges in disk range list.
+ * @param doForceDirect Whether the data should be read into direct buffers.
+ * @return New or modified list of DiskRange-s, where all the ranges are filled with data.
+ */
+ DiskRangeList readFileData(
+ DiskRangeList range, long baseOffset, boolean doForceDirect) throws IOException;
+
+
+ /**
+ * Whether the user should release buffers created by readFileData. See readFileData javadoc.
+ */
+ boolean isTrackingDiskRanges();
+
+ /**
+ * Releases buffers created by readFileData. See readFileData javadoc.
+ * @param toRelease The buffer to release.
+ */
+ void releaseBuffer(ByteBuffer toRelease);
+
+ /**
+ * Clone the entire state of the DataReader with the assumption that the
+ * clone will be closed at a different time. Thus, any file handles in the
+ * implementation need to be cloned.
+ * @return a new instance
+ */
+ DataReader clone();
+
+ @Override
+ public void close() throws IOException;
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/DateColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/DateColumnStatistics.java b/orc/src/java/org/apache/hive/orc/DateColumnStatistics.java
new file mode 100644
index 0000000..b03dcec
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/DateColumnStatistics.java
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+import java.util.Date;
+
+/**
+ * Statistics for DATE columns.
+ */
+public interface DateColumnStatistics extends ColumnStatistics {
+ /**
+ * Get the minimum value for the column.
+ * @return minimum value
+ */
+ Date getMinimum();
+
+ /**
+ * Get the maximum value for the column.
+ * @return maximum value
+ */
+ Date getMaximum();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/DecimalColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/DecimalColumnStatistics.java b/orc/src/java/org/apache/hive/orc/DecimalColumnStatistics.java
new file mode 100644
index 0000000..4dbbc12
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/DecimalColumnStatistics.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+
+/**
+ * Statistics for decimal columns.
+ */
+public interface DecimalColumnStatistics extends ColumnStatistics {
+
+ /**
+ * Get the minimum value for the column.
+ * @return the minimum value
+ */
+ HiveDecimal getMinimum();
+
+ /**
+ * Get the maximum value for the column.
+ * @return the maximum value
+ */
+ HiveDecimal getMaximum();
+
+ /**
+ * Get the sum of the values of the column.
+ * @return the sum
+ */
+ HiveDecimal getSum();
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/DoubleColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/DoubleColumnStatistics.java b/orc/src/java/org/apache/hive/orc/DoubleColumnStatistics.java
new file mode 100644
index 0000000..5f2d426
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/DoubleColumnStatistics.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+/**
+ * Statistics for float and double columns.
+ */
+public interface DoubleColumnStatistics extends ColumnStatistics {
+
+ /**
+ * Get the smallest value in the column. Only defined if getNumberOfValues
+ * is non-zero.
+ * @return the minimum
+ */
+ double getMinimum();
+
+ /**
+ * Get the largest value in the column. Only defined if getNumberOfValues
+ * is non-zero.
+ * @return the maximum
+ */
+ double getMaximum();
+
+ /**
+ * Get the sum of the values in the column.
+ * @return the sum
+ */
+ double getSum();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/FileFormatException.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/FileFormatException.java b/orc/src/java/org/apache/hive/orc/FileFormatException.java
new file mode 100644
index 0000000..30356c3
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/FileFormatException.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+import java.io.IOException;
+
+/**
+ * Thrown when an invalid file format is encountered.
+ */
+public class FileFormatException extends IOException {
+
+ public FileFormatException(String errMsg) {
+ super(errMsg);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/FileMetadata.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/FileMetadata.java b/orc/src/java/org/apache/hive/orc/FileMetadata.java
new file mode 100644
index 0000000..acb8a78
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/FileMetadata.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc;
+
+import java.util.List;
+
+/**
+ * Cached file metadata. Right now, it caches everything; we don't have to store all the
+ * protobuf structs actually, we could just store what we need, but that would require that
+ * ORC stop depending on them too. Luckily, they shouldn't be very big.
+ */
+public interface FileMetadata {
+ boolean isOriginalFormat();
+
+ List<StripeInformation> getStripes();
+
+ CompressionKind getCompressionKind();
+
+ int getCompressionBufferSize();
+
+ int getRowIndexStride();
+
+ int getColumnCount();
+
+ int getFlattenedColumnCount();
+
+ Object getFileKey();
+
+ List<Integer> getVersionList();
+
+ int getMetadataSize();
+
+ int getWriterVersionNum();
+
+ List<OrcProto.Type> getTypes();
+
+ List<OrcProto.StripeStatistics> getStripeStats();
+
+ long getContentLength();
+
+ long getNumberOfRows();
+
+ List<OrcProto.ColumnStatistics> getFileStats();
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/IntegerColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/IntegerColumnStatistics.java b/orc/src/java/org/apache/hive/orc/IntegerColumnStatistics.java
new file mode 100644
index 0000000..00d17eb
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/IntegerColumnStatistics.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+/**
+ * Statistics for all of the integer columns, such as byte, short, int, and
+ * long.
+ */
+public interface IntegerColumnStatistics extends ColumnStatistics {
+ /**
+ * Get the smallest value in the column. Only defined if getNumberOfValues
+ * is non-zero.
+ * @return the minimum
+ */
+ long getMinimum();
+
+ /**
+ * Get the largest value in the column. Only defined if getNumberOfValues
+ * is non-zero.
+ * @return the maximum
+ */
+ long getMaximum();
+
+ /**
+ * Is the sum defined? If the sum overflowed the counter this will be false.
+ * @return is the sum available
+ */
+ boolean isSumDefined();
+
+ /**
+ * Get the sum of the column. Only valid if isSumDefined returns true.
+ * @return the sum of the column
+ */
+ long getSum();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/OrcConf.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/OrcConf.java b/orc/src/java/org/apache/hive/orc/OrcConf.java
new file mode 100644
index 0000000..dc2f865
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/OrcConf.java
@@ -0,0 +1,193 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc;
+
+import org.apache.hadoop.conf.Configuration;
+
+import java.util.Properties;
+
+/**
+ * Define the configuration properties that Orc understands.
+ */
+public enum OrcConf {
+ STRIPE_SIZE("orc.stripe.size", "hive.exec.orc.default.stripe.size",
+ 64L * 1024 * 1024,
+ "Define the default ORC stripe size, in bytes."),
+ BLOCK_SIZE("orc.block.size", "hive.exec.orc.default.block.size",
+ 256L * 1024 * 1024,
+ "Define the default file system block size for ORC files."),
+ ENABLE_INDEXES("orc.create.index", "orc.create.index", true,
+ "Should the ORC writer create indexes as part of the file."),
+ ROW_INDEX_STRIDE("orc.row.index.stride",
+ "hive.exec.orc.default.row.index.stride", 10000,
+ "Define the default ORC index stride in number of rows. (Stride is the\n"+
+ " number of rows n index entry represents.)"),
+ BUFFER_SIZE("orc.compress.size", "hive.exec.orc.default.buffer.size",
+ 256 * 1024, "Define the default ORC buffer size, in bytes."),
+ BASE_DELTA_RATIO("orc.base.delta.ratio", "hive.exec.orc.base.delta.ratio", 8,
+ "The ratio of base writer and delta writer in terms of STRIPE_SIZE and BUFFER_SIZE."),
+ BLOCK_PADDING("orc.block.padding", "hive.exec.orc.default.block.padding",
+ true,
+ "Define whether stripes should be padded to the HDFS block boundaries."),
+ COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZLIB",
+ "Define the default compression codec for ORC file"),
+ WRITE_FORMAT("orc.write.format", "hive.exec.orc.write.format", "0.12",
+ "Define the version of the file to write. Possible values are 0.11 and\n"+
+ " 0.12. If this parameter is not defined, ORC will use the run\n" +
+ " length encoding (RLE) introduced in Hive 0.12."),
+ ENCODING_STRATEGY("orc.encoding.strategy", "hive.exec.orc.encoding.strategy",
+ "SPEED",
+ "Define the encoding strategy to use while writing data. Changing this\n"+
+ "will only affect the light weight encoding for integers. This\n" +
+ "flag will not change the compression level of higher level\n" +
+ "compression codec (like ZLIB)."),
+ COMPRESSION_STRATEGY("orc.compression.strategy",
+ "hive.exec.orc.compression.strategy", "SPEED",
+ "Define the compression strategy to use while writing data.\n" +
+ "This changes the compression level of higher level compression\n" +
+ "codec (like ZLIB)."),
+ BLOCK_PADDING_TOLERANCE("orc.block.padding.tolerance",
+ "hive.exec.orc.block.padding.tolerance", 0.05,
+ "Define the tolerance for block padding as a decimal fraction of\n" +
+ "stripe size (for example, the default value 0.05 is 5% of the\n" +
+ "stripe size). For the defaults of 64Mb ORC stripe and 256Mb HDFS\n" +
+ "blocks, the default block padding tolerance of 5% will\n" +
+ "reserve a maximum of 3.2Mb for padding within the 256Mb block.\n" +
+ "In that case, if the available size within the block is more than\n"+
+ "3.2Mb, a new smaller stripe will be inserted to fit within that\n" +
+ "space. This will make sure that no stripe written will block\n" +
+ " boundaries and cause remote reads within a node local task."),
+ BLOOM_FILTER_FPP("orc.bloom.filter.fpp", "orc.default.bloom.fpp", 0.05,
+ "Define the default false positive probability for bloom filters."),
+ USE_ZEROCOPY("orc.use.zerocopy", "hive.exec.orc.zerocopy", false,
+ "Use zerocopy reads with ORC. (This requires Hadoop 2.3 or later.)"),
+ SKIP_CORRUPT_DATA("orc.skip.corrupt.data", "hive.exec.orc.skip.corrupt.data",
+ false,
+ "If ORC reader encounters corrupt data, this value will be used to\n" +
+ "determine whether to skip the corrupt data or throw exception.\n" +
+ "The default behavior is to throw exception."),
+ MEMORY_POOL("orc.memory.pool", "hive.exec.orc.memory.pool", 0.5,
+ "Maximum fraction of heap that can be used by ORC file writers"),
+ DICTIONARY_KEY_SIZE_THRESHOLD("orc.dictionary.key.threshold",
+ "hive.exec.orc.dictionary.key.size.threshold",
+ 0.8,
+ "If the number of distinct keys in a dictionary is greater than this\n" +
+ "fraction of the total number of non-null rows, turn off \n" +
+ "dictionary encoding. Use 1 to always use dictionary encoding."),
+ ROW_INDEX_STRIDE_DICTIONARY_CHECK("orc.dictionary.early.check",
+ "hive.orc.row.index.stride.dictionary.check",
+ true,
+ "If enabled dictionary check will happen after first row index stride\n" +
+ "(default 10000 rows) else dictionary check will happen before\n" +
+ "writing first stripe. In both cases, the decision to use\n" +
+ "dictionary or not will be retained thereafter."),
+ BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns", "orc.bloom.filter.columns",
+ "", "List of columns to create bloom filters for when writing.")
+ ;
+
+ private final String attribute;
+ private final String hiveConfName;
+ private final Object defaultValue;
+ private final String description;
+
+ OrcConf(String attribute,
+ String hiveConfName,
+ Object defaultValue,
+ String description) {
+ this.attribute = attribute;
+ this.hiveConfName = hiveConfName;
+ this.defaultValue = defaultValue;
+ this.description = description;
+ }
+
+ public String getAttribute() {
+ return attribute;
+ }
+
+ public String getHiveConfName() {
+ return hiveConfName;
+ }
+
+ public Object getDefaultValue() {
+ return defaultValue;
+ }
+
+ public String getDescription() {
+ return description;
+ }
+
+ private String lookupValue(Properties tbl, Configuration conf) {
+ String result = null;
+ if (tbl != null) {
+ result = tbl.getProperty(attribute);
+ }
+ if (result == null && conf != null) {
+ result = conf.get(attribute);
+ if (result == null) {
+ result = conf.get(hiveConfName);
+ }
+ }
+ return result;
+ }
+
+ public long getLong(Properties tbl, Configuration conf) {
+ String value = lookupValue(tbl, conf);
+ if (value != null) {
+ return Long.parseLong(value);
+ }
+ return ((Number) defaultValue).longValue();
+ }
+
+ public long getLong(Configuration conf) {
+ return getLong(null, conf);
+ }
+
+ public String getString(Properties tbl, Configuration conf) {
+ String value = lookupValue(tbl, conf);
+ return value == null ? (String) defaultValue : value;
+ }
+
+ public String getString(Configuration conf) {
+ return getString(null, conf);
+ }
+
+ public boolean getBoolean(Properties tbl, Configuration conf) {
+ String value = lookupValue(tbl, conf);
+ if (value != null) {
+ return Boolean.parseBoolean(value);
+ }
+ return (Boolean) defaultValue;
+ }
+
+ public boolean getBoolean(Configuration conf) {
+ return getBoolean(null, conf);
+ }
+
+ public double getDouble(Properties tbl, Configuration conf) {
+ String value = lookupValue(tbl, conf);
+ if (value != null) {
+ return Double.parseDouble(value);
+ }
+ return ((Number) defaultValue).doubleValue();
+ }
+
+ public double getDouble(Configuration conf) {
+ return getDouble(null, conf);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/OrcFile.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/OrcFile.java b/orc/src/java/org/apache/hive/orc/OrcFile.java
new file mode 100644
index 0000000..5670a61
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/OrcFile.java
@@ -0,0 +1,574 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc;
+
+import java.io.IOException;
+import java.util.Properties;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hive.orc.impl.MemoryManager;
+import org.apache.hive.orc.impl.OrcTail;
+import org.apache.hive.orc.impl.ReaderImpl;
+import org.apache.hive.orc.impl.WriterImpl;
+
+/**
+ * Contains factory methods to read or write ORC files.
+ */
+public class OrcFile {
+ public static final String MAGIC = "ORC";
+
+ /**
+ * Create a version number for the ORC file format, so that we can add
+ * non-forward compatible changes in the future. To make it easier for users
+ * to understand the version numbers, we use the Hive release number that
+ * first wrote that version of ORC files.
+ *
+ * Thus, if you add new encodings or other non-forward compatible changes
+ * to ORC files, which prevent the old reader from reading the new format,
+ * you should change these variable to reflect the next Hive release number.
+ * Non-forward compatible changes should never be added in patch releases.
+ *
+ * Do not make any changes that break backwards compatibility, which would
+ * prevent the new reader from reading ORC files generated by any released
+ * version of Hive.
+ */
+ public enum Version {
+ V_0_11("0.11", 0, 11),
+ V_0_12("0.12", 0, 12);
+
+ public static final Version CURRENT = V_0_12;
+
+ private final String name;
+ private final int major;
+ private final int minor;
+
+ Version(String name, int major, int minor) {
+ this.name = name;
+ this.major = major;
+ this.minor = minor;
+ }
+
+ public static Version byName(String name) {
+ for(Version version: values()) {
+ if (version.name.equals(name)) {
+ return version;
+ }
+ }
+ throw new IllegalArgumentException("Unknown ORC version " + name);
+ }
+
+ /**
+ * Get the human readable name for the version.
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * Get the major version number.
+ */
+ public int getMajor() {
+ return major;
+ }
+
+ /**
+ * Get the minor version number.
+ */
+ public int getMinor() {
+ return minor;
+ }
+ }
+
+ /**
+ * Records the version of the writer in terms of which bugs have been fixed.
+ * For bugs in the writer, but the old readers already read the new data
+ * correctly, bump this version instead of the Version.
+ */
+ public enum WriterVersion {
+ ORIGINAL(0),
+ HIVE_8732(1), // corrupted stripe/file maximum column statistics
+ HIVE_4243(2), // use real column names from Hive tables
+ HIVE_12055(3), // vectorized writer
+ HIVE_13083(4), // decimal writer updating present stream wrongly
+
+ // Don't use any magic numbers here except for the below:
+ FUTURE(Integer.MAX_VALUE); // a version from a future writer
+
+ private final int id;
+
+ public int getId() {
+ return id;
+ }
+
+ WriterVersion(int id) {
+ this.id = id;
+ }
+
+ private static final WriterVersion[] values;
+ static {
+ // Assumes few non-negative values close to zero.
+ int max = Integer.MIN_VALUE;
+ for (WriterVersion v : WriterVersion.values()) {
+ if (v.id < 0) throw new AssertionError();
+ if (v.id > max && FUTURE.id != v.id) {
+ max = v.id;
+ }
+ }
+ values = new WriterVersion[max + 1];
+ for (WriterVersion v : WriterVersion.values()) {
+ if (v.id < values.length) {
+ values[v.id] = v;
+ }
+ }
+ }
+
+ /**
+ * Convert the integer from OrcProto.PostScript.writerVersion
+ * to the enumeration with unknown versions being mapped to FUTURE.
+ * @param val the serialized writer version
+ * @return the corresponding enumeration value
+ */
+ public static WriterVersion from(int val) {
+ if (val >= values.length) {
+ return FUTURE;
+ }
+ return values[val];
+ }
+ }
+ public static final WriterVersion CURRENT_WRITER = WriterVersion.HIVE_13083;
+
+ public enum EncodingStrategy {
+ SPEED, COMPRESSION
+ }
+
+ public enum CompressionStrategy {
+ SPEED, COMPRESSION
+ }
+
+ // unused
+ protected OrcFile() {}
+
+ public static class ReaderOptions {
+ private final Configuration conf;
+ private FileSystem filesystem;
+ private long maxLength = Long.MAX_VALUE;
+ private OrcTail orcTail;
+ // TODO: We can generalize FileMetada interface. Make OrcTail implement FileMetadata interface
+ // and remove this class altogether. Both footer caching and llap caching just needs OrcTail.
+ // For now keeping this around to avoid complex surgery
+ private FileMetadata fileMetadata;
+
+ public ReaderOptions(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public ReaderOptions filesystem(FileSystem fs) {
+ this.filesystem = fs;
+ return this;
+ }
+
+ public ReaderOptions maxLength(long val) {
+ maxLength = val;
+ return this;
+ }
+
+ public ReaderOptions orcTail(OrcTail tail) {
+ this.orcTail = tail;
+ return this;
+ }
+
+ public Configuration getConfiguration() {
+ return conf;
+ }
+
+ public FileSystem getFilesystem() {
+ return filesystem;
+ }
+
+ public long getMaxLength() {
+ return maxLength;
+ }
+
+ public OrcTail getOrcTail() {
+ return orcTail;
+ }
+
+ public ReaderOptions fileMetadata(final FileMetadata metadata) {
+ fileMetadata = metadata;
+ return this;
+ }
+
+ public FileMetadata getFileMetadata() {
+ return fileMetadata;
+ }
+ }
+
+ public static ReaderOptions readerOptions(Configuration conf) {
+ return new ReaderOptions(conf);
+ }
+
+ public static Reader createReader(Path path,
+ ReaderOptions options) throws IOException {
+ return new ReaderImpl(path, options);
+ }
+
+ public interface WriterContext {
+ Writer getWriter();
+ }
+
+ public interface WriterCallback {
+ void preStripeWrite(WriterContext context) throws IOException;
+ void preFooterWrite(WriterContext context) throws IOException;
+ }
+
+ /**
+ * Options for creating ORC file writers.
+ */
+ public static class WriterOptions {
+ private final Configuration configuration;
+ private FileSystem fileSystemValue = null;
+ private TypeDescription schema = null;
+ private long stripeSizeValue;
+ private long blockSizeValue;
+ private int rowIndexStrideValue;
+ private int bufferSizeValue;
+ private boolean enforceBufferSize = false;
+ private boolean blockPaddingValue;
+ private CompressionKind compressValue;
+ private MemoryManager memoryManagerValue;
+ private Version versionValue;
+ private WriterCallback callback;
+ private EncodingStrategy encodingStrategy;
+ private CompressionStrategy compressionStrategy;
+ private double paddingTolerance;
+ private String bloomFilterColumns;
+ private double bloomFilterFpp;
+
+ protected WriterOptions(Properties tableProperties, Configuration conf) {
+ configuration = conf;
+ memoryManagerValue = getStaticMemoryManager(conf);
+ stripeSizeValue = OrcConf.STRIPE_SIZE.getLong(tableProperties, conf);
+ blockSizeValue = OrcConf.BLOCK_SIZE.getLong(tableProperties, conf);
+ rowIndexStrideValue =
+ (int) OrcConf.ROW_INDEX_STRIDE.getLong(tableProperties, conf);
+ bufferSizeValue = (int) OrcConf.BUFFER_SIZE.getLong(tableProperties,
+ conf);
+ blockPaddingValue =
+ OrcConf.BLOCK_PADDING.getBoolean(tableProperties, conf);
+ compressValue =
+ CompressionKind.valueOf(OrcConf.COMPRESS.getString(tableProperties,
+ conf).toUpperCase());
+ String versionName = OrcConf.WRITE_FORMAT.getString(tableProperties,
+ conf);
+ versionValue = Version.byName(versionName);
+ String enString = OrcConf.ENCODING_STRATEGY.getString(tableProperties,
+ conf);
+ encodingStrategy = EncodingStrategy.valueOf(enString);
+
+ String compString =
+ OrcConf.COMPRESSION_STRATEGY.getString(tableProperties, conf);
+ compressionStrategy = CompressionStrategy.valueOf(compString);
+
+ paddingTolerance =
+ OrcConf.BLOCK_PADDING_TOLERANCE.getDouble(tableProperties, conf);
+
+ bloomFilterColumns = OrcConf.BLOOM_FILTER_COLUMNS.getString(tableProperties,
+ conf);
+ bloomFilterFpp = OrcConf.BLOOM_FILTER_FPP.getDouble(tableProperties,
+ conf);
+ }
+
+ /**
+ * Provide the filesystem for the path, if the client has it available.
+ * If it is not provided, it will be found from the path.
+ */
+ public WriterOptions fileSystem(FileSystem value) {
+ fileSystemValue = value;
+ return this;
+ }
+
+ /**
+ * Set the stripe size for the file. The writer stores the contents of the
+ * stripe in memory until this memory limit is reached and the stripe
+ * is flushed to the HDFS file and the next stripe started.
+ */
+ public WriterOptions stripeSize(long value) {
+ stripeSizeValue = value;
+ return this;
+ }
+
+ /**
+ * Set the file system block size for the file. For optimal performance,
+ * set the block size to be multiple factors of stripe size.
+ */
+ public WriterOptions blockSize(long value) {
+ blockSizeValue = value;
+ return this;
+ }
+
+ /**
+ * Set the distance between entries in the row index. The minimum value is
+ * 1000 to prevent the index from overwhelming the data. If the stride is
+ * set to 0, no indexes will be included in the file.
+ */
+ public WriterOptions rowIndexStride(int value) {
+ rowIndexStrideValue = value;
+ return this;
+ }
+
+ /**
+ * The size of the memory buffers used for compressing and storing the
+ * stripe in memory. NOTE: ORC writer may choose to use smaller buffer
+ * size based on stripe size and number of columns for efficient stripe
+ * writing and memory utilization. To enforce writer to use the requested
+ * buffer size use enforceBufferSize().
+ */
+ public WriterOptions bufferSize(int value) {
+ bufferSizeValue = value;
+ return this;
+ }
+
+ /**
+ * Enforce writer to use requested buffer size instead of estimating
+ * buffer size based on stripe size and number of columns.
+ * See bufferSize() method for more info.
+ * Default: false
+ */
+ public WriterOptions enforceBufferSize() {
+ enforceBufferSize = true;
+ return this;
+ }
+
+ /**
+ * Sets whether the HDFS blocks are padded to prevent stripes from
+ * straddling blocks. Padding improves locality and thus the speed of
+ * reading, but costs space.
+ */
+ public WriterOptions blockPadding(boolean value) {
+ blockPaddingValue = value;
+ return this;
+ }
+
+ /**
+ * Sets the encoding strategy that is used to encode the data.
+ */
+ public WriterOptions encodingStrategy(EncodingStrategy strategy) {
+ encodingStrategy = strategy;
+ return this;
+ }
+
+ /**
+ * Sets the tolerance for block padding as a percentage of stripe size.
+ */
+ public WriterOptions paddingTolerance(double value) {
+ paddingTolerance = value;
+ return this;
+ }
+
+ /**
+ * Comma separated values of column names for which bloom filter is to be created.
+ */
+ public WriterOptions bloomFilterColumns(String columns) {
+ bloomFilterColumns = columns;
+ return this;
+ }
+
+ /**
+ * Specify the false positive probability for bloom filter.
+ * @param fpp - false positive probability
+ * @return this
+ */
+ public WriterOptions bloomFilterFpp(double fpp) {
+ bloomFilterFpp = fpp;
+ return this;
+ }
+
+ /**
+ * Sets the generic compression that is used to compress the data.
+ */
+ public WriterOptions compress(CompressionKind value) {
+ compressValue = value;
+ return this;
+ }
+
+ /**
+ * Set the schema for the file. This is a required parameter.
+ * @param schema the schema for the file.
+ * @return this
+ */
+ public WriterOptions setSchema(TypeDescription schema) {
+ this.schema = schema;
+ return this;
+ }
+
+ /**
+ * Sets the version of the file that will be written.
+ */
+ public WriterOptions version(Version value) {
+ versionValue = value;
+ return this;
+ }
+
+ /**
+ * Add a listener for when the stripe and file are about to be closed.
+ * @param callback the object to be called when the stripe is closed
+ * @return this
+ */
+ public WriterOptions callback(WriterCallback callback) {
+ this.callback = callback;
+ return this;
+ }
+
+ /**
+ * A package local option to set the memory manager.
+ */
+ protected WriterOptions memory(MemoryManager value) {
+ memoryManagerValue = value;
+ return this;
+ }
+
+ public boolean getBlockPadding() {
+ return blockPaddingValue;
+ }
+
+ public long getBlockSize() {
+ return blockSizeValue;
+ }
+
+ public String getBloomFilterColumns() {
+ return bloomFilterColumns;
+ }
+
+ public FileSystem getFileSystem() {
+ return fileSystemValue;
+ }
+
+ public Configuration getConfiguration() {
+ return configuration;
+ }
+
+ public TypeDescription getSchema() {
+ return schema;
+ }
+
+ public long getStripeSize() {
+ return stripeSizeValue;
+ }
+
+ public CompressionKind getCompress() {
+ return compressValue;
+ }
+
+ public WriterCallback getCallback() {
+ return callback;
+ }
+
+ public Version getVersion() {
+ return versionValue;
+ }
+
+ public MemoryManager getMemoryManager() {
+ return memoryManagerValue;
+ }
+
+ public int getBufferSize() {
+ return bufferSizeValue;
+ }
+
+ public boolean isEnforceBufferSize() {
+ return enforceBufferSize;
+ }
+
+ public int getRowIndexStride() {
+ return rowIndexStrideValue;
+ }
+
+ public CompressionStrategy getCompressionStrategy() {
+ return compressionStrategy;
+ }
+
+ public EncodingStrategy getEncodingStrategy() {
+ return encodingStrategy;
+ }
+
+ public double getPaddingTolerance() {
+ return paddingTolerance;
+ }
+
+ public double getBloomFilterFpp() {
+ return bloomFilterFpp;
+ }
+ }
+
+ /**
+ * Create a set of writer options based on a configuration.
+ * @param conf the configuration to use for values
+ * @return A WriterOptions object that can be modified
+ */
+ public static WriterOptions writerOptions(Configuration conf) {
+ return new WriterOptions(null, conf);
+ }
+
+ /**
+ * Create a set of write options based on a set of table properties and
+ * configuration.
+ * @param tableProperties the properties of the table
+ * @param conf the configuration of the query
+ * @return a WriterOptions object that can be modified
+ */
+ public static WriterOptions writerOptions(Properties tableProperties,
+ Configuration conf) {
+ return new WriterOptions(tableProperties, conf);
+ }
+
+ private static ThreadLocal<MemoryManager> memoryManager = null;
+
+ private static synchronized MemoryManager getStaticMemoryManager(
+ final Configuration conf) {
+ if (memoryManager == null) {
+ memoryManager = new ThreadLocal<MemoryManager>() {
+ @Override
+ protected MemoryManager initialValue() {
+ return new MemoryManager(conf);
+ }
+ };
+ }
+ return memoryManager.get();
+ }
+
+ /**
+ * Create an ORC file writer. This is the public interface for creating
+ * writers going forward and new options will only be added to this method.
+ * @param path filename to write to
+ * @param opts the options
+ * @return a new ORC file writer
+ * @throws IOException
+ */
+ public static Writer createWriter(Path path,
+ WriterOptions opts
+ ) throws IOException {
+ FileSystem fs = opts.getFileSystem() == null ?
+ path.getFileSystem(opts.getConfiguration()) : opts.getFileSystem();
+
+ return new WriterImpl(fs, path, opts);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/OrcUtils.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/OrcUtils.java b/orc/src/java/org/apache/hive/orc/OrcUtils.java
new file mode 100644
index 0000000..12cb1f7
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/OrcUtils.java
@@ -0,0 +1,623 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.hive.orc.impl.ReaderImpl;
+
+import com.google.common.collect.Lists;
+
+public class OrcUtils {
+
+ /**
+ * Returns selected columns as a boolean array with true value set for specified column names.
+ * The result will contain number of elements equal to flattened number of columns.
+ * For example:
+ * selectedColumns - a,b,c
+ * allColumns - a,b,c,d
+ * If column c is a complex type, say list<string> and other types are primitives then result will
+ * be [false, true, true, true, true, true, false]
+ * Index 0 is the root element of the struct which is set to false by default, index 1,2
+ * corresponds to columns a and b. Index 3,4 correspond to column c which is list<string> and
+ * index 5 correspond to column d. After flattening list<string> gets 2 columns.
+ *
+ * @param selectedColumns - comma separated list of selected column names
+ * @param schema - object schema
+ * @return - boolean array with true value set for the specified column names
+ */
+ public static boolean[] includeColumns(String selectedColumns,
+ TypeDescription schema) {
+ int numFlattenedCols = schema.getMaximumId();
+ boolean[] results = new boolean[numFlattenedCols + 1];
+ if ("*".equals(selectedColumns)) {
+ Arrays.fill(results, true);
+ return results;
+ }
+ if (selectedColumns != null &&
+ schema.getCategory() == TypeDescription.Category.STRUCT) {
+ List<String> fieldNames = schema.getFieldNames();
+ List<TypeDescription> fields = schema.getChildren();
+ for (String column: selectedColumns.split((","))) {
+ TypeDescription col = findColumn(column, fieldNames, fields);
+ if (col != null) {
+ for(int i=col.getId(); i <= col.getMaximumId(); ++i) {
+ results[i] = true;
+ }
+ }
+ }
+ }
+ return results;
+ }
+
+ private static TypeDescription findColumn(String columnName,
+ List<String> fieldNames,
+ List<TypeDescription> fields) {
+ int i = 0;
+ for(String fieldName: fieldNames) {
+ if (fieldName.equalsIgnoreCase(columnName)) {
+ return fields.get(i);
+ } else {
+ i += 1;
+ }
+ }
+ return null;
+ }
+
+ public static List<OrcProto.Type> getOrcTypes(TypeDescription typeDescr) {
+ List<OrcProto.Type> result = Lists.newArrayList();
+ appendOrcTypes(result, typeDescr);
+ return result;
+ }
+
+ private static void appendOrcTypes(List<OrcProto.Type> result, TypeDescription typeDescr) {
+ OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
+ List<TypeDescription> children = typeDescr.getChildren();
+ switch (typeDescr.getCategory()) {
+ case BOOLEAN:
+ type.setKind(OrcProto.Type.Kind.BOOLEAN);
+ break;
+ case BYTE:
+ type.setKind(OrcProto.Type.Kind.BYTE);
+ break;
+ case SHORT:
+ type.setKind(OrcProto.Type.Kind.SHORT);
+ break;
+ case INT:
+ type.setKind(OrcProto.Type.Kind.INT);
+ break;
+ case LONG:
+ type.setKind(OrcProto.Type.Kind.LONG);
+ break;
+ case FLOAT:
+ type.setKind(OrcProto.Type.Kind.FLOAT);
+ break;
+ case DOUBLE:
+ type.setKind(OrcProto.Type.Kind.DOUBLE);
+ break;
+ case STRING:
+ type.setKind(OrcProto.Type.Kind.STRING);
+ break;
+ case CHAR:
+ type.setKind(OrcProto.Type.Kind.CHAR);
+ type.setMaximumLength(typeDescr.getMaxLength());
+ break;
+ case VARCHAR:
+ type.setKind(OrcProto.Type.Kind.VARCHAR);
+ type.setMaximumLength(typeDescr.getMaxLength());
+ break;
+ case BINARY:
+ type.setKind(OrcProto.Type.Kind.BINARY);
+ break;
+ case TIMESTAMP:
+ type.setKind(OrcProto.Type.Kind.TIMESTAMP);
+ break;
+ case DATE:
+ type.setKind(OrcProto.Type.Kind.DATE);
+ break;
+ case DECIMAL:
+ type.setKind(OrcProto.Type.Kind.DECIMAL);
+ type.setPrecision(typeDescr.getPrecision());
+ type.setScale(typeDescr.getScale());
+ break;
+ case LIST:
+ type.setKind(OrcProto.Type.Kind.LIST);
+ type.addSubtypes(children.get(0).getId());
+ break;
+ case MAP:
+ type.setKind(OrcProto.Type.Kind.MAP);
+ for(TypeDescription t: children) {
+ type.addSubtypes(t.getId());
+ }
+ break;
+ case STRUCT:
+ type.setKind(OrcProto.Type.Kind.STRUCT);
+ for(TypeDescription t: children) {
+ type.addSubtypes(t.getId());
+ }
+ for(String field: typeDescr.getFieldNames()) {
+ type.addFieldNames(field);
+ }
+ break;
+ case UNION:
+ type.setKind(OrcProto.Type.Kind.UNION);
+ for(TypeDescription t: children) {
+ type.addSubtypes(t.getId());
+ }
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown category: " +
+ typeDescr.getCategory());
+ }
+ result.add(type.build());
+ if (children != null) {
+ for(TypeDescription child: children) {
+ appendOrcTypes(result, child);
+ }
+ }
+ }
+
+ /**
+ * NOTE: This method ignores the subtype numbers in the TypeDescription rebuilds the subtype
+ * numbers based on the length of the result list being appended.
+ *
+ * @param result
+ * @param typeDescr
+ */
+ public static void appendOrcTypesRebuildSubtypes(List<OrcProto.Type> result,
+ TypeDescription typeDescr) {
+
+ int subtype = result.size();
+ OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
+ boolean needsAdd = true;
+ List<TypeDescription> children = typeDescr.getChildren();
+ switch (typeDescr.getCategory()) {
+ case BOOLEAN:
+ type.setKind(OrcProto.Type.Kind.BOOLEAN);
+ break;
+ case BYTE:
+ type.setKind(OrcProto.Type.Kind.BYTE);
+ break;
+ case SHORT:
+ type.setKind(OrcProto.Type.Kind.SHORT);
+ break;
+ case INT:
+ type.setKind(OrcProto.Type.Kind.INT);
+ break;
+ case LONG:
+ type.setKind(OrcProto.Type.Kind.LONG);
+ break;
+ case FLOAT:
+ type.setKind(OrcProto.Type.Kind.FLOAT);
+ break;
+ case DOUBLE:
+ type.setKind(OrcProto.Type.Kind.DOUBLE);
+ break;
+ case STRING:
+ type.setKind(OrcProto.Type.Kind.STRING);
+ break;
+ case CHAR:
+ type.setKind(OrcProto.Type.Kind.CHAR);
+ type.setMaximumLength(typeDescr.getMaxLength());
+ break;
+ case VARCHAR:
+ type.setKind(OrcProto.Type.Kind.VARCHAR);
+ type.setMaximumLength(typeDescr.getMaxLength());
+ break;
+ case BINARY:
+ type.setKind(OrcProto.Type.Kind.BINARY);
+ break;
+ case TIMESTAMP:
+ type.setKind(OrcProto.Type.Kind.TIMESTAMP);
+ break;
+ case DATE:
+ type.setKind(OrcProto.Type.Kind.DATE);
+ break;
+ case DECIMAL:
+ type.setKind(OrcProto.Type.Kind.DECIMAL);
+ type.setPrecision(typeDescr.getPrecision());
+ type.setScale(typeDescr.getScale());
+ break;
+ case LIST:
+ type.setKind(OrcProto.Type.Kind.LIST);
+ type.addSubtypes(++subtype);
+ result.add(type.build());
+ needsAdd = false;
+ appendOrcTypesRebuildSubtypes(result, children.get(0));
+ break;
+ case MAP:
+ {
+ // Make room for MAP type.
+ result.add(null);
+
+ // Add MAP type pair in order to determine their subtype values.
+ appendOrcTypesRebuildSubtypes(result, children.get(0));
+ int subtype2 = result.size();
+ appendOrcTypesRebuildSubtypes(result, children.get(1));
+ type.setKind(OrcProto.Type.Kind.MAP);
+ type.addSubtypes(subtype + 1);
+ type.addSubtypes(subtype2);
+ result.set(subtype, type.build());
+ needsAdd = false;
+ }
+ break;
+ case STRUCT:
+ {
+ List<String> fieldNames = typeDescr.getFieldNames();
+
+ // Make room for STRUCT type.
+ result.add(null);
+
+ List<Integer> fieldSubtypes = new ArrayList<Integer>(fieldNames.size());
+ for(TypeDescription child: children) {
+ int fieldSubtype = result.size();
+ fieldSubtypes.add(fieldSubtype);
+ appendOrcTypesRebuildSubtypes(result, child);
+ }
+
+ type.setKind(OrcProto.Type.Kind.STRUCT);
+
+ for (int i = 0 ; i < fieldNames.size(); i++) {
+ type.addSubtypes(fieldSubtypes.get(i));
+ type.addFieldNames(fieldNames.get(i));
+ }
+ result.set(subtype, type.build());
+ needsAdd = false;
+ }
+ break;
+ case UNION:
+ {
+ // Make room for UNION type.
+ result.add(null);
+
+ List<Integer> unionSubtypes = new ArrayList<Integer>(children.size());
+ for(TypeDescription child: children) {
+ int unionSubtype = result.size();
+ unionSubtypes.add(unionSubtype);
+ appendOrcTypesRebuildSubtypes(result, child);
+ }
+
+ type.setKind(OrcProto.Type.Kind.UNION);
+ for (int i = 0 ; i < children.size(); i++) {
+ type.addSubtypes(unionSubtypes.get(i));
+ }
+ result.set(subtype, type.build());
+ needsAdd = false;
+ }
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown category: " + typeDescr.getCategory());
+ }
+ if (needsAdd) {
+ result.add(type.build());
+ }
+ }
+
+ /**
+ * NOTE: This method ignores the subtype numbers in the OrcProto.Type rebuilds the subtype
+ * numbers based on the length of the result list being appended.
+ *
+ * @param result
+ * @param types
+ * @param columnId
+ */
+ public static int appendOrcTypesRebuildSubtypes(List<OrcProto.Type> result,
+ List<OrcProto.Type> types, int columnId) {
+
+ OrcProto.Type oldType = types.get(columnId++);
+
+ int subtype = result.size();
+ OrcProto.Type.Builder builder = OrcProto.Type.newBuilder();
+ boolean needsAdd = true;
+ switch (oldType.getKind()) {
+ case BOOLEAN:
+ builder.setKind(OrcProto.Type.Kind.BOOLEAN);
+ break;
+ case BYTE:
+ builder.setKind(OrcProto.Type.Kind.BYTE);
+ break;
+ case SHORT:
+ builder.setKind(OrcProto.Type.Kind.SHORT);
+ break;
+ case INT:
+ builder.setKind(OrcProto.Type.Kind.INT);
+ break;
+ case LONG:
+ builder.setKind(OrcProto.Type.Kind.LONG);
+ break;
+ case FLOAT:
+ builder.setKind(OrcProto.Type.Kind.FLOAT);
+ break;
+ case DOUBLE:
+ builder.setKind(OrcProto.Type.Kind.DOUBLE);
+ break;
+ case STRING:
+ builder.setKind(OrcProto.Type.Kind.STRING);
+ break;
+ case CHAR:
+ builder.setKind(OrcProto.Type.Kind.CHAR);
+ builder.setMaximumLength(oldType.getMaximumLength());
+ break;
+ case VARCHAR:
+ builder.setKind(OrcProto.Type.Kind.VARCHAR);
+ builder.setMaximumLength(oldType.getMaximumLength());
+ break;
+ case BINARY:
+ builder.setKind(OrcProto.Type.Kind.BINARY);
+ break;
+ case TIMESTAMP:
+ builder.setKind(OrcProto.Type.Kind.TIMESTAMP);
+ break;
+ case DATE:
+ builder.setKind(OrcProto.Type.Kind.DATE);
+ break;
+ case DECIMAL:
+ builder.setKind(OrcProto.Type.Kind.DECIMAL);
+ builder.setPrecision(oldType.getPrecision());
+ builder.setScale(oldType.getScale());
+ break;
+ case LIST:
+ builder.setKind(OrcProto.Type.Kind.LIST);
+ builder.addSubtypes(++subtype);
+ result.add(builder.build());
+ needsAdd = false;
+ columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
+ break;
+ case MAP:
+ {
+ // Make room for MAP type.
+ result.add(null);
+
+ // Add MAP type pair in order to determine their subtype values.
+ columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
+ int subtype2 = result.size();
+ columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
+ builder.setKind(OrcProto.Type.Kind.MAP);
+ builder.addSubtypes(subtype + 1);
+ builder.addSubtypes(subtype2);
+ result.set(subtype, builder.build());
+ needsAdd = false;
+ }
+ break;
+ case STRUCT:
+ {
+ List<String> fieldNames = oldType.getFieldNamesList();
+
+ // Make room for STRUCT type.
+ result.add(null);
+
+ List<Integer> fieldSubtypes = new ArrayList<Integer>(fieldNames.size());
+ for(int i = 0 ; i < fieldNames.size(); i++) {
+ int fieldSubtype = result.size();
+ fieldSubtypes.add(fieldSubtype);
+ columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
+ }
+
+ builder.setKind(OrcProto.Type.Kind.STRUCT);
+
+ for (int i = 0 ; i < fieldNames.size(); i++) {
+ builder.addSubtypes(fieldSubtypes.get(i));
+ builder.addFieldNames(fieldNames.get(i));
+ }
+ result.set(subtype, builder.build());
+ needsAdd = false;
+ }
+ break;
+ case UNION:
+ {
+ int subtypeCount = oldType.getSubtypesCount();
+
+ // Make room for UNION type.
+ result.add(null);
+
+ List<Integer> unionSubtypes = new ArrayList<Integer>(subtypeCount);
+ for(int i = 0 ; i < subtypeCount; i++) {
+ int unionSubtype = result.size();
+ unionSubtypes.add(unionSubtype);
+ columnId = appendOrcTypesRebuildSubtypes(result, types, columnId);
+ }
+
+ builder.setKind(OrcProto.Type.Kind.UNION);
+ for (int i = 0 ; i < subtypeCount; i++) {
+ builder.addSubtypes(unionSubtypes.get(i));
+ }
+ result.set(subtype, builder.build());
+ needsAdd = false;
+ }
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown category: " + oldType.getKind());
+ }
+ if (needsAdd) {
+ result.add(builder.build());
+ }
+ return columnId;
+ }
+
+ /**
+ * Translate the given rootColumn from the list of types to a TypeDescription.
+ * @param types all of the types
+ * @param rootColumn translate this type
+ * @return a new TypeDescription that matches the given rootColumn
+ */
+ public static
+ TypeDescription convertTypeFromProtobuf(List<OrcProto.Type> types,
+ int rootColumn) {
+ OrcProto.Type type = types.get(rootColumn);
+ switch (type.getKind()) {
+ case BOOLEAN:
+ return TypeDescription.createBoolean();
+ case BYTE:
+ return TypeDescription.createByte();
+ case SHORT:
+ return TypeDescription.createShort();
+ case INT:
+ return TypeDescription.createInt();
+ case LONG:
+ return TypeDescription.createLong();
+ case FLOAT:
+ return TypeDescription.createFloat();
+ case DOUBLE:
+ return TypeDescription.createDouble();
+ case STRING:
+ return TypeDescription.createString();
+ case CHAR:
+ case VARCHAR: {
+ TypeDescription result = type.getKind() == OrcProto.Type.Kind.CHAR ?
+ TypeDescription.createChar() : TypeDescription.createVarchar();
+ if (type.hasMaximumLength()) {
+ result.withMaxLength(type.getMaximumLength());
+ }
+ return result;
+ }
+ case BINARY:
+ return TypeDescription.createBinary();
+ case TIMESTAMP:
+ return TypeDescription.createTimestamp();
+ case DATE:
+ return TypeDescription.createDate();
+ case DECIMAL: {
+ TypeDescription result = TypeDescription.createDecimal();
+ if (type.hasScale()) {
+ result.withScale(type.getScale());
+ }
+ if (type.hasPrecision()) {
+ result.withPrecision(type.getPrecision());
+ }
+ return result;
+ }
+ case LIST:
+ return TypeDescription.createList(
+ convertTypeFromProtobuf(types, type.getSubtypes(0)));
+ case MAP:
+ return TypeDescription.createMap(
+ convertTypeFromProtobuf(types, type.getSubtypes(0)),
+ convertTypeFromProtobuf(types, type.getSubtypes(1)));
+ case STRUCT: {
+ TypeDescription result = TypeDescription.createStruct();
+ for(int f=0; f < type.getSubtypesCount(); ++f) {
+ result.addField(type.getFieldNames(f),
+ convertTypeFromProtobuf(types, type.getSubtypes(f)));
+ }
+ return result;
+ }
+ case UNION: {
+ TypeDescription result = TypeDescription.createUnion();
+ for(int f=0; f < type.getSubtypesCount(); ++f) {
+ result.addUnionChild(
+ convertTypeFromProtobuf(types, type.getSubtypes(f)));
+ }
+ return result;
+ }
+ }
+ throw new IllegalArgumentException("Unknown ORC type " + type.getKind());
+ }
+
+ public static List<StripeInformation> convertProtoStripesToStripes(
+ List<OrcProto.StripeInformation> stripes) {
+ List<StripeInformation> result = new ArrayList<StripeInformation>(stripes.size());
+ for (OrcProto.StripeInformation info : stripes) {
+ result.add(new ReaderImpl.StripeInformationImpl(info));
+ }
+ return result;
+ }
+
+ public static List<TypeDescription> setTypeBuilderFromSchema(
+ OrcProto.Type.Builder type, TypeDescription schema) {
+ List<TypeDescription> children = schema.getChildren();
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ type.setKind(OrcProto.Type.Kind.BOOLEAN);
+ break;
+ case BYTE:
+ type.setKind(OrcProto.Type.Kind.BYTE);
+ break;
+ case SHORT:
+ type.setKind(OrcProto.Type.Kind.SHORT);
+ break;
+ case INT:
+ type.setKind(OrcProto.Type.Kind.INT);
+ break;
+ case LONG:
+ type.setKind(OrcProto.Type.Kind.LONG);
+ break;
+ case FLOAT:
+ type.setKind(OrcProto.Type.Kind.FLOAT);
+ break;
+ case DOUBLE:
+ type.setKind(OrcProto.Type.Kind.DOUBLE);
+ break;
+ case STRING:
+ type.setKind(OrcProto.Type.Kind.STRING);
+ break;
+ case CHAR:
+ type.setKind(OrcProto.Type.Kind.CHAR);
+ type.setMaximumLength(schema.getMaxLength());
+ break;
+ case VARCHAR:
+ type.setKind(OrcProto.Type.Kind.VARCHAR);
+ type.setMaximumLength(schema.getMaxLength());
+ break;
+ case BINARY:
+ type.setKind(OrcProto.Type.Kind.BINARY);
+ break;
+ case TIMESTAMP:
+ type.setKind(OrcProto.Type.Kind.TIMESTAMP);
+ break;
+ case DATE:
+ type.setKind(OrcProto.Type.Kind.DATE);
+ break;
+ case DECIMAL:
+ type.setKind(OrcProto.Type.Kind.DECIMAL);
+ type.setPrecision(schema.getPrecision());
+ type.setScale(schema.getScale());
+ break;
+ case LIST:
+ type.setKind(OrcProto.Type.Kind.LIST);
+ type.addSubtypes(children.get(0).getId());
+ break;
+ case MAP:
+ type.setKind(OrcProto.Type.Kind.MAP);
+ for(TypeDescription t: children) {
+ type.addSubtypes(t.getId());
+ }
+ break;
+ case STRUCT:
+ type.setKind(OrcProto.Type.Kind.STRUCT);
+ for(TypeDescription t: children) {
+ type.addSubtypes(t.getId());
+ }
+ for(String field: schema.getFieldNames()) {
+ type.addFieldNames(field);
+ }
+ break;
+ case UNION:
+ type.setKind(OrcProto.Type.Kind.UNION);
+ for(TypeDescription t: children) {
+ type.addSubtypes(t.getId());
+ }
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown category: " +
+ schema.getCategory());
+ }
+ return children;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/Reader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/Reader.java b/orc/src/java/org/apache/hive/orc/Reader.java
new file mode 100644
index 0000000..61ce186
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/Reader.java
@@ -0,0 +1,375 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
+
+/**
+ * The interface for reading ORC files.
+ *
+ * One Reader can support multiple concurrent RecordReader.
+ */
+public interface Reader {
+
+ /**
+ * Get the number of rows in the file.
+ * @return the number of rows
+ */
+ long getNumberOfRows();
+
+ /**
+ * Get the deserialized data size of the file
+ * @return raw data size
+ */
+ long getRawDataSize();
+
+ /**
+ * Get the deserialized data size of the specified columns
+ * @param colNames
+ * @return raw data size of columns
+ */
+ long getRawDataSizeOfColumns(List<String> colNames);
+
+ /**
+ * Get the deserialized data size of the specified columns ids
+ * @param colIds - internal column id (check orcfiledump for column ids)
+ * @return raw data size of columns
+ */
+ long getRawDataSizeFromColIndices(List<Integer> colIds);
+
+ /**
+ * Get the user metadata keys.
+ * @return the set of metadata keys
+ */
+ List<String> getMetadataKeys();
+
+ /**
+ * Get a user metadata value.
+ * @param key a key given by the user
+ * @return the bytes associated with the given key
+ */
+ ByteBuffer getMetadataValue(String key);
+
+ /**
+ * Did the user set the given metadata value.
+ * @param key the key to check
+ * @return true if the metadata value was set
+ */
+ boolean hasMetadataValue(String key);
+
+ /**
+ * Get the compression kind.
+ * @return the kind of compression in the file
+ */
+ CompressionKind getCompressionKind();
+
+ /**
+ * Get the buffer size for the compression.
+ * @return number of bytes to buffer for the compression codec.
+ */
+ int getCompressionSize();
+
+ /**
+ * Get the number of rows per a entry in the row index.
+ * @return the number of rows per an entry in the row index or 0 if there
+ * is no row index.
+ */
+ int getRowIndexStride();
+
+ /**
+ * Get the list of stripes.
+ * @return the information about the stripes in order
+ */
+ List<StripeInformation> getStripes();
+
+ /**
+ * Get the length of the file.
+ * @return the number of bytes in the file
+ */
+ long getContentLength();
+
+ /**
+ * Get the statistics about the columns in the file.
+ * @return the information about the column
+ */
+ ColumnStatistics[] getStatistics();
+
+ /**
+ * Get the type of rows in this ORC file.
+ */
+ TypeDescription getSchema();
+
+ /**
+ * Get the list of types contained in the file. The root type is the first
+ * type in the list.
+ * @return the list of flattened types
+ * @deprecated use getSchema instead
+ */
+ List<OrcProto.Type> getTypes();
+
+ /**
+ * Get the file format version.
+ */
+ OrcFile.Version getFileVersion();
+
+ /**
+ * Get the version of the writer of this file.
+ */
+ OrcFile.WriterVersion getWriterVersion();
+
+ /**
+ * Get the file tail (footer + postscript)
+ *
+ * @return - file tail
+ */
+ OrcProto.FileTail getFileTail();
+
+ /**
+ * Options for creating a RecordReader.
+ */
+ public static class Options {
+ private boolean[] include;
+ private long offset = 0;
+ private long length = Long.MAX_VALUE;
+ private SearchArgument sarg = null;
+ private String[] columnNames = null;
+ private Boolean useZeroCopy = null;
+ private Boolean skipCorruptRecords = null;
+ private TypeDescription schema = null;
+ private DataReader dataReader = null;
+
+ /**
+ * Set the list of columns to read.
+ * @param include a list of columns to read
+ * @return this
+ */
+ public Options include(boolean[] include) {
+ this.include = include;
+ return this;
+ }
+
+ /**
+ * Set the range of bytes to read
+ * @param offset the starting byte offset
+ * @param length the number of bytes to read
+ * @return this
+ */
+ public Options range(long offset, long length) {
+ this.offset = offset;
+ this.length = length;
+ return this;
+ }
+
+ /**
+ * Set the schema on read type description.
+ */
+ public Options schema(TypeDescription schema) {
+ this.schema = schema;
+ return this;
+ }
+
+ /**
+ * Set search argument for predicate push down.
+ * @param sarg the search argument
+ * @param columnNames the column names for
+ * @return this
+ */
+ public Options searchArgument(SearchArgument sarg, String[] columnNames) {
+ this.sarg = sarg;
+ this.columnNames = columnNames;
+ return this;
+ }
+
+ /**
+ * Set whether to use zero copy from HDFS.
+ * @param value the new zero copy flag
+ * @return this
+ */
+ public Options useZeroCopy(boolean value) {
+ this.useZeroCopy = value;
+ return this;
+ }
+
+ public Options dataReader(DataReader value) {
+ this.dataReader = value;
+ return this;
+ }
+
+ /**
+ * Set whether to skip corrupt records.
+ * @param value the new skip corrupt records flag
+ * @return this
+ */
+ public Options skipCorruptRecords(boolean value) {
+ this.skipCorruptRecords = value;
+ return this;
+ }
+
+ public boolean[] getInclude() {
+ return include;
+ }
+
+ public long getOffset() {
+ return offset;
+ }
+
+ public long getLength() {
+ return length;
+ }
+
+ public TypeDescription getSchema() {
+ return schema;
+ }
+
+ public SearchArgument getSearchArgument() {
+ return sarg;
+ }
+
+ public String[] getColumnNames() {
+ return columnNames;
+ }
+
+ public long getMaxOffset() {
+ long result = offset + length;
+ if (result < 0) {
+ result = Long.MAX_VALUE;
+ }
+ return result;
+ }
+
+ public Boolean getUseZeroCopy() {
+ return useZeroCopy;
+ }
+
+ public Boolean getSkipCorruptRecords() {
+ return skipCorruptRecords;
+ }
+
+ public DataReader getDataReader() {
+ return dataReader;
+ }
+
+ public Options clone() {
+ Options result = new Options();
+ result.include = include;
+ result.offset = offset;
+ result.length = length;
+ result.sarg = sarg;
+ result.schema = schema;
+ result.columnNames = columnNames;
+ result.useZeroCopy = useZeroCopy;
+ result.skipCorruptRecords = skipCorruptRecords;
+ result.dataReader = dataReader == null ? null : dataReader.clone();
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buffer = new StringBuilder();
+ buffer.append("{include: ");
+ if (include == null) {
+ buffer.append("null");
+ } else {
+ buffer.append("[");
+ for(int i=0; i < include.length; ++i) {
+ if (i != 0) {
+ buffer.append(", ");
+ }
+ buffer.append(include[i]);
+ }
+ buffer.append("]");
+ }
+ buffer.append(", offset: ");
+ buffer.append(offset);
+ buffer.append(", length: ");
+ buffer.append(length);
+ if (sarg != null) {
+ buffer.append(", sarg: ");
+ buffer.append(sarg.toString());
+ buffer.append(", columns: [");
+ for(int i=0; i < columnNames.length; ++i) {
+ if (i != 0) {
+ buffer.append(", ");
+ }
+ buffer.append("'");
+ buffer.append(columnNames[i]);
+ buffer.append("'");
+ }
+ buffer.append("]");
+ }
+ if (schema != null) {
+ buffer.append(", schema: ");
+ schema.printToBuffer(buffer);
+ }
+ buffer.append("}");
+ return buffer.toString();
+ }
+ }
+
+ /**
+ * Create a RecordReader that reads everything with the default options.
+ * @return a new RecordReader
+ * @throws IOException
+ */
+ RecordReader rows() throws IOException;
+
+ /**
+ * Create a RecordReader that uses the options given.
+ * This method can't be named rows, because many callers used rows(null)
+ * before the rows() method was introduced.
+ * @param options the options to read with
+ * @return a new RecordReader
+ * @throws IOException
+ */
+ RecordReader rows(Options options) throws IOException;
+
+ /**
+ * @return List of integers representing version of the file, in order from major to minor.
+ */
+ List<Integer> getVersionList();
+
+ /**
+ * @return Gets the size of metadata, in bytes.
+ */
+ int getMetadataSize();
+
+ /**
+ * @return Stripe statistics, in original protobuf form.
+ */
+ List<OrcProto.StripeStatistics> getOrcProtoStripeStatistics();
+
+ /**
+ * @return Stripe statistics.
+ */
+ List<StripeStatistics> getStripeStatistics() throws IOException;
+
+ /**
+ * @return File statistics, in original protobuf form.
+ */
+ List<OrcProto.ColumnStatistics> getOrcProtoFileStatistics();
+
+ /**
+ * @return Serialized file metadata read from disk for the purposes of caching, etc.
+ */
+ ByteBuffer getSerializedFileFooter();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/RecordReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/RecordReader.java b/orc/src/java/org/apache/hive/orc/RecordReader.java
new file mode 100644
index 0000000..f86fa0e
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/RecordReader.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+/**
+ * A row-by-row iterator for ORC files.
+ */
+public interface RecordReader {
+ /**
+ * Read the next row batch. The size of the batch to read cannot be
+ * controlled by the callers. Caller need to look at
+ * VectorizedRowBatch.size of the retunred object to know the batch
+ * size read.
+ * @param batch a row batch object to read into
+ * @return were more rows available to read?
+ * @throws java.io.IOException
+ */
+ boolean nextBatch(VectorizedRowBatch batch) throws IOException;
+
+ /**
+ * Get the row number of the row that will be returned by the following
+ * call to next().
+ * @return the row number from 0 to the number of rows in the file
+ * @throws java.io.IOException
+ */
+ long getRowNumber() throws IOException;
+
+ /**
+ * Get the progress of the reader through the rows.
+ * @return a fraction between 0.0 and 1.0 of rows read
+ * @throws java.io.IOException
+ */
+ float getProgress() throws IOException;
+
+ /**
+ * Release the resources associated with the given reader.
+ * @throws java.io.IOException
+ */
+ void close() throws IOException;
+
+ /**
+ * Seek to a particular row number.
+ */
+ void seekToRow(long rowCount) throws IOException;
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/StringColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/StringColumnStatistics.java b/orc/src/java/org/apache/hive/orc/StringColumnStatistics.java
new file mode 100644
index 0000000..8a81413
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/StringColumnStatistics.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+/**
+ * Statistics for string columns.
+ */
+public interface StringColumnStatistics extends ColumnStatistics {
+ /**
+ * Get the minimum string.
+ * @return the minimum
+ */
+ String getMinimum();
+
+ /**
+ * Get the maximum string.
+ * @return the maximum
+ */
+ String getMaximum();
+
+ /**
+ * Get the total length of all strings
+ * @return the sum (total length)
+ */
+ long getSum();
+}
[20/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/DataReaderProperties.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/DataReaderProperties.java b/orc/src/java/org/apache/orc/impl/DataReaderProperties.java
deleted file mode 100644
index 22301e8..0000000
--- a/orc/src/java/org/apache/orc/impl/DataReaderProperties.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import com.google.common.base.Preconditions;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.orc.CompressionKind;
-
-import javax.annotation.Nullable;
-
-public final class DataReaderProperties {
-
- private final FileSystem fileSystem;
- private final Path path;
- private final CompressionKind compression;
- private final boolean zeroCopy;
- private final int typeCount;
- private final int bufferSize;
-
- private DataReaderProperties(Builder builder) {
- this.fileSystem = builder.fileSystem;
- this.path = builder.path;
- this.compression = builder.compression;
- this.zeroCopy = builder.zeroCopy;
- this.typeCount = builder.typeCount;
- this.bufferSize = builder.bufferSize;
- }
-
- public FileSystem getFileSystem() {
- return fileSystem;
- }
-
- public Path getPath() {
- return path;
- }
-
- public CompressionKind getCompression() {
- return compression;
- }
-
- public boolean getZeroCopy() {
- return zeroCopy;
- }
-
- public int getTypeCount() {
- return typeCount;
- }
-
- public int getBufferSize() {
- return bufferSize;
- }
-
- public static Builder builder() {
- return new Builder();
- }
-
- public static class Builder {
-
- private FileSystem fileSystem;
- private Path path;
- private CompressionKind compression;
- private boolean zeroCopy;
- private int typeCount;
- private int bufferSize;
-
- private Builder() {
-
- }
-
- public Builder withFileSystem(FileSystem fileSystem) {
- this.fileSystem = fileSystem;
- return this;
- }
-
- public Builder withPath(Path path) {
- this.path = path;
- return this;
- }
-
- public Builder withCompression(CompressionKind value) {
- this.compression = value;
- return this;
- }
-
- public Builder withZeroCopy(boolean zeroCopy) {
- this.zeroCopy = zeroCopy;
- return this;
- }
-
- public Builder withTypeCount(int value) {
- this.typeCount = value;
- return this;
- }
-
- public Builder withBufferSize(int value) {
- this.bufferSize = value;
- return this;
- }
-
- public DataReaderProperties build() {
- Preconditions.checkNotNull(fileSystem);
- Preconditions.checkNotNull(path);
-
- return new DataReaderProperties(this);
- }
-
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/DirectDecompressionCodec.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/DirectDecompressionCodec.java b/orc/src/java/org/apache/orc/impl/DirectDecompressionCodec.java
deleted file mode 100644
index 7e0110d..0000000
--- a/orc/src/java/org/apache/orc/impl/DirectDecompressionCodec.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import org.apache.orc.CompressionCodec;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-public interface DirectDecompressionCodec extends CompressionCodec {
- public boolean isAvailable();
- public void directDecompress(ByteBuffer in, ByteBuffer out) throws IOException;
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/DynamicByteArray.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/DynamicByteArray.java b/orc/src/java/org/apache/orc/impl/DynamicByteArray.java
deleted file mode 100644
index 986c2ac..0000000
--- a/orc/src/java/org/apache/orc/impl/DynamicByteArray.java
+++ /dev/null
@@ -1,303 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-
-import org.apache.hadoop.io.Text;
-
-/**
- * A class that is a growable array of bytes. Growth is managed in terms of
- * chunks that are allocated when needed.
- */
-public final class DynamicByteArray {
- static final int DEFAULT_CHUNKSIZE = 32 * 1024;
- static final int DEFAULT_NUM_CHUNKS = 128;
-
- private final int chunkSize; // our allocation sizes
- private byte[][] data; // the real data
- private int length; // max set element index +1
- private int initializedChunks = 0; // the number of chunks created
-
- public DynamicByteArray() {
- this(DEFAULT_NUM_CHUNKS, DEFAULT_CHUNKSIZE);
- }
-
- public DynamicByteArray(int numChunks, int chunkSize) {
- if (chunkSize == 0) {
- throw new IllegalArgumentException("bad chunksize");
- }
- this.chunkSize = chunkSize;
- data = new byte[numChunks][];
- }
-
- /**
- * Ensure that the given index is valid.
- */
- private void grow(int chunkIndex) {
- if (chunkIndex >= initializedChunks) {
- if (chunkIndex >= data.length) {
- int newSize = Math.max(chunkIndex + 1, 2 * data.length);
- byte[][] newChunk = new byte[newSize][];
- System.arraycopy(data, 0, newChunk, 0, data.length);
- data = newChunk;
- }
- for(int i=initializedChunks; i <= chunkIndex; ++i) {
- data[i] = new byte[chunkSize];
- }
- initializedChunks = chunkIndex + 1;
- }
- }
-
- public byte get(int index) {
- if (index >= length) {
- throw new IndexOutOfBoundsException("Index " + index +
- " is outside of 0.." +
- (length - 1));
- }
- int i = index / chunkSize;
- int j = index % chunkSize;
- return data[i][j];
- }
-
- public void set(int index, byte value) {
- int i = index / chunkSize;
- int j = index % chunkSize;
- grow(i);
- if (index >= length) {
- length = index + 1;
- }
- data[i][j] = value;
- }
-
- public int add(byte value) {
- int i = length / chunkSize;
- int j = length % chunkSize;
- grow(i);
- data[i][j] = value;
- int result = length;
- length += 1;
- return result;
- }
-
- /**
- * Copy a slice of a byte array into our buffer.
- * @param value the array to copy from
- * @param valueOffset the first location to copy from value
- * @param valueLength the number of bytes to copy from value
- * @return the offset of the start of the value
- */
- public int add(byte[] value, int valueOffset, int valueLength) {
- int i = length / chunkSize;
- int j = length % chunkSize;
- grow((length + valueLength) / chunkSize);
- int remaining = valueLength;
- while (remaining > 0) {
- int size = Math.min(remaining, chunkSize - j);
- System.arraycopy(value, valueOffset, data[i], j, size);
- remaining -= size;
- valueOffset += size;
- i += 1;
- j = 0;
- }
- int result = length;
- length += valueLength;
- return result;
- }
-
- /**
- * Read the entire stream into this array.
- * @param in the stream to read from
- * @throws IOException
- */
- public void readAll(InputStream in) throws IOException {
- int currentChunk = length / chunkSize;
- int currentOffset = length % chunkSize;
- grow(currentChunk);
- int currentLength = in.read(data[currentChunk], currentOffset,
- chunkSize - currentOffset);
- while (currentLength > 0) {
- length += currentLength;
- currentOffset = length % chunkSize;
- if (currentOffset == 0) {
- currentChunk = length / chunkSize;
- grow(currentChunk);
- }
- currentLength = in.read(data[currentChunk], currentOffset,
- chunkSize - currentOffset);
- }
- }
-
- /**
- * Byte compare a set of bytes against the bytes in this dynamic array.
- * @param other source of the other bytes
- * @param otherOffset start offset in the other array
- * @param otherLength number of bytes in the other array
- * @param ourOffset the offset in our array
- * @param ourLength the number of bytes in our array
- * @return negative for less, 0 for equal, positive for greater
- */
- public int compare(byte[] other, int otherOffset, int otherLength,
- int ourOffset, int ourLength) {
- int currentChunk = ourOffset / chunkSize;
- int currentOffset = ourOffset % chunkSize;
- int maxLength = Math.min(otherLength, ourLength);
- while (maxLength > 0 &&
- other[otherOffset] == data[currentChunk][currentOffset]) {
- otherOffset += 1;
- currentOffset += 1;
- if (currentOffset == chunkSize) {
- currentChunk += 1;
- currentOffset = 0;
- }
- maxLength -= 1;
- }
- if (maxLength == 0) {
- return otherLength - ourLength;
- }
- int otherByte = 0xff & other[otherOffset];
- int ourByte = 0xff & data[currentChunk][currentOffset];
- return otherByte > ourByte ? 1 : -1;
- }
-
- /**
- * Get the size of the array.
- * @return the number of bytes in the array
- */
- public int size() {
- return length;
- }
-
- /**
- * Clear the array to its original pristine state.
- */
- public void clear() {
- length = 0;
- for(int i=0; i < data.length; ++i) {
- data[i] = null;
- }
- initializedChunks = 0;
- }
-
- /**
- * Set a text value from the bytes in this dynamic array.
- * @param result the value to set
- * @param offset the start of the bytes to copy
- * @param length the number of bytes to copy
- */
- public void setText(Text result, int offset, int length) {
- result.clear();
- int currentChunk = offset / chunkSize;
- int currentOffset = offset % chunkSize;
- int currentLength = Math.min(length, chunkSize - currentOffset);
- while (length > 0) {
- result.append(data[currentChunk], currentOffset, currentLength);
- length -= currentLength;
- currentChunk += 1;
- currentOffset = 0;
- currentLength = Math.min(length, chunkSize - currentOffset);
- }
- }
-
- /**
- * Write out a range of this dynamic array to an output stream.
- * @param out the stream to write to
- * @param offset the first offset to write
- * @param length the number of bytes to write
- * @throws IOException
- */
- public void write(OutputStream out, int offset,
- int length) throws IOException {
- int currentChunk = offset / chunkSize;
- int currentOffset = offset % chunkSize;
- while (length > 0) {
- int currentLength = Math.min(length, chunkSize - currentOffset);
- out.write(data[currentChunk], currentOffset, currentLength);
- length -= currentLength;
- currentChunk += 1;
- currentOffset = 0;
- }
- }
-
- @Override
- public String toString() {
- int i;
- StringBuilder sb = new StringBuilder(length * 3);
-
- sb.append('{');
- int l = length - 1;
- for (i=0; i<l; i++) {
- sb.append(Integer.toHexString(get(i)));
- sb.append(',');
- }
- sb.append(get(i));
- sb.append('}');
-
- return sb.toString();
- }
-
- public void setByteBuffer(ByteBuffer result, int offset, int length) {
- result.clear();
- int currentChunk = offset / chunkSize;
- int currentOffset = offset % chunkSize;
- int currentLength = Math.min(length, chunkSize - currentOffset);
- while (length > 0) {
- result.put(data[currentChunk], currentOffset, currentLength);
- length -= currentLength;
- currentChunk += 1;
- currentOffset = 0;
- currentLength = Math.min(length, chunkSize - currentOffset);
- }
- }
-
- /**
- * Gets all the bytes of the array.
- *
- * @return Bytes of the array
- */
- public byte[] get() {
- byte[] result = null;
- if (length > 0) {
- int currentChunk = 0;
- int currentOffset = 0;
- int currentLength = Math.min(length, chunkSize);
- int destOffset = 0;
- result = new byte[length];
- int totalLength = length;
- while (totalLength > 0) {
- System.arraycopy(data[currentChunk], currentOffset, result, destOffset, currentLength);
- destOffset += currentLength;
- totalLength -= currentLength;
- currentChunk += 1;
- currentOffset = 0;
- currentLength = Math.min(totalLength, chunkSize - currentOffset);
- }
- }
- return result;
- }
-
- /**
- * Get the size of the buffers.
- */
- public long getSizeInBytes() {
- return initializedChunks * chunkSize;
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/DynamicIntArray.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/DynamicIntArray.java b/orc/src/java/org/apache/orc/impl/DynamicIntArray.java
deleted file mode 100644
index 3b2884b..0000000
--- a/orc/src/java/org/apache/orc/impl/DynamicIntArray.java
+++ /dev/null
@@ -1,142 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-/**
- * Dynamic int array that uses primitive types and chunks to avoid copying
- * large number of integers when it resizes.
- *
- * The motivation for this class is memory optimization, i.e. space efficient
- * storage of potentially huge arrays without good a-priori size guesses.
- *
- * The API of this class is between a primitive array and a AbstractList. It's
- * not a Collection implementation because it handles primitive types, but the
- * API could be extended to support iterators and the like.
- *
- * NOTE: Like standard Collection implementations/arrays, this class is not
- * synchronized.
- */
-public final class DynamicIntArray {
- static final int DEFAULT_CHUNKSIZE = 8 * 1024;
- static final int INIT_CHUNKS = 128;
-
- private final int chunkSize; // our allocation size
- private int[][] data; // the real data
- private int length; // max set element index +1
- private int initializedChunks = 0; // the number of created chunks
-
- public DynamicIntArray() {
- this(DEFAULT_CHUNKSIZE);
- }
-
- public DynamicIntArray(int chunkSize) {
- this.chunkSize = chunkSize;
-
- data = new int[INIT_CHUNKS][];
- }
-
- /**
- * Ensure that the given index is valid.
- */
- private void grow(int chunkIndex) {
- if (chunkIndex >= initializedChunks) {
- if (chunkIndex >= data.length) {
- int newSize = Math.max(chunkIndex + 1, 2 * data.length);
- int[][] newChunk = new int[newSize][];
- System.arraycopy(data, 0, newChunk, 0, data.length);
- data = newChunk;
- }
- for (int i=initializedChunks; i <= chunkIndex; ++i) {
- data[i] = new int[chunkSize];
- }
- initializedChunks = chunkIndex + 1;
- }
- }
-
- public int get(int index) {
- if (index >= length) {
- throw new IndexOutOfBoundsException("Index " + index +
- " is outside of 0.." +
- (length - 1));
- }
- int i = index / chunkSize;
- int j = index % chunkSize;
- return data[i][j];
- }
-
- public void set(int index, int value) {
- int i = index / chunkSize;
- int j = index % chunkSize;
- grow(i);
- if (index >= length) {
- length = index + 1;
- }
- data[i][j] = value;
- }
-
- public void increment(int index, int value) {
- int i = index / chunkSize;
- int j = index % chunkSize;
- grow(i);
- if (index >= length) {
- length = index + 1;
- }
- data[i][j] += value;
- }
-
- public void add(int value) {
- int i = length / chunkSize;
- int j = length % chunkSize;
- grow(i);
- data[i][j] = value;
- length += 1;
- }
-
- public int size() {
- return length;
- }
-
- public void clear() {
- length = 0;
- for(int i=0; i < data.length; ++i) {
- data[i] = null;
- }
- initializedChunks = 0;
- }
-
- public String toString() {
- int i;
- StringBuilder sb = new StringBuilder(length * 4);
-
- sb.append('{');
- int l = length - 1;
- for (i=0; i<l; i++) {
- sb.append(get(i));
- sb.append(',');
- }
- sb.append(get(i));
- sb.append('}');
-
- return sb.toString();
- }
-
- public int getSizeInBytes() {
- return 4 * initializedChunks * chunkSize;
- }
-}
-
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/HadoopShims.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/HadoopShims.java b/orc/src/java/org/apache/orc/impl/HadoopShims.java
deleted file mode 100644
index ef7d70f..0000000
--- a/orc/src/java/org/apache/orc/impl/HadoopShims.java
+++ /dev/null
@@ -1,143 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.util.VersionInfo;
-
-import java.io.Closeable;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.ByteBuffer;
-
-public interface HadoopShims {
-
- enum DirectCompressionType {
- NONE,
- ZLIB_NOHEADER,
- ZLIB,
- SNAPPY,
- }
-
- interface DirectDecompressor {
- void decompress(ByteBuffer var1, ByteBuffer var2) throws IOException;
- }
-
- /**
- * Get a direct decompressor codec, if it is available
- * @param codec
- * @return
- */
- DirectDecompressor getDirectDecompressor(DirectCompressionType codec);
-
- /**
- * a hadoop.io ByteBufferPool shim.
- */
- public interface ByteBufferPoolShim {
- /**
- * Get a new ByteBuffer from the pool. The pool can provide this from
- * removing a buffer from its internal cache, or by allocating a
- * new buffer.
- *
- * @param direct Whether the buffer should be direct.
- * @param length The minimum length the buffer will have.
- * @return A new ByteBuffer. Its capacity can be less
- * than what was requested, but must be at
- * least 1 byte.
- */
- ByteBuffer getBuffer(boolean direct, int length);
-
- /**
- * Release a buffer back to the pool.
- * The pool may choose to put this buffer into its cache/free it.
- *
- * @param buffer a direct bytebuffer
- */
- void putBuffer(ByteBuffer buffer);
- }
-
- /**
- * Provides an HDFS ZeroCopyReader shim.
- * @param in FSDataInputStream to read from (where the cached/mmap buffers are tied to)
- * @param in ByteBufferPoolShim to allocate fallback buffers with
- *
- * @return returns null if not supported
- */
- public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in, ByteBufferPoolShim pool) throws IOException;
-
- public interface ZeroCopyReaderShim extends Closeable {
- /**
- * Get a ByteBuffer from the FSDataInputStream - this can be either a HeapByteBuffer or an MappedByteBuffer.
- * Also move the in stream by that amount. The data read can be small than maxLength.
- *
- * @return ByteBuffer read from the stream,
- */
- public ByteBuffer readBuffer(int maxLength, boolean verifyChecksums) throws IOException;
- /**
- * Release a ByteBuffer obtained from a read on the
- * Also move the in stream by that amount. The data read can be small than maxLength.
- *
- */
- public void releaseBuffer(ByteBuffer buffer);
-
- /**
- * Close the underlying stream.
- * @throws IOException
- */
- public void close() throws IOException;
- }
- /**
- * Read data into a Text object in the fastest way possible
- */
- public interface TextReaderShim {
- /**
- * @param txt
- * @param size
- * @return bytes read
- * @throws IOException
- */
- void read(Text txt, int size) throws IOException;
- }
-
- /**
- * Wrap a TextReaderShim around an input stream. The reader shim will not
- * buffer any reads from the underlying stream and will only consume bytes
- * which are required for TextReaderShim.read() input.
- */
- public TextReaderShim getTextReaderShim(InputStream input) throws IOException;
-
- class Factory {
- private static HadoopShims SHIMS = null;
-
- public static synchronized HadoopShims get() {
- if (SHIMS == null) {
- String[] versionParts = VersionInfo.getVersion().split("[.]");
- int major = Integer.parseInt(versionParts[0]);
- int minor = Integer.parseInt(versionParts[1]);
- if (major < 2 || (major == 2 && minor < 3)) {
- SHIMS = new HadoopShims_2_2();
- } else {
- SHIMS = new HadoopShimsCurrent();
- }
- }
- return SHIMS;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java b/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java
deleted file mode 100644
index 5c53f74..0000000
--- a/orc/src/java/org/apache/orc/impl/HadoopShimsCurrent.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.compress.snappy.SnappyDecompressor;
-import org.apache.hadoop.io.compress.zlib.ZlibDecompressor;
-
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.ByteBuffer;
-
-/**
- * Shims for recent versions of Hadoop
- */
-public class HadoopShimsCurrent implements HadoopShims {
-
- private static class DirectDecompressWrapper implements DirectDecompressor {
- private final org.apache.hadoop.io.compress.DirectDecompressor root;
-
- DirectDecompressWrapper(org.apache.hadoop.io.compress.DirectDecompressor root) {
- this.root = root;
- }
-
- public void decompress(ByteBuffer input,
- ByteBuffer output) throws IOException {
- root.decompress(input, output);
- }
- }
-
- public DirectDecompressor getDirectDecompressor(
- DirectCompressionType codec) {
- switch (codec) {
- case ZLIB:
- return new DirectDecompressWrapper
- (new ZlibDecompressor.ZlibDirectDecompressor());
- case ZLIB_NOHEADER:
- return new DirectDecompressWrapper
- (new ZlibDecompressor.ZlibDirectDecompressor
- (ZlibDecompressor.CompressionHeader.NO_HEADER, 0));
- case SNAPPY:
- return new DirectDecompressWrapper
- (new SnappyDecompressor.SnappyDirectDecompressor());
- default:
- return null;
- }
- }
-
- @Override
- public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in,
- ByteBufferPoolShim pool
- ) throws IOException {
- return ZeroCopyShims.getZeroCopyReader(in, pool);
- }
-
- private final class FastTextReaderShim implements TextReaderShim {
- private final DataInputStream din;
-
- public FastTextReaderShim(InputStream in) {
- this.din = new DataInputStream(in);
- }
-
- @Override
- public void read(Text txt, int len) throws IOException {
- txt.readWithKnownLength(din, len);
- }
- }
-
- @Override
- public TextReaderShim getTextReaderShim(InputStream in) throws IOException {
- return new FastTextReaderShim(in);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java b/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java
deleted file mode 100644
index 3f65e74..0000000
--- a/orc/src/java/org/apache/orc/impl/HadoopShims_2_2.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.io.Text;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.lang.reflect.Method;
-
-/**
- * Shims for versions of Hadoop up to and including 2.2.x
- */
-public class HadoopShims_2_2 implements HadoopShims {
-
- final boolean zeroCopy;
- final boolean fastRead;
-
- HadoopShims_2_2() {
- boolean zcr = false;
- try {
- Class.forName("org.apache.hadoop.fs.CacheFlag", false,
- HadoopShims_2_2.class.getClassLoader());
- zcr = true;
- } catch (ClassNotFoundException ce) {
- }
- zeroCopy = zcr;
- boolean fastRead = false;
- if (zcr) {
- for (Method m : Text.class.getMethods()) {
- if ("readWithKnownLength".equals(m.getName())) {
- fastRead = true;
- }
- }
- }
- this.fastRead = fastRead;
- }
-
- public DirectDecompressor getDirectDecompressor(
- DirectCompressionType codec) {
- return null;
- }
-
- @Override
- public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in,
- ByteBufferPoolShim pool
- ) throws IOException {
- if(zeroCopy) {
- return ZeroCopyShims.getZeroCopyReader(in, pool);
- }
- /* not supported */
- return null;
- }
-
- private final class BasicTextReaderShim implements TextReaderShim {
- private final InputStream in;
-
- public BasicTextReaderShim(InputStream in) {
- this.in = in;
- }
-
- @Override
- public void read(Text txt, int len) throws IOException {
- int offset = 0;
- byte[] bytes = new byte[len];
- while (len > 0) {
- int written = in.read(bytes, offset, len);
- if (written < 0) {
- throw new EOFException("Can't finish read from " + in + " read "
- + (offset) + " bytes out of " + bytes.length);
- }
- len -= written;
- offset += written;
- }
- txt.set(bytes);
- }
- }
-
- @Override
- public TextReaderShim getTextReaderShim(InputStream in) throws IOException {
- return new BasicTextReaderShim(in);
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/InStream.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/InStream.java b/orc/src/java/org/apache/orc/impl/InStream.java
deleted file mode 100644
index 851f645..0000000
--- a/orc/src/java/org/apache/orc/impl/InStream.java
+++ /dev/null
@@ -1,498 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.ListIterator;
-
-import org.apache.orc.CompressionCodec;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.hive.common.io.DiskRange;
-
-import com.google.common.annotations.VisibleForTesting;
-import com.google.protobuf.CodedInputStream;
-
-public abstract class InStream extends InputStream {
-
- private static final Logger LOG = LoggerFactory.getLogger(InStream.class);
- public static final int PROTOBUF_MESSAGE_MAX_LIMIT = 1024 << 20; // 1GB
-
- protected final String name;
- protected long length;
-
- public InStream(String name, long length) {
- this.name = name;
- this.length = length;
- }
-
- public String getStreamName() {
- return name;
- }
-
- public long getStreamLength() {
- return length;
- }
-
- @Override
- public abstract void close();
-
- public static class UncompressedStream extends InStream {
- private List<DiskRange> bytes;
- private long length;
- protected long currentOffset;
- private ByteBuffer range;
- private int currentRange;
-
- public UncompressedStream(String name, List<DiskRange> input, long length) {
- super(name, length);
- reset(input, length);
- }
-
- protected void reset(List<DiskRange> input, long length) {
- this.bytes = input;
- this.length = length;
- currentRange = 0;
- currentOffset = 0;
- range = null;
- }
-
- @Override
- public int read() {
- if (range == null || range.remaining() == 0) {
- if (currentOffset == length) {
- return -1;
- }
- seek(currentOffset);
- }
- currentOffset += 1;
- return 0xff & range.get();
- }
-
- @Override
- public int read(byte[] data, int offset, int length) {
- if (range == null || range.remaining() == 0) {
- if (currentOffset == this.length) {
- return -1;
- }
- seek(currentOffset);
- }
- int actualLength = Math.min(length, range.remaining());
- range.get(data, offset, actualLength);
- currentOffset += actualLength;
- return actualLength;
- }
-
- @Override
- public int available() {
- if (range != null && range.remaining() > 0) {
- return range.remaining();
- }
- return (int) (length - currentOffset);
- }
-
- @Override
- public void close() {
- currentRange = bytes.size();
- currentOffset = length;
- // explicit de-ref of bytes[]
- bytes.clear();
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- seek(index.getNext());
- }
-
- public void seek(long desired) {
- if (desired == 0 && bytes.isEmpty()) {
- logEmptySeek(name);
- return;
- }
- int i = 0;
- for (DiskRange curRange : bytes) {
- if (desired == 0 && curRange.getData().remaining() == 0) {
- logEmptySeek(name);
- return;
- }
- if (curRange.getOffset() <= desired &&
- (desired - curRange.getOffset()) < curRange.getLength()) {
- currentOffset = desired;
- currentRange = i;
- this.range = curRange.getData().duplicate();
- int pos = range.position();
- pos += (int)(desired - curRange.getOffset()); // this is why we duplicate
- this.range.position(pos);
- return;
- }
- ++i;
- }
- // if they are seeking to the precise end, go ahead and let them go there
- int segments = bytes.size();
- if (segments != 0 && desired == bytes.get(segments - 1).getEnd()) {
- currentOffset = desired;
- currentRange = segments - 1;
- DiskRange curRange = bytes.get(currentRange);
- this.range = curRange.getData().duplicate();
- int pos = range.position();
- pos += (int)(desired - curRange.getOffset()); // this is why we duplicate
- this.range.position(pos);
- return;
- }
- throw new IllegalArgumentException("Seek in " + name + " to " +
- desired + " is outside of the data");
- }
-
- @Override
- public String toString() {
- return "uncompressed stream " + name + " position: " + currentOffset +
- " length: " + length + " range: " + currentRange +
- " offset: " + (range == null ? 0 : range.position()) + " limit: " + (range == null ? 0 : range.limit());
- }
- }
-
- private static ByteBuffer allocateBuffer(int size, boolean isDirect) {
- // TODO: use the same pool as the ORC readers
- if (isDirect) {
- return ByteBuffer.allocateDirect(size);
- } else {
- return ByteBuffer.allocate(size);
- }
- }
-
- private static class CompressedStream extends InStream {
- private final List<DiskRange> bytes;
- private final int bufferSize;
- private ByteBuffer uncompressed;
- private final CompressionCodec codec;
- private ByteBuffer compressed;
- private long currentOffset;
- private int currentRange;
- private boolean isUncompressedOriginal;
-
- public CompressedStream(String name, List<DiskRange> input, long length,
- CompressionCodec codec, int bufferSize) {
- super(name, length);
- this.bytes = input;
- this.codec = codec;
- this.bufferSize = bufferSize;
- currentOffset = 0;
- currentRange = 0;
- }
-
- private void allocateForUncompressed(int size, boolean isDirect) {
- uncompressed = allocateBuffer(size, isDirect);
- }
-
- private void readHeader() throws IOException {
- if (compressed == null || compressed.remaining() <= 0) {
- seek(currentOffset);
- }
- if (compressed.remaining() > OutStream.HEADER_SIZE) {
- int b0 = compressed.get() & 0xff;
- int b1 = compressed.get() & 0xff;
- int b2 = compressed.get() & 0xff;
- boolean isOriginal = (b0 & 0x01) == 1;
- int chunkLength = (b2 << 15) | (b1 << 7) | (b0 >> 1);
-
- if (chunkLength > bufferSize) {
- throw new IllegalArgumentException("Buffer size too small. size = " +
- bufferSize + " needed = " + chunkLength);
- }
- // read 3 bytes, which should be equal to OutStream.HEADER_SIZE always
- assert OutStream.HEADER_SIZE == 3 : "The Orc HEADER_SIZE must be the same in OutStream and InStream";
- currentOffset += OutStream.HEADER_SIZE;
-
- ByteBuffer slice = this.slice(chunkLength);
-
- if (isOriginal) {
- uncompressed = slice;
- isUncompressedOriginal = true;
- } else {
- if (isUncompressedOriginal) {
- allocateForUncompressed(bufferSize, slice.isDirect());
- isUncompressedOriginal = false;
- } else if (uncompressed == null) {
- allocateForUncompressed(bufferSize, slice.isDirect());
- } else {
- uncompressed.clear();
- }
- codec.decompress(slice, uncompressed);
- }
- } else {
- throw new IllegalStateException("Can't read header at " + this);
- }
- }
-
- @Override
- public int read() throws IOException {
- if (uncompressed == null || uncompressed.remaining() == 0) {
- if (currentOffset == length) {
- return -1;
- }
- readHeader();
- }
- return 0xff & uncompressed.get();
- }
-
- @Override
- public int read(byte[] data, int offset, int length) throws IOException {
- if (uncompressed == null || uncompressed.remaining() == 0) {
- if (currentOffset == this.length) {
- return -1;
- }
- readHeader();
- }
- int actualLength = Math.min(length, uncompressed.remaining());
- uncompressed.get(data, offset, actualLength);
- return actualLength;
- }
-
- @Override
- public int available() throws IOException {
- if (uncompressed == null || uncompressed.remaining() == 0) {
- if (currentOffset == length) {
- return 0;
- }
- readHeader();
- }
- return uncompressed.remaining();
- }
-
- @Override
- public void close() {
- uncompressed = null;
- compressed = null;
- currentRange = bytes.size();
- currentOffset = length;
- bytes.clear();
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- seek(index.getNext());
- long uncompressedBytes = index.getNext();
- if (uncompressedBytes != 0) {
- readHeader();
- uncompressed.position(uncompressed.position() +
- (int) uncompressedBytes);
- } else if (uncompressed != null) {
- // mark the uncompressed buffer as done
- uncompressed.position(uncompressed.limit());
- }
- }
-
- /* slices a read only contiguous buffer of chunkLength */
- private ByteBuffer slice(int chunkLength) throws IOException {
- int len = chunkLength;
- final long oldOffset = currentOffset;
- ByteBuffer slice;
- if (compressed.remaining() >= len) {
- slice = compressed.slice();
- // simple case
- slice.limit(len);
- currentOffset += len;
- compressed.position(compressed.position() + len);
- return slice;
- } else if (currentRange >= (bytes.size() - 1)) {
- // nothing has been modified yet
- throw new IOException("EOF in " + this + " while trying to read " +
- chunkLength + " bytes");
- }
-
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format(
- "Crossing into next BufferChunk because compressed only has %d bytes (needs %d)",
- compressed.remaining(), len));
- }
-
- // we need to consolidate 2 or more buffers into 1
- // first copy out compressed buffers
- ByteBuffer copy = allocateBuffer(chunkLength, compressed.isDirect());
- currentOffset += compressed.remaining();
- len -= compressed.remaining();
- copy.put(compressed);
- ListIterator<DiskRange> iter = bytes.listIterator(currentRange);
-
- while (len > 0 && iter.hasNext()) {
- ++currentRange;
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("Read slow-path, >1 cross block reads with %s", this.toString()));
- }
- DiskRange range = iter.next();
- compressed = range.getData().duplicate();
- if (compressed.remaining() >= len) {
- slice = compressed.slice();
- slice.limit(len);
- copy.put(slice);
- currentOffset += len;
- compressed.position(compressed.position() + len);
- return copy;
- }
- currentOffset += compressed.remaining();
- len -= compressed.remaining();
- copy.put(compressed);
- }
-
- // restore offsets for exception clarity
- seek(oldOffset);
- throw new IOException("EOF in " + this + " while trying to read " +
- chunkLength + " bytes");
- }
-
- private void seek(long desired) throws IOException {
- if (desired == 0 && bytes.isEmpty()) {
- logEmptySeek(name);
- return;
- }
- int i = 0;
- for (DiskRange range : bytes) {
- if (range.getOffset() <= desired && desired < range.getEnd()) {
- currentRange = i;
- compressed = range.getData().duplicate();
- int pos = compressed.position();
- pos += (int)(desired - range.getOffset());
- compressed.position(pos);
- currentOffset = desired;
- return;
- }
- ++i;
- }
- // if they are seeking to the precise end, go ahead and let them go there
- int segments = bytes.size();
- if (segments != 0 && desired == bytes.get(segments - 1).getEnd()) {
- DiskRange range = bytes.get(segments - 1);
- currentRange = segments - 1;
- compressed = range.getData().duplicate();
- compressed.position(compressed.limit());
- currentOffset = desired;
- return;
- }
- throw new IOException("Seek outside of data in " + this + " to " + desired);
- }
-
- private String rangeString() {
- StringBuilder builder = new StringBuilder();
- int i = 0;
- for (DiskRange range : bytes) {
- if (i != 0) {
- builder.append("; ");
- }
- builder.append(" range " + i + " = " + range.getOffset()
- + " to " + (range.getEnd() - range.getOffset()));
- ++i;
- }
- return builder.toString();
- }
-
- @Override
- public String toString() {
- return "compressed stream " + name + " position: " + currentOffset +
- " length: " + length + " range: " + currentRange +
- " offset: " + (compressed == null ? 0 : compressed.position()) + " limit: " + (compressed == null ? 0 : compressed.limit()) +
- rangeString() +
- (uncompressed == null ? "" :
- " uncompressed: " + uncompressed.position() + " to " +
- uncompressed.limit());
- }
- }
-
- public abstract void seek(PositionProvider index) throws IOException;
-
- private static void logEmptySeek(String name) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("Attempting seek into empty stream (" + name + ") Skipping stream.");
- }
- }
-
- /**
- * Create an input stream from a list of buffers.
- * @param streamName the name of the stream
- * @param buffers the list of ranges of bytes for the stream
- * @param offsets a list of offsets (the same length as input) that must
- * contain the first offset of the each set of bytes in input
- * @param length the length in bytes of the stream
- * @param codec the compression codec
- * @param bufferSize the compression buffer size
- * @return an input stream
- * @throws IOException
- */
- @VisibleForTesting
- @Deprecated
- public static InStream create(String streamName,
- ByteBuffer[] buffers,
- long[] offsets,
- long length,
- CompressionCodec codec,
- int bufferSize) throws IOException {
- List<DiskRange> input = new ArrayList<DiskRange>(buffers.length);
- for (int i = 0; i < buffers.length; ++i) {
- input.add(new BufferChunk(buffers[i], offsets[i]));
- }
- return create(streamName, input, length, codec, bufferSize);
- }
-
- /**
- * Create an input stream from a list of disk ranges with data.
- * @param name the name of the stream
- * @param input the list of ranges of bytes for the stream; from disk or cache
- * @param length the length in bytes of the stream
- * @param codec the compression codec
- * @param bufferSize the compression buffer size
- * @return an input stream
- * @throws IOException
- */
- public static InStream create(String name,
- List<DiskRange> input,
- long length,
- CompressionCodec codec,
- int bufferSize) throws IOException {
- if (codec == null) {
- return new UncompressedStream(name, input, length);
- } else {
- return new CompressedStream(name, input, length, codec, bufferSize);
- }
- }
-
- /**
- * Creates coded input stream (used for protobuf message parsing) with higher message size limit.
- *
- * @param name the name of the stream
- * @param input the list of ranges of bytes for the stream; from disk or cache
- * @param length the length in bytes of the stream
- * @param codec the compression codec
- * @param bufferSize the compression buffer size
- * @return coded input stream
- * @throws IOException
- */
- public static CodedInputStream createCodedInputStream(
- String name,
- List<DiskRange> input,
- long length,
- CompressionCodec codec,
- int bufferSize) throws IOException {
- InStream inStream = create(name, input, length, codec, bufferSize);
- CodedInputStream codedInputStream = CodedInputStream.newInstance(inStream);
- codedInputStream.setSizeLimit(PROTOBUF_MESSAGE_MAX_LIMIT);
- return codedInputStream;
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/IntegerReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/IntegerReader.java b/orc/src/java/org/apache/orc/impl/IntegerReader.java
deleted file mode 100644
index 3e64d54..0000000
--- a/orc/src/java/org/apache/orc/impl/IntegerReader.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import java.io.IOException;
-
-import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
-
-/**
- * Interface for reading integers.
- */
-public interface IntegerReader {
-
- /**
- * Seek to the position provided by index.
- * @param index
- * @throws IOException
- */
- void seek(PositionProvider index) throws IOException;
-
- /**
- * Skip number of specified rows.
- * @param numValues
- * @throws IOException
- */
- void skip(long numValues) throws IOException;
-
- /**
- * Check if there are any more values left.
- * @return
- * @throws IOException
- */
- boolean hasNext() throws IOException;
-
- /**
- * Return the next available value.
- * @return
- * @throws IOException
- */
- long next() throws IOException;
-
- /**
- * Return the next available vector for values.
- * @param column the column being read
- * @param data the vector to read into
- * @param length the number of numbers to read
- * @throws IOException
- */
- void nextVector(ColumnVector column,
- long[] data,
- int length
- ) throws IOException;
-
- /**
- * Return the next available vector for values. Does not change the
- * value of column.isRepeating.
- * @param column the column being read
- * @param data the vector to read into
- * @param length the number of numbers to read
- * @throws IOException
- */
- void nextVector(ColumnVector column,
- int[] data,
- int length
- ) throws IOException;
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/IntegerWriter.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/IntegerWriter.java b/orc/src/java/org/apache/orc/impl/IntegerWriter.java
deleted file mode 100644
index 419054f..0000000
--- a/orc/src/java/org/apache/orc/impl/IntegerWriter.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import java.io.IOException;
-
-/**
- * Interface for writing integers.
- */
-public interface IntegerWriter {
-
- /**
- * Get position from the stream.
- * @param recorder
- * @throws IOException
- */
- void getPosition(PositionRecorder recorder) throws IOException;
-
- /**
- * Write the integer value
- * @param value
- * @throws IOException
- */
- void write(long value) throws IOException;
-
- /**
- * Flush the buffer
- * @throws IOException
- */
- void flush() throws IOException;
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/MemoryManager.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/MemoryManager.java b/orc/src/java/org/apache/orc/impl/MemoryManager.java
deleted file mode 100644
index 757c0b4..0000000
--- a/orc/src/java/org/apache/orc/impl/MemoryManager.java
+++ /dev/null
@@ -1,214 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import org.apache.orc.OrcConf;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-
-import com.google.common.base.Preconditions;
-
-import java.io.IOException;
-import java.lang.management.ManagementFactory;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.locks.ReentrantLock;
-
-/**
- * Implements a memory manager that keeps a global context of how many ORC
- * writers there are and manages the memory between them. For use cases with
- * dynamic partitions, it is easy to end up with many writers in the same task.
- * By managing the size of each allocation, we try to cut down the size of each
- * allocation and keep the task from running out of memory.
- *
- * This class is not thread safe, but is re-entrant - ensure creation and all
- * invocations are triggered from the same thread.
- */
-public class MemoryManager {
-
- private static final Logger LOG = LoggerFactory.getLogger(MemoryManager.class);
-
- /**
- * How often should we check the memory sizes? Measured in rows added
- * to all of the writers.
- */
- private static final int ROWS_BETWEEN_CHECKS = 5000;
- private final long totalMemoryPool;
- private final Map<Path, WriterInfo> writerList =
- new HashMap<Path, WriterInfo>();
- private long totalAllocation = 0;
- private double currentScale = 1;
- private int rowsAddedSinceCheck = 0;
- private final OwnedLock ownerLock = new OwnedLock();
-
- @SuppressWarnings("serial")
- private static class OwnedLock extends ReentrantLock {
- public Thread getOwner() {
- return super.getOwner();
- }
- }
-
- private static class WriterInfo {
- long allocation;
- Callback callback;
- WriterInfo(long allocation, Callback callback) {
- this.allocation = allocation;
- this.callback = callback;
- }
- }
-
- public interface Callback {
- /**
- * The writer needs to check its memory usage
- * @param newScale the current scale factor for memory allocations
- * @return true if the writer was over the limit
- * @throws IOException
- */
- boolean checkMemory(double newScale) throws IOException;
- }
-
- /**
- * Create the memory manager.
- * @param conf use the configuration to find the maximum size of the memory
- * pool.
- */
- public MemoryManager(Configuration conf) {
- double maxLoad = OrcConf.MEMORY_POOL.getDouble(conf);
- totalMemoryPool = Math.round(ManagementFactory.getMemoryMXBean().
- getHeapMemoryUsage().getMax() * maxLoad);
- ownerLock.lock();
- }
-
- /**
- * Light weight thread-safety check for multi-threaded access patterns
- */
- private void checkOwner() {
- if (!ownerLock.isHeldByCurrentThread()) {
- LOG.warn("Owner thread expected {}, got {}",
- ownerLock.getOwner(), Thread.currentThread());
- }
- }
-
- /**
- * Add a new writer's memory allocation to the pool. We use the path
- * as a unique key to ensure that we don't get duplicates.
- * @param path the file that is being written
- * @param requestedAllocation the requested buffer size
- */
- public void addWriter(Path path, long requestedAllocation,
- Callback callback) throws IOException {
- checkOwner();
- WriterInfo oldVal = writerList.get(path);
- // this should always be null, but we handle the case where the memory
- // manager wasn't told that a writer wasn't still in use and the task
- // starts writing to the same path.
- if (oldVal == null) {
- oldVal = new WriterInfo(requestedAllocation, callback);
- writerList.put(path, oldVal);
- totalAllocation += requestedAllocation;
- } else {
- // handle a new writer that is writing to the same path
- totalAllocation += requestedAllocation - oldVal.allocation;
- oldVal.allocation = requestedAllocation;
- oldVal.callback = callback;
- }
- updateScale(true);
- }
-
- /**
- * Remove the given writer from the pool.
- * @param path the file that has been closed
- */
- public void removeWriter(Path path) throws IOException {
- checkOwner();
- WriterInfo val = writerList.get(path);
- if (val != null) {
- writerList.remove(path);
- totalAllocation -= val.allocation;
- if (writerList.isEmpty()) {
- rowsAddedSinceCheck = 0;
- }
- updateScale(false);
- }
- if(writerList.isEmpty()) {
- rowsAddedSinceCheck = 0;
- }
- }
-
- /**
- * Get the total pool size that is available for ORC writers.
- * @return the number of bytes in the pool
- */
- public long getTotalMemoryPool() {
- return totalMemoryPool;
- }
-
- /**
- * The scaling factor for each allocation to ensure that the pool isn't
- * oversubscribed.
- * @return a fraction between 0.0 and 1.0 of the requested size that is
- * available for each writer.
- */
- public double getAllocationScale() {
- return currentScale;
- }
-
- /**
- * Give the memory manager an opportunity for doing a memory check.
- * @param rows number of rows added
- * @throws IOException
- */
- public void addedRow(int rows) throws IOException {
- rowsAddedSinceCheck += rows;
- if (rowsAddedSinceCheck >= ROWS_BETWEEN_CHECKS) {
- notifyWriters();
- }
- }
-
- /**
- * Notify all of the writers that they should check their memory usage.
- * @throws IOException
- */
- public void notifyWriters() throws IOException {
- checkOwner();
- LOG.debug("Notifying writers after " + rowsAddedSinceCheck);
- for(WriterInfo writer: writerList.values()) {
- boolean flushed = writer.callback.checkMemory(currentScale);
- if (LOG.isDebugEnabled() && flushed) {
- LOG.debug("flushed " + writer.toString());
- }
- }
- rowsAddedSinceCheck = 0;
- }
-
- /**
- * Update the currentScale based on the current allocation and pool size.
- * This also updates the notificationTrigger.
- * @param isAllocate is this an allocation?
- */
- private void updateScale(boolean isAllocate) throws IOException {
- if (totalAllocation <= totalMemoryPool) {
- currentScale = 1;
- } else {
- currentScale = (double) totalMemoryPool / totalAllocation;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/OrcAcidUtils.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/OrcAcidUtils.java b/orc/src/java/org/apache/orc/impl/OrcAcidUtils.java
deleted file mode 100644
index 7ca9e1d..0000000
--- a/orc/src/java/org/apache/orc/impl/OrcAcidUtils.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.orc.Reader;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.charset.CharacterCodingException;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-
-public class OrcAcidUtils {
- public static final String ACID_STATS = "hive.acid.stats";
- public static final String DELTA_SIDE_FILE_SUFFIX = "_flush_length";
-
- /**
- * Get the filename of the ORC ACID side file that contains the lengths
- * of the intermediate footers.
- * @param main the main ORC filename
- * @return the name of the side file
- */
- public static Path getSideFile(Path main) {
- return new Path(main + DELTA_SIDE_FILE_SUFFIX);
- }
-
- /**
- * Read the side file to get the last flush length.
- * @param fs the file system to use
- * @param deltaFile the path of the delta file
- * @return the maximum size of the file to use
- * @throws IOException
- */
- public static long getLastFlushLength(FileSystem fs,
- Path deltaFile) throws IOException {
- Path lengths = getSideFile(deltaFile);
- long result = Long.MAX_VALUE;
- if(!fs.exists(lengths)) {
- return result;
- }
- try (FSDataInputStream stream = fs.open(lengths)) {
- result = -1;
- while (stream.available() > 0) {
- result = stream.readLong();
- }
- return result;
- } catch (IOException ioe) {
- return result;
- }
- }
-
- private static final Charset utf8 = Charset.forName("UTF-8");
- private static final CharsetDecoder utf8Decoder = utf8.newDecoder();
-
- public static AcidStats parseAcidStats(Reader reader) {
- if (reader.hasMetadataValue(ACID_STATS)) {
- try {
- ByteBuffer val = reader.getMetadataValue(ACID_STATS).duplicate();
- return new AcidStats(utf8Decoder.decode(val).toString());
- } catch (CharacterCodingException e) {
- throw new IllegalArgumentException("Bad string encoding for " +
- ACID_STATS, e);
- }
- } else {
- return null;
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/OrcIndex.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/OrcIndex.java b/orc/src/java/org/apache/orc/impl/OrcIndex.java
deleted file mode 100644
index 50a15f2..0000000
--- a/orc/src/java/org/apache/orc/impl/OrcIndex.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import org.apache.orc.OrcProto;
-
-public final class OrcIndex {
- OrcProto.RowIndex[] rowGroupIndex;
- OrcProto.BloomFilterIndex[] bloomFilterIndex;
-
- public OrcIndex(OrcProto.RowIndex[] rgIndex, OrcProto.BloomFilterIndex[] bfIndex) {
- this.rowGroupIndex = rgIndex;
- this.bloomFilterIndex = bfIndex;
- }
-
- public OrcProto.RowIndex[] getRowGroupIndex() {
- return rowGroupIndex;
- }
-
- public OrcProto.BloomFilterIndex[] getBloomFilterIndex() {
- return bloomFilterIndex;
- }
-
- public void setRowGroupIndex(OrcProto.RowIndex[] rowGroupIndex) {
- this.rowGroupIndex = rowGroupIndex;
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/OrcTail.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/OrcTail.java b/orc/src/java/org/apache/orc/impl/OrcTail.java
deleted file mode 100644
index f095603..0000000
--- a/orc/src/java/org/apache/orc/impl/OrcTail.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- * http://www.apache.org/licenses/LICENSE-2.0
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import static org.apache.orc.impl.ReaderImpl.extractMetadata;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.orc.CompressionCodec;
-import org.apache.orc.CompressionKind;
-import org.apache.orc.OrcFile;
-import org.apache.orc.OrcProto;
-import org.apache.orc.StripeInformation;
-import org.apache.orc.StripeStatistics;
-
-// TODO: Make OrcTail implement FileMetadata or Reader interface
-public final class OrcTail {
- // postscript + footer - Serialized in OrcSplit
- private final OrcProto.FileTail fileTail;
- // serialized representation of metadata, footer and postscript
- private final ByteBuffer serializedTail;
- // used to invalidate cache entries
- private final long fileModificationTime;
- // lazily deserialized
- private OrcProto.Metadata metadata;
-
- public OrcTail(OrcProto.FileTail fileTail, ByteBuffer serializedTail) {
- this(fileTail, serializedTail, -1);
- }
-
- public OrcTail(OrcProto.FileTail fileTail, ByteBuffer serializedTail, long fileModificationTime) {
- this.fileTail = fileTail;
- this.serializedTail = serializedTail;
- this.fileModificationTime = fileModificationTime;
- this.metadata = null;
- }
-
- public ByteBuffer getSerializedTail() {
- return serializedTail;
- }
-
- public long getFileModificationTime() {
- return fileModificationTime;
- }
-
- public OrcProto.Footer getFooter() {
- return fileTail.getFooter();
- }
-
- public OrcProto.PostScript getPostScript() {
- return fileTail.getPostscript();
- }
-
- public OrcFile.WriterVersion getWriterVersion() {
- OrcProto.PostScript ps = fileTail.getPostscript();
- return (ps.hasWriterVersion()
- ? OrcFile.WriterVersion.from(ps.getWriterVersion()) : OrcFile.WriterVersion.ORIGINAL);
- }
-
- public List<StripeInformation> getStripes() {
- List<StripeInformation> result = new ArrayList<>(fileTail.getFooter().getStripesCount());
- for (OrcProto.StripeInformation stripeProto : fileTail.getFooter().getStripesList()) {
- result.add(new ReaderImpl.StripeInformationImpl(stripeProto));
- }
- return result;
- }
-
- public CompressionKind getCompressionKind() {
- return CompressionKind.valueOf(fileTail.getPostscript().getCompression().name());
- }
-
- public CompressionCodec getCompressionCodec() {
- return PhysicalFsWriter.createCodec(getCompressionKind());
- }
-
- public int getCompressionBufferSize() {
- return (int) fileTail.getPostscript().getCompressionBlockSize();
- }
-
- public List<StripeStatistics> getStripeStatistics() throws IOException {
- List<StripeStatistics> result = new ArrayList<>();
- List<OrcProto.StripeStatistics> ssProto = getStripeStatisticsProto();
- if (ssProto != null) {
- for (OrcProto.StripeStatistics ss : ssProto) {
- result.add(new StripeStatistics(ss.getColStatsList()));
- }
- }
- return result;
- }
-
- public List<OrcProto.StripeStatistics> getStripeStatisticsProto() throws IOException {
- if (serializedTail == null) return null;
- if (metadata == null) {
- metadata = extractMetadata(serializedTail, 0,
- (int) fileTail.getPostscript().getMetadataLength(),
- getCompressionCodec(), getCompressionBufferSize());
- // clear does not clear the contents but sets position to 0 and limit = capacity
- serializedTail.clear();
- }
- return metadata.getStripeStatsList();
- }
-
- public int getMetadataSize() {
- return (int) getPostScript().getMetadataLength();
- }
-
- public List<OrcProto.Type> getTypes() {
- return getFooter().getTypesList();
- }
-
- public OrcProto.FileTail getFileTail() {
- return fileTail;
- }
-
- public OrcProto.FileTail getMinimalFileTail() {
- OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder(fileTail);
- OrcProto.Footer.Builder footerBuilder = OrcProto.Footer.newBuilder(fileTail.getFooter());
- footerBuilder.clearStatistics();
- fileTailBuilder.setFooter(footerBuilder.build());
- OrcProto.FileTail result = fileTailBuilder.build();
- return result;
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/OutStream.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/OutStream.java b/orc/src/java/org/apache/orc/impl/OutStream.java
deleted file mode 100644
index 81662cc..0000000
--- a/orc/src/java/org/apache/orc/impl/OutStream.java
+++ /dev/null
@@ -1,289 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import org.apache.orc.CompressionCodec;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-public class OutStream extends PositionedOutputStream {
-
- public interface OutputReceiver {
- /**
- * Output the given buffer to the final destination
- * @param buffer the buffer to output
- * @throws IOException
- */
- void output(ByteBuffer buffer) throws IOException;
- }
-
- public static final int HEADER_SIZE = 3;
- private final String name;
- private final OutputReceiver receiver;
- // if enabled the stream will be suppressed when writing stripe
- private boolean suppress;
-
- /**
- * Stores the uncompressed bytes that have been serialized, but not
- * compressed yet. When this fills, we compress the entire buffer.
- */
- private ByteBuffer current = null;
-
- /**
- * Stores the compressed bytes until we have a full buffer and then outputs
- * them to the receiver. If no compression is being done, this (and overflow)
- * will always be null and the current buffer will be sent directly to the
- * receiver.
- */
- private ByteBuffer compressed = null;
-
- /**
- * Since the compressed buffer may start with contents from previous
- * compression blocks, we allocate an overflow buffer so that the
- * output of the codec can be split between the two buffers. After the
- * compressed buffer is sent to the receiver, the overflow buffer becomes
- * the new compressed buffer.
- */
- private ByteBuffer overflow = null;
- private final int bufferSize;
- private final CompressionCodec codec;
- private long compressedBytes = 0;
- private long uncompressedBytes = 0;
-
- public OutStream(String name,
- int bufferSize,
- CompressionCodec codec,
- OutputReceiver receiver) throws IOException {
- this.name = name;
- this.bufferSize = bufferSize;
- this.codec = codec;
- this.receiver = receiver;
- this.suppress = false;
- }
-
- public void clear() throws IOException {
- flush();
- suppress = false;
- }
-
- /**
- * Write the length of the compressed bytes. Life is much easier if the
- * header is constant length, so just use 3 bytes. Considering most of the
- * codecs want between 32k (snappy) and 256k (lzo, zlib), 3 bytes should
- * be plenty. We also use the low bit for whether it is the original or
- * compressed bytes.
- * @param buffer the buffer to write the header to
- * @param position the position in the buffer to write at
- * @param val the size in the file
- * @param original is it uncompressed
- */
- private static void writeHeader(ByteBuffer buffer,
- int position,
- int val,
- boolean original) {
- buffer.put(position, (byte) ((val << 1) + (original ? 1 : 0)));
- buffer.put(position + 1, (byte) (val >> 7));
- buffer.put(position + 2, (byte) (val >> 15));
- }
-
- private void getNewInputBuffer() throws IOException {
- if (codec == null) {
- current = ByteBuffer.allocate(bufferSize);
- } else {
- current = ByteBuffer.allocate(bufferSize + HEADER_SIZE);
- writeHeader(current, 0, bufferSize, true);
- current.position(HEADER_SIZE);
- }
- }
-
- /**
- * Allocate a new output buffer if we are compressing.
- */
- private ByteBuffer getNewOutputBuffer() throws IOException {
- return ByteBuffer.allocate(bufferSize + HEADER_SIZE);
- }
-
- private void flip() throws IOException {
- current.limit(current.position());
- current.position(codec == null ? 0 : HEADER_SIZE);
- }
-
- @Override
- public void write(int i) throws IOException {
- if (current == null) {
- getNewInputBuffer();
- }
- if (current.remaining() < 1) {
- spill();
- }
- uncompressedBytes += 1;
- current.put((byte) i);
- }
-
- @Override
- public void write(byte[] bytes, int offset, int length) throws IOException {
- if (current == null) {
- getNewInputBuffer();
- }
- int remaining = Math.min(current.remaining(), length);
- current.put(bytes, offset, remaining);
- uncompressedBytes += remaining;
- length -= remaining;
- while (length != 0) {
- spill();
- offset += remaining;
- remaining = Math.min(current.remaining(), length);
- current.put(bytes, offset, remaining);
- uncompressedBytes += remaining;
- length -= remaining;
- }
- }
-
- private void spill() throws java.io.IOException {
- // if there isn't anything in the current buffer, don't spill
- if (current == null ||
- current.position() == (codec == null ? 0 : HEADER_SIZE)) {
- return;
- }
- flip();
- if (codec == null) {
- receiver.output(current);
- getNewInputBuffer();
- } else {
- if (compressed == null) {
- compressed = getNewOutputBuffer();
- } else if (overflow == null) {
- overflow = getNewOutputBuffer();
- }
- int sizePosn = compressed.position();
- compressed.position(compressed.position() + HEADER_SIZE);
- if (codec.compress(current, compressed, overflow)) {
- uncompressedBytes = 0;
- // move position back to after the header
- current.position(HEADER_SIZE);
- current.limit(current.capacity());
- // find the total bytes in the chunk
- int totalBytes = compressed.position() - sizePosn - HEADER_SIZE;
- if (overflow != null) {
- totalBytes += overflow.position();
- }
- compressedBytes += totalBytes + HEADER_SIZE;
- writeHeader(compressed, sizePosn, totalBytes, false);
- // if we have less than the next header left, spill it.
- if (compressed.remaining() < HEADER_SIZE) {
- compressed.flip();
- receiver.output(compressed);
- compressed = overflow;
- overflow = null;
- }
- } else {
- compressedBytes += uncompressedBytes + HEADER_SIZE;
- uncompressedBytes = 0;
- // we are using the original, but need to spill the current
- // compressed buffer first. So back up to where we started,
- // flip it and add it to done.
- if (sizePosn != 0) {
- compressed.position(sizePosn);
- compressed.flip();
- receiver.output(compressed);
- compressed = null;
- // if we have an overflow, clear it and make it the new compress
- // buffer
- if (overflow != null) {
- overflow.clear();
- compressed = overflow;
- overflow = null;
- }
- } else {
- compressed.clear();
- if (overflow != null) {
- overflow.clear();
- }
- }
-
- // now add the current buffer into the done list and get a new one.
- current.position(0);
- // update the header with the current length
- writeHeader(current, 0, current.limit() - HEADER_SIZE, true);
- receiver.output(current);
- getNewInputBuffer();
- }
- }
- }
-
- @Override
- public void getPosition(PositionRecorder recorder) throws IOException {
- if (codec == null) {
- recorder.addPosition(uncompressedBytes);
- } else {
- recorder.addPosition(compressedBytes);
- recorder.addPosition(uncompressedBytes);
- }
- }
-
- @Override
- public void flush() throws IOException {
- spill();
- if (compressed != null && compressed.position() != 0) {
- compressed.flip();
- receiver.output(compressed);
- }
- compressed = null;
- uncompressedBytes = 0;
- compressedBytes = 0;
- overflow = null;
- current = null;
- }
-
- @Override
- public String toString() {
- return name;
- }
-
- @Override
- public long getBufferSize() {
- long result = 0;
- if (current != null) {
- result += current.capacity();
- }
- if (compressed != null) {
- result += compressed.capacity();
- }
- if (overflow != null) {
- result += overflow.capacity();
- }
- return result;
- }
-
- /**
- * Set suppress flag
- */
- public void suppress() {
- suppress = true;
- }
-
- /**
- * Returns the state of suppress flag
- * @return value of suppress flag
- */
- public boolean isSuppressed() {
- return suppress;
- }
-}
-
[18/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java b/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java
deleted file mode 100644
index 820674b..0000000
--- a/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ /dev/null
@@ -1,1238 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.IOException;
-import java.math.BigDecimal;
-import java.sql.Date;
-import java.sql.Timestamp;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.orc.BooleanColumnStatistics;
-import org.apache.orc.Reader;
-import org.apache.orc.RecordReader;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.CompressionCodec;
-import org.apache.orc.DataReader;
-import org.apache.orc.DateColumnStatistics;
-import org.apache.orc.DecimalColumnStatistics;
-import org.apache.orc.DoubleColumnStatistics;
-import org.apache.orc.IntegerColumnStatistics;
-import org.apache.orc.OrcConf;
-import org.apache.orc.StringColumnStatistics;
-import org.apache.orc.StripeInformation;
-import org.apache.orc.TimestampColumnStatistics;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.common.io.DiskRange;
-import org.apache.hadoop.hive.common.io.DiskRangeList;
-import org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.orc.BloomFilterIO;
-import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.hadoop.hive.ql.util.TimestampUtils;
-import org.apache.hadoop.io.Text;
-import org.apache.orc.OrcProto;
-
-public class RecordReaderImpl implements RecordReader {
- static final Logger LOG = LoggerFactory.getLogger(RecordReaderImpl.class);
- private static final boolean isLogDebugEnabled = LOG.isDebugEnabled();
- private static final Object UNKNOWN_VALUE = new Object();
- protected final Path path;
- private final long firstRow;
- private final List<StripeInformation> stripes =
- new ArrayList<StripeInformation>();
- private OrcProto.StripeFooter stripeFooter;
- private final long totalRowCount;
- private final CompressionCodec codec;
- protected final TypeDescription schema;
- private final List<OrcProto.Type> types;
- private final int bufferSize;
- private final SchemaEvolution evolution;
- // the file included columns indexed by the file's column ids.
- private final boolean[] included;
- private final long rowIndexStride;
- private long rowInStripe = 0;
- private int currentStripe = -1;
- private long rowBaseInStripe = 0;
- private long rowCountInStripe = 0;
- private final Map<StreamName, InStream> streams =
- new HashMap<StreamName, InStream>();
- DiskRangeList bufferChunks = null;
- private final TreeReaderFactory.TreeReader reader;
- private final OrcProto.RowIndex[] indexes;
- private final OrcProto.BloomFilterIndex[] bloomFilterIndices;
- private final SargApplier sargApp;
- // an array about which row groups aren't skipped
- private boolean[] includedRowGroups = null;
- private final DataReader dataReader;
-
- /**
- * Given a list of column names, find the given column and return the index.
- *
- * @param evolution the mapping from reader to file schema
- * @param columnName the column name to look for
- * @return the file column id or -1 if the column wasn't found
- */
- static int findColumns(SchemaEvolution evolution,
- String columnName) {
- TypeDescription readerSchema = evolution.getReaderBaseSchema();
- List<String> fieldNames = readerSchema.getFieldNames();
- List<TypeDescription> children = readerSchema.getChildren();
- for (int i = 0; i < fieldNames.size(); ++i) {
- if (columnName.equals(fieldNames.get(i))) {
- TypeDescription result = evolution.getFileType(children.get(i).getId());
- return result == null ? -1 : result.getId();
- }
- }
- return -1;
- }
-
- /**
- * Find the mapping from predicate leaves to columns.
- * @param sargLeaves the search argument that we need to map
- * @param evolution the mapping from reader to file schema
- * @return an array mapping the sarg leaves to file column ids
- */
- public static int[] mapSargColumnsToOrcInternalColIdx(List<PredicateLeaf> sargLeaves,
- SchemaEvolution evolution) {
- int[] result = new int[sargLeaves.size()];
- Arrays.fill(result, -1);
- for(int i=0; i < result.length; ++i) {
- String colName = sargLeaves.get(i).getColumnName();
- result[i] = findColumns(evolution, colName);
- }
- return result;
- }
-
- protected RecordReaderImpl(ReaderImpl fileReader,
- Reader.Options options) throws IOException {
- boolean[] readerIncluded = options.getInclude();
- if (options.getSchema() == null) {
- if (LOG.isInfoEnabled()) {
- LOG.info("Reader schema not provided -- using file schema " +
- fileReader.getSchema());
- }
- evolution = new SchemaEvolution(fileReader.getSchema(), readerIncluded);
- } else {
-
- // Now that we are creating a record reader for a file, validate that the schema to read
- // is compatible with the file schema.
- //
- evolution = new SchemaEvolution(fileReader.getSchema(),
- options.getSchema(), readerIncluded);
- if (LOG.isDebugEnabled() && evolution.hasConversion()) {
- LOG.debug("ORC file " + fileReader.path.toString() +
- " has data type conversion --\n" +
- "reader schema: " + options.getSchema().toString() + "\n" +
- "file schema: " + fileReader.getSchema());
- }
- }
- this.schema = evolution.getReaderSchema();
- this.path = fileReader.path;
- this.codec = fileReader.codec;
- this.types = fileReader.types;
- this.bufferSize = fileReader.bufferSize;
- this.rowIndexStride = fileReader.rowIndexStride;
- SearchArgument sarg = options.getSearchArgument();
- if (sarg != null && rowIndexStride != 0) {
- sargApp = new SargApplier(sarg, options.getColumnNames(), rowIndexStride,
- evolution);
- } else {
- sargApp = null;
- }
- long rows = 0;
- long skippedRows = 0;
- long offset = options.getOffset();
- long maxOffset = options.getMaxOffset();
- for(StripeInformation stripe: fileReader.getStripes()) {
- long stripeStart = stripe.getOffset();
- if (offset > stripeStart) {
- skippedRows += stripe.getNumberOfRows();
- } else if (stripeStart < maxOffset) {
- this.stripes.add(stripe);
- rows += stripe.getNumberOfRows();
- }
- }
-
- Boolean zeroCopy = options.getUseZeroCopy();
- if (zeroCopy == null) {
- zeroCopy = OrcConf.USE_ZEROCOPY.getBoolean(fileReader.conf);
- }
- if (options.getDataReader() != null) {
- this.dataReader = options.getDataReader();
- } else {
- this.dataReader = RecordReaderUtils.createDefaultDataReader(
- DataReaderProperties.builder()
- .withBufferSize(bufferSize)
- .withCompression(fileReader.compressionKind)
- .withFileSystem(fileReader.fileSystem)
- .withPath(fileReader.path)
- .withTypeCount(types.size())
- .withZeroCopy(zeroCopy)
- .build());
- }
- this.dataReader.open();
-
- firstRow = skippedRows;
- totalRowCount = rows;
- Boolean skipCorrupt = options.getSkipCorruptRecords();
- if (skipCorrupt == null) {
- skipCorrupt = OrcConf.SKIP_CORRUPT_DATA.getBoolean(fileReader.conf);
- }
-
- reader = TreeReaderFactory.createTreeReader(evolution.getReaderSchema(),
- evolution, readerIncluded, skipCorrupt);
- indexes = new OrcProto.RowIndex[types.size()];
- bloomFilterIndices = new OrcProto.BloomFilterIndex[types.size()];
- this.included = evolution.getFileIncluded();
- advanceToNextRow(reader, 0L, true);
- }
-
- public static final class PositionProviderImpl implements PositionProvider {
- private final OrcProto.RowIndexEntry entry;
- private int index;
-
- public PositionProviderImpl(OrcProto.RowIndexEntry entry) {
- this(entry, 0);
- }
-
- public PositionProviderImpl(OrcProto.RowIndexEntry entry, int startPos) {
- this.entry = entry;
- this.index = startPos;
- }
-
- @Override
- public long getNext() {
- return entry.getPositions(index++);
- }
-
- @Override
- public String toString() {
- return "{" + entry.getPositionsList() + "; " + index + "}";
- }
- }
-
- public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe
- ) throws IOException {
- return dataReader.readStripeFooter(stripe);
- }
-
- enum Location {
- BEFORE, MIN, MIDDLE, MAX, AFTER
- }
-
- /**
- * Given a point and min and max, determine if the point is before, at the
- * min, in the middle, at the max, or after the range.
- * @param point the point to test
- * @param min the minimum point
- * @param max the maximum point
- * @param <T> the type of the comparision
- * @return the location of the point
- */
- static <T> Location compareToRange(Comparable<T> point, T min, T max) {
- int minCompare = point.compareTo(min);
- if (minCompare < 0) {
- return Location.BEFORE;
- } else if (minCompare == 0) {
- return Location.MIN;
- }
- int maxCompare = point.compareTo(max);
- if (maxCompare > 0) {
- return Location.AFTER;
- } else if (maxCompare == 0) {
- return Location.MAX;
- }
- return Location.MIDDLE;
- }
-
- /**
- * Get the maximum value out of an index entry.
- * @param index
- * the index entry
- * @return the object for the maximum value or null if there isn't one
- */
- static Object getMax(ColumnStatistics index) {
- if (index instanceof IntegerColumnStatistics) {
- return ((IntegerColumnStatistics) index).getMaximum();
- } else if (index instanceof DoubleColumnStatistics) {
- return ((DoubleColumnStatistics) index).getMaximum();
- } else if (index instanceof StringColumnStatistics) {
- return ((StringColumnStatistics) index).getMaximum();
- } else if (index instanceof DateColumnStatistics) {
- return ((DateColumnStatistics) index).getMaximum();
- } else if (index instanceof DecimalColumnStatistics) {
- return ((DecimalColumnStatistics) index).getMaximum();
- } else if (index instanceof TimestampColumnStatistics) {
- return ((TimestampColumnStatistics) index).getMaximum();
- } else if (index instanceof BooleanColumnStatistics) {
- if (((BooleanColumnStatistics)index).getTrueCount()!=0) {
- return Boolean.TRUE;
- } else {
- return Boolean.FALSE;
- }
- } else {
- return null;
- }
- }
-
- /**
- * Get the minimum value out of an index entry.
- * @param index
- * the index entry
- * @return the object for the minimum value or null if there isn't one
- */
- static Object getMin(ColumnStatistics index) {
- if (index instanceof IntegerColumnStatistics) {
- return ((IntegerColumnStatistics) index).getMinimum();
- } else if (index instanceof DoubleColumnStatistics) {
- return ((DoubleColumnStatistics) index).getMinimum();
- } else if (index instanceof StringColumnStatistics) {
- return ((StringColumnStatistics) index).getMinimum();
- } else if (index instanceof DateColumnStatistics) {
- return ((DateColumnStatistics) index).getMinimum();
- } else if (index instanceof DecimalColumnStatistics) {
- return ((DecimalColumnStatistics) index).getMinimum();
- } else if (index instanceof TimestampColumnStatistics) {
- return ((TimestampColumnStatistics) index).getMinimum();
- } else if (index instanceof BooleanColumnStatistics) {
- if (((BooleanColumnStatistics)index).getFalseCount()!=0) {
- return Boolean.FALSE;
- } else {
- return Boolean.TRUE;
- }
- } else {
- return UNKNOWN_VALUE; // null is not safe here
- }
- }
-
- /**
- * Evaluate a predicate with respect to the statistics from the column
- * that is referenced in the predicate.
- * @param statsProto the statistics for the column mentioned in the predicate
- * @param predicate the leaf predicate we need to evaluation
- * @param bloomFilter
- * @return the set of truth values that may be returned for the given
- * predicate.
- */
- static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto,
- PredicateLeaf predicate, OrcProto.BloomFilter bloomFilter) {
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(statsProto);
- Object minValue = getMin(cs);
- Object maxValue = getMax(cs);
- BloomFilterIO bf = null;
- if (bloomFilter != null) {
- bf = new BloomFilterIO(bloomFilter);
- }
- return evaluatePredicateRange(predicate, minValue, maxValue, cs.hasNull(), bf);
- }
-
- /**
- * Evaluate a predicate with respect to the statistics from the column
- * that is referenced in the predicate.
- * @param stats the statistics for the column mentioned in the predicate
- * @param predicate the leaf predicate we need to evaluation
- * @return the set of truth values that may be returned for the given
- * predicate.
- */
- public static TruthValue evaluatePredicate(ColumnStatistics stats,
- PredicateLeaf predicate,
- BloomFilterIO bloomFilter) {
- Object minValue = getMin(stats);
- Object maxValue = getMax(stats);
- return evaluatePredicateRange(predicate, minValue, maxValue, stats.hasNull(), bloomFilter);
- }
-
- static TruthValue evaluatePredicateRange(PredicateLeaf predicate, Object min,
- Object max, boolean hasNull, BloomFilterIO bloomFilter) {
- // if we didn't have any values, everything must have been null
- if (min == null) {
- if (predicate.getOperator() == PredicateLeaf.Operator.IS_NULL) {
- return TruthValue.YES;
- } else {
- return TruthValue.NULL;
- }
- } else if (min == UNKNOWN_VALUE) {
- return TruthValue.YES_NO_NULL;
- }
-
- // TODO: Enabling PPD for timestamp requires ORC-101 and ORC-135
- if (min != null && min instanceof Timestamp) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Not using predication pushdown on {} because it doesn't " +
- "include ORC-135.", predicate.getColumnName());
- }
- return TruthValue.YES_NO_NULL;
- }
-
- TruthValue result;
- Object baseObj = predicate.getLiteral();
- try {
- // Predicate object and stats objects are converted to the type of the predicate object.
- Object minValue = getBaseObjectForComparison(predicate.getType(), min);
- Object maxValue = getBaseObjectForComparison(predicate.getType(), max);
- Object predObj = getBaseObjectForComparison(predicate.getType(), baseObj);
-
- result = evaluatePredicateMinMax(predicate, predObj, minValue, maxValue, hasNull);
- if (shouldEvaluateBloomFilter(predicate, result, bloomFilter)) {
- result = evaluatePredicateBloomFilter(predicate, predObj, bloomFilter, hasNull);
- }
- // in case failed conversion, return the default YES_NO_NULL truth value
- } catch (Exception e) {
- if (LOG.isDebugEnabled()) {
- final String statsType = min == null ?
- (max == null ? "null" : max.getClass().getSimpleName()) :
- min.getClass().getSimpleName();
- final String predicateType = baseObj == null ? "null" : baseObj.getClass().getSimpleName();
- final String reason = e.getClass().getSimpleName() + " when evaluating predicate." +
- " Skipping ORC PPD." +
- " Exception: " + e.getMessage() +
- " StatsType: " + statsType +
- " PredicateType: " + predicateType;
- LOG.debug(reason);
- }
- if (predicate.getOperator().equals(PredicateLeaf.Operator.NULL_SAFE_EQUALS) || !hasNull) {
- result = TruthValue.YES_NO;
- } else {
- result = TruthValue.YES_NO_NULL;
- }
- }
- return result;
- }
-
- private static boolean shouldEvaluateBloomFilter(PredicateLeaf predicate,
- TruthValue result, BloomFilterIO bloomFilter) {
- // evaluate bloom filter only when
- // 1) Bloom filter is available
- // 2) Min/Max evaluation yield YES or MAYBE
- // 3) Predicate is EQUALS or IN list
- if (bloomFilter != null
- && result != TruthValue.NO_NULL && result != TruthValue.NO
- && (predicate.getOperator().equals(PredicateLeaf.Operator.EQUALS)
- || predicate.getOperator().equals(PredicateLeaf.Operator.NULL_SAFE_EQUALS)
- || predicate.getOperator().equals(PredicateLeaf.Operator.IN))) {
- return true;
- }
- return false;
- }
-
- private static TruthValue evaluatePredicateMinMax(PredicateLeaf predicate, Object predObj,
- Object minValue,
- Object maxValue,
- boolean hasNull) {
- Location loc;
-
- switch (predicate.getOperator()) {
- case NULL_SAFE_EQUALS:
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (loc == Location.BEFORE || loc == Location.AFTER) {
- return TruthValue.NO;
- } else {
- return TruthValue.YES_NO;
- }
- case EQUALS:
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (minValue.equals(maxValue) && loc == Location.MIN) {
- return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
- } else if (loc == Location.BEFORE || loc == Location.AFTER) {
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- } else {
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
- }
- case LESS_THAN:
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (loc == Location.AFTER) {
- return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
- } else if (loc == Location.BEFORE || loc == Location.MIN) {
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- } else {
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
- }
- case LESS_THAN_EQUALS:
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (loc == Location.AFTER || loc == Location.MAX) {
- return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
- } else if (loc == Location.BEFORE) {
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- } else {
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
- }
- case IN:
- if (minValue.equals(maxValue)) {
- // for a single value, look through to see if that value is in the
- // set
- for (Object arg : predicate.getLiteralList()) {
- predObj = getBaseObjectForComparison(predicate.getType(), arg);
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (loc == Location.MIN) {
- return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
- }
- }
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- } else {
- // are all of the values outside of the range?
- for (Object arg : predicate.getLiteralList()) {
- predObj = getBaseObjectForComparison(predicate.getType(), arg);
- loc = compareToRange((Comparable) predObj, minValue, maxValue);
- if (loc == Location.MIN || loc == Location.MIDDLE ||
- loc == Location.MAX) {
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
- }
- }
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- }
- case BETWEEN:
- List<Object> args = predicate.getLiteralList();
- Object predObj1 = getBaseObjectForComparison(predicate.getType(), args.get(0));
-
- loc = compareToRange((Comparable) predObj1, minValue, maxValue);
- if (loc == Location.BEFORE || loc == Location.MIN) {
- Object predObj2 = getBaseObjectForComparison(predicate.getType(), args.get(1));
-
- Location loc2 = compareToRange((Comparable) predObj2, minValue, maxValue);
- if (loc2 == Location.AFTER || loc2 == Location.MAX) {
- return hasNull ? TruthValue.YES_NULL : TruthValue.YES;
- } else if (loc2 == Location.BEFORE) {
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- } else {
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
- }
- } else if (loc == Location.AFTER) {
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- } else {
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
- }
- case IS_NULL:
- // min = null condition above handles the all-nulls YES case
- return hasNull ? TruthValue.YES_NO : TruthValue.NO;
- default:
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
- }
- }
-
- private static TruthValue evaluatePredicateBloomFilter(PredicateLeaf predicate,
- final Object predObj, BloomFilterIO bloomFilter, boolean hasNull) {
- switch (predicate.getOperator()) {
- case NULL_SAFE_EQUALS:
- // null safe equals does not return *_NULL variant. So set hasNull to false
- return checkInBloomFilter(bloomFilter, predObj, false);
- case EQUALS:
- return checkInBloomFilter(bloomFilter, predObj, hasNull);
- case IN:
- for (Object arg : predicate.getLiteralList()) {
- // if atleast one value in IN list exist in bloom filter, qualify the row group/stripe
- Object predObjItem = getBaseObjectForComparison(predicate.getType(), arg);
- TruthValue result = checkInBloomFilter(bloomFilter, predObjItem, hasNull);
- if (result == TruthValue.YES_NO_NULL || result == TruthValue.YES_NO) {
- return result;
- }
- }
- return hasNull ? TruthValue.NO_NULL : TruthValue.NO;
- default:
- return hasNull ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
- }
- }
-
- private static TruthValue checkInBloomFilter(BloomFilterIO bf, Object predObj, boolean hasNull) {
- TruthValue result = hasNull ? TruthValue.NO_NULL : TruthValue.NO;
-
- if (predObj instanceof Long) {
- if (bf.testLong(((Long) predObj).longValue())) {
- result = TruthValue.YES_NO_NULL;
- }
- } else if (predObj instanceof Double) {
- if (bf.testDouble(((Double) predObj).doubleValue())) {
- result = TruthValue.YES_NO_NULL;
- }
- } else if (predObj instanceof String || predObj instanceof Text ||
- predObj instanceof HiveDecimalWritable ||
- predObj instanceof BigDecimal) {
- if (bf.testString(predObj.toString())) {
- result = TruthValue.YES_NO_NULL;
- }
- } else if (predObj instanceof Timestamp) {
- if (bf.testLong(((Timestamp) predObj).getTime())) {
- result = TruthValue.YES_NO_NULL;
- }
- } else if (predObj instanceof Date) {
- if (bf.testLong(DateWritable.dateToDays((Date) predObj))) {
- result = TruthValue.YES_NO_NULL;
- }
- } else {
- // if the predicate object is null and if hasNull says there are no nulls then return NO
- if (predObj == null && !hasNull) {
- result = TruthValue.NO;
- } else {
- result = TruthValue.YES_NO_NULL;
- }
- }
-
- if (result == TruthValue.YES_NO_NULL && !hasNull) {
- result = TruthValue.YES_NO;
- }
-
- if (LOG.isDebugEnabled()) {
- LOG.debug("Bloom filter evaluation: " + result.toString());
- }
-
- return result;
- }
-
- private static Object getBaseObjectForComparison(PredicateLeaf.Type type, Object obj) {
- if (obj == null) {
- return null;
- }
- switch (type) {
- case BOOLEAN:
- if (obj instanceof Boolean) {
- return obj;
- } else {
- // will only be true if the string conversion yields "true", all other values are
- // considered false
- return Boolean.valueOf(obj.toString());
- }
- case DATE:
- if (obj instanceof Date) {
- return obj;
- } else if (obj instanceof String) {
- return Date.valueOf((String) obj);
- } else if (obj instanceof Timestamp) {
- return DateWritable.timeToDate(((Timestamp) obj).getTime() / 1000L);
- }
- // always string, but prevent the comparison to numbers (are they days/seconds/milliseconds?)
- break;
- case DECIMAL:
- if (obj instanceof Boolean) {
- return new HiveDecimalWritable(((Boolean) obj).booleanValue() ?
- HiveDecimal.ONE : HiveDecimal.ZERO);
- } else if (obj instanceof Integer) {
- return new HiveDecimalWritable(((Integer) obj).intValue());
- } else if (obj instanceof Long) {
- return new HiveDecimalWritable(((Long) obj));
- } else if (obj instanceof Float || obj instanceof Double ||
- obj instanceof String) {
- return new HiveDecimalWritable(obj.toString());
- } else if (obj instanceof BigDecimal) {
- return new HiveDecimalWritable(HiveDecimal.create((BigDecimal) obj));
- } else if (obj instanceof HiveDecimal) {
- return new HiveDecimalWritable((HiveDecimal) obj);
- } else if (obj instanceof HiveDecimalWritable) {
- return obj;
- } else if (obj instanceof Timestamp) {
- return new HiveDecimalWritable(Double.toString(
- TimestampUtils.getDouble((Timestamp) obj)));
- }
- break;
- case FLOAT:
- if (obj instanceof Number) {
- // widening conversion
- return ((Number) obj).doubleValue();
- } else if (obj instanceof HiveDecimal) {
- return ((HiveDecimal) obj).doubleValue();
- } else if (obj instanceof String) {
- return Double.valueOf(obj.toString());
- } else if (obj instanceof Timestamp) {
- return TimestampUtils.getDouble((Timestamp) obj);
- } else if (obj instanceof HiveDecimal) {
- return ((HiveDecimal) obj).doubleValue();
- } else if (obj instanceof BigDecimal) {
- return ((BigDecimal) obj).doubleValue();
- }
- break;
- case LONG:
- if (obj instanceof Number) {
- // widening conversion
- return ((Number) obj).longValue();
- } else if (obj instanceof HiveDecimal) {
- return ((HiveDecimal) obj).longValue();
- } else if (obj instanceof String) {
- return Long.valueOf(obj.toString());
- }
- break;
- case STRING:
- if (obj != null) {
- return (obj.toString());
- }
- break;
- case TIMESTAMP:
- if (obj instanceof Timestamp) {
- return obj;
- } else if (obj instanceof Integer) {
- return new Timestamp(((Number) obj).longValue());
- } else if (obj instanceof Float) {
- return TimestampUtils.doubleToTimestamp(((Float) obj).doubleValue());
- } else if (obj instanceof Double) {
- return TimestampUtils.doubleToTimestamp(((Double) obj).doubleValue());
- } else if (obj instanceof HiveDecimal) {
- return TimestampUtils.decimalToTimestamp((HiveDecimal) obj);
- } else if (obj instanceof HiveDecimalWritable) {
- return TimestampUtils.decimalToTimestamp(((HiveDecimalWritable) obj).getHiveDecimal());
- } else if (obj instanceof Date) {
- return new Timestamp(((Date) obj).getTime());
- }
- // float/double conversion to timestamp is interpreted as seconds whereas integer conversion
- // to timestamp is interpreted as milliseconds by default. The integer to timestamp casting
- // is also config driven. The filter operator changes its promotion based on config:
- // "int.timestamp.conversion.in.seconds". Disable PPD for integer cases.
- break;
- default:
- break;
- }
-
- throw new IllegalArgumentException(String.format(
- "ORC SARGS could not convert from %s to %s", obj == null ? "(null)" : obj.getClass()
- .getSimpleName(), type));
- }
-
- public static class SargApplier {
- public final static boolean[] READ_ALL_RGS = null;
- public final static boolean[] READ_NO_RGS = new boolean[0];
-
- private final SearchArgument sarg;
- private final List<PredicateLeaf> sargLeaves;
- private final int[] filterColumns;
- private final long rowIndexStride;
- // same as the above array, but indices are set to true
- private final boolean[] sargColumns;
- private SchemaEvolution evolution;
-
- public SargApplier(SearchArgument sarg, String[] columnNames,
- long rowIndexStride,
- SchemaEvolution evolution) {
- this.sarg = sarg;
- sargLeaves = sarg.getLeaves();
- filterColumns = mapSargColumnsToOrcInternalColIdx(sargLeaves, evolution);
- this.rowIndexStride = rowIndexStride;
- // included will not be null, row options will fill the array with trues if null
- sargColumns = new boolean[evolution.getFileIncluded().length];
- for (int i : filterColumns) {
- // filter columns may have -1 as index which could be partition column in SARG.
- if (i > 0) {
- sargColumns[i] = true;
- }
- }
- this.evolution = evolution;
- }
-
- /**
- * Pick the row groups that we need to load from the current stripe.
- *
- * @return an array with a boolean for each row group or null if all of the
- * row groups must be read.
- * @throws IOException
- */
- public boolean[] pickRowGroups(StripeInformation stripe, OrcProto.RowIndex[] indexes,
- OrcProto.BloomFilterIndex[] bloomFilterIndices, boolean returnNone) throws IOException {
- long rowsInStripe = stripe.getNumberOfRows();
- int groupsInStripe = (int) ((rowsInStripe + rowIndexStride - 1) / rowIndexStride);
- boolean[] result = new boolean[groupsInStripe]; // TODO: avoid alloc?
- TruthValue[] leafValues = new TruthValue[sargLeaves.size()];
- boolean hasSelected = false, hasSkipped = false;
- for (int rowGroup = 0; rowGroup < result.length; ++rowGroup) {
- for (int pred = 0; pred < leafValues.length; ++pred) {
- int columnIx = filterColumns[pred];
- if (columnIx != -1) {
- if (indexes[columnIx] == null) {
- throw new AssertionError("Index is not populated for " + columnIx);
- }
- OrcProto.RowIndexEntry entry = indexes[columnIx].getEntry(rowGroup);
- if (entry == null) {
- throw new AssertionError("RG is not populated for " + columnIx + " rg " + rowGroup);
- }
- OrcProto.ColumnStatistics stats = entry.getStatistics();
- OrcProto.BloomFilter bf = null;
- if (bloomFilterIndices != null && bloomFilterIndices[columnIx] != null) {
- bf = bloomFilterIndices[columnIx].getBloomFilter(rowGroup);
- }
- if (evolution != null && evolution.isPPDSafeConversion(columnIx)) {
- leafValues[pred] = evaluatePredicateProto(stats, sargLeaves.get(pred), bf);
- } else {
- leafValues[pred] = TruthValue.YES_NO_NULL;
- }
- if (LOG.isTraceEnabled()) {
- LOG.trace("Stats = " + stats);
- LOG.trace("Setting " + sargLeaves.get(pred) + " to " + leafValues[pred]);
- }
- } else {
- // the column is a virtual column
- leafValues[pred] = TruthValue.YES_NO_NULL;
- }
- }
- result[rowGroup] = sarg.evaluate(leafValues).isNeeded();
- hasSelected = hasSelected || result[rowGroup];
- hasSkipped = hasSkipped || (!result[rowGroup]);
- if (LOG.isDebugEnabled()) {
- LOG.debug("Row group " + (rowIndexStride * rowGroup) + " to " +
- (rowIndexStride * (rowGroup + 1) - 1) + " is " +
- (result[rowGroup] ? "" : "not ") + "included.");
- }
- }
-
- return hasSkipped ? ((hasSelected || !returnNone) ? result : READ_NO_RGS) : READ_ALL_RGS;
- }
- }
-
- /**
- * Pick the row groups that we need to load from the current stripe.
- *
- * @return an array with a boolean for each row group or null if all of the
- * row groups must be read.
- * @throws IOException
- */
- protected boolean[] pickRowGroups() throws IOException {
- // if we don't have a sarg or indexes, we read everything
- if (sargApp == null) {
- return null;
- }
- readRowIndex(currentStripe, included, sargApp.sargColumns);
- return sargApp.pickRowGroups(stripes.get(currentStripe), indexes, bloomFilterIndices, false);
- }
-
- private void clearStreams() {
- // explicit close of all streams to de-ref ByteBuffers
- for (InStream is : streams.values()) {
- is.close();
- }
- if (bufferChunks != null) {
- if (dataReader.isTrackingDiskRanges()) {
- for (DiskRangeList range = bufferChunks; range != null; range = range.next) {
- if (!(range instanceof BufferChunk)) {
- continue;
- }
- dataReader.releaseBuffer(((BufferChunk) range).getChunk());
- }
- }
- }
- bufferChunks = null;
- streams.clear();
- }
-
- /**
- * Read the current stripe into memory.
- *
- * @throws IOException
- */
- private void readStripe() throws IOException {
- StripeInformation stripe = beginReadStripe();
- includedRowGroups = pickRowGroups();
-
- // move forward to the first unskipped row
- if (includedRowGroups != null) {
- while (rowInStripe < rowCountInStripe &&
- !includedRowGroups[(int) (rowInStripe / rowIndexStride)]) {
- rowInStripe = Math.min(rowCountInStripe, rowInStripe + rowIndexStride);
- }
- }
-
- // if we haven't skipped the whole stripe, read the data
- if (rowInStripe < rowCountInStripe) {
- // if we aren't projecting columns or filtering rows, just read it all
- if (included == null && includedRowGroups == null) {
- readAllDataStreams(stripe);
- } else {
- readPartialDataStreams(stripe);
- }
- reader.startStripe(streams, stripeFooter);
- // if we skipped the first row group, move the pointers forward
- if (rowInStripe != 0) {
- seekToRowEntry(reader, (int) (rowInStripe / rowIndexStride));
- }
- }
- }
-
- private StripeInformation beginReadStripe() throws IOException {
- StripeInformation stripe = stripes.get(currentStripe);
- stripeFooter = readStripeFooter(stripe);
- clearStreams();
- // setup the position in the stripe
- rowCountInStripe = stripe.getNumberOfRows();
- rowInStripe = 0;
- rowBaseInStripe = 0;
- for (int i = 0; i < currentStripe; ++i) {
- rowBaseInStripe += stripes.get(i).getNumberOfRows();
- }
- // reset all of the indexes
- for (int i = 0; i < indexes.length; ++i) {
- indexes[i] = null;
- }
- return stripe;
- }
-
- private void readAllDataStreams(StripeInformation stripe) throws IOException {
- long start = stripe.getIndexLength();
- long end = start + stripe.getDataLength();
- // explicitly trigger 1 big read
- DiskRangeList toRead = new DiskRangeList(start, end);
- bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false);
- List<OrcProto.Stream> streamDescriptions = stripeFooter.getStreamsList();
- createStreams(streamDescriptions, bufferChunks, null, codec, bufferSize, streams);
- }
-
- /**
- * Plan the ranges of the file that we need to read given the list of
- * columns and row groups.
- *
- * @param streamList the list of streams available
- * @param indexes the indexes that have been loaded
- * @param includedColumns which columns are needed
- * @param includedRowGroups which row groups are needed
- * @param isCompressed does the file have generic compression
- * @param encodings the encodings for each column
- * @param types the types of the columns
- * @param compressionSize the compression block size
- * @return the list of disk ranges that will be loaded
- */
- static DiskRangeList planReadPartialDataStreams
- (List<OrcProto.Stream> streamList,
- OrcProto.RowIndex[] indexes,
- boolean[] includedColumns,
- boolean[] includedRowGroups,
- boolean isCompressed,
- List<OrcProto.ColumnEncoding> encodings,
- List<OrcProto.Type> types,
- int compressionSize,
- boolean doMergeBuffers) {
- long offset = 0;
- // figure out which columns have a present stream
- boolean[] hasNull = RecordReaderUtils.findPresentStreamsByColumn(streamList, types);
- CreateHelper list = new CreateHelper();
- for (OrcProto.Stream stream : streamList) {
- long length = stream.getLength();
- int column = stream.getColumn();
- OrcProto.Stream.Kind streamKind = stream.getKind();
- // since stream kind is optional, first check if it exists
- if (stream.hasKind() &&
- (StreamName.getArea(streamKind) == StreamName.Area.DATA) &&
- (column < includedColumns.length && includedColumns[column])) {
- // if we aren't filtering or it is a dictionary, load it.
- if (includedRowGroups == null
- || RecordReaderUtils.isDictionary(streamKind, encodings.get(column))) {
- RecordReaderUtils.addEntireStreamToRanges(offset, length, list, doMergeBuffers);
- } else {
- RecordReaderUtils.addRgFilteredStreamToRanges(stream, includedRowGroups,
- isCompressed, indexes[column], encodings.get(column), types.get(column),
- compressionSize, hasNull[column], offset, length, list, doMergeBuffers);
- }
- }
- offset += length;
- }
- return list.extract();
- }
-
- void createStreams(List<OrcProto.Stream> streamDescriptions,
- DiskRangeList ranges,
- boolean[] includeColumn,
- CompressionCodec codec,
- int bufferSize,
- Map<StreamName, InStream> streams) throws IOException {
- long streamOffset = 0;
- for (OrcProto.Stream streamDesc : streamDescriptions) {
- int column = streamDesc.getColumn();
- if ((includeColumn != null &&
- (column < included.length && !includeColumn[column])) ||
- streamDesc.hasKind() &&
- (StreamName.getArea(streamDesc.getKind()) != StreamName.Area.DATA)) {
- streamOffset += streamDesc.getLength();
- continue;
- }
- List<DiskRange> buffers = RecordReaderUtils.getStreamBuffers(
- ranges, streamOffset, streamDesc.getLength());
- StreamName name = new StreamName(column, streamDesc.getKind());
- streams.put(name, InStream.create(name.toString(), buffers,
- streamDesc.getLength(), codec, bufferSize));
- streamOffset += streamDesc.getLength();
- }
- }
-
- private void readPartialDataStreams(StripeInformation stripe) throws IOException {
- List<OrcProto.Stream> streamList = stripeFooter.getStreamsList();
- DiskRangeList toRead = planReadPartialDataStreams(streamList,
- indexes, included, includedRowGroups, codec != null,
- stripeFooter.getColumnsList(), types, bufferSize, true);
- if (LOG.isDebugEnabled()) {
- LOG.debug("chunks = " + RecordReaderUtils.stringifyDiskRanges(toRead));
- }
- bufferChunks = dataReader.readFileData(toRead, stripe.getOffset(), false);
- if (LOG.isDebugEnabled()) {
- LOG.debug("merge = " + RecordReaderUtils.stringifyDiskRanges(bufferChunks));
- }
-
- createStreams(streamList, bufferChunks, included, codec, bufferSize, streams);
- }
-
- /**
- * Read the next stripe until we find a row that we don't skip.
- *
- * @throws IOException
- */
- private void advanceStripe() throws IOException {
- rowInStripe = rowCountInStripe;
- while (rowInStripe >= rowCountInStripe &&
- currentStripe < stripes.size() - 1) {
- currentStripe += 1;
- readStripe();
- }
- }
-
- /**
- * Skip over rows that we aren't selecting, so that the next row is
- * one that we will read.
- *
- * @param nextRow the row we want to go to
- * @throws IOException
- */
- private boolean advanceToNextRow(
- TreeReaderFactory.TreeReader reader, long nextRow, boolean canAdvanceStripe)
- throws IOException {
- long nextRowInStripe = nextRow - rowBaseInStripe;
- // check for row skipping
- if (rowIndexStride != 0 &&
- includedRowGroups != null &&
- nextRowInStripe < rowCountInStripe) {
- int rowGroup = (int) (nextRowInStripe / rowIndexStride);
- if (!includedRowGroups[rowGroup]) {
- while (rowGroup < includedRowGroups.length && !includedRowGroups[rowGroup]) {
- rowGroup += 1;
- }
- if (rowGroup >= includedRowGroups.length) {
- if (canAdvanceStripe) {
- advanceStripe();
- }
- return canAdvanceStripe;
- }
- nextRowInStripe = Math.min(rowCountInStripe, rowGroup * rowIndexStride);
- }
- }
- if (nextRowInStripe >= rowCountInStripe) {
- if (canAdvanceStripe) {
- advanceStripe();
- }
- return canAdvanceStripe;
- }
- if (nextRowInStripe != rowInStripe) {
- if (rowIndexStride != 0) {
- int rowGroup = (int) (nextRowInStripe / rowIndexStride);
- seekToRowEntry(reader, rowGroup);
- reader.skipRows(nextRowInStripe - rowGroup * rowIndexStride);
- } else {
- reader.skipRows(nextRowInStripe - rowInStripe);
- }
- rowInStripe = nextRowInStripe;
- }
- return true;
- }
-
- @Override
- public boolean nextBatch(VectorizedRowBatch batch) throws IOException {
- try {
- if (rowInStripe >= rowCountInStripe) {
- currentStripe += 1;
- if (currentStripe >= stripes.size()) {
- batch.size = 0;
- return false;
- }
- readStripe();
- }
-
- int batchSize = computeBatchSize(batch.getMaxSize());
-
- rowInStripe += batchSize;
- reader.setVectorColumnCount(batch.getDataColumnCount());
- reader.nextBatch(batch, batchSize);
- batch.selectedInUse = false;
- batch.size = batchSize;
- advanceToNextRow(reader, rowInStripe + rowBaseInStripe, true);
- return batch.size != 0;
- } catch (IOException e) {
- // Rethrow exception with file name in log message
- throw new IOException("Error reading file: " + path, e);
- }
- }
-
- private int computeBatchSize(long targetBatchSize) {
- final int batchSize;
- // In case of PPD, batch size should be aware of row group boundaries. If only a subset of row
- // groups are selected then marker position is set to the end of range (subset of row groups
- // within strip). Batch size computed out of marker position makes sure that batch size is
- // aware of row group boundary and will not cause overflow when reading rows
- // illustration of this case is here https://issues.apache.org/jira/browse/HIVE-6287
- if (rowIndexStride != 0 && includedRowGroups != null && rowInStripe < rowCountInStripe) {
- int startRowGroup = (int) (rowInStripe / rowIndexStride);
- if (!includedRowGroups[startRowGroup]) {
- while (startRowGroup < includedRowGroups.length && !includedRowGroups[startRowGroup]) {
- startRowGroup += 1;
- }
- }
-
- int endRowGroup = startRowGroup;
- while (endRowGroup < includedRowGroups.length && includedRowGroups[endRowGroup]) {
- endRowGroup += 1;
- }
-
- final long markerPosition =
- (endRowGroup * rowIndexStride) < rowCountInStripe ? (endRowGroup * rowIndexStride)
- : rowCountInStripe;
- batchSize = (int) Math.min(targetBatchSize, (markerPosition - rowInStripe));
-
- if (isLogDebugEnabled && batchSize < targetBatchSize) {
- LOG.debug("markerPosition: " + markerPosition + " batchSize: " + batchSize);
- }
- } else {
- batchSize = (int) Math.min(targetBatchSize, (rowCountInStripe - rowInStripe));
- }
- return batchSize;
- }
-
- @Override
- public void close() throws IOException {
- clearStreams();
- dataReader.close();
- }
-
- @Override
- public long getRowNumber() {
- return rowInStripe + rowBaseInStripe + firstRow;
- }
-
- /**
- * Return the fraction of rows that have been read from the selected.
- * section of the file
- *
- * @return fraction between 0.0 and 1.0 of rows consumed
- */
- @Override
- public float getProgress() {
- return ((float) rowBaseInStripe + rowInStripe) / totalRowCount;
- }
-
- private int findStripe(long rowNumber) {
- for (int i = 0; i < stripes.size(); i++) {
- StripeInformation stripe = stripes.get(i);
- if (stripe.getNumberOfRows() > rowNumber) {
- return i;
- }
- rowNumber -= stripe.getNumberOfRows();
- }
- throw new IllegalArgumentException("Seek after the end of reader range");
- }
-
- public OrcIndex readRowIndex(int stripeIndex, boolean[] included,
- boolean[] sargColumns) throws IOException {
- return readRowIndex(stripeIndex, included, null, null, sargColumns);
- }
-
- public OrcIndex readRowIndex(int stripeIndex, boolean[] included,
- OrcProto.RowIndex[] indexes,
- OrcProto.BloomFilterIndex[] bloomFilterIndex,
- boolean[] sargColumns) throws IOException {
- StripeInformation stripe = stripes.get(stripeIndex);
- OrcProto.StripeFooter stripeFooter = null;
- // if this is the current stripe, use the cached objects.
- if (stripeIndex == currentStripe) {
- stripeFooter = this.stripeFooter;
- indexes = indexes == null ? this.indexes : indexes;
- bloomFilterIndex = bloomFilterIndex == null ? this.bloomFilterIndices : bloomFilterIndex;
- sargColumns = sargColumns == null ?
- (sargApp == null ? null : sargApp.sargColumns) : sargColumns;
- }
- return dataReader.readRowIndex(stripe, stripeFooter, included, indexes, sargColumns,
- bloomFilterIndex);
- }
-
- private void seekToRowEntry(TreeReaderFactory.TreeReader reader, int rowEntry)
- throws IOException {
- PositionProvider[] index = new PositionProvider[indexes.length];
- for (int i = 0; i < indexes.length; ++i) {
- if (indexes[i] != null) {
- index[i] = new PositionProviderImpl(indexes[i].getEntry(rowEntry));
- }
- }
- reader.seek(index);
- }
-
- @Override
- public void seekToRow(long rowNumber) throws IOException {
- if (rowNumber < 0) {
- throw new IllegalArgumentException("Seek to a negative row number " +
- rowNumber);
- } else if (rowNumber < firstRow) {
- throw new IllegalArgumentException("Seek before reader range " +
- rowNumber);
- }
- // convert to our internal form (rows from the beginning of slice)
- rowNumber -= firstRow;
-
- // move to the right stripe
- int rightStripe = findStripe(rowNumber);
- if (rightStripe != currentStripe) {
- currentStripe = rightStripe;
- readStripe();
- }
- readRowIndex(currentStripe, included, sargApp == null ? null : sargApp.sargColumns);
-
- // if we aren't to the right row yet, advance in the stripe.
- advanceToNextRow(reader, rowNumber, true);
- }
-
- private static final String TRANSLATED_SARG_SEPARATOR = "_";
- public static String encodeTranslatedSargColumn(int rootColumn, Integer indexInSourceTable) {
- return rootColumn + TRANSLATED_SARG_SEPARATOR
- + ((indexInSourceTable == null) ? -1 : indexInSourceTable);
- }
-
- public static int[] mapTranslatedSargColumns(
- List<OrcProto.Type> types, List<PredicateLeaf> sargLeaves) {
- int[] result = new int[sargLeaves.size()];
- OrcProto.Type lastRoot = null; // Root will be the same for everyone as of now.
- String lastRootStr = null;
- for (int i = 0; i < result.length; ++i) {
- String[] rootAndIndex = sargLeaves.get(i).getColumnName().split(TRANSLATED_SARG_SEPARATOR);
- assert rootAndIndex.length == 2;
- String rootStr = rootAndIndex[0], indexStr = rootAndIndex[1];
- int index = Integer.parseInt(indexStr);
- // First, check if the column even maps to anything.
- if (index == -1) {
- result[i] = -1;
- continue;
- }
- assert index >= 0;
- // Then, find the root type if needed.
- if (!rootStr.equals(lastRootStr)) {
- lastRoot = types.get(Integer.parseInt(rootStr));
- lastRootStr = rootStr;
- }
- // Subtypes of the root types correspond, in order, to the columns in the table schema
- // (disregarding schema evolution that doesn't presently work). Get the index for the
- // corresponding subtype.
- result[i] = lastRoot.getSubtypes(index);
- }
- return result;
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/RecordReaderUtils.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/RecordReaderUtils.java b/orc/src/java/org/apache/orc/impl/RecordReaderUtils.java
deleted file mode 100644
index 6100d50..0000000
--- a/orc/src/java/org/apache/orc/impl/RecordReaderUtils.java
+++ /dev/null
@@ -1,578 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-
-import com.google.common.collect.Lists;
-import org.apache.commons.lang.builder.HashCodeBuilder;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.common.io.DiskRange;
-import org.apache.hadoop.hive.common.io.DiskRangeList;
-import org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper;
-import org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper;
-import org.apache.orc.CompressionCodec;
-import org.apache.orc.DataReader;
-import org.apache.orc.OrcProto;
-
-import com.google.common.collect.ComparisonChain;
-import org.apache.orc.StripeInformation;
-
-/**
- * Stateless methods shared between RecordReaderImpl and EncodedReaderImpl.
- */
-public class RecordReaderUtils {
- private static final HadoopShims SHIMS = HadoopShims.Factory.get();
-
- private static class DefaultDataReader implements DataReader {
- private FSDataInputStream file = null;
- private final ByteBufferAllocatorPool pool;
- private HadoopShims.ZeroCopyReaderShim zcr = null;
- private final FileSystem fs;
- private final Path path;
- private final boolean useZeroCopy;
- private final CompressionCodec codec;
- private final int bufferSize;
- private final int typeCount;
-
- private DefaultDataReader(DefaultDataReader other) {
- this.pool = other.pool;
- this.bufferSize = other.bufferSize;
- this.typeCount = other.typeCount;
- this.fs = other.fs;
- this.path = other.path;
- this.useZeroCopy = other.useZeroCopy;
- this.codec = other.codec;
- }
-
- private DefaultDataReader(DataReaderProperties properties) {
- this.fs = properties.getFileSystem();
- this.path = properties.getPath();
- this.useZeroCopy = properties.getZeroCopy();
- this.codec = PhysicalFsWriter.createCodec(properties.getCompression());
- this.bufferSize = properties.getBufferSize();
- this.typeCount = properties.getTypeCount();
- if (useZeroCopy) {
- this.pool = new ByteBufferAllocatorPool();
- } else {
- this.pool = null;
- }
- }
-
- @Override
- public void open() throws IOException {
- this.file = fs.open(path);
- if (useZeroCopy) {
- zcr = RecordReaderUtils.createZeroCopyShim(file, codec, pool);
- } else {
- zcr = null;
- }
- }
-
- @Override
- public OrcIndex readRowIndex(StripeInformation stripe,
- OrcProto.StripeFooter footer,
- boolean[] included,
- OrcProto.RowIndex[] indexes,
- boolean[] sargColumns,
- OrcProto.BloomFilterIndex[] bloomFilterIndices
- ) throws IOException {
- if (file == null) {
- open();
- }
- if (footer == null) {
- footer = readStripeFooter(stripe);
- }
- if (indexes == null) {
- indexes = new OrcProto.RowIndex[typeCount];
- }
- if (bloomFilterIndices == null) {
- bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
- }
- long offset = stripe.getOffset();
- List<OrcProto.Stream> streams = footer.getStreamsList();
- for (int i = 0; i < streams.size(); i++) {
- OrcProto.Stream stream = streams.get(i);
- OrcProto.Stream nextStream = null;
- if (i < streams.size() - 1) {
- nextStream = streams.get(i+1);
- }
- int col = stream.getColumn();
- int len = (int) stream.getLength();
- // row index stream and bloom filter are interlaced, check if the sarg column contains bloom
- // filter and combine the io to read row index and bloom filters for that column together
- if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.ROW_INDEX)) {
- boolean readBloomFilter = false;
- if (sargColumns != null && sargColumns[col] &&
- nextStream.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER) {
- len += nextStream.getLength();
- i += 1;
- readBloomFilter = true;
- }
- if ((included == null || included[col]) && indexes[col] == null) {
- byte[] buffer = new byte[len];
- file.readFully(offset, buffer, 0, buffer.length);
- ByteBuffer bb = ByteBuffer.wrap(buffer);
- indexes[col] = OrcProto.RowIndex.parseFrom(InStream.create("index",
- Lists.<DiskRange>newArrayList(new BufferChunk(bb, 0)), stream.getLength(),
- codec, bufferSize));
- if (readBloomFilter) {
- bb.position((int) stream.getLength());
- bloomFilterIndices[col] = OrcProto.BloomFilterIndex.parseFrom(InStream.create(
- "bloom_filter", Lists.<DiskRange>newArrayList(new BufferChunk(bb, 0)),
- nextStream.getLength(), codec, bufferSize));
- }
- }
- }
- offset += len;
- }
-
- OrcIndex index = new OrcIndex(indexes, bloomFilterIndices);
- return index;
- }
-
- @Override
- public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe) throws IOException {
- if (file == null) {
- open();
- }
- long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength();
- int tailLength = (int) stripe.getFooterLength();
-
- // read the footer
- ByteBuffer tailBuf = ByteBuffer.allocate(tailLength);
- file.readFully(offset, tailBuf.array(), tailBuf.arrayOffset(), tailLength);
- return OrcProto.StripeFooter.parseFrom(InStream.createCodedInputStream("footer",
- Lists.<DiskRange>newArrayList(new BufferChunk(tailBuf, 0)),
- tailLength, codec, bufferSize));
- }
-
- @Override
- public DiskRangeList readFileData(
- DiskRangeList range, long baseOffset, boolean doForceDirect) throws IOException {
- return RecordReaderUtils.readDiskRanges(file, zcr, baseOffset, range, doForceDirect);
- }
-
- @Override
- public void close() throws IOException {
- if (pool != null) {
- pool.clear();
- }
- // close both zcr and file
- try (HadoopShims.ZeroCopyReaderShim myZcr = zcr) {
- if (file != null) {
- file.close();
- }
- }
- }
-
- @Override
- public boolean isTrackingDiskRanges() {
- return zcr != null;
- }
-
- @Override
- public void releaseBuffer(ByteBuffer buffer) {
- zcr.releaseBuffer(buffer);
- }
-
- @Override
- public DataReader clone() {
- return new DefaultDataReader(this);
- }
-
- }
-
- public static DataReader createDefaultDataReader(DataReaderProperties properties) {
- return new DefaultDataReader(properties);
- }
-
- public static boolean[] findPresentStreamsByColumn(
- List<OrcProto.Stream> streamList, List<OrcProto.Type> types) {
- boolean[] hasNull = new boolean[types.size()];
- for(OrcProto.Stream stream: streamList) {
- if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.PRESENT)) {
- hasNull[stream.getColumn()] = true;
- }
- }
- return hasNull;
- }
-
- /**
- * Does region A overlap region B? The end points are inclusive on both sides.
- * @param leftA A's left point
- * @param rightA A's right point
- * @param leftB B's left point
- * @param rightB B's right point
- * @return Does region A overlap region B?
- */
- static boolean overlap(long leftA, long rightA, long leftB, long rightB) {
- if (leftA <= leftB) {
- return rightA >= leftB;
- }
- return rightB >= leftA;
- }
-
- public static void addEntireStreamToRanges(
- long offset, long length, CreateHelper list, boolean doMergeBuffers) {
- list.addOrMerge(offset, offset + length, doMergeBuffers, false);
- }
-
- public static void addRgFilteredStreamToRanges(OrcProto.Stream stream,
- boolean[] includedRowGroups, boolean isCompressed, OrcProto.RowIndex index,
- OrcProto.ColumnEncoding encoding, OrcProto.Type type, int compressionSize, boolean hasNull,
- long offset, long length, CreateHelper list, boolean doMergeBuffers) {
- for (int group = 0; group < includedRowGroups.length; ++group) {
- if (!includedRowGroups[group]) continue;
- int posn = getIndexPosition(
- encoding.getKind(), type.getKind(), stream.getKind(), isCompressed, hasNull);
- long start = index.getEntry(group).getPositions(posn);
- final long nextGroupOffset;
- boolean isLast = group == (includedRowGroups.length - 1);
- nextGroupOffset = isLast ? length : index.getEntry(group + 1).getPositions(posn);
-
- start += offset;
- long end = offset + estimateRgEndOffset(
- isCompressed, isLast, nextGroupOffset, length, compressionSize);
- list.addOrMerge(start, end, doMergeBuffers, true);
- }
- }
-
- public static long estimateRgEndOffset(boolean isCompressed, boolean isLast,
- long nextGroupOffset, long streamLength, int bufferSize) {
- // figure out the worst case last location
- // if adjacent groups have the same compressed block offset then stretch the slop
- // by factor of 2 to safely accommodate the next compression block.
- // One for the current compression block and another for the next compression block.
- long slop = isCompressed ? 2 * (OutStream.HEADER_SIZE + bufferSize) : WORST_UNCOMPRESSED_SLOP;
- return isLast ? streamLength : Math.min(streamLength, nextGroupOffset + slop);
- }
-
- private static final int BYTE_STREAM_POSITIONS = 1;
- private static final int RUN_LENGTH_BYTE_POSITIONS = BYTE_STREAM_POSITIONS + 1;
- private static final int BITFIELD_POSITIONS = RUN_LENGTH_BYTE_POSITIONS + 1;
- private static final int RUN_LENGTH_INT_POSITIONS = BYTE_STREAM_POSITIONS + 1;
-
- /**
- * Get the offset in the index positions for the column that the given
- * stream starts.
- * @param columnEncoding the encoding of the column
- * @param columnType the type of the column
- * @param streamType the kind of the stream
- * @param isCompressed is the file compressed
- * @param hasNulls does the column have a PRESENT stream?
- * @return the number of positions that will be used for that stream
- */
- public static int getIndexPosition(OrcProto.ColumnEncoding.Kind columnEncoding,
- OrcProto.Type.Kind columnType,
- OrcProto.Stream.Kind streamType,
- boolean isCompressed,
- boolean hasNulls) {
- if (streamType == OrcProto.Stream.Kind.PRESENT) {
- return 0;
- }
- int compressionValue = isCompressed ? 1 : 0;
- int base = hasNulls ? (BITFIELD_POSITIONS + compressionValue) : 0;
- switch (columnType) {
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- case FLOAT:
- case DOUBLE:
- case DATE:
- case STRUCT:
- case MAP:
- case LIST:
- case UNION:
- return base;
- case CHAR:
- case VARCHAR:
- case STRING:
- if (columnEncoding == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
- columnEncoding == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
- return base;
- } else {
- if (streamType == OrcProto.Stream.Kind.DATA) {
- return base;
- } else {
- return base + BYTE_STREAM_POSITIONS + compressionValue;
- }
- }
- case BINARY:
- if (streamType == OrcProto.Stream.Kind.DATA) {
- return base;
- }
- return base + BYTE_STREAM_POSITIONS + compressionValue;
- case DECIMAL:
- if (streamType == OrcProto.Stream.Kind.DATA) {
- return base;
- }
- return base + BYTE_STREAM_POSITIONS + compressionValue;
- case TIMESTAMP:
- if (streamType == OrcProto.Stream.Kind.DATA) {
- return base;
- }
- return base + RUN_LENGTH_INT_POSITIONS + compressionValue;
- default:
- throw new IllegalArgumentException("Unknown type " + columnType);
- }
- }
-
- // for uncompressed streams, what is the most overlap with the following set
- // of rows (long vint literal group).
- static final int WORST_UNCOMPRESSED_SLOP = 2 + 8 * 512;
-
- /**
- * Is this stream part of a dictionary?
- * @return is this part of a dictionary?
- */
- public static boolean isDictionary(OrcProto.Stream.Kind kind,
- OrcProto.ColumnEncoding encoding) {
- assert kind != OrcProto.Stream.Kind.DICTIONARY_COUNT;
- OrcProto.ColumnEncoding.Kind encodingKind = encoding.getKind();
- return kind == OrcProto.Stream.Kind.DICTIONARY_DATA ||
- (kind == OrcProto.Stream.Kind.LENGTH &&
- (encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
- encodingKind == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2));
- }
-
- /**
- * Build a string representation of a list of disk ranges.
- * @param range ranges to stringify
- * @return the resulting string
- */
- public static String stringifyDiskRanges(DiskRangeList range) {
- StringBuilder buffer = new StringBuilder();
- buffer.append("[");
- boolean isFirst = true;
- while (range != null) {
- if (!isFirst) {
- buffer.append(", {");
- } else {
- buffer.append("{");
- }
- isFirst = false;
- buffer.append(range.toString());
- buffer.append("}");
- range = range.next;
- }
- buffer.append("]");
- return buffer.toString();
- }
-
- /**
- * Read the list of ranges from the file.
- * @param file the file to read
- * @param base the base of the stripe
- * @param range the disk ranges within the stripe to read
- * @return the bytes read for each disk range, which is the same length as
- * ranges
- * @throws IOException
- */
- static DiskRangeList readDiskRanges(FSDataInputStream file,
- HadoopShims.ZeroCopyReaderShim zcr,
- long base,
- DiskRangeList range,
- boolean doForceDirect) throws IOException {
- if (range == null) return null;
- DiskRangeList prev = range.prev;
- if (prev == null) {
- prev = new MutateHelper(range);
- }
- while (range != null) {
- if (range.hasData()) {
- range = range.next;
- continue;
- }
- int len = (int) (range.getEnd() - range.getOffset());
- long off = range.getOffset();
- if (zcr != null) {
- file.seek(base + off);
- boolean hasReplaced = false;
- while (len > 0) {
- ByteBuffer partial = zcr.readBuffer(len, false);
- BufferChunk bc = new BufferChunk(partial, off);
- if (!hasReplaced) {
- range.replaceSelfWith(bc);
- hasReplaced = true;
- } else {
- range.insertAfter(bc);
- }
- range = bc;
- int read = partial.remaining();
- len -= read;
- off += read;
- }
- } else {
- // Don't use HDFS ByteBuffer API because it has no readFully, and is buggy and pointless.
- byte[] buffer = new byte[len];
- file.readFully((base + off), buffer, 0, buffer.length);
- ByteBuffer bb = null;
- if (doForceDirect) {
- bb = ByteBuffer.allocateDirect(len);
- bb.put(buffer);
- bb.position(0);
- bb.limit(len);
- } else {
- bb = ByteBuffer.wrap(buffer);
- }
- range = range.replaceSelfWith(new BufferChunk(bb, range.getOffset()));
- }
- range = range.next;
- }
- return prev.next;
- }
-
-
- static List<DiskRange> getStreamBuffers(DiskRangeList range, long offset, long length) {
- // This assumes sorted ranges (as do many other parts of ORC code.
- ArrayList<DiskRange> buffers = new ArrayList<DiskRange>();
- if (length == 0) return buffers;
- long streamEnd = offset + length;
- boolean inRange = false;
- while (range != null) {
- if (!inRange) {
- if (range.getEnd() <= offset) {
- range = range.next;
- continue; // Skip until we are in range.
- }
- inRange = true;
- if (range.getOffset() < offset) {
- // Partial first buffer, add a slice of it.
- buffers.add(range.sliceAndShift(offset, Math.min(streamEnd, range.getEnd()), -offset));
- if (range.getEnd() >= streamEnd) break; // Partial first buffer is also partial last buffer.
- range = range.next;
- continue;
- }
- } else if (range.getOffset() >= streamEnd) {
- break;
- }
- if (range.getEnd() > streamEnd) {
- // Partial last buffer (may also be the first buffer), add a slice of it.
- buffers.add(range.sliceAndShift(range.getOffset(), streamEnd, -offset));
- break;
- }
- // Buffer that belongs entirely to one stream.
- // TODO: ideally we would want to reuse the object and remove it from the list, but we cannot
- // because bufferChunks is also used by clearStreams for zcr. Create a useless dup.
- buffers.add(range.sliceAndShift(range.getOffset(), range.getEnd(), -offset));
- if (range.getEnd() == streamEnd) break;
- range = range.next;
- }
- return buffers;
- }
-
- static HadoopShims.ZeroCopyReaderShim createZeroCopyShim(FSDataInputStream file,
- CompressionCodec codec, ByteBufferAllocatorPool pool) throws IOException {
- if ((codec == null || ((codec instanceof DirectDecompressionCodec)
- && ((DirectDecompressionCodec) codec).isAvailable()))) {
- /* codec is null or is available */
- return SHIMS.getZeroCopyReader(file, pool);
- }
- return null;
- }
-
- // this is an implementation copied from ElasticByteBufferPool in hadoop-2,
- // which lacks a clear()/clean() operation
- public final static class ByteBufferAllocatorPool implements HadoopShims.ByteBufferPoolShim {
- private static final class Key implements Comparable<Key> {
- private final int capacity;
- private final long insertionGeneration;
-
- Key(int capacity, long insertionGeneration) {
- this.capacity = capacity;
- this.insertionGeneration = insertionGeneration;
- }
-
- @Override
- public int compareTo(Key other) {
- return ComparisonChain.start().compare(capacity, other.capacity)
- .compare(insertionGeneration, other.insertionGeneration).result();
- }
-
- @Override
- public boolean equals(Object rhs) {
- if (rhs == null) {
- return false;
- }
- try {
- Key o = (Key) rhs;
- return (compareTo(o) == 0);
- } catch (ClassCastException e) {
- return false;
- }
- }
-
- @Override
- public int hashCode() {
- return new HashCodeBuilder().append(capacity).append(insertionGeneration)
- .toHashCode();
- }
- }
-
- private final TreeMap<Key, ByteBuffer> buffers = new TreeMap<Key, ByteBuffer>();
-
- private final TreeMap<Key, ByteBuffer> directBuffers = new TreeMap<Key, ByteBuffer>();
-
- private long currentGeneration = 0;
-
- private final TreeMap<Key, ByteBuffer> getBufferTree(boolean direct) {
- return direct ? directBuffers : buffers;
- }
-
- public void clear() {
- buffers.clear();
- directBuffers.clear();
- }
-
- @Override
- public ByteBuffer getBuffer(boolean direct, int length) {
- TreeMap<Key, ByteBuffer> tree = getBufferTree(direct);
- Map.Entry<Key, ByteBuffer> entry = tree.ceilingEntry(new Key(length, 0));
- if (entry == null) {
- return direct ? ByteBuffer.allocateDirect(length) : ByteBuffer
- .allocate(length);
- }
- tree.remove(entry.getKey());
- return entry.getValue();
- }
-
- @Override
- public void putBuffer(ByteBuffer buffer) {
- TreeMap<Key, ByteBuffer> tree = getBufferTree(buffer.isDirect());
- while (true) {
- Key key = new Key(buffer.capacity(), currentGeneration++);
- if (!tree.containsKey(key)) {
- tree.put(key, buffer);
- return;
- }
- // Buffers are indexed by (capacity, generation).
- // If our key is not unique on the first try, we try again
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/RedBlackTree.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/RedBlackTree.java b/orc/src/java/org/apache/orc/impl/RedBlackTree.java
deleted file mode 100644
index 41aa4b9..0000000
--- a/orc/src/java/org/apache/orc/impl/RedBlackTree.java
+++ /dev/null
@@ -1,311 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import org.apache.orc.impl.DynamicIntArray;
-
-/**
- * A memory efficient red-black tree that does not allocate any objects per
- * an element. This class is abstract and assumes that the child class
- * handles the key and comparisons with the key.
- */
-abstract class RedBlackTree {
- public static final int NULL = -1;
-
- // Various values controlling the offset of the data within the array.
- private static final int LEFT_OFFSET = 0;
- private static final int RIGHT_OFFSET = 1;
- private static final int ELEMENT_SIZE = 2;
-
- protected int size = 0;
- private final DynamicIntArray data;
- protected int root = NULL;
- protected int lastAdd = 0;
- private boolean wasAdd = false;
-
- /**
- * Create a set with the given initial capacity.
- */
- public RedBlackTree(int initialCapacity) {
- data = new DynamicIntArray(initialCapacity * ELEMENT_SIZE);
- }
-
- /**
- * Insert a new node into the data array, growing the array as necessary.
- *
- * @return Returns the position of the new node.
- */
- private int insert(int left, int right, boolean isRed) {
- int position = size;
- size += 1;
- setLeft(position, left, isRed);
- setRight(position, right);
- return position;
- }
-
- /**
- * Compare the value at the given position to the new value.
- * @return 0 if the values are the same, -1 if the new value is smaller and
- * 1 if the new value is larger.
- */
- protected abstract int compareValue(int position);
-
- /**
- * Is the given node red as opposed to black? To prevent having an extra word
- * in the data array, we just the low bit on the left child index.
- */
- protected boolean isRed(int position) {
- return position != NULL &&
- (data.get(position * ELEMENT_SIZE + LEFT_OFFSET) & 1) == 1;
- }
-
- /**
- * Set the red bit true or false.
- */
- private void setRed(int position, boolean isRed) {
- int offset = position * ELEMENT_SIZE + LEFT_OFFSET;
- if (isRed) {
- data.set(offset, data.get(offset) | 1);
- } else {
- data.set(offset, data.get(offset) & ~1);
- }
- }
-
- /**
- * Get the left field of the given position.
- */
- protected int getLeft(int position) {
- return data.get(position * ELEMENT_SIZE + LEFT_OFFSET) >> 1;
- }
-
- /**
- * Get the right field of the given position.
- */
- protected int getRight(int position) {
- return data.get(position * ELEMENT_SIZE + RIGHT_OFFSET);
- }
-
- /**
- * Set the left field of the given position.
- * Note that we are storing the node color in the low bit of the left pointer.
- */
- private void setLeft(int position, int left) {
- int offset = position * ELEMENT_SIZE + LEFT_OFFSET;
- data.set(offset, (left << 1) | (data.get(offset) & 1));
- }
-
- /**
- * Set the left field of the given position.
- * Note that we are storing the node color in the low bit of the left pointer.
- */
- private void setLeft(int position, int left, boolean isRed) {
- int offset = position * ELEMENT_SIZE + LEFT_OFFSET;
- data.set(offset, (left << 1) | (isRed ? 1 : 0));
- }
-
- /**
- * Set the right field of the given position.
- */
- private void setRight(int position, int right) {
- data.set(position * ELEMENT_SIZE + RIGHT_OFFSET, right);
- }
-
- /**
- * Insert or find a given key in the tree and rebalance the tree correctly.
- * Rebalancing restores the red-black aspect of the tree to maintain the
- * invariants:
- * 1. If a node is red, both of its children are black.
- * 2. Each child of a node has the same black height (the number of black
- * nodes between it and the leaves of the tree).
- *
- * Inserted nodes are at the leaves and are red, therefore there is at most a
- * violation of rule 1 at the node we just put in. Instead of always keeping
- * the parents, this routine passing down the context.
- *
- * The fix is broken down into 6 cases (1.{1,2,3} and 2.{1,2,3} that are
- * left-right mirror images of each other). See Algorighms by Cormen,
- * Leiserson, and Rivest for the explaination of the subcases.
- *
- * @param node The node that we are fixing right now.
- * @param fromLeft Did we come down from the left?
- * @param parent Nodes' parent
- * @param grandparent Parent's parent
- * @param greatGrandparent Grandparent's parent
- * @return Does parent also need to be checked and/or fixed?
- */
- private boolean add(int node, boolean fromLeft, int parent,
- int grandparent, int greatGrandparent) {
- if (node == NULL) {
- if (root == NULL) {
- lastAdd = insert(NULL, NULL, false);
- root = lastAdd;
- wasAdd = true;
- return false;
- } else {
- lastAdd = insert(NULL, NULL, true);
- node = lastAdd;
- wasAdd = true;
- // connect the new node into the tree
- if (fromLeft) {
- setLeft(parent, node);
- } else {
- setRight(parent, node);
- }
- }
- } else {
- int compare = compareValue(node);
- boolean keepGoing;
-
- // Recurse down to find where the node needs to be added
- if (compare < 0) {
- keepGoing = add(getLeft(node), true, node, parent, grandparent);
- } else if (compare > 0) {
- keepGoing = add(getRight(node), false, node, parent, grandparent);
- } else {
- lastAdd = node;
- wasAdd = false;
- return false;
- }
-
- // we don't need to fix the root (because it is always set to black)
- if (node == root || !keepGoing) {
- return false;
- }
- }
-
-
- // Do we need to fix this node? Only if there are two reds right under each
- // other.
- if (isRed(node) && isRed(parent)) {
- if (parent == getLeft(grandparent)) {
- int uncle = getRight(grandparent);
- if (isRed(uncle)) {
- // case 1.1
- setRed(parent, false);
- setRed(uncle, false);
- setRed(grandparent, true);
- return true;
- } else {
- if (node == getRight(parent)) {
- // case 1.2
- // swap node and parent
- int tmp = node;
- node = parent;
- parent = tmp;
- // left-rotate on node
- setLeft(grandparent, parent);
- setRight(node, getLeft(parent));
- setLeft(parent, node);
- }
-
- // case 1.2 and 1.3
- setRed(parent, false);
- setRed(grandparent, true);
-
- // right-rotate on grandparent
- if (greatGrandparent == NULL) {
- root = parent;
- } else if (getLeft(greatGrandparent) == grandparent) {
- setLeft(greatGrandparent, parent);
- } else {
- setRight(greatGrandparent, parent);
- }
- setLeft(grandparent, getRight(parent));
- setRight(parent, grandparent);
- return false;
- }
- } else {
- int uncle = getLeft(grandparent);
- if (isRed(uncle)) {
- // case 2.1
- setRed(parent, false);
- setRed(uncle, false);
- setRed(grandparent, true);
- return true;
- } else {
- if (node == getLeft(parent)) {
- // case 2.2
- // swap node and parent
- int tmp = node;
- node = parent;
- parent = tmp;
- // right-rotate on node
- setRight(grandparent, parent);
- setLeft(node, getRight(parent));
- setRight(parent, node);
- }
- // case 2.2 and 2.3
- setRed(parent, false);
- setRed(grandparent, true);
- // left-rotate on grandparent
- if (greatGrandparent == NULL) {
- root = parent;
- } else if (getRight(greatGrandparent) == grandparent) {
- setRight(greatGrandparent, parent);
- } else {
- setLeft(greatGrandparent, parent);
- }
- setRight(grandparent, getLeft(parent));
- setLeft(parent, grandparent);
- return false;
- }
- }
- } else {
- return true;
- }
- }
-
- /**
- * Add the new key to the tree.
- * @return true if the element is a new one.
- */
- protected boolean add() {
- add(root, false, NULL, NULL, NULL);
- if (wasAdd) {
- setRed(root, false);
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Get the number of elements in the set.
- */
- public int size() {
- return size;
- }
-
- /**
- * Reset the table to empty.
- */
- public void clear() {
- root = NULL;
- size = 0;
- data.clear();
- }
-
- /**
- * Get the buffer size in bytes.
- */
- public long getSizeInBytes() {
- return data.getSizeInBytes();
- }
-}
-
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/RunLengthByteReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/RunLengthByteReader.java b/orc/src/java/org/apache/orc/impl/RunLengthByteReader.java
deleted file mode 100644
index 24bd051..0000000
--- a/orc/src/java/org/apache/orc/impl/RunLengthByteReader.java
+++ /dev/null
@@ -1,174 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.EOFException;
-import java.io.IOException;
-
-import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
-
-/**
- * A reader that reads a sequence of bytes. A control byte is read before
- * each run with positive values 0 to 127 meaning 3 to 130 repetitions. If the
- * byte is -1 to -128, 1 to 128 literal byte values follow.
- */
-public class RunLengthByteReader {
- private InStream input;
- private final byte[] literals =
- new byte[RunLengthByteWriter.MAX_LITERAL_SIZE];
- private int numLiterals = 0;
- private int used = 0;
- private boolean repeat = false;
-
- public RunLengthByteReader(InStream input) throws IOException {
- this.input = input;
- }
-
- public void setInStream(InStream input) {
- this.input = input;
- }
-
- private void readValues(boolean ignoreEof) throws IOException {
- int control = input.read();
- used = 0;
- if (control == -1) {
- if (!ignoreEof) {
- throw new EOFException("Read past end of buffer RLE byte from " + input);
- }
- used = numLiterals = 0;
- return;
- } else if (control < 0x80) {
- repeat = true;
- numLiterals = control + RunLengthByteWriter.MIN_REPEAT_SIZE;
- int val = input.read();
- if (val == -1) {
- throw new EOFException("Reading RLE byte got EOF");
- }
- literals[0] = (byte) val;
- } else {
- repeat = false;
- numLiterals = 0x100 - control;
- int bytes = 0;
- while (bytes < numLiterals) {
- int result = input.read(literals, bytes, numLiterals - bytes);
- if (result == -1) {
- throw new EOFException("Reading RLE byte literal got EOF in " + this);
- }
- bytes += result;
- }
- }
- }
-
- public boolean hasNext() throws IOException {
- return used != numLiterals || input.available() > 0;
- }
-
- public byte next() throws IOException {
- byte result;
- if (used == numLiterals) {
- readValues(false);
- }
- if (repeat) {
- result = literals[0];
- } else {
- result = literals[used];
- }
- ++used;
- return result;
- }
-
- public void nextVector(ColumnVector previous, long[] data, long size)
- throws IOException {
- previous.isRepeating = true;
- for (int i = 0; i < size; i++) {
- if (!previous.isNull[i]) {
- data[i] = next();
- } else {
- // The default value of null for int types in vectorized
- // processing is 1, so set that if the value is null
- data[i] = 1;
- }
-
- // The default value for nulls in Vectorization for int types is 1
- // and given that non null value can also be 1, we need to check for isNull also
- // when determining the isRepeating flag.
- if (previous.isRepeating
- && i > 0
- && ((data[0] != data[i]) ||
- (previous.isNull[0] != previous.isNull[i]))) {
- previous.isRepeating = false;
- }
- }
- }
-
- /**
- * Read the next size bytes into the data array, skipping over any slots
- * where isNull is true.
- * @param isNull if non-null, skip any rows where isNull[r] is true
- * @param data the array to read into
- * @param size the number of elements to read
- * @throws IOException
- */
- public void nextVector(boolean[] isNull, int[] data,
- long size) throws IOException {
- if (isNull == null) {
- for(int i=0; i < size; ++i) {
- data[i] = next();
- }
- } else {
- for(int i=0; i < size; ++i) {
- if (!isNull[i]) {
- data[i] = next();
- }
- }
- }
- }
-
- public void seek(PositionProvider index) throws IOException {
- input.seek(index);
- int consumed = (int) index.getNext();
- if (consumed != 0) {
- // a loop is required for cases where we break the run into two parts
- while (consumed > 0) {
- readValues(false);
- used = consumed;
- consumed -= numLiterals;
- }
- } else {
- used = 0;
- numLiterals = 0;
- }
- }
-
- public void skip(long items) throws IOException {
- while (items > 0) {
- if (used == numLiterals) {
- readValues(false);
- }
- long consume = Math.min(items, numLiterals - used);
- used += consume;
- items -= consume;
- }
- }
-
- @Override
- public String toString() {
- return "byte rle " + (repeat ? "repeat" : "literal") + " used: " +
- used + "/" + numLiterals + " from " + input;
- }
-}
[11/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/TestStringDictionary.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/TestStringDictionary.java b/orc/src/test/org/apache/hive/orc/TestStringDictionary.java
new file mode 100644
index 0000000..6e24819
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/TestStringDictionary.java
@@ -0,0 +1,291 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import org.apache.hive.orc.impl.RecordReaderImpl;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+public class TestStringDictionary {
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
+ + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testTooManyDistinct() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector col = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 20000; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ col.setVal(batch.size++, String.valueOf(i).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ col = (BytesColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(String.valueOf(idx++), col.toString(r));
+ }
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ Assert.assertEquals(OrcProto.ColumnEncoding.Kind.DIRECT_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testHalfDistinct() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ Random rand = new Random(123);
+ int[] input = new int[20000];
+ for (int i = 0; i < 20000; i++) {
+ input[i] = rand.nextInt(10000);
+ }
+
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector col = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 20000; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ col.setVal(batch.size++, String.valueOf(input[i]).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ col = (BytesColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(String.valueOf(input[idx++]), col.toString(r));
+ }
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ Assert.assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testTooManyDistinctCheckDisabled() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ conf.setBoolean(OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getAttribute(), false);
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector string = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 20000; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ string.setVal(batch.size++, String.valueOf(i).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ string = (BytesColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(String.valueOf(idx++), string.toString(r));
+ }
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ Assert.assertEquals(OrcProto.ColumnEncoding.Kind.DIRECT_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testHalfDistinctCheckDisabled() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ conf.setBoolean(OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getAttribute(),
+ false);
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ Random rand = new Random(123);
+ int[] input = new int[20000];
+ for (int i = 0; i < 20000; i++) {
+ input[i] = rand.nextInt(10000);
+ }
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector string = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 20000; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ string.setVal(batch.size++, String.valueOf(input[i]).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ string = (BytesColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(String.valueOf(input[idx++]), string.toString(r));
+ }
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ Assert.assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testTooManyDistinctV11AlwaysDictionary() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .version(OrcFile.Version.V_0_11).bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector string = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 20000; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ string.setVal(batch.size++, String.valueOf(i).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ batch = reader.getSchema().createRowBatch();
+ string = (BytesColumnVector) batch.cols[0];
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(String.valueOf(idx++), string.toString(r));
+ }
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ Assert.assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY, encoding.getKind());
+ }
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/TestTypeDescription.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/TestTypeDescription.java b/orc/src/test/org/apache/hive/orc/TestTypeDescription.java
new file mode 100644
index 0000000..1887209
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/TestTypeDescription.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+
+public class TestTypeDescription {
+
+ @Test
+ public void testJson() {
+ TypeDescription bin = TypeDescription.createBinary();
+ assertEquals("{\"category\": \"binary\", \"id\": 0, \"max\": 0}",
+ bin.toJson());
+ assertEquals("binary", bin.toString());
+ TypeDescription struct = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createInt())
+ .addField("f2", TypeDescription.createString())
+ .addField("f3", TypeDescription.createDecimal());
+ assertEquals("struct<f1:int,f2:string,f3:decimal(38,10)>",
+ struct.toString());
+ assertEquals("{\"category\": \"struct\", \"id\": 0, \"max\": 3, \"fields\": [\n"
+ + " \"f1\": {\"category\": \"int\", \"id\": 1, \"max\": 1},\n"
+ + " \"f2\": {\"category\": \"string\", \"id\": 2, \"max\": 2},\n"
+ + " \"f3\": {\"category\": \"decimal\", \"id\": 3, \"max\": 3, \"precision\": 38, \"scale\": 10}]}",
+ struct.toJson());
+ struct = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createUnion()
+ .addUnionChild(TypeDescription.createByte())
+ .addUnionChild(TypeDescription.createDecimal()
+ .withPrecision(20).withScale(10)))
+ .addField("f2", TypeDescription.createStruct()
+ .addField("f3", TypeDescription.createDate())
+ .addField("f4", TypeDescription.createDouble())
+ .addField("f5", TypeDescription.createBoolean()))
+ .addField("f6", TypeDescription.createChar().withMaxLength(100));
+ assertEquals("struct<f1:uniontype<tinyint,decimal(20,10)>,f2:struct<f3:date,f4:double,f5:boolean>,f6:char(100)>",
+ struct.toString());
+ assertEquals(
+ "{\"category\": \"struct\", \"id\": 0, \"max\": 8, \"fields\": [\n" +
+ " \"f1\": {\"category\": \"uniontype\", \"id\": 1, \"max\": 3, \"children\": [\n" +
+ " {\"category\": \"tinyint\", \"id\": 2, \"max\": 2},\n" +
+ " {\"category\": \"decimal\", \"id\": 3, \"max\": 3, \"precision\": 20, \"scale\": 10}]},\n" +
+ " \"f2\": {\"category\": \"struct\", \"id\": 4, \"max\": 7, \"fields\": [\n" +
+ " \"f3\": {\"category\": \"date\", \"id\": 5, \"max\": 5},\n" +
+ " \"f4\": {\"category\": \"double\", \"id\": 6, \"max\": 6},\n" +
+ " \"f5\": {\"category\": \"boolean\", \"id\": 7, \"max\": 7}]},\n" +
+ " \"f6\": {\"category\": \"char\", \"id\": 8, \"max\": 8, \"length\": 100}]}",
+ struct.toJson());
+ }
+
+ @Test
+ public void testEquals() {
+ TypeDescription type1 =
+ TypeDescription.createStruct()
+ .addField("a", TypeDescription.createInt())
+ .addField("b", TypeDescription.createStruct()
+ .addField("x", TypeDescription.createString())
+ .addField("y", TypeDescription.createBinary())
+ .addField("z", TypeDescription.createDouble()))
+ .addField("c", TypeDescription.createString());
+ assertEquals(0, type1.getId());
+ assertEquals(6, type1.getMaximumId());
+ TypeDescription type2 =
+ TypeDescription.createStruct()
+ .addField("x", TypeDescription.createString())
+ .addField("y", TypeDescription.createBinary())
+ .addField("z", TypeDescription.createDouble());
+ assertEquals(0, type2.getId());
+ assertEquals(3, type2.getMaximumId());
+ assertEquals(type2, type1.getChildren().get(1));
+ assertEquals(type2.hashCode(), type1.getChildren().get(1).hashCode());
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/TestUnrolledBitPack.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/TestUnrolledBitPack.java b/orc/src/test/org/apache/hive/orc/TestUnrolledBitPack.java
new file mode 100644
index 0000000..a7a9de4
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/TestUnrolledBitPack.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import com.google.common.collect.Lists;
+import com.google.common.primitives.Longs;
+
+@RunWith(value = Parameterized.class)
+public class TestUnrolledBitPack {
+
+ private long val;
+
+ public TestUnrolledBitPack(long val) {
+ this.val = val;
+ }
+
+ @Parameters
+ public static Collection<Object[]> data() {
+ Object[][] data = new Object[][] { { -1 }, { 1 }, { 7 }, { -128 }, { 32000 }, { 8300000 },
+ { Integer.MAX_VALUE }, { 540000000000L }, { 140000000000000L }, { 36000000000000000L },
+ { Long.MAX_VALUE } };
+ return Arrays.asList(data);
+ }
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
+ + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testBitPacking() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { val, 0, val, val, 0, val, 0, val, val, 0, val, 0, val, val, 0, 0,
+ val, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val,
+ 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0,
+ 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0,
+ val, 0, val, 0, 0, val, 0, val, 0, 0, val, val };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .compress(CompressionKind.NONE).bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (Long l : input) {
+ int row = batch.size++;
+ ((LongColumnVector) batch.cols[0]).vector[row] = l;
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+}
[07/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestRunLengthIntegerReader.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestRunLengthIntegerReader.java b/orc/src/test/org/apache/hive/orc/impl/TestRunLengthIntegerReader.java
new file mode 100644
index 0000000..8a6337d
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestRunLengthIntegerReader.java
@@ -0,0 +1,125 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import org.apache.hive.orc.CompressionCodec;
+import org.junit.Test;
+
+public class TestRunLengthIntegerReader {
+
+ public void runSeekTest(CompressionCodec codec) throws Exception {
+ TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
+ RunLengthIntegerWriter out = new RunLengthIntegerWriter(
+ new OutStream("test", 1000, codec, collect), true);
+ TestInStream.PositionCollector[] positions =
+ new TestInStream.PositionCollector[4096];
+ Random random = new Random(99);
+ int[] junk = new int[2048];
+ for(int i=0; i < junk.length; ++i) {
+ junk[i] = random.nextInt();
+ }
+ for(int i=0; i < 4096; ++i) {
+ positions[i] = new TestInStream.PositionCollector();
+ out.getPosition(positions[i]);
+ // test runs, incrementing runs, non-runs
+ if (i < 1024) {
+ out.write(i/4);
+ } else if (i < 2048) {
+ out.write(2*i);
+ } else {
+ out.write(junk[i-2048]);
+ }
+ }
+ out.flush();
+ ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
+ collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
+ inBuf.flip();
+ RunLengthIntegerReader in = new RunLengthIntegerReader(InStream.create
+ ("test", new ByteBuffer[]{inBuf}, new long[]{0}, inBuf.remaining(),
+ codec, 1000), true);
+ for(int i=0; i < 2048; ++i) {
+ int x = (int) in.next();
+ if (i < 1024) {
+ assertEquals(i/4, x);
+ } else if (i < 2048) {
+ assertEquals(2*i, x);
+ } else {
+ assertEquals(junk[i-2048], x);
+ }
+ }
+ for(int i=2047; i >= 0; --i) {
+ in.seek(positions[i]);
+ int x = (int) in.next();
+ if (i < 1024) {
+ assertEquals(i/4, x);
+ } else if (i < 2048) {
+ assertEquals(2*i, x);
+ } else {
+ assertEquals(junk[i-2048], x);
+ }
+ }
+ }
+
+ @Test
+ public void testUncompressedSeek() throws Exception {
+ runSeekTest(null);
+ }
+
+ @Test
+ public void testCompressedSeek() throws Exception {
+ runSeekTest(new ZlibCodec());
+ }
+
+ @Test
+ public void testSkips() throws Exception {
+ TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
+ RunLengthIntegerWriter out = new RunLengthIntegerWriter(
+ new OutStream("test", 100, null, collect), true);
+ for(int i=0; i < 2048; ++i) {
+ if (i < 1024) {
+ out.write(i);
+ } else {
+ out.write(256 * i);
+ }
+ }
+ out.flush();
+ ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
+ collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
+ inBuf.flip();
+ RunLengthIntegerReader in = new RunLengthIntegerReader(InStream.create
+ ("test", new ByteBuffer[]{inBuf}, new long[]{0}, inBuf.remaining(),
+ null, 100), true);
+ for(int i=0; i < 2048; i += 10) {
+ int x = (int) in.next();
+ if (i < 1024) {
+ assertEquals(i, x);
+ } else {
+ assertEquals(256 * i, x);
+ }
+ if (i < 2038) {
+ in.skip(9);
+ }
+ in.skip(0);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestSchemaEvolution.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestSchemaEvolution.java b/orc/src/test/org/apache/hive/orc/impl/TestSchemaEvolution.java
new file mode 100644
index 0000000..cc963c8
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestSchemaEvolution.java
@@ -0,0 +1,480 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hive.orc.OrcFile;
+import org.apache.hive.orc.RecordReader;
+import org.apache.hive.orc.TypeDescription;
+import org.apache.hive.orc.Writer;
+import org.apache.hive.orc.Reader;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+public class TestSchemaEvolution {
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ Configuration conf;
+ Path testFilePath;
+ FileSystem fs;
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+
+ @Before
+ public void setup() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcFile." +
+ testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testDataTypeConversion1() throws IOException {
+ TypeDescription fileStruct1 = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createInt())
+ .addField("f2", TypeDescription.createString())
+ .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10));
+ SchemaEvolution same1 = new SchemaEvolution(fileStruct1, null);
+ assertFalse(same1.hasConversion());
+ TypeDescription readerStruct1 = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createInt())
+ .addField("f2", TypeDescription.createString())
+ .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10));
+ SchemaEvolution both1 = new SchemaEvolution(fileStruct1, readerStruct1, null);
+ assertFalse(both1.hasConversion());
+ TypeDescription readerStruct1diff = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createLong())
+ .addField("f2", TypeDescription.createString())
+ .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10));
+ SchemaEvolution both1diff = new SchemaEvolution(fileStruct1, readerStruct1diff, null);
+ assertTrue(both1diff.hasConversion());
+ TypeDescription readerStruct1diffPrecision = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createInt())
+ .addField("f2", TypeDescription.createString())
+ .addField("f3", TypeDescription.createDecimal().withPrecision(12).withScale(10));
+ SchemaEvolution both1diffPrecision = new SchemaEvolution(fileStruct1, readerStruct1diffPrecision, null);
+ assertTrue(both1diffPrecision.hasConversion());
+ }
+
+ @Test
+ public void testDataTypeConversion2() throws IOException {
+ TypeDescription fileStruct2 = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createUnion()
+ .addUnionChild(TypeDescription.createByte())
+ .addUnionChild(TypeDescription.createDecimal()
+ .withPrecision(20).withScale(10)))
+ .addField("f2", TypeDescription.createStruct()
+ .addField("f3", TypeDescription.createDate())
+ .addField("f4", TypeDescription.createDouble())
+ .addField("f5", TypeDescription.createBoolean()))
+ .addField("f6", TypeDescription.createChar().withMaxLength(100));
+ SchemaEvolution same2 = new SchemaEvolution(fileStruct2, null);
+ assertFalse(same2.hasConversion());
+ TypeDescription readerStruct2 = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createUnion()
+ .addUnionChild(TypeDescription.createByte())
+ .addUnionChild(TypeDescription.createDecimal()
+ .withPrecision(20).withScale(10)))
+ .addField("f2", TypeDescription.createStruct()
+ .addField("f3", TypeDescription.createDate())
+ .addField("f4", TypeDescription.createDouble())
+ .addField("f5", TypeDescription.createBoolean()))
+ .addField("f6", TypeDescription.createChar().withMaxLength(100));
+ SchemaEvolution both2 = new SchemaEvolution(fileStruct2, readerStruct2, null);
+ assertFalse(both2.hasConversion());
+ TypeDescription readerStruct2diff = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createUnion()
+ .addUnionChild(TypeDescription.createByte())
+ .addUnionChild(TypeDescription.createDecimal()
+ .withPrecision(20).withScale(10)))
+ .addField("f2", TypeDescription.createStruct()
+ .addField("f3", TypeDescription.createDate())
+ .addField("f4", TypeDescription.createDouble())
+ .addField("f5", TypeDescription.createByte()))
+ .addField("f6", TypeDescription.createChar().withMaxLength(100));
+ SchemaEvolution both2diff = new SchemaEvolution(fileStruct2, readerStruct2diff, null);
+ assertTrue(both2diff.hasConversion());
+ TypeDescription readerStruct2diffChar = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createUnion()
+ .addUnionChild(TypeDescription.createByte())
+ .addUnionChild(TypeDescription.createDecimal()
+ .withPrecision(20).withScale(10)))
+ .addField("f2", TypeDescription.createStruct()
+ .addField("f3", TypeDescription.createDate())
+ .addField("f4", TypeDescription.createDouble())
+ .addField("f5", TypeDescription.createBoolean()))
+ .addField("f6", TypeDescription.createChar().withMaxLength(80));
+ SchemaEvolution both2diffChar = new SchemaEvolution(fileStruct2, readerStruct2diffChar, null);
+ assertTrue(both2diffChar.hasConversion());
+ }
+
+ @Test
+ public void testFloatToDoubleEvolution() throws Exception {
+ testFilePath = new Path(workDir, "TestOrcFile." +
+ testCaseName.getMethodName() + ".orc");
+ TypeDescription schema = TypeDescription.createFloat();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = new VectorizedRowBatch(1, 1024);
+ DoubleColumnVector dcv = new DoubleColumnVector(1024);
+ batch.cols[0] = dcv;
+ batch.reset();
+ batch.size = 1;
+ dcv.vector[0] = 74.72f;
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ TypeDescription schemaOnRead = TypeDescription.createDouble();
+ RecordReader rows = reader.rows(new Reader.Options().schema(schemaOnRead));
+ batch = schemaOnRead.createRowBatch();
+ rows.nextBatch(batch);
+ assertEquals(74.72, ((DoubleColumnVector) batch.cols[0]).vector[0], 0.00000000001);
+ rows.close();
+ }
+
+ @Test
+ public void testSafePpdEvaluation() throws IOException {
+ TypeDescription fileStruct1 = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createInt())
+ .addField("f2", TypeDescription.createString())
+ .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10));
+ SchemaEvolution same1 = new SchemaEvolution(fileStruct1, null);
+ assertTrue(same1.isPPDSafeConversion(0));
+ assertFalse(same1.hasConversion());
+ TypeDescription readerStruct1 = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createInt())
+ .addField("f2", TypeDescription.createString())
+ .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10));
+ SchemaEvolution both1 = new SchemaEvolution(fileStruct1, readerStruct1, null);
+ assertFalse(both1.hasConversion());
+ assertTrue(both1.isPPDSafeConversion(0));
+ assertTrue(both1.isPPDSafeConversion(1));
+ assertTrue(both1.isPPDSafeConversion(2));
+ assertTrue(both1.isPPDSafeConversion(3));
+
+ // int -> long
+ TypeDescription readerStruct1diff = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createLong())
+ .addField("f2", TypeDescription.createString())
+ .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10));
+ SchemaEvolution both1diff = new SchemaEvolution(fileStruct1, readerStruct1diff, null);
+ assertTrue(both1diff.hasConversion());
+ assertFalse(both1diff.isPPDSafeConversion(0));
+ assertTrue(both1diff.isPPDSafeConversion(1));
+ assertTrue(both1diff.isPPDSafeConversion(2));
+ assertTrue(both1diff.isPPDSafeConversion(3));
+
+ // decimal(38,10) -> decimal(12, 10)
+ TypeDescription readerStruct1diffPrecision = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createInt())
+ .addField("f2", TypeDescription.createString())
+ .addField("f3", TypeDescription.createDecimal().withPrecision(12).withScale(10));
+ SchemaEvolution both1diffPrecision = new SchemaEvolution(fileStruct1, readerStruct1diffPrecision,
+ new boolean[] {true, false, false, true});
+ assertTrue(both1diffPrecision.hasConversion());
+ assertFalse(both1diffPrecision.isPPDSafeConversion(0));
+ assertFalse(both1diffPrecision.isPPDSafeConversion(1)); // column not included
+ assertFalse(both1diffPrecision.isPPDSafeConversion(2)); // column not included
+ assertFalse(both1diffPrecision.isPPDSafeConversion(3));
+
+ // add columns
+ readerStruct1 = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createInt())
+ .addField("f2", TypeDescription.createString())
+ .addField("f3", TypeDescription.createDecimal().withPrecision(38).withScale(10))
+ .addField("f4", TypeDescription.createBoolean());
+ both1 = new SchemaEvolution(fileStruct1, readerStruct1, null);
+ assertTrue(both1.hasConversion());
+ assertFalse(both1.isPPDSafeConversion(0));
+ assertTrue(both1.isPPDSafeConversion(1));
+ assertTrue(both1.isPPDSafeConversion(2));
+ assertTrue(both1.isPPDSafeConversion(3));
+ assertFalse(both1.isPPDSafeConversion(4));
+ }
+
+ @Test
+ public void testSafePpdEvaluationForInts() throws IOException {
+ // byte -> short -> int -> long
+ TypeDescription fileSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createByte());
+ SchemaEvolution schemaEvolution = new SchemaEvolution(fileSchema, null);
+ assertFalse(schemaEvolution.hasConversion());
+
+ // byte -> short
+ TypeDescription readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createShort());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertTrue(schemaEvolution.isPPDSafeConversion(1));
+
+ // byte -> int
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createInt());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertTrue(schemaEvolution.isPPDSafeConversion(1));
+
+ // byte -> long
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createLong());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertTrue(schemaEvolution.isPPDSafeConversion(1));
+
+ // short -> int -> long
+ fileSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createShort());
+ schemaEvolution = new SchemaEvolution(fileSchema, null);
+ assertFalse(schemaEvolution.hasConversion());
+
+ // unsafe conversion short -> byte
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createByte());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+
+ // short -> int
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createInt());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertTrue(schemaEvolution.isPPDSafeConversion(1));
+
+ // short -> long
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createLong());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertTrue(schemaEvolution.isPPDSafeConversion(1));
+
+ // int -> long
+ fileSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createInt());
+ schemaEvolution = new SchemaEvolution(fileSchema, null);
+ assertFalse(schemaEvolution.hasConversion());
+
+ // unsafe conversion int -> byte
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createByte());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+
+ // unsafe conversion int -> short
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createShort());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+
+ // int -> long
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createLong());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertTrue(schemaEvolution.isPPDSafeConversion(1));
+
+ // long
+ fileSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createLong());
+ schemaEvolution = new SchemaEvolution(fileSchema, null);
+ assertTrue(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.hasConversion());
+
+ // unsafe conversion long -> byte
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createByte());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+
+ // unsafe conversion long -> short
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createShort());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+
+ // unsafe conversion long -> int
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createInt());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+
+ // invalid
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createString());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+
+ // invalid
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createFloat());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+
+ // invalid
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createTimestamp());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+ }
+
+ @Test
+ public void testSafePpdEvaluationForStrings() throws IOException {
+ TypeDescription fileSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createString());
+ SchemaEvolution schemaEvolution = new SchemaEvolution(fileSchema, null);
+ assertTrue(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.hasConversion());
+
+ // string -> char
+ TypeDescription readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createChar());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+
+ // string -> varchar
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createVarchar());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertTrue(schemaEvolution.isPPDSafeConversion(1));
+
+ fileSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createChar());
+ schemaEvolution = new SchemaEvolution(fileSchema, null);
+ assertTrue(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.hasConversion());
+
+ // char -> string
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createString());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+
+ // char -> varchar
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createVarchar());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+
+ fileSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createVarchar());
+ schemaEvolution = new SchemaEvolution(fileSchema, null);
+ assertTrue(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.hasConversion());
+
+ // varchar -> string
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createString());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertTrue(schemaEvolution.isPPDSafeConversion(1));
+
+ // varchar -> char
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createChar());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+
+ // invalid
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createDecimal());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+
+ // invalid
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createDate());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+
+ // invalid
+ readerSchema = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createInt());
+ schemaEvolution = new SchemaEvolution(fileSchema, readerSchema, null);
+ assertTrue(schemaEvolution.hasConversion());
+ assertFalse(schemaEvolution.isPPDSafeConversion(0));
+ assertFalse(schemaEvolution.isPPDSafeConversion(1));
+ }
+
+ @Test
+ public void ensureFileIncluded() throws IOException {
+ TypeDescription file = TypeDescription.fromString("struct<x:int,y:int>");
+ SchemaEvolution evolution = new SchemaEvolution(file, null);
+ boolean[] include = evolution.getFileIncluded();
+ assertEquals(3, include.length);
+ for(int i=0; i < include.length; ++i) {
+ assertTrue("element " + i, include[i]);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestSerializationUtils.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestSerializationUtils.java b/orc/src/test/org/apache/hive/orc/impl/TestSerializationUtils.java
new file mode 100644
index 0000000..5bcee60
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestSerializationUtils.java
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.math.BigInteger;
+import java.util.Random;
+
+import org.junit.Test;
+
+import com.google.common.math.LongMath;
+
+public class TestSerializationUtils {
+
+ private InputStream fromBuffer(ByteArrayOutputStream buffer) {
+ return new ByteArrayInputStream(buffer.toByteArray());
+ }
+
+ @Test
+ public void testDoubles() throws Exception {
+ double tolerance = 0.0000000000000001;
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+ SerializationUtils utils = new SerializationUtils();
+ utils.writeDouble(buffer, 1343822337.759);
+ assertEquals(1343822337.759, utils.readDouble(fromBuffer(buffer)), tolerance);
+ buffer = new ByteArrayOutputStream();
+ utils.writeDouble(buffer, 0.8);
+ double got = utils.readDouble(fromBuffer(buffer));
+ assertEquals(0.8, got, tolerance);
+ }
+
+ @Test
+ public void testBigIntegers() throws Exception {
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+ SerializationUtils.writeBigInteger(buffer, BigInteger.valueOf(0));
+ assertArrayEquals(new byte[]{0}, buffer.toByteArray());
+ assertEquals(0L,
+ SerializationUtils.readBigInteger(fromBuffer(buffer)).longValue());
+ buffer.reset();
+ SerializationUtils.writeBigInteger(buffer, BigInteger.valueOf(1));
+ assertArrayEquals(new byte[]{2}, buffer.toByteArray());
+ assertEquals(1L,
+ SerializationUtils.readBigInteger(fromBuffer(buffer)).longValue());
+ buffer.reset();
+ SerializationUtils.writeBigInteger(buffer, BigInteger.valueOf(-1));
+ assertArrayEquals(new byte[]{1}, buffer.toByteArray());
+ assertEquals(-1L,
+ SerializationUtils.readBigInteger(fromBuffer(buffer)).longValue());
+ buffer.reset();
+ SerializationUtils.writeBigInteger(buffer, BigInteger.valueOf(50));
+ assertArrayEquals(new byte[]{100}, buffer.toByteArray());
+ assertEquals(50L,
+ SerializationUtils.readBigInteger(fromBuffer(buffer)).longValue());
+ buffer.reset();
+ SerializationUtils.writeBigInteger(buffer, BigInteger.valueOf(-50));
+ assertArrayEquals(new byte[]{99}, buffer.toByteArray());
+ assertEquals(-50L,
+ SerializationUtils.readBigInteger(fromBuffer(buffer)).longValue());
+ for(int i=-8192; i < 8192; ++i) {
+ buffer.reset();
+ SerializationUtils.writeBigInteger(buffer, BigInteger.valueOf(i));
+ assertEquals("compare length for " + i,
+ i >= -64 && i < 64 ? 1 : 2, buffer.size());
+ assertEquals("compare result for " + i,
+ i, SerializationUtils.readBigInteger(fromBuffer(buffer)).intValue());
+ }
+ buffer.reset();
+ SerializationUtils.writeBigInteger(buffer,
+ new BigInteger("123456789abcdef0",16));
+ assertEquals(new BigInteger("123456789abcdef0",16),
+ SerializationUtils.readBigInteger(fromBuffer(buffer)));
+ buffer.reset();
+ SerializationUtils.writeBigInteger(buffer,
+ new BigInteger("-123456789abcdef0",16));
+ assertEquals(new BigInteger("-123456789abcdef0",16),
+ SerializationUtils.readBigInteger(fromBuffer(buffer)));
+ StringBuilder buf = new StringBuilder();
+ for(int i=0; i < 256; ++i) {
+ String num = Integer.toHexString(i);
+ if (num.length() == 1) {
+ buf.append('0');
+ }
+ buf.append(num);
+ }
+ buffer.reset();
+ SerializationUtils.writeBigInteger(buffer,
+ new BigInteger(buf.toString(),16));
+ assertEquals(new BigInteger(buf.toString(),16),
+ SerializationUtils.readBigInteger(fromBuffer(buffer)));
+ buffer.reset();
+ SerializationUtils.writeBigInteger(buffer,
+ new BigInteger("ff000000000000000000000000000000000000000000ff",16));
+ assertEquals(
+ new BigInteger("ff000000000000000000000000000000000000000000ff",16),
+ SerializationUtils.readBigInteger(fromBuffer(buffer)));
+ }
+
+ @Test
+ public void testSubtractionOverflow() {
+ // cross check results with Guava results below
+ SerializationUtils utils = new SerializationUtils();
+ assertEquals(false, utils.isSafeSubtract(22222222222L, Long.MIN_VALUE));
+ assertEquals(false, utils.isSafeSubtract(-22222222222L, Long.MAX_VALUE));
+ assertEquals(false, utils.isSafeSubtract(Long.MIN_VALUE, Long.MAX_VALUE));
+ assertEquals(true, utils.isSafeSubtract(-1553103058346370095L, 6553103058346370095L));
+ assertEquals(true, utils.isSafeSubtract(0, Long.MAX_VALUE));
+ assertEquals(true, utils.isSafeSubtract(Long.MIN_VALUE, 0));
+ }
+
+ @Test
+ public void testSubtractionOverflowGuava() {
+ try {
+ LongMath.checkedSubtract(22222222222L, Long.MIN_VALUE);
+ fail("expected ArithmeticException for overflow");
+ } catch (ArithmeticException ex) {
+ assertEquals(ex.getMessage(), "overflow");
+ }
+
+ try {
+ LongMath.checkedSubtract(-22222222222L, Long.MAX_VALUE);
+ fail("expected ArithmeticException for overflow");
+ } catch (ArithmeticException ex) {
+ assertEquals(ex.getMessage(), "overflow");
+ }
+
+ try {
+ LongMath.checkedSubtract(Long.MIN_VALUE, Long.MAX_VALUE);
+ fail("expected ArithmeticException for overflow");
+ } catch (ArithmeticException ex) {
+ assertEquals(ex.getMessage(), "overflow");
+ }
+
+ assertEquals(-8106206116692740190L,
+ LongMath.checkedSubtract(-1553103058346370095L, 6553103058346370095L));
+ assertEquals(-Long.MAX_VALUE, LongMath.checkedSubtract(0, Long.MAX_VALUE));
+ assertEquals(Long.MIN_VALUE, LongMath.checkedSubtract(Long.MIN_VALUE, 0));
+ }
+
+ @Test
+ public void testRandomFloats() throws Exception {
+ float tolerance = 0.0000000000000001f;
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+ SerializationUtils utils = new SerializationUtils();
+ Random rand = new Random();
+ int n = 100_000;
+ float[] expected = new float[n];
+ for (int i = 0; i < n; i++) {
+ float f = rand.nextFloat();
+ expected[i] = f;
+ utils.writeFloat(buffer, f);
+ }
+ InputStream newBuffer = fromBuffer(buffer);
+ for (int i = 0; i < n; i++) {
+ float got = utils.readFloat(newBuffer);
+ assertEquals(expected[i], got, tolerance);
+ }
+ }
+
+ @Test
+ public void testRandomDoubles() throws Exception {
+ double tolerance = 0.0000000000000001;
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+ SerializationUtils utils = new SerializationUtils();
+ Random rand = new Random();
+ int n = 100_000;
+ double[] expected = new double[n];
+ for (int i = 0; i < n; i++) {
+ double d = rand.nextDouble();
+ expected[i] = d;
+ utils.writeDouble(buffer, d);
+ }
+ InputStream newBuffer = fromBuffer(buffer);
+ for (int i = 0; i < n; i++) {
+ double got = utils.readDouble(newBuffer);
+ assertEquals(expected[i], got, tolerance);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestStreamName.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestStreamName.java b/orc/src/test/org/apache/hive/orc/impl/TestStreamName.java
new file mode 100644
index 0000000..4aed06c
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestStreamName.java
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import org.apache.hive.orc.OrcProto;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class TestStreamName {
+
+ @Test
+ public void test1() throws Exception {
+ StreamName s1 = new StreamName(3, OrcProto.Stream.Kind.DATA);
+ StreamName s2 = new StreamName(3,
+ OrcProto.Stream.Kind.DICTIONARY_DATA);
+ StreamName s3 = new StreamName(5, OrcProto.Stream.Kind.DATA);
+ StreamName s4 = new StreamName(5,
+ OrcProto.Stream.Kind.DICTIONARY_DATA);
+ StreamName s1p = new StreamName(3, OrcProto.Stream.Kind.DATA);
+ assertEquals(true, s1.equals(s1));
+ assertEquals(false, s1.equals(s2));
+ assertEquals(false, s1.equals(s3));
+ assertEquals(true, s1.equals(s1p));
+ assertEquals(true, s1.compareTo(null) < 0);
+ assertEquals(false, s1.equals(null));
+ assertEquals(true, s1.compareTo(s2) < 0);
+ assertEquals(true, s2.compareTo(s3) < 0);
+ assertEquals(true, s3.compareTo(s4) < 0);
+ assertEquals(true, s4.compareTo(s1p) > 0);
+ assertEquals(0, s1p.compareTo(s1));
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestStringRedBlackTree.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestStringRedBlackTree.java b/orc/src/test/org/apache/hive/orc/impl/TestStringRedBlackTree.java
new file mode 100644
index 0000000..c3e51b8
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestStringRedBlackTree.java
@@ -0,0 +1,232 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import org.apache.hadoop.io.DataOutputBuffer;
+import org.apache.hadoop.io.IntWritable;
+import org.junit.Test;
+
+import java.io.IOException;
+
+import static junit.framework.Assert.assertEquals;
+
+/**
+ * Test the red-black tree with string keys.
+ */
+public class TestStringRedBlackTree {
+
+ /**
+ * Checks the red-black tree rules to make sure that we have correctly built
+ * a valid tree.
+ *
+ * Properties:
+ * 1. Red nodes must have black children
+ * 2. Each node must have the same black height on both sides.
+ *
+ * @param node The id of the root of the subtree to check for the red-black
+ * tree properties.
+ * @return The black-height of the subtree.
+ */
+ private int checkSubtree(RedBlackTree tree, int node, IntWritable count
+ ) throws IOException {
+ if (node == RedBlackTree.NULL) {
+ return 1;
+ }
+ count.set(count.get() + 1);
+ boolean is_red = tree.isRed(node);
+ int left = tree.getLeft(node);
+ int right = tree.getRight(node);
+ if (is_red) {
+ if (tree.isRed(left)) {
+ printTree(tree, "", tree.root);
+ throw new IllegalStateException("Left node of " + node + " is " + left +
+ " and both are red.");
+ }
+ if (tree.isRed(right)) {
+ printTree(tree, "", tree.root);
+ throw new IllegalStateException("Right node of " + node + " is " +
+ right + " and both are red.");
+ }
+ }
+ int left_depth = checkSubtree(tree, left, count);
+ int right_depth = checkSubtree(tree, right, count);
+ if (left_depth != right_depth) {
+ printTree(tree, "", tree.root);
+ throw new IllegalStateException("Lopsided tree at node " + node +
+ " with depths " + left_depth + " and " + right_depth);
+ }
+ if (is_red) {
+ return left_depth;
+ } else {
+ return left_depth + 1;
+ }
+ }
+
+ /**
+ * Checks the validity of the entire tree. Also ensures that the number of
+ * nodes visited is the same as the size of the set.
+ */
+ void checkTree(RedBlackTree tree) throws IOException {
+ IntWritable count = new IntWritable(0);
+ if (tree.isRed(tree.root)) {
+ printTree(tree, "", tree.root);
+ throw new IllegalStateException("root is red");
+ }
+ checkSubtree(tree, tree.root, count);
+ if (count.get() != tree.size) {
+ printTree(tree, "", tree.root);
+ throw new IllegalStateException("Broken tree! visited= " + count.get() +
+ " size=" + tree.size);
+ }
+ }
+
+ void printTree(RedBlackTree tree, String indent, int node
+ ) throws IOException {
+ if (node == RedBlackTree.NULL) {
+ System.err.println(indent + "NULL");
+ } else {
+ System.err.println(indent + "Node " + node + " color " +
+ (tree.isRed(node) ? "red" : "black"));
+ printTree(tree, indent + " ", tree.getLeft(node));
+ printTree(tree, indent + " ", tree.getRight(node));
+ }
+ }
+
+ private static class MyVisitor implements StringRedBlackTree.Visitor {
+ private final String[] words;
+ private final int[] order;
+ private final DataOutputBuffer buffer = new DataOutputBuffer();
+ int current = 0;
+
+ MyVisitor(String[] args, int[] order) {
+ words = args;
+ this.order = order;
+ }
+
+ @Override
+ public void visit(StringRedBlackTree.VisitorContext context
+ ) throws IOException {
+ String word = context.getText().toString();
+ assertEquals("in word " + current, words[current], word);
+ assertEquals("in word " + current, order[current],
+ context.getOriginalPosition());
+ buffer.reset();
+ context.writeBytes(buffer);
+ assertEquals(word, new String(buffer.getData(),0,buffer.getLength()));
+ current += 1;
+ }
+ }
+
+ void checkContents(StringRedBlackTree tree, int[] order,
+ String... params
+ ) throws IOException {
+ tree.visit(new MyVisitor(params, order));
+ }
+
+ StringRedBlackTree buildTree(String... params) throws IOException {
+ StringRedBlackTree result = new StringRedBlackTree(1000);
+ for(String word: params) {
+ result.add(word);
+ checkTree(result);
+ }
+ return result;
+ }
+
+ @Test
+ public void test1() throws Exception {
+ StringRedBlackTree tree = new StringRedBlackTree(5);
+ assertEquals(0, tree.getSizeInBytes());
+ checkTree(tree);
+ assertEquals(0, tree.add("owen"));
+ checkTree(tree);
+ assertEquals(1, tree.add("ashutosh"));
+ checkTree(tree);
+ assertEquals(0, tree.add("owen"));
+ checkTree(tree);
+ assertEquals(2, tree.add("alan"));
+ checkTree(tree);
+ assertEquals(2, tree.add("alan"));
+ checkTree(tree);
+ assertEquals(1, tree.add("ashutosh"));
+ checkTree(tree);
+ assertEquals(3, tree.add("greg"));
+ checkTree(tree);
+ assertEquals(4, tree.add("eric"));
+ checkTree(tree);
+ assertEquals(5, tree.add("arun"));
+ checkTree(tree);
+ assertEquals(6, tree.size());
+ checkTree(tree);
+ assertEquals(6, tree.add("eric14"));
+ checkTree(tree);
+ assertEquals(7, tree.add("o"));
+ checkTree(tree);
+ assertEquals(8, tree.add("ziggy"));
+ checkTree(tree);
+ assertEquals(9, tree.add("z"));
+ checkTree(tree);
+ checkContents(tree, new int[]{2,5,1,4,6,3,7,0,9,8},
+ "alan", "arun", "ashutosh", "eric", "eric14", "greg",
+ "o", "owen", "z", "ziggy");
+ assertEquals(32888, tree.getSizeInBytes());
+ // check that adding greg again bumps the count
+ assertEquals(3, tree.add("greg"));
+ assertEquals(41, tree.getCharacterSize());
+ // add some more strings to test the different branches of the
+ // rebalancing
+ assertEquals(10, tree.add("zak"));
+ checkTree(tree);
+ assertEquals(11, tree.add("eric1"));
+ checkTree(tree);
+ assertEquals(12, tree.add("ash"));
+ checkTree(tree);
+ assertEquals(13, tree.add("harry"));
+ checkTree(tree);
+ assertEquals(14, tree.add("john"));
+ checkTree(tree);
+ tree.clear();
+ checkTree(tree);
+ assertEquals(0, tree.getSizeInBytes());
+ assertEquals(0, tree.getCharacterSize());
+ }
+
+ @Test
+ public void test2() throws Exception {
+ StringRedBlackTree tree =
+ buildTree("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l",
+ "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z");
+ assertEquals(26, tree.size());
+ checkContents(tree, new int[]{0,1,2, 3,4,5, 6,7,8, 9,10,11, 12,13,14,
+ 15,16,17, 18,19,20, 21,22,23, 24,25},
+ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j","k", "l", "m", "n", "o",
+ "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z");
+ }
+
+ @Test
+ public void test3() throws Exception {
+ StringRedBlackTree tree =
+ buildTree("z", "y", "x", "w", "v", "u", "t", "s", "r", "q", "p", "o", "n",
+ "m", "l", "k", "j", "i", "h", "g", "f", "e", "d", "c", "b", "a");
+ assertEquals(26, tree.size());
+ checkContents(tree, new int[]{25,24,23, 22,21,20, 19,18,17, 16,15,14,
+ 13,12,11, 10,9,8, 7,6,5, 4,3,2, 1,0},
+ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o",
+ "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z");
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestZlib.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestZlib.java b/orc/src/test/org/apache/hive/orc/impl/TestZlib.java
new file mode 100644
index 0000000..c87f4a8
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestZlib.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import org.apache.hive.orc.CompressionCodec;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.fail;
+
+public class TestZlib {
+
+ @Test
+ public void testNoOverflow() throws Exception {
+ ByteBuffer in = ByteBuffer.allocate(10);
+ ByteBuffer out = ByteBuffer.allocate(10);
+ in.put(new byte[]{1,2,3,4,5,6,7,10});
+ in.flip();
+ CompressionCodec codec = new ZlibCodec();
+ assertEquals(false, codec.compress(in, out, null));
+ }
+
+ @Test
+ public void testCorrupt() throws Exception {
+ ByteBuffer buf = ByteBuffer.allocate(1000);
+ buf.put(new byte[]{127,-128,0,99,98,-1});
+ buf.flip();
+ CompressionCodec codec = new ZlibCodec();
+ ByteBuffer out = ByteBuffer.allocate(1000);
+ try {
+ codec.decompress(buf, out);
+ fail();
+ } catch (IOException ioe) {
+ // EXPECTED
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/tools/TestFileDump.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/tools/TestFileDump.java b/orc/src/test/org/apache/hive/orc/tools/TestFileDump.java
new file mode 100644
index 0000000..50e6208
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/tools/TestFileDump.java
@@ -0,0 +1,485 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.tools;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.PrintStream;
+import java.sql.Date;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hive.orc.CompressionKind;
+import org.apache.hive.orc.OrcConf;
+import org.apache.hive.orc.OrcFile;
+import org.apache.hive.orc.TypeDescription;
+import org.apache.hive.orc.Writer;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestFileDump {
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir"));
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Before
+ public void openFileSystem () throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ fs.setWorkingDirectory(workDir);
+ testFilePath = new Path("TestFileDump.testDump.orc");
+ fs.delete(testFilePath, false);
+ }
+
+ static TypeDescription getMyRecordType() {
+ return TypeDescription.createStruct()
+ .addField("i", TypeDescription.createInt())
+ .addField("l", TypeDescription.createLong())
+ .addField("s", TypeDescription.createString());
+ }
+
+ static void appendMyRecord(VectorizedRowBatch batch,
+ int i,
+ long l,
+ String str) {
+ ((LongColumnVector) batch.cols[0]).vector[batch.size] = i;
+ ((LongColumnVector) batch.cols[1]).vector[batch.size] = l;
+ if (str == null) {
+ batch.cols[2].noNulls = false;
+ batch.cols[2].isNull[batch.size] = true;
+ } else {
+ ((BytesColumnVector) batch.cols[2]).setVal(batch.size,
+ str.getBytes());
+ }
+ batch.size += 1;
+ }
+
+ static TypeDescription getAllTypesType() {
+ return TypeDescription.createStruct()
+ .addField("b", TypeDescription.createBoolean())
+ .addField("bt", TypeDescription.createByte())
+ .addField("s", TypeDescription.createShort())
+ .addField("i", TypeDescription.createInt())
+ .addField("l", TypeDescription.createLong())
+ .addField("f", TypeDescription.createFloat())
+ .addField("d", TypeDescription.createDouble())
+ .addField("de", TypeDescription.createDecimal())
+ .addField("t", TypeDescription.createTimestamp())
+ .addField("dt", TypeDescription.createDate())
+ .addField("str", TypeDescription.createString())
+ .addField("c", TypeDescription.createChar().withMaxLength(5))
+ .addField("vc", TypeDescription.createVarchar().withMaxLength(10))
+ .addField("m", TypeDescription.createMap(
+ TypeDescription.createString(),
+ TypeDescription.createString()))
+ .addField("a", TypeDescription.createList(TypeDescription.createInt()))
+ .addField("st", TypeDescription.createStruct()
+ .addField("i", TypeDescription.createInt())
+ .addField("s", TypeDescription.createString()));
+ }
+
+ static void appendAllTypes(VectorizedRowBatch batch,
+ boolean b,
+ byte bt,
+ short s,
+ int i,
+ long l,
+ float f,
+ double d,
+ HiveDecimalWritable de,
+ Timestamp t,
+ DateWritable dt,
+ String str,
+ String c,
+ String vc,
+ Map<String, String> m,
+ List<Integer> a,
+ int sti,
+ String sts) {
+ int row = batch.size++;
+ ((LongColumnVector) batch.cols[0]).vector[row] = b ? 1 : 0;
+ ((LongColumnVector) batch.cols[1]).vector[row] = bt;
+ ((LongColumnVector) batch.cols[2]).vector[row] = s;
+ ((LongColumnVector) batch.cols[3]).vector[row] = i;
+ ((LongColumnVector) batch.cols[4]).vector[row] = l;
+ ((DoubleColumnVector) batch.cols[5]).vector[row] = f;
+ ((DoubleColumnVector) batch.cols[6]).vector[row] = d;
+ ((DecimalColumnVector) batch.cols[7]).vector[row].set(de);
+ ((TimestampColumnVector) batch.cols[8]).set(row, t);
+ ((LongColumnVector) batch.cols[9]).vector[row] = dt.getDays();
+ ((BytesColumnVector) batch.cols[10]).setVal(row, str.getBytes());
+ ((BytesColumnVector) batch.cols[11]).setVal(row, c.getBytes());
+ ((BytesColumnVector) batch.cols[12]).setVal(row, vc.getBytes());
+ MapColumnVector map = (MapColumnVector) batch.cols[13];
+ int offset = map.childCount;
+ map.offsets[row] = offset;
+ map.lengths[row] = m.size();
+ map.childCount += map.lengths[row];
+ for(Map.Entry<String, String> entry: m.entrySet()) {
+ ((BytesColumnVector) map.keys).setVal(offset, entry.getKey().getBytes());
+ ((BytesColumnVector) map.values).setVal(offset++,
+ entry.getValue().getBytes());
+ }
+ ListColumnVector list = (ListColumnVector) batch.cols[14];
+ offset = list.childCount;
+ list.offsets[row] = offset;
+ list.lengths[row] = a.size();
+ list.childCount += list.lengths[row];
+ for(int e=0; e < a.size(); ++e) {
+ ((LongColumnVector) list.child).vector[offset + e] = a.get(e);
+ }
+ StructColumnVector struct = (StructColumnVector) batch.cols[15];
+ ((LongColumnVector) struct.fields[0]).vector[row] = sti;
+ ((BytesColumnVector) struct.fields[1]).setVal(row, sts.getBytes());
+ }
+
+ public static void checkOutput(String expected,
+ String actual) throws Exception {
+ BufferedReader eStream =
+ new BufferedReader(new FileReader
+ (TestJsonFileDump.getFileFromClasspath(expected)));
+ BufferedReader aStream =
+ new BufferedReader(new FileReader(actual));
+ String expectedLine = eStream.readLine().trim();
+ while (expectedLine != null) {
+ String actualLine = aStream.readLine().trim();
+ System.out.println("actual: " + actualLine);
+ System.out.println("expected: " + expectedLine);
+ Assert.assertEquals(expectedLine, actualLine);
+ expectedLine = eStream.readLine();
+ expectedLine = expectedLine == null ? null : expectedLine.trim();
+ }
+ Assert.assertNull(eStream.readLine());
+ Assert.assertNull(aStream.readLine());
+ eStream.close();
+ aStream.close();
+ }
+
+ @Test
+ public void testDump() throws Exception {
+ TypeDescription schema = getMyRecordType();
+ conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .compress(CompressionKind.ZLIB)
+ .stripeSize(100000)
+ .rowIndexStride(1000));
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ for(int i=0; i < 21000; ++i) {
+ appendMyRecord(batch, r1.nextInt(), r1.nextLong(),
+ words[r1.nextInt(words.length)]);
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size > 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "--rowindex=1,2,3"});
+ System.out.flush();
+ System.setOut(origOut);
+
+
+ checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+
+ @Test
+ public void testDataDump() throws Exception {
+ TypeDescription schema = getAllTypesType();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .rowIndexStride(1000));
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ Map<String, String> m = new HashMap<String, String>(2);
+ m.put("k1", "v1");
+ appendAllTypes(batch,
+ true,
+ (byte) 10,
+ (short) 100,
+ 1000,
+ 10000L,
+ 4.0f,
+ 20.0,
+ new HiveDecimalWritable("4.2222"),
+ new Timestamp(1416967764000L),
+ new DateWritable(new Date(1416967764000L)),
+ "string",
+ "hello",
+ "hello",
+ m,
+ Arrays.asList(100, 200),
+ 10, "foo");
+ m.clear();
+ m.put("k3", "v3");
+ appendAllTypes(
+ batch,
+ false,
+ (byte)20,
+ (short)200,
+ 2000,
+ 20000L,
+ 8.0f,
+ 40.0,
+ new HiveDecimalWritable("2.2222"),
+ new Timestamp(1416967364000L),
+ new DateWritable(new Date(1411967764000L)),
+ "abcd",
+ "world",
+ "world",
+ m,
+ Arrays.asList(200, 300),
+ 20, "bar");
+ writer.addRowBatch(batch);
+
+ writer.close();
+ PrintStream origOut = System.out;
+ ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "-d"});
+ System.out.flush();
+ System.setOut(origOut);
+ String[] lines = myOut.toString().split("\n");
+ Assert.assertEquals("{\"b\":true,\"bt\":10,\"s\":100,\"i\":1000,\"l\":10000,\"f\":4,\"d\":20,\"de\":\"4.2222\",\"t\":\"2014-11-25 18:09:24.0\",\"dt\":\"2014-11-25\",\"str\":\"string\",\"c\":\"hello\",\"vc\":\"hello\",\"m\":[{\"_key\":\"k1\",\"_value\":\"v1\"}],\"a\":[100,200],\"st\":{\"i\":10,\"s\":\"foo\"}}", lines[0]);
+ Assert.assertEquals("{\"b\":false,\"bt\":20,\"s\":200,\"i\":2000,\"l\":20000,\"f\":8,\"d\":40,\"de\":\"2.2222\",\"t\":\"2014-11-25 18:02:44.0\",\"dt\":\"2014-09-28\",\"str\":\"abcd\",\"c\":\"world\",\"vc\":\"world\",\"m\":[{\"_key\":\"k3\",\"_value\":\"v3\"}],\"a\":[200,300],\"st\":{\"i\":20,\"s\":\"bar\"}}", lines[1]);
+ }
+
+ // Test that if the fraction of rows that have distinct strings is greater than the configured
+ // threshold dictionary encoding is turned off. If dictionary encoding is turned off the length
+ // of the dictionary stream for the column will be 0 in the ORC file dump.
+ @Test
+ public void testDictionaryThreshold() throws Exception {
+ TypeDescription schema = getMyRecordType();
+ Configuration conf = new Configuration();
+ conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
+ conf.setFloat(OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getAttribute(), 0.49f);
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.ZLIB)
+ .rowIndexStride(1000)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ int nextInt = 0;
+ for(int i=0; i < 21000; ++i) {
+ // Write out the same string twice, this guarantees the fraction of rows with
+ // distinct strings is 0.5
+ if (i % 2 == 0) {
+ nextInt = r1.nextInt(words.length);
+ // Append the value of i to the word, this guarantees when an index or word is repeated
+ // the actual string is unique.
+ words[nextInt] += "-" + i;
+ }
+ appendMyRecord(batch, r1.nextInt(), r1.nextLong(), words[nextInt]);
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size != 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump-dictionary-threshold.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "--rowindex=1,2,3"});
+ System.out.flush();
+ System.setOut(origOut);
+
+ checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+
+ @Test
+ public void testBloomFilter() throws Exception {
+ TypeDescription schema = getMyRecordType();
+ conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
+ OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.ZLIB)
+ .bufferSize(10000)
+ .rowIndexStride(1000)
+ .bloomFilterColumns("S");
+ Writer writer = OrcFile.createWriter(testFilePath, options);
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ for(int i=0; i < 21000; ++i) {
+ appendMyRecord(batch, r1.nextInt(), r1.nextLong(),
+ words[r1.nextInt(words.length)]);
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size > 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump-bloomfilter.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "--rowindex=3"});
+ System.out.flush();
+ System.setOut(origOut);
+
+
+ checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+
+ @Test
+ public void testBloomFilter2() throws Exception {
+ TypeDescription schema = getMyRecordType();
+ conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
+ OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.ZLIB)
+ .bufferSize(10000)
+ .rowIndexStride(1000)
+ .bloomFilterColumns("l")
+ .bloomFilterFpp(0.01);
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ Writer writer = OrcFile.createWriter(testFilePath, options);
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ for(int i=0; i < 21000; ++i) {
+ appendMyRecord(batch, r1.nextInt(), r1.nextLong(),
+ words[r1.nextInt(words.length)]);
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size > 0) {
+ writer.addRowBatch(batch);
+ }
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump-bloomfilter2.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "--rowindex=2"});
+ System.out.flush();
+ System.setOut(origOut);
+
+
+ checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/tools/TestJsonFileDump.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/tools/TestJsonFileDump.java b/orc/src/test/org/apache/hive/orc/tools/TestJsonFileDump.java
new file mode 100644
index 0000000..efded7a
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/tools/TestJsonFileDump.java
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.tools;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.PrintStream;
+import java.net.URL;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hive.orc.OrcFile;
+import org.apache.hive.orc.CompressionKind;
+import org.apache.hive.orc.OrcConf;
+import org.apache.hive.orc.TypeDescription;
+import org.apache.hive.orc.Writer;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestJsonFileDump {
+ public static String getFileFromClasspath(String name) {
+ URL url = ClassLoader.getSystemResource(name);
+ if (url == null) {
+ throw new IllegalArgumentException("Could not find " + name);
+ }
+ return url.getPath();
+ }
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir"));
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Before
+ public void openFileSystem () throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ fs.setWorkingDirectory(workDir);
+ testFilePath = new Path("TestFileDump.testDump.orc");
+ fs.delete(testFilePath, false);
+ }
+
+ static void checkOutput(String expected,
+ String actual) throws Exception {
+ BufferedReader eStream =
+ new BufferedReader(new FileReader(getFileFromClasspath(expected)));
+ BufferedReader aStream =
+ new BufferedReader(new FileReader(actual));
+ String expectedLine = eStream.readLine();
+ while (expectedLine != null) {
+ String actualLine = aStream.readLine();
+ System.out.println("actual: " + actualLine);
+ System.out.println("expected: " + expectedLine);
+ assertEquals(expectedLine, actualLine);
+ expectedLine = eStream.readLine();
+ }
+ assertNull(eStream.readLine());
+ assertNull(aStream.readLine());
+ }
+
+ @Test
+ public void testJsonDump() throws Exception {
+ TypeDescription schema = TypeDescription.createStruct()
+ .addField("i", TypeDescription.createInt())
+ .addField("l", TypeDescription.createLong())
+ .addField("s", TypeDescription.createString());
+ conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
+ OrcFile.WriterOptions options = OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.ZLIB)
+ .bufferSize(10000)
+ .rowIndexStride(1000)
+ .bloomFilterColumns("s");
+ Writer writer = OrcFile.createWriter(testFilePath, options);
+ Random r1 = new Random(1);
+ String[] words = new String[]{"It", "was", "the", "best", "of", "times,",
+ "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age",
+ "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it",
+ "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch",
+ "of", "incredulity,", "it", "was", "the", "season", "of", "Light,",
+ "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the",
+ "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,",
+ "we", "had", "everything", "before", "us,", "we", "had", "nothing",
+ "before", "us,", "we", "were", "all", "going", "direct", "to",
+ "Heaven,", "we", "were", "all", "going", "direct", "the", "other",
+ "way"};
+ VectorizedRowBatch batch = schema.createRowBatch(1000);
+ for(int i=0; i < 21000; ++i) {
+ ((LongColumnVector) batch.cols[0]).vector[batch.size] = r1.nextInt();
+ ((LongColumnVector) batch.cols[1]).vector[batch.size] = r1.nextLong();
+ if (i % 100 == 0) {
+ batch.cols[2].noNulls = false;
+ batch.cols[2].isNull[batch.size] = true;
+ } else {
+ ((BytesColumnVector) batch.cols[2]).setVal(batch.size,
+ words[r1.nextInt(words.length)].getBytes());
+ }
+ batch.size += 1;
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ }
+ if (batch.size > 0) {
+ writer.addRowBatch(batch);
+ }
+
+ writer.close();
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-dump.json";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "-j", "-p", "--rowindex=3"});
+ System.out.flush();
+ System.setOut(origOut);
+
+
+ checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+}
[02/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
deleted file mode 100644
index 79f67e3..0000000
--- a/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
+++ /dev/null
@@ -1,1709 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import static junit.framework.Assert.assertEquals;
-import static org.hamcrest.core.Is.is;
-import static org.junit.Assert.*;
-import static org.mockito.Mockito.any;
-import static org.mockito.Mockito.atLeastOnce;
-import static org.mockito.Mockito.doThrow;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.when;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.sql.Timestamp;
-import java.util.ArrayList;
-import java.util.List;
-
-import junit.framework.Assert;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PositionedReadable;
-import org.apache.hadoop.fs.Seekable;
-import org.apache.hadoop.hive.common.io.DiskRangeList;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl;
-import org.apache.orc.BloomFilterIO;
-import org.apache.orc.DataReader;
-import org.apache.orc.RecordReader;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.Writer;
-import org.apache.orc.impl.RecordReaderImpl.Location;
-import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.OrcFile;
-import org.apache.orc.Reader;
-import org.apache.orc.OrcProto;
-
-import org.junit.Test;
-import org.mockito.MockSettings;
-import org.mockito.Mockito;
-
-public class TestRecordReaderImpl {
- /**
- * Create a predicate leaf. This is used by another test.
- */
- public static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator operator,
- PredicateLeaf.Type type,
- String columnName,
- Object literal,
- List<Object> literalList) {
- return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName,
- literal, literalList, null);
- }
-
- // can add .verboseLogging() to cause Mockito to log invocations
- private final MockSettings settings = Mockito.withSettings().verboseLogging();
-
- static class BufferInStream
- extends InputStream implements PositionedReadable, Seekable {
- private final byte[] buffer;
- private final int length;
- private int position = 0;
-
- BufferInStream(byte[] bytes, int length) {
- this.buffer = bytes;
- this.length = length;
- }
-
- @Override
- public int read() {
- if (position < length) {
- return buffer[position++];
- }
- return -1;
- }
-
- @Override
- public int read(byte[] bytes, int offset, int length) {
- int lengthToRead = Math.min(length, this.length - this.position);
- if (lengthToRead >= 0) {
- for(int i=0; i < lengthToRead; ++i) {
- bytes[offset + i] = buffer[position++];
- }
- return lengthToRead;
- } else {
- return -1;
- }
- }
-
- @Override
- public int read(long position, byte[] bytes, int offset, int length) {
- this.position = (int) position;
- return read(bytes, offset, length);
- }
-
- @Override
- public void readFully(long position, byte[] bytes, int offset,
- int length) throws IOException {
- this.position = (int) position;
- while (length > 0) {
- int result = read(bytes, offset, length);
- offset += result;
- length -= result;
- if (result < 0) {
- throw new IOException("Read past end of buffer at " + offset);
- }
- }
- }
-
- @Override
- public void readFully(long position, byte[] bytes) throws IOException {
- readFully(position, bytes, 0, bytes.length);
- }
-
- @Override
- public void seek(long position) {
- this.position = (int) position;
- }
-
- @Override
- public long getPos() {
- return position;
- }
-
- @Override
- public boolean seekToNewSource(long position) throws IOException {
- this.position = (int) position;
- return false;
- }
- }
-
- @Test
- public void testMaxLengthToReader() throws Exception {
- Configuration conf = new Configuration();
- OrcProto.Type rowType = OrcProto.Type.newBuilder()
- .setKind(OrcProto.Type.Kind.STRUCT).build();
- OrcProto.Footer footer = OrcProto.Footer.newBuilder()
- .setHeaderLength(0).setContentLength(0).setNumberOfRows(0)
- .setRowIndexStride(0).addTypes(rowType).build();
- OrcProto.PostScript ps = OrcProto.PostScript.newBuilder()
- .setCompression(OrcProto.CompressionKind.NONE)
- .setFooterLength(footer.getSerializedSize())
- .setMagic("ORC").addVersion(0).addVersion(11).build();
- DataOutputBuffer buffer = new DataOutputBuffer();
- footer.writeTo(buffer);
- ps.writeTo(buffer);
- buffer.write(ps.getSerializedSize());
- FileSystem fs = mock(FileSystem.class, settings);
- FSDataInputStream file =
- new FSDataInputStream(new BufferInStream(buffer.getData(),
- buffer.getLength()));
- Path p = new Path("/dir/file.orc");
- when(fs.open(p)).thenReturn(file);
- OrcFile.ReaderOptions options = OrcFile.readerOptions(conf);
- options.filesystem(fs);
- options.maxLength(buffer.getLength());
- when(fs.getFileStatus(p))
- .thenReturn(new FileStatus(10, false, 3, 3000, 0, p));
- Reader reader = OrcFile.createReader(p, options);
- }
-
- @Test
- public void testCompareToRangeInt() throws Exception {
- assertEquals(Location.BEFORE,
- RecordReaderImpl.compareToRange(19L, 20L, 40L));
- assertEquals(Location.AFTER,
- RecordReaderImpl.compareToRange(41L, 20L, 40L));
- assertEquals(Location.MIN,
- RecordReaderImpl.compareToRange(20L, 20L, 40L));
- assertEquals(Location.MIDDLE,
- RecordReaderImpl.compareToRange(21L, 20L, 40L));
- assertEquals(Location.MAX,
- RecordReaderImpl.compareToRange(40L, 20L, 40L));
- assertEquals(Location.BEFORE,
- RecordReaderImpl.compareToRange(0L, 1L, 1L));
- assertEquals(Location.MIN,
- RecordReaderImpl.compareToRange(1L, 1L, 1L));
- assertEquals(Location.AFTER,
- RecordReaderImpl.compareToRange(2L, 1L, 1L));
- }
-
- @Test
- public void testCompareToRangeString() throws Exception {
- assertEquals(Location.BEFORE,
- RecordReaderImpl.compareToRange("a", "b", "c"));
- assertEquals(Location.AFTER,
- RecordReaderImpl.compareToRange("d", "b", "c"));
- assertEquals(Location.MIN,
- RecordReaderImpl.compareToRange("b", "b", "c"));
- assertEquals(Location.MIDDLE,
- RecordReaderImpl.compareToRange("bb", "b", "c"));
- assertEquals(Location.MAX,
- RecordReaderImpl.compareToRange("c", "b", "c"));
- assertEquals(Location.BEFORE,
- RecordReaderImpl.compareToRange("a", "b", "b"));
- assertEquals(Location.MIN,
- RecordReaderImpl.compareToRange("b", "b", "b"));
- assertEquals(Location.AFTER,
- RecordReaderImpl.compareToRange("c", "b", "b"));
- }
-
- @Test
- public void testCompareToCharNeedConvert() throws Exception {
- assertEquals(Location.BEFORE,
- RecordReaderImpl.compareToRange("apple", "hello", "world"));
- assertEquals(Location.AFTER,
- RecordReaderImpl.compareToRange("zombie", "hello", "world"));
- assertEquals(Location.MIN,
- RecordReaderImpl.compareToRange("hello", "hello", "world"));
- assertEquals(Location.MIDDLE,
- RecordReaderImpl.compareToRange("pilot", "hello", "world"));
- assertEquals(Location.MAX,
- RecordReaderImpl.compareToRange("world", "hello", "world"));
- assertEquals(Location.BEFORE,
- RecordReaderImpl.compareToRange("apple", "hello", "hello"));
- assertEquals(Location.MIN,
- RecordReaderImpl.compareToRange("hello", "hello", "hello"));
- assertEquals(Location.AFTER,
- RecordReaderImpl.compareToRange("zombie", "hello", "hello"));
- }
-
- @Test
- public void testGetMin() throws Exception {
- assertEquals(10L, RecordReaderImpl.getMin(
- ColumnStatisticsImpl.deserialize(createIntStats(10L, 100L))));
- assertEquals(10.0d, RecordReaderImpl.getMin(ColumnStatisticsImpl.deserialize(
- OrcProto.ColumnStatistics.newBuilder()
- .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
- .setMinimum(10.0d).setMaximum(100.0d).build()).build())));
- assertEquals(null, RecordReaderImpl.getMin(ColumnStatisticsImpl.deserialize(
- OrcProto.ColumnStatistics.newBuilder()
- .setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
- .build())));
- assertEquals("a", RecordReaderImpl.getMin(ColumnStatisticsImpl.deserialize(
- OrcProto.ColumnStatistics.newBuilder()
- .setStringStatistics(OrcProto.StringStatistics.newBuilder()
- .setMinimum("a").setMaximum("b").build()).build())));
- assertEquals("hello", RecordReaderImpl.getMin(ColumnStatisticsImpl
- .deserialize(createStringStats("hello", "world"))));
- assertEquals(HiveDecimal.create("111.1"), RecordReaderImpl.getMin(ColumnStatisticsImpl
- .deserialize(createDecimalStats("111.1", "112.1"))));
- }
-
- private static OrcProto.ColumnStatistics createIntStats(Long min,
- Long max) {
- OrcProto.IntegerStatistics.Builder intStats =
- OrcProto.IntegerStatistics.newBuilder();
- if (min != null) {
- intStats.setMinimum(min);
- }
- if (max != null) {
- intStats.setMaximum(max);
- }
- return OrcProto.ColumnStatistics.newBuilder()
- .setIntStatistics(intStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createBooleanStats(int n, int trueCount) {
- OrcProto.BucketStatistics.Builder boolStats = OrcProto.BucketStatistics.newBuilder();
- boolStats.addCount(trueCount);
- return OrcProto.ColumnStatistics.newBuilder().setNumberOfValues(n).setBucketStatistics(
- boolStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createIntStats(int min, int max) {
- OrcProto.IntegerStatistics.Builder intStats = OrcProto.IntegerStatistics.newBuilder();
- intStats.setMinimum(min);
- intStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setIntStatistics(intStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createDoubleStats(double min, double max) {
- OrcProto.DoubleStatistics.Builder dblStats = OrcProto.DoubleStatistics.newBuilder();
- dblStats.setMinimum(min);
- dblStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setDoubleStatistics(dblStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createStringStats(String min, String max,
- boolean hasNull) {
- OrcProto.StringStatistics.Builder strStats = OrcProto.StringStatistics.newBuilder();
- strStats.setMinimum(min);
- strStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build())
- .setHasNull(hasNull).build();
- }
-
- private static OrcProto.ColumnStatistics createStringStats(String min, String max) {
- OrcProto.StringStatistics.Builder strStats = OrcProto.StringStatistics.newBuilder();
- strStats.setMinimum(min);
- strStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createDateStats(int min, int max) {
- OrcProto.DateStatistics.Builder dateStats = OrcProto.DateStatistics.newBuilder();
- dateStats.setMinimum(min);
- dateStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setDateStatistics(dateStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createTimestampStats(long min, long max) {
- OrcProto.TimestampStatistics.Builder tsStats = OrcProto.TimestampStatistics.newBuilder();
- tsStats.setMinimum(min);
- tsStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setTimestampStatistics(tsStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createDecimalStats(String min, String max) {
- OrcProto.DecimalStatistics.Builder decStats = OrcProto.DecimalStatistics.newBuilder();
- decStats.setMinimum(min);
- decStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setDecimalStatistics(decStats.build()).build();
- }
-
- private static OrcProto.ColumnStatistics createDecimalStats(String min, String max,
- boolean hasNull) {
- OrcProto.DecimalStatistics.Builder decStats = OrcProto.DecimalStatistics.newBuilder();
- decStats.setMinimum(min);
- decStats.setMaximum(max);
- return OrcProto.ColumnStatistics.newBuilder().setDecimalStatistics(decStats.build())
- .setHasNull(hasNull).build();
- }
-
- @Test
- public void testGetMax() throws Exception {
- assertEquals(100L, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(createIntStats(10L, 100L))));
- assertEquals(100.0d, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
- OrcProto.ColumnStatistics.newBuilder()
- .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
- .setMinimum(10.0d).setMaximum(100.0d).build()).build())));
- assertEquals(null, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
- OrcProto.ColumnStatistics.newBuilder()
- .setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
- .build())));
- assertEquals("b", RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
- OrcProto.ColumnStatistics.newBuilder()
- .setStringStatistics(OrcProto.StringStatistics.newBuilder()
- .setMinimum("a").setMaximum("b").build()).build())));
- assertEquals("world", RecordReaderImpl.getMax(ColumnStatisticsImpl
- .deserialize(createStringStats("hello", "world"))));
- assertEquals(HiveDecimal.create("112.1"), RecordReaderImpl.getMax(ColumnStatisticsImpl
- .deserialize(createDecimalStats("111.1", "112.1"))));
- }
-
- @Test
- public void testPredEvalWithBooleanStats() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
-
- pred = createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
-
- pred = createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", false, null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
- }
-
- @Test
- public void testPredEvalWithIntStats() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
-
- // Stats gets converted to column type. "15" is outside of "10" and "100"
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "15", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
-
- // Integer stats will not be converted date because of days/seconds/millis ambiguity
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
- }
-
- @Test
- public void testPredEvalWithDoubleStats() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
-
- // Stats gets converted to column type. "15.0" is outside of "10.0" and "100.0"
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "15", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
-
- // Double is not converted to date type because of days/seconds/millis ambiguity
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15*1000L), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150*1000L), null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
- }
-
- @Test
- public void testPredEvalWithStringStats() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 100L, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.FLOAT, "x", 100.0, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "100", null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
-
- // IllegalArgumentException is thrown when converting String to Date, hence YES_NO
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DATE, "x", new DateWritable(100).get(), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 1000), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("100"), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(100), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
- }
-
- @Test
- public void testPredEvalWithDateStats() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- // Date to Integer conversion is not possible.
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- // Date to Float conversion is also not possible.
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "15", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "1970-01-11", null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "15.1", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "__a15__1", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "2000-01-16", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "1970-01-16", null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DATE, "x", new DateWritable(150).get(), null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- // Date to Decimal conversion is also not possible.
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15L * 24L * 60L * 60L * 1000L), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
- }
-
- @Test
- public void testPredEvalWithDecimalStats() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
-
- // "15" out of range of "10.0" and "100.0"
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "15", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
-
- // Decimal to Date not possible.
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15 * 1000L), null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150 * 1000L), null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
- }
-
- @Test
- public void testPredEvalWithTimestampStats() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", "15", null);
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.STRING, "x", new Timestamp(15).toString(), null);
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10 * 24L * 60L * 60L * 1000L,
- 100 * 24L * 60L * 60L * 1000L), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
-
- pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
- PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
- }
-
- @Test
- public void testEquals() throws Exception {
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG,
- "x", 15L, null);
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null));
- }
-
- @Test
- public void testNullSafeEquals() throws Exception {
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG,
- "x", 15L, null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null));
- }
-
- @Test
- public void testLessThan() throws Exception {
- PredicateLeaf lessThan = createPredicateLeaf
- (PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.LONG,
- "x", 15L, null);
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), lessThan, null));
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), lessThan, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), lessThan, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), lessThan, null));
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), lessThan, null));
- }
-
- @Test
- public void testLessThanEquals() throws Exception {
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.LONG,
- "x", 15L, null);
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
- }
-
- @Test
- public void testIn() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(10L);
- args.add(20L);
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG,
- "x", null, args);
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 20L), pred, null));
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 30L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null));
- }
-
- @Test
- public void testBetween() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(10L);
- args.add(20L);
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.LONG,
- "x", null, args);
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 5L), pred, null));
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 40L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 15L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 25L), pred, null));
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 25L), pred, null));
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 20L), pred, null));
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null));
- }
-
- @Test
- public void testIsNull() throws Exception {
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.LONG,
- "x", null, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
- }
-
-
- @Test
- public void testEqualsWithNullInStats() throws Exception {
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING,
- "x", "c", null);
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
- }
-
- @Test
- public void testNullSafeEqualsWithNullInStats() throws Exception {
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING,
- "x", "c", null);
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
- }
-
- @Test
- public void testLessThanWithNullInStats() throws Exception {
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.STRING,
- "x", "c", null);
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
- assertEquals(TruthValue.NO_NULL, // min, same stats
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null));
- }
-
- @Test
- public void testLessThanEqualsWithNullInStats() throws Exception {
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.STRING,
- "x", "c", null);
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
- }
-
- @Test
- public void testInWithNullInStats() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add("c");
- args.add("f");
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING,
- "x", null, args);
- assertEquals(TruthValue.NO_NULL, // before & after
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null));
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null)); // max
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
- assertEquals(TruthValue.YES_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
- }
-
- @Test
- public void testBetweenWithNullInStats() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add("c");
- args.add("f");
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.STRING,
- "x", null, args);
- assertEquals(TruthValue.YES_NULL, // before & after
- RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null));
- assertEquals(TruthValue.YES_NULL, // before & max
- RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null));
- assertEquals(TruthValue.NO_NULL, // before & before
- RecordReaderImpl.evaluatePredicateProto(createStringStats("h", "g", true), pred, null));
- assertEquals(TruthValue.YES_NO_NULL, // before & min
- RecordReaderImpl.evaluatePredicateProto(createStringStats("f", "g", true), pred, null));
- assertEquals(TruthValue.YES_NO_NULL, // before & middle
- RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "g", true), pred, null));
-
- assertEquals(TruthValue.YES_NULL, // min & after
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "e", true), pred, null));
- assertEquals(TruthValue.YES_NULL, // min & max
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "f", true), pred, null));
- assertEquals(TruthValue.YES_NO_NULL, // min & middle
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "g", true), pred, null));
-
- assertEquals(TruthValue.NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "c", true), pred, null)); // max
- assertEquals(TruthValue.YES_NO_NULL,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
- assertEquals(TruthValue.YES_NULL, // min & after, same stats
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null));
- }
-
- @Test
- public void testIsNullWithNullInStats() throws Exception {
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.STRING,
- "x", null, null);
- assertEquals(TruthValue.YES_NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null));
- assertEquals(TruthValue.NO,
- RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", false), pred, null));
- }
-
- @Test
- public void testOverlap() throws Exception {
- assertTrue(!RecordReaderUtils.overlap(0, 10, -10, -1));
- assertTrue(RecordReaderUtils.overlap(0, 10, -1, 0));
- assertTrue(RecordReaderUtils.overlap(0, 10, -1, 1));
- assertTrue(RecordReaderUtils.overlap(0, 10, 2, 8));
- assertTrue(RecordReaderUtils.overlap(0, 10, 5, 10));
- assertTrue(RecordReaderUtils.overlap(0, 10, 10, 11));
- assertTrue(RecordReaderUtils.overlap(0, 10, 0, 10));
- assertTrue(RecordReaderUtils.overlap(0, 10, -1, 11));
- assertTrue(!RecordReaderUtils.overlap(0, 10, 11, 12));
- }
-
- private static DiskRangeList diskRanges(Integer... points) {
- DiskRangeList head = null, tail = null;
- for(int i = 0; i < points.length; i += 2) {
- DiskRangeList range = new DiskRangeList(points[i], points[i+1]);
- if (tail == null) {
- head = tail = range;
- } else {
- tail = tail.insertAfter(range);
- }
- }
- return head;
- }
-
- @Test
- public void testGetIndexPosition() throws Exception {
- assertEquals(0, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
- OrcProto.Stream.Kind.PRESENT, true, true));
- assertEquals(4, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
- OrcProto.Stream.Kind.DATA, true, true));
- assertEquals(3, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
- OrcProto.Stream.Kind.DATA, false, true));
- assertEquals(0, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
- OrcProto.Stream.Kind.DATA, true, false));
- assertEquals(4, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DICTIONARY, OrcProto.Type.Kind.STRING,
- OrcProto.Stream.Kind.DATA, true, true));
- assertEquals(4, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
- OrcProto.Stream.Kind.DATA, true, true));
- assertEquals(3, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
- OrcProto.Stream.Kind.DATA, false, true));
- assertEquals(6, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
- OrcProto.Stream.Kind.LENGTH, true, true));
- assertEquals(4, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
- OrcProto.Stream.Kind.LENGTH, false, true));
- assertEquals(4, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
- OrcProto.Stream.Kind.DATA, true, true));
- assertEquals(3, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
- OrcProto.Stream.Kind.DATA, false, true));
- assertEquals(6, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
- OrcProto.Stream.Kind.SECONDARY, true, true));
- assertEquals(4, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
- OrcProto.Stream.Kind.SECONDARY, false, true));
- assertEquals(4, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
- OrcProto.Stream.Kind.DATA, true, true));
- assertEquals(3, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
- OrcProto.Stream.Kind.DATA, false, true));
- assertEquals(7, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
- OrcProto.Stream.Kind.SECONDARY, true, true));
- assertEquals(5, RecordReaderUtils.getIndexPosition
- (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
- OrcProto.Stream.Kind.SECONDARY, false, true));
- }
-
- @Test
- public void testPartialPlan() throws Exception {
- DiskRangeList result;
-
- // set the streams
- List<OrcProto.Stream> streams = new ArrayList<OrcProto.Stream>();
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.PRESENT)
- .setColumn(1).setLength(1000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.DATA)
- .setColumn(1).setLength(99000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.PRESENT)
- .setColumn(2).setLength(2000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.DATA)
- .setColumn(2).setLength(98000).build());
-
- boolean[] columns = new boolean[]{true, true, false};
- boolean[] rowGroups = new boolean[]{true, true, false, false, true, false};
-
- // set the index
- OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.length];
- indexes[1] = OrcProto.RowIndex.newBuilder()
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(0).addPositions(-1).addPositions(-1)
- .addPositions(0)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(100).addPositions(-1).addPositions(-1)
- .addPositions(10000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(200).addPositions(-1).addPositions(-1)
- .addPositions(20000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(300).addPositions(-1).addPositions(-1)
- .addPositions(30000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(400).addPositions(-1).addPositions(-1)
- .addPositions(40000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(500).addPositions(-1).addPositions(-1)
- .addPositions(50000)
- .build())
- .build();
-
- // set encodings
- List<OrcProto.ColumnEncoding> encodings =
- new ArrayList<OrcProto.ColumnEncoding>();
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
-
- // set types struct{x: int, y: int}
- List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT)
- .addSubtypes(1).addSubtypes(2).addFieldNames("x")
- .addFieldNames("y").build());
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
-
- // filter by rows and groups
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, false);
- assertThat(result, is(diskRanges(0, 1000, 100, 1000, 400, 1000,
- 1000, 11000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
- 11000, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
- 41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, true);
- assertThat(result, is(diskRanges(0, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
- 41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
-
- // if we read no rows, don't read any bytes
- rowGroups = new boolean[]{false, false, false, false, false, false};
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, false);
- assertNull(result);
-
- // all rows, but only columns 0 and 2.
- rowGroups = null;
- columns = new boolean[]{true, false, true};
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, null, false, encodings, types, 32768, false);
- assertThat(result, is(diskRanges(100000, 102000, 102000, 200000)));
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, null, false, encodings, types, 32768, true);
- assertThat(result, is(diskRanges(100000, 200000)));
-
- rowGroups = new boolean[]{false, true, false, false, false, false};
- indexes[2] = indexes[1];
- indexes[1] = null;
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, false);
- assertThat(result, is(diskRanges(100100, 102000,
- 112000, 122000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, true);
- assertThat(result, is(diskRanges(100100, 102000,
- 112000, 122000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
-
- rowGroups = new boolean[]{false, false, false, false, false, true};
- indexes[1] = indexes[2];
- columns = new boolean[]{true, true, true};
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, false);
- assertThat(result, is(diskRanges(500, 1000, 51000, 100000, 100500, 102000,
- 152000, 200000)));
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, true);
- assertThat(result, is(diskRanges(500, 1000, 51000, 100000, 100500, 102000,
- 152000, 200000)));
- }
-
-
- @Test
- public void testPartialPlanCompressed() throws Exception {
- DiskRangeList result;
-
- // set the streams
- List<OrcProto.Stream> streams = new ArrayList<OrcProto.Stream>();
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.PRESENT)
- .setColumn(1).setLength(1000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.DATA)
- .setColumn(1).setLength(99000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.PRESENT)
- .setColumn(2).setLength(2000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.DATA)
- .setColumn(2).setLength(98000).build());
-
- boolean[] columns = new boolean[]{true, true, false};
- boolean[] rowGroups = new boolean[]{true, true, false, false, true, false};
-
- // set the index
- OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.length];
- indexes[1] = OrcProto.RowIndex.newBuilder()
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(0).addPositions(-1).addPositions(-1).addPositions(-1)
- .addPositions(0)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(100).addPositions(-1).addPositions(-1).addPositions(-1)
- .addPositions(10000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(200).addPositions(-1).addPositions(-1).addPositions(-1)
- .addPositions(20000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(300).addPositions(-1).addPositions(-1).addPositions(-1)
- .addPositions(30000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(400).addPositions(-1).addPositions(-1).addPositions(-1)
- .addPositions(40000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(500).addPositions(-1).addPositions(-1).addPositions(-1)
- .addPositions(50000)
- .build())
- .build();
-
- // set encodings
- List<OrcProto.ColumnEncoding> encodings =
- new ArrayList<OrcProto.ColumnEncoding>();
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
-
- // set types struct{x: int, y: int}
- List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT)
- .addSubtypes(1).addSubtypes(2).addFieldNames("x")
- .addFieldNames("y").build());
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
-
- // filter by rows and groups
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, true, encodings, types, 32768, false);
- assertThat(result, is(diskRanges(0, 1000, 100, 1000,
- 400, 1000, 1000, 11000+(2*32771),
- 11000, 21000+(2*32771), 41000, 100000)));
-
- rowGroups = new boolean[]{false, false, false, false, false, true};
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, true, encodings, types, 32768, false);
- assertThat(result, is(diskRanges(500, 1000, 51000, 100000)));
- }
-
- @Test
- public void testPartialPlanString() throws Exception {
- DiskRangeList result;
-
- // set the streams
- List<OrcProto.Stream> streams = new ArrayList<OrcProto.Stream>();
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.PRESENT)
- .setColumn(1).setLength(1000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.DATA)
- .setColumn(1).setLength(94000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.LENGTH)
- .setColumn(1).setLength(2000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.DICTIONARY_DATA)
- .setColumn(1).setLength(3000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.PRESENT)
- .setColumn(2).setLength(2000).build());
- streams.add(OrcProto.Stream.newBuilder()
- .setKind(OrcProto.Stream.Kind.DATA)
- .setColumn(2).setLength(98000).build());
-
- boolean[] columns = new boolean[]{true, true, false};
- boolean[] rowGroups = new boolean[]{false, true, false, false, true, true};
-
- // set the index
- OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.length];
- indexes[1] = OrcProto.RowIndex.newBuilder()
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(0).addPositions(-1).addPositions(-1)
- .addPositions(0)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(100).addPositions(-1).addPositions(-1)
- .addPositions(10000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(200).addPositions(-1).addPositions(-1)
- .addPositions(20000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(300).addPositions(-1).addPositions(-1)
- .addPositions(30000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(400).addPositions(-1).addPositions(-1)
- .addPositions(40000)
- .build())
- .addEntry(OrcProto.RowIndexEntry.newBuilder()
- .addPositions(500).addPositions(-1).addPositions(-1)
- .addPositions(50000)
- .build())
- .build();
-
- // set encodings
- List<OrcProto.ColumnEncoding> encodings =
- new ArrayList<OrcProto.ColumnEncoding>();
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DICTIONARY).build());
- encodings.add(OrcProto.ColumnEncoding.newBuilder()
- .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
-
- // set types struct{x: string, y: int}
- List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT)
- .addSubtypes(1).addSubtypes(2).addFieldNames("x")
- .addFieldNames("y").build());
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRING).build());
- types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
-
- // filter by rows and groups
- result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
- columns, rowGroups, false, encodings, types, 32768, false);
- assertThat(result, is(diskRanges(100, 1000, 400, 1000, 500, 1000,
- 11000, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
- 41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
- 51000, 95000, 95000, 97000, 97000, 100000)));
- }
-
- @Test
- public void testIntNullSafeEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong(i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
- assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong(15);
- assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testIntEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong(i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong(15);
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testIntInBloomFilter() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(15L);
- args.add(19L);
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG,
- "x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong(i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong(19);
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong(15);
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDoubleNullSafeEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addDouble(i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
- assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addDouble(15.0);
- assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDoubleEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addDouble(i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addDouble(15.0);
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDoubleInBloomFilter() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(15.0);
- args.add(19.0);
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.FLOAT,
- "x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addDouble(i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addDouble(19.0);
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addDouble(15.0);
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testStringNullSafeEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addString("str_" + i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
- assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString("str_15");
- assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testStringEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addString("str_" + i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString("str_15");
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testStringInBloomFilter() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add("str_15");
- args.add("str_19");
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING,
- "x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addString("str_" + i);
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString("str_19");
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString("str_15");
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDateWritableNullSafeEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x",
- new DateWritable(15).get(), null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong((new DateWritable(i)).getDays());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDateStats(10, 100));
- assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new DateWritable(15)).getDays());
- assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDateWritableEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DATE, "x",
- new DateWritable(15).get(), null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong((new DateWritable(i)).getDays());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDateStats(10, 100));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new DateWritable(15)).getDays());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDateWritableInBloomFilter() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(new DateWritable(15).get());
- args.add(new DateWritable(19).get());
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DATE,
- "x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong((new DateWritable(i)).getDays());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDateStats(10, 100));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new DateWritable(19)).getDays());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new DateWritable(15)).getDays());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testTimestampNullSafeEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x",
- new Timestamp(15),
- null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong((new Timestamp(i)).getTime());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100));
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new Timestamp(15)).getTime());
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testTimestampEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong((new Timestamp(i)).getTime());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100));
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new Timestamp(15)).getTime());
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testTimestampInBloomFilter() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(new Timestamp(15));
- args.add(new Timestamp(19));
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.TIMESTAMP,
- "x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addLong((new Timestamp(i)).getTime());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100));
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new Timestamp(19)).getTime());
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addLong((new Timestamp(15)).getTime());
- // timestamp PPD is disable until ORC-135
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDecimalNullSafeEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x",
- new HiveDecimalWritable("15"),
- null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addString(HiveDecimal.create(i).toString());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200"));
- assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString(HiveDecimal.create(15).toString());
- assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDecimalEqualsBloomFilter() throws Exception {
- PredicateLeaf pred = createPredicateLeaf(
- PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DECIMAL, "x",
- new HiveDecimalWritable("15"),
- null);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addString(HiveDecimal.create(i).toString());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200"));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString(HiveDecimal.create(15).toString());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testDecimalInBloomFilter() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(new HiveDecimalWritable("15"));
- args.add(new HiveDecimalWritable("19"));
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL,
- "x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addString(HiveDecimal.create(i).toString());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200"));
- assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString(HiveDecimal.create(19).toString());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString(HiveDecimal.create(15).toString());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testNullsInBloomFilter() throws Exception {
- List<Object> args = new ArrayList<Object>();
- args.add(new HiveDecimalWritable("15"));
- args.add(null);
- args.add(new HiveDecimalWritable("19"));
- PredicateLeaf pred = createPredicateLeaf
- (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL,
- "x", null, args);
- BloomFilterIO bf = new BloomFilterIO(10000);
- for (int i = 20; i < 1000; i++) {
- bf.addString(HiveDecimal.create(i).toString());
- }
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200", false));
- // hasNull is false, so bloom filter should return NO
- assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200", true));
- // hasNull is true, so bloom filter should return YES_NO_NULL
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString(HiveDecimal.create(19).toString());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
-
- bf.addString(HiveDecimal.create(15).toString());
- assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
- }
-
- @Test
- public void testClose() throws Exception {
- DataReader mockedDataReader = mock(DataReader.class);
- closeMockedRecordReader(mockedDataReader);
-
- verify(mockedDataReader, atLeastOnce()).close();
- }
-
- @Test
- public void testCloseWithException() throws Exception {
- DataReader mockedDataReader = mock(DataReader.class);
- doThrow(IOException.class).when(mockedDataReader).close();
-
- try {
- closeMockedRecordReader(mockedDataReader);
- fail("Exception should have been thrown when Record Reader was closed");
- } catch (IOException expected) {
-
- }
-
- verify(mockedDataReader, atLeastOnce()).close();
- }
-
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
-
- private void closeMockedRecordReader(DataReader mockedDataReader) throws IOException {
- Configuration conf = new Configuration();
- Path path = new Path(workDir, "empty.orc");
- FileSystem.get(conf).delete(path, true);
- Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf)
- .setSchema(TypeDescription.createLong()));
- writer.close();
- Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
-
- RecordReader recordReader = reader.rows(new Reader.Options()
- .dataReader(mockedDataReader));
-
- recordReader.close();
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestRunLengthByteReader.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestRunLengthByteReader.java b/orc/src/test/org/apache/orc/impl/TestRunLengthByteReader.java
deleted file mode 100644
index a14bef1..0000000
--- a/orc/src/test/org/apache/orc/impl/TestRunLengthByteReader.java
+++ /dev/null
@@ -1,143 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import static junit.framework.Assert.assertEquals;
-
-import java.nio.ByteBuffer;
-
-import org.apache.orc.CompressionCodec;
-import org.junit.Test;
-
-public class TestRunLengthByteReader {
-
- @Test
- public void testUncompressedSeek() throws Exception {
- TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
- RunLengthByteWriter out = new RunLengthByteWriter(new OutStream("test", 100,
- null, collect));
- TestInStream.PositionCollector[] positions =
- new TestInStream.PositionCollector[2048];
- for(int i=0; i < 2048; ++i) {
- positions[i] = new TestInStream.PositionCollector();
- out.getPosition(positions[i]);
- if (i < 1024) {
- out.write((byte) (i/4));
- } else {
- out.write((byte) i);
- }
- }
- out.flush();
- ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
- collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
- inBuf.flip();
- RunLengthByteReader in = new RunLengthByteReader(InStream.create("test",
- new ByteBuffer[]{inBuf}, new long[]{0}, inBuf.remaining(), null, 100));
- for(int i=0; i < 2048; ++i) {
- int x = in.next() & 0xff;
- if (i < 1024) {
- assertEquals((i/4) & 0xff, x);
- } else {
- assertEquals(i & 0xff, x);
- }
- }
- for(int i=2047; i >= 0; --i) {
- in.seek(positions[i]);
- int x = in.next() & 0xff;
- if (i < 1024) {
- assertEquals((i/4) & 0xff, x);
- } else {
- assertEquals(i & 0xff, x);
- }
- }
- }
-
- @Test
- public void testCompressedSeek() throws Exception {
- CompressionCodec codec = new SnappyCodec();
- TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
- RunLengthByteWriter out = new RunLengthByteWriter(new OutStream("test", 500,
- codec, collect));
- TestInStream.PositionCollector[] positions =
- new TestInStream.PositionCollector[2048];
- for(int i=0; i < 2048; ++i) {
- positions[i] = new TestInStream.PositionCollector();
- out.getPosition(positions[i]);
- if (i < 1024) {
- out.write((byte) (i/4));
- } else {
- out.write((byte) i);
- }
- }
- out.flush();
- ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
- collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
- inBuf.flip();
- RunLengthByteReader in = new RunLengthByteReader(InStream.create("test",
- new ByteBuffer[]{inBuf}, new long[]{0}, inBuf.remaining(), codec, 500));
- for(int i=0; i < 2048; ++i) {
- int x = in.next() & 0xff;
- if (i < 1024) {
- assertEquals((i/4) & 0xff, x);
- } else {
- assertEquals(i & 0xff, x);
- }
- }
- for(int i=2047; i >= 0; --i) {
- in.seek(positions[i]);
- int x = in.next() & 0xff;
- if (i < 1024) {
- assertEquals((i/4) & 0xff, x);
- } else {
- assertEquals(i & 0xff, x);
- }
- }
- }
-
- @Test
- public void testSkips() throws Exception {
- TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
- RunLengthByteWriter out = new RunLengthByteWriter(new OutStream("test", 100,
- null, collect));
- for(int i=0; i < 2048; ++i) {
- if (i < 1024) {
- out.write((byte) (i/16));
- } else {
- out.write((byte) i);
- }
- }
- out.flush();
- ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
- collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
- inBuf.flip();
- RunLengthByteReader in = new RunLengthByteReader(InStream.create("test",
- new ByteBuffer[]{inBuf}, new long[]{0}, inBuf.remaining(), null, 100));
- for(int i=0; i < 2048; i += 10) {
- int x = in.next() & 0xff;
- if (i < 1024) {
- assertEquals((i/16) & 0xff, x);
- } else {
- assertEquals(i & 0xff, x);
- }
- if (i < 2038) {
- in.skip(9);
- }
- in.skip(0);
- }
- }
-}
[25/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/WriterImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/WriterImpl.java b/orc/src/java/org/apache/hive/orc/impl/WriterImpl.java
new file mode 100644
index 0000000..1e273c0
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/WriterImpl.java
@@ -0,0 +1,2443 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.sql.Timestamp;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.TimeZone;
+import java.util.TreeMap;
+
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.hive.ql.util.JavaDataModel;
+import org.apache.hive.orc.BinaryColumnStatistics;
+import org.apache.hive.orc.BloomFilterIO;
+import org.apache.hive.orc.OrcConf;
+import org.apache.hive.orc.OrcFile;
+import org.apache.hive.orc.OrcProto;
+import org.apache.hive.orc.StripeInformation;
+import org.apache.hive.orc.TypeDescription;
+import org.apache.hive.orc.Writer;
+import org.apache.hive.orc.OrcUtils;
+import org.apache.hive.orc.StringColumnStatistics;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.io.Text;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Lists;
+import com.google.common.primitives.Longs;
+import com.google.protobuf.ByteString;
+
+/**
+ * An ORC file writer. The file is divided into stripes, which is the natural
+ * unit of work when reading. Each stripe is buffered in memory until the
+ * memory reaches the stripe size and then it is written out broken down by
+ * columns. Each column is written by a TreeWriter that is specific to that
+ * type of column. TreeWriters may have children TreeWriters that handle the
+ * sub-types. Each of the TreeWriters writes the column's data as a set of
+ * streams.
+ *
+ * This class is unsynchronized like most Stream objects, so from the creation
+ * of an OrcFile and all access to a single instance has to be from a single
+ * thread.
+ *
+ * There are no known cases where these happen between different threads today.
+ *
+ * Caveat: the MemoryManager is created during WriterOptions create, that has
+ * to be confined to a single thread as well.
+ *
+ */
+public class WriterImpl implements Writer, MemoryManager.Callback {
+
+ private static final Logger LOG = LoggerFactory.getLogger(WriterImpl.class);
+
+ private static final int MIN_ROW_INDEX_STRIDE = 1000;
+
+ private final Path path;
+ private final int rowIndexStride;
+ private final TypeDescription schema;
+
+ @VisibleForTesting
+ protected final PhysicalWriter physWriter;
+ private int columnCount;
+ private long rowCount = 0;
+ private long rowsInStripe = 0;
+ private long rawDataSize = 0;
+ private int rowsInIndex = 0;
+ private int stripesAtLastFlush = -1;
+ private final List<OrcProto.StripeInformation> stripes =
+ new ArrayList<OrcProto.StripeInformation>();
+ private final Map<String, ByteString> userMetadata =
+ new TreeMap<String, ByteString>();
+ private final StreamFactory streamFactory = new StreamFactory();
+ private final TreeWriter treeWriter;
+ private final boolean buildIndex;
+ private final MemoryManager memoryManager;
+ private final OrcFile.Version version;
+ private final Configuration conf;
+ private final OrcFile.WriterCallback callback;
+ private final OrcFile.WriterContext callbackContext;
+ private final OrcFile.EncodingStrategy encodingStrategy;
+ private final boolean[] bloomFilterColumns;
+ private final double bloomFilterFpp;
+ private boolean writeTimeZone;
+
+ public WriterImpl(FileSystem fs,
+ Path path,
+ OrcFile.WriterOptions opts) throws IOException {
+ this(new PhysicalFsWriter(fs, path, opts.getSchema().getMaximumId() + 1, opts), path, opts);
+ }
+
+ public WriterImpl(PhysicalWriter writer,
+ Path pathForMem,
+ OrcFile.WriterOptions opts) throws IOException {
+ this.physWriter = writer;
+ this.path = pathForMem;
+ this.conf = opts.getConfiguration();
+ this.schema = opts.getSchema();
+ this.callback = opts.getCallback();
+ if (callback != null) {
+ callbackContext = new OrcFile.WriterContext(){
+
+ @Override
+ public Writer getWriter() {
+ return WriterImpl.this;
+ }
+ };
+ } else {
+ callbackContext = null;
+ }
+ this.version = opts.getVersion();
+ this.encodingStrategy = opts.getEncodingStrategy();
+ this.rowIndexStride = opts.getRowIndexStride();
+ this.memoryManager = opts.getMemoryManager();
+ buildIndex = rowIndexStride > 0;
+ if (version == OrcFile.Version.V_0_11) {
+ /* do not write bloom filters for ORC v11 */
+ this.bloomFilterColumns = new boolean[schema.getMaximumId() + 1];
+ } else {
+ this.bloomFilterColumns =
+ OrcUtils.includeColumns(opts.getBloomFilterColumns(), schema);
+ }
+ this.bloomFilterFpp = opts.getBloomFilterFpp();
+ treeWriter = createTreeWriter(schema, streamFactory, false);
+ if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
+ throw new IllegalArgumentException("Row stride must be at least " +
+ MIN_ROW_INDEX_STRIDE);
+ }
+
+ // ensure that we are able to handle callbacks before we register ourselves
+ if (path != null) {
+ memoryManager.addWriter(path, opts.getStripeSize(), this);
+ }
+ }
+
+ @Override
+ public boolean checkMemory(double newScale) throws IOException {
+ long limit = (long) Math.round(physWriter.getPhysicalStripeSize() * newScale);
+ long size = estimateStripeSize();
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("ORC writer " + path + " size = " + size + " limit = " +
+ limit);
+ }
+ if (size > limit) {
+ flushStripe();
+ return true;
+ }
+ return false;
+ }
+
+ private static class RowIndexPositionRecorder implements PositionRecorder {
+ private final OrcProto.RowIndexEntry.Builder builder;
+
+ RowIndexPositionRecorder(OrcProto.RowIndexEntry.Builder builder) {
+ this.builder = builder;
+ }
+
+ @Override
+ public void addPosition(long position) {
+ builder.addPositions(position);
+ }
+ }
+
+ /**
+ * Interface from the Writer to the TreeWriters. This limits the visibility
+ * that the TreeWriters have into the Writer.
+ */
+ private class StreamFactory {
+ /**
+ * Create a stream to store part of a column.
+ * @param column the column id for the stream
+ * @param kind the kind of stream
+ * @return The output outStream that the section needs to be written to.
+ * @throws IOException
+ */
+ public OutStream createStream(int column,
+ OrcProto.Stream.Kind kind
+ ) throws IOException {
+ final StreamName name = new StreamName(column, kind);
+ return physWriter.getOrCreatePhysicalStream(name);
+ }
+
+ public void writeIndex(int column, OrcProto.RowIndex.Builder rowIndex) throws IOException {
+ physWriter.writeIndexStream(new StreamName(column, OrcProto.Stream.Kind.ROW_INDEX), rowIndex);
+ }
+
+ public void writeBloomFilter(
+ int column, OrcProto.BloomFilterIndex.Builder bloomFilterIndex) throws IOException {
+ physWriter.writeBloomFilterStream(
+ new StreamName(column, OrcProto.Stream.Kind.BLOOM_FILTER), bloomFilterIndex);
+ }
+ /**
+ * Get the next column id.
+ * @return a number from 0 to the number of columns - 1
+ */
+ public int getNextColumnId() {
+ return columnCount++;
+ }
+
+ /**
+ * Get the stride rate of the row index.
+ */
+ public int getRowIndexStride() {
+ return rowIndexStride;
+ }
+
+ /**
+ * Should be building the row index.
+ * @return true if we are building the index
+ */
+ public boolean buildIndex() {
+ return buildIndex;
+ }
+
+ /**
+ * Is the ORC file compressed?
+ * @return are the streams compressed
+ */
+ public boolean isCompressed() {
+ return physWriter.isCompressed();
+ }
+
+ /**
+ * Get the encoding strategy to use.
+ * @return encoding strategy
+ */
+ public OrcFile.EncodingStrategy getEncodingStrategy() {
+ return encodingStrategy;
+ }
+
+ /**
+ * Get the bloom filter columns
+ * @return bloom filter columns
+ */
+ public boolean[] getBloomFilterColumns() {
+ return bloomFilterColumns;
+ }
+
+ /**
+ * Get bloom filter false positive percentage.
+ * @return fpp
+ */
+ public double getBloomFilterFPP() {
+ return bloomFilterFpp;
+ }
+
+ /**
+ * Get the writer's configuration.
+ * @return configuration
+ */
+ public Configuration getConfiguration() {
+ return conf;
+ }
+
+ /**
+ * Get the version of the file to write.
+ */
+ public OrcFile.Version getVersion() {
+ return version;
+ }
+
+ public void useWriterTimeZone(boolean val) {
+ writeTimeZone = val;
+ }
+
+ public boolean hasWriterTimeZone() {
+ return writeTimeZone;
+ }
+ }
+
+ /**
+ * The parent class of all of the writers for each column. Each column
+ * is written by an instance of this class. The compound types (struct,
+ * list, map, and union) have children tree writers that write the children
+ * types.
+ */
+ private abstract static class TreeWriter {
+ protected final int id;
+ protected final BitFieldWriter isPresent;
+ private final boolean isCompressed;
+ protected final ColumnStatisticsImpl indexStatistics;
+ protected final ColumnStatisticsImpl stripeColStatistics;
+ private final ColumnStatisticsImpl fileStatistics;
+ protected TreeWriter[] childrenWriters;
+ protected final RowIndexPositionRecorder rowIndexPosition;
+ private final OrcProto.RowIndex.Builder rowIndex;
+ private final OrcProto.RowIndexEntry.Builder rowIndexEntry;
+ protected final BloomFilterIO bloomFilter;
+ protected final boolean createBloomFilter;
+ private final OrcProto.BloomFilterIndex.Builder bloomFilterIndex;
+ private final OrcProto.BloomFilter.Builder bloomFilterEntry;
+ private boolean foundNulls;
+ private OutStream isPresentOutStream;
+ private final List<OrcProto.StripeStatistics.Builder> stripeStatsBuilders;
+ private final StreamFactory streamFactory;
+
+ /**
+ * Create a tree writer.
+ * @param columnId the column id of the column to write
+ * @param schema the row schema
+ * @param streamFactory limited access to the Writer's data.
+ * @param nullable can the value be null?
+ * @throws IOException
+ */
+ TreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory streamFactory,
+ boolean nullable) throws IOException {
+ this.streamFactory = streamFactory;
+ this.isCompressed = streamFactory.isCompressed();
+ this.id = columnId;
+ if (nullable) {
+ isPresentOutStream = streamFactory.createStream(id,
+ OrcProto.Stream.Kind.PRESENT);
+ isPresent = new BitFieldWriter(isPresentOutStream, 1);
+ } else {
+ isPresent = null;
+ }
+ this.foundNulls = false;
+ createBloomFilter = streamFactory.getBloomFilterColumns()[columnId];
+ indexStatistics = ColumnStatisticsImpl.create(schema);
+ stripeColStatistics = ColumnStatisticsImpl.create(schema);
+ fileStatistics = ColumnStatisticsImpl.create(schema);
+ childrenWriters = new TreeWriter[0];
+ rowIndex = OrcProto.RowIndex.newBuilder();
+ rowIndexEntry = OrcProto.RowIndexEntry.newBuilder();
+ rowIndexPosition = new RowIndexPositionRecorder(rowIndexEntry);
+ stripeStatsBuilders = Lists.newArrayList();
+ if (createBloomFilter) {
+ bloomFilterEntry = OrcProto.BloomFilter.newBuilder();
+ bloomFilterIndex = OrcProto.BloomFilterIndex.newBuilder();
+ bloomFilter = new BloomFilterIO(streamFactory.getRowIndexStride(),
+ streamFactory.getBloomFilterFPP());
+ } else {
+ bloomFilterEntry = null;
+ bloomFilterIndex = null;
+ bloomFilter = null;
+ }
+ }
+
+ protected OrcProto.RowIndex.Builder getRowIndex() {
+ return rowIndex;
+ }
+
+ protected ColumnStatisticsImpl getStripeStatistics() {
+ return stripeColStatistics;
+ }
+
+ protected OrcProto.RowIndexEntry.Builder getRowIndexEntry() {
+ return rowIndexEntry;
+ }
+
+ IntegerWriter createIntegerWriter(PositionedOutputStream output,
+ boolean signed, boolean isDirectV2,
+ StreamFactory writer) {
+ if (isDirectV2) {
+ boolean alignedBitpacking = false;
+ if (writer.getEncodingStrategy().equals(OrcFile.EncodingStrategy.SPEED)) {
+ alignedBitpacking = true;
+ }
+ return new RunLengthIntegerWriterV2(output, signed, alignedBitpacking);
+ } else {
+ return new RunLengthIntegerWriter(output, signed);
+ }
+ }
+
+ boolean isNewWriteFormat(StreamFactory writer) {
+ return writer.getVersion() != OrcFile.Version.V_0_11;
+ }
+
+ /**
+ * Handle the top level object write.
+ *
+ * This default method is used for all types except structs, which are the
+ * typical case. VectorizedRowBatch assumes the top level object is a
+ * struct, so we use the first column for all other types.
+ * @param batch the batch to write from
+ * @param offset the row to start on
+ * @param length the number of rows to write
+ * @throws IOException
+ */
+ void writeRootBatch(VectorizedRowBatch batch, int offset,
+ int length) throws IOException {
+ writeBatch(batch.cols[0], offset, length);
+ }
+
+ /**
+ * Write the values from the given vector from offset for length elements.
+ * @param vector the vector to write from
+ * @param offset the first value from the vector to write
+ * @param length the number of values from the vector to write
+ * @throws IOException
+ */
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ if (vector.noNulls) {
+ indexStatistics.increment(length);
+ if (isPresent != null) {
+ for (int i = 0; i < length; ++i) {
+ isPresent.write(1);
+ }
+ }
+ } else {
+ if (vector.isRepeating) {
+ boolean isNull = vector.isNull[0];
+ if (isPresent != null) {
+ for (int i = 0; i < length; ++i) {
+ isPresent.write(isNull ? 0 : 1);
+ }
+ }
+ if (isNull) {
+ foundNulls = true;
+ indexStatistics.setNull();
+ } else {
+ indexStatistics.increment(length);
+ }
+ } else {
+ // count the number of non-null values
+ int nonNullCount = 0;
+ for(int i = 0; i < length; ++i) {
+ boolean isNull = vector.isNull[i + offset];
+ if (!isNull) {
+ nonNullCount += 1;
+ }
+ if (isPresent != null) {
+ isPresent.write(isNull ? 0 : 1);
+ }
+ }
+ indexStatistics.increment(nonNullCount);
+ if (nonNullCount != length) {
+ foundNulls = true;
+ indexStatistics.setNull();
+ }
+ }
+ }
+ }
+
+ private void removeIsPresentPositions() {
+ for(int i=0; i < rowIndex.getEntryCount(); ++i) {
+ OrcProto.RowIndexEntry.Builder entry = rowIndex.getEntryBuilder(i);
+ List<Long> positions = entry.getPositionsList();
+ // bit streams use 3 positions if uncompressed, 4 if compressed
+ positions = positions.subList(isCompressed ? 4 : 3, positions.size());
+ entry.clearPositions();
+ entry.addAllPositions(positions);
+ }
+ }
+
+ /**
+ * Write the stripe out to the file.
+ * @param builder the stripe footer that contains the information about the
+ * layout of the stripe. The TreeWriter is required to update
+ * the footer with its information.
+ * @param requiredIndexEntries the number of index entries that are
+ * required. this is to check to make sure the
+ * row index is well formed.
+ * @throws IOException
+ */
+ void writeStripe(OrcProto.StripeFooter.Builder builder,
+ int requiredIndexEntries) throws IOException {
+ if (isPresent != null) {
+ isPresent.flush();
+
+ // if no nulls are found in a stream, then suppress the stream
+ if (!foundNulls) {
+ isPresentOutStream.suppress();
+ // since isPresent bitstream is suppressed, update the index to
+ // remove the positions of the isPresent stream
+ if (streamFactory.buildIndex()) {
+ removeIsPresentPositions();
+ }
+ }
+ }
+
+ // merge stripe-level column statistics to file statistics and write it to
+ // stripe statistics
+ OrcProto.StripeStatistics.Builder stripeStatsBuilder = OrcProto.StripeStatistics.newBuilder();
+ writeStripeStatistics(stripeStatsBuilder, this);
+ stripeStatsBuilders.add(stripeStatsBuilder);
+
+ // reset the flag for next stripe
+ foundNulls = false;
+
+ builder.addColumns(getEncoding());
+ if (streamFactory.hasWriterTimeZone()) {
+ builder.setWriterTimezone(TimeZone.getDefault().getID());
+ }
+ if (streamFactory.buildIndex()) {
+ if (rowIndex.getEntryCount() != requiredIndexEntries) {
+ throw new IllegalArgumentException("Column has wrong number of " +
+ "index entries found: " + rowIndex.getEntryCount() + " expected: " +
+ requiredIndexEntries);
+ }
+ streamFactory.writeIndex(id, rowIndex);
+ }
+
+ rowIndex.clear();
+ rowIndexEntry.clear();
+
+ // write the bloom filter to out stream
+ if (createBloomFilter) {
+ streamFactory.writeBloomFilter(id, bloomFilterIndex);
+ bloomFilterIndex.clear();
+ bloomFilterEntry.clear();
+ }
+ }
+
+ private void writeStripeStatistics(OrcProto.StripeStatistics.Builder builder,
+ TreeWriter treeWriter) {
+ treeWriter.fileStatistics.merge(treeWriter.stripeColStatistics);
+ builder.addColStats(treeWriter.stripeColStatistics.serialize().build());
+ treeWriter.stripeColStatistics.reset();
+ for (TreeWriter child : treeWriter.getChildrenWriters()) {
+ writeStripeStatistics(builder, child);
+ }
+ }
+
+ TreeWriter[] getChildrenWriters() {
+ return childrenWriters;
+ }
+
+ /**
+ * Get the encoding for this column.
+ * @return the information about the encoding of this column
+ */
+ OrcProto.ColumnEncoding getEncoding() {
+ return OrcProto.ColumnEncoding.newBuilder().setKind(
+ OrcProto.ColumnEncoding.Kind.DIRECT).build();
+ }
+
+ /**
+ * Create a row index entry with the previous location and the current
+ * index statistics. Also merges the index statistics into the file
+ * statistics before they are cleared. Finally, it records the start of the
+ * next index and ensures all of the children columns also create an entry.
+ * @throws IOException
+ */
+ void createRowIndexEntry() throws IOException {
+ stripeColStatistics.merge(indexStatistics);
+ rowIndexEntry.setStatistics(indexStatistics.serialize());
+ indexStatistics.reset();
+ rowIndex.addEntry(rowIndexEntry);
+ rowIndexEntry.clear();
+ addBloomFilterEntry();
+ recordPosition(rowIndexPosition);
+ for(TreeWriter child: childrenWriters) {
+ child.createRowIndexEntry();
+ }
+ }
+
+ void addBloomFilterEntry() {
+ if (createBloomFilter) {
+ bloomFilterEntry.setNumHashFunctions(bloomFilter.getNumHashFunctions());
+ bloomFilterEntry.addAllBitset(Longs.asList(bloomFilter.getBitSet()));
+ bloomFilterIndex.addBloomFilter(bloomFilterEntry.build());
+ bloomFilter.reset();
+ bloomFilterEntry.clear();
+ }
+ }
+
+ /**
+ * Record the current position in each of this column's streams.
+ * @param recorder where should the locations be recorded
+ * @throws IOException
+ */
+ void recordPosition(PositionRecorder recorder) throws IOException {
+ if (isPresent != null) {
+ isPresent.getPosition(recorder);
+ }
+ }
+
+ /**
+ * Estimate how much memory the writer is consuming excluding the streams.
+ * @return the number of bytes.
+ */
+ long estimateMemory() {
+ long result = 0;
+ for (TreeWriter child: childrenWriters) {
+ result += child.estimateMemory();
+ }
+ return result;
+ }
+ }
+
+ private static class BooleanTreeWriter extends TreeWriter {
+ private final BitFieldWriter writer;
+
+ BooleanTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ PositionedOutputStream out = writer.createStream(id,
+ OrcProto.Stream.Kind.DATA);
+ this.writer = new BitFieldWriter(out, 1);
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ LongColumnVector vec = (LongColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ int value = vec.vector[0] == 0 ? 0 : 1;
+ indexStatistics.updateBoolean(value != 0, length);
+ for(int i=0; i < length; ++i) {
+ writer.write(value);
+ }
+ }
+ } else {
+ for(int i=0; i < length; ++i) {
+ if (vec.noNulls || !vec.isNull[i + offset]) {
+ int value = vec.vector[i + offset] == 0 ? 0 : 1;
+ writer.write(value);
+ indexStatistics.updateBoolean(value != 0, 1);
+ }
+ }
+ }
+ }
+
+ @Override
+ void writeStripe(OrcProto.StripeFooter.Builder builder,
+ int requiredIndexEntries) throws IOException {
+ super.writeStripe(builder, requiredIndexEntries);
+ writer.flush();
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void recordPosition(PositionRecorder recorder) throws IOException {
+ super.recordPosition(recorder);
+ writer.getPosition(recorder);
+ }
+ }
+
+ private static class ByteTreeWriter extends TreeWriter {
+ private final RunLengthByteWriter writer;
+
+ ByteTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ this.writer = new RunLengthByteWriter(writer.createStream(id,
+ OrcProto.Stream.Kind.DATA));
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ LongColumnVector vec = (LongColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ byte value = (byte) vec.vector[0];
+ indexStatistics.updateInteger(value, length);
+ if (createBloomFilter) {
+ bloomFilter.addLong(value);
+ }
+ for(int i=0; i < length; ++i) {
+ writer.write(value);
+ }
+ }
+ } else {
+ for(int i=0; i < length; ++i) {
+ if (vec.noNulls || !vec.isNull[i + offset]) {
+ byte value = (byte) vec.vector[i + offset];
+ writer.write(value);
+ indexStatistics.updateInteger(value, 1);
+ if (createBloomFilter) {
+ bloomFilter.addLong(value);
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ void writeStripe(OrcProto.StripeFooter.Builder builder,
+ int requiredIndexEntries) throws IOException {
+ super.writeStripe(builder, requiredIndexEntries);
+ writer.flush();
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void recordPosition(PositionRecorder recorder) throws IOException {
+ super.recordPosition(recorder);
+ writer.getPosition(recorder);
+ }
+ }
+
+ private static class IntegerTreeWriter extends TreeWriter {
+ private final IntegerWriter writer;
+ private boolean isDirectV2 = true;
+
+ IntegerTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ OutStream out = writer.createStream(id,
+ OrcProto.Stream.Kind.DATA);
+ this.isDirectV2 = isNewWriteFormat(writer);
+ this.writer = createIntegerWriter(out, true, isDirectV2, writer);
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ OrcProto.ColumnEncoding getEncoding() {
+ if (isDirectV2) {
+ return OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
+ }
+ return OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build();
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ LongColumnVector vec = (LongColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ long value = vec.vector[0];
+ indexStatistics.updateInteger(value, length);
+ if (createBloomFilter) {
+ bloomFilter.addLong(value);
+ }
+ for(int i=0; i < length; ++i) {
+ writer.write(value);
+ }
+ }
+ } else {
+ for(int i=0; i < length; ++i) {
+ if (vec.noNulls || !vec.isNull[i + offset]) {
+ long value = vec.vector[i + offset];
+ writer.write(value);
+ indexStatistics.updateInteger(value, 1);
+ if (createBloomFilter) {
+ bloomFilter.addLong(value);
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ void writeStripe(OrcProto.StripeFooter.Builder builder,
+ int requiredIndexEntries) throws IOException {
+ super.writeStripe(builder, requiredIndexEntries);
+ writer.flush();
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void recordPosition(PositionRecorder recorder) throws IOException {
+ super.recordPosition(recorder);
+ writer.getPosition(recorder);
+ }
+ }
+
+ private static class FloatTreeWriter extends TreeWriter {
+ private final PositionedOutputStream stream;
+ private final SerializationUtils utils;
+
+ FloatTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ this.stream = writer.createStream(id,
+ OrcProto.Stream.Kind.DATA);
+ this.utils = new SerializationUtils();
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ DoubleColumnVector vec = (DoubleColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ float value = (float) vec.vector[0];
+ indexStatistics.updateDouble(value);
+ if (createBloomFilter) {
+ bloomFilter.addDouble(value);
+ }
+ for(int i=0; i < length; ++i) {
+ utils.writeFloat(stream, value);
+ }
+ }
+ } else {
+ for(int i=0; i < length; ++i) {
+ if (vec.noNulls || !vec.isNull[i + offset]) {
+ float value = (float) vec.vector[i + offset];
+ utils.writeFloat(stream, value);
+ indexStatistics.updateDouble(value);
+ if (createBloomFilter) {
+ bloomFilter.addDouble(value);
+ }
+ }
+ }
+ }
+ }
+
+
+ @Override
+ void writeStripe(OrcProto.StripeFooter.Builder builder,
+ int requiredIndexEntries) throws IOException {
+ super.writeStripe(builder, requiredIndexEntries);
+ stream.flush();
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void recordPosition(PositionRecorder recorder) throws IOException {
+ super.recordPosition(recorder);
+ stream.getPosition(recorder);
+ }
+ }
+
+ private static class DoubleTreeWriter extends TreeWriter {
+ private final PositionedOutputStream stream;
+ private final SerializationUtils utils;
+
+ DoubleTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ this.stream = writer.createStream(id,
+ OrcProto.Stream.Kind.DATA);
+ this.utils = new SerializationUtils();
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ DoubleColumnVector vec = (DoubleColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ double value = vec.vector[0];
+ indexStatistics.updateDouble(value);
+ if (createBloomFilter) {
+ bloomFilter.addDouble(value);
+ }
+ for(int i=0; i < length; ++i) {
+ utils.writeDouble(stream, value);
+ }
+ }
+ } else {
+ for(int i=0; i < length; ++i) {
+ if (vec.noNulls || !vec.isNull[i + offset]) {
+ double value = vec.vector[i + offset];
+ utils.writeDouble(stream, value);
+ indexStatistics.updateDouble(value);
+ if (createBloomFilter) {
+ bloomFilter.addDouble(value);
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ void writeStripe(OrcProto.StripeFooter.Builder builder,
+ int requiredIndexEntries) throws IOException {
+ super.writeStripe(builder, requiredIndexEntries);
+ stream.flush();
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void recordPosition(PositionRecorder recorder) throws IOException {
+ super.recordPosition(recorder);
+ stream.getPosition(recorder);
+ }
+ }
+
+ private static abstract class StringBaseTreeWriter extends TreeWriter {
+ private static final int INITIAL_DICTIONARY_SIZE = 4096;
+ private final OutStream stringOutput;
+ private final IntegerWriter lengthOutput;
+ private final IntegerWriter rowOutput;
+ protected final StringRedBlackTree dictionary =
+ new StringRedBlackTree(INITIAL_DICTIONARY_SIZE);
+ protected final DynamicIntArray rows = new DynamicIntArray();
+ protected final PositionedOutputStream directStreamOutput;
+ protected final IntegerWriter directLengthOutput;
+ private final List<OrcProto.RowIndexEntry> savedRowIndex =
+ new ArrayList<OrcProto.RowIndexEntry>();
+ private final boolean buildIndex;
+ private final List<Long> rowIndexValueCount = new ArrayList<Long>();
+ // If the number of keys in a dictionary is greater than this fraction of
+ //the total number of non-null rows, turn off dictionary encoding
+ private final double dictionaryKeySizeThreshold;
+ protected boolean useDictionaryEncoding;
+ private boolean isDirectV2 = true;
+ private boolean doneDictionaryCheck;
+ private final boolean strideDictionaryCheck;
+
+ StringBaseTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ this.isDirectV2 = isNewWriteFormat(writer);
+ stringOutput = writer.createStream(id,
+ OrcProto.Stream.Kind.DICTIONARY_DATA);
+ lengthOutput = createIntegerWriter(writer.createStream(id,
+ OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
+ rowOutput = createIntegerWriter(writer.createStream(id,
+ OrcProto.Stream.Kind.DATA), false, isDirectV2, writer);
+ recordPosition(rowIndexPosition);
+ rowIndexValueCount.add(0L);
+ buildIndex = writer.buildIndex();
+ directStreamOutput = writer.createStream(id, OrcProto.Stream.Kind.DATA);
+ directLengthOutput = createIntegerWriter(writer.createStream(id,
+ OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
+ Configuration conf = writer.getConfiguration();
+ dictionaryKeySizeThreshold =
+ OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getDouble(conf);
+ strideDictionaryCheck =
+ OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getBoolean(conf);
+ useDictionaryEncoding = dictionaryKeySizeThreshold >= 0.000001; // Epsilon.
+ doneDictionaryCheck = !useDictionaryEncoding;
+ }
+
+ private boolean checkDictionaryEncoding() {
+ if (!doneDictionaryCheck) {
+ // Set the flag indicating whether or not to use dictionary encoding
+ // based on whether or not the fraction of distinct keys over number of
+ // non-null rows is less than the configured threshold
+ float ratio = rows.size() > 0 ? (float) (dictionary.size()) / rows.size() : 0.0f;
+ useDictionaryEncoding = !isDirectV2 || ratio <= dictionaryKeySizeThreshold;
+ doneDictionaryCheck = true;
+ }
+ return useDictionaryEncoding;
+ }
+
+ @Override
+ void writeStripe(OrcProto.StripeFooter.Builder builder,
+ int requiredIndexEntries) throws IOException {
+ // if rows in stripe is less than dictionaryCheckAfterRows, dictionary
+ // checking would not have happened. So do it again here.
+ checkDictionaryEncoding();
+
+ if (useDictionaryEncoding) {
+ flushDictionary();
+ } else {
+ // flushout any left over entries from dictionary
+ if (rows.size() > 0) {
+ flushDictionary();
+ }
+
+ // suppress the stream for every stripe if dictionary is disabled
+ stringOutput.suppress();
+ }
+
+ // we need to build the rowindex before calling super, since it
+ // writes it out.
+ super.writeStripe(builder, requiredIndexEntries);
+ stringOutput.flush();
+ lengthOutput.flush();
+ rowOutput.flush();
+ directStreamOutput.flush();
+ directLengthOutput.flush();
+ // reset all of the fields to be ready for the next stripe.
+ dictionary.clear();
+ savedRowIndex.clear();
+ rowIndexValueCount.clear();
+ recordPosition(rowIndexPosition);
+ rowIndexValueCount.add(0L);
+
+ if (!useDictionaryEncoding) {
+ // record the start positions of first index stride of next stripe i.e
+ // beginning of the direct streams when dictionary is disabled
+ recordDirectStreamPosition();
+ }
+ }
+
+ private void flushDictionary() throws IOException {
+ final int[] dumpOrder = new int[dictionary.size()];
+
+ if (useDictionaryEncoding) {
+ // Write the dictionary by traversing the red-black tree writing out
+ // the bytes and lengths; and creating the map from the original order
+ // to the final sorted order.
+
+ dictionary.visit(new StringRedBlackTree.Visitor() {
+ private int currentId = 0;
+ @Override
+ public void visit(StringRedBlackTree.VisitorContext context
+ ) throws IOException {
+ context.writeBytes(stringOutput);
+ lengthOutput.write(context.getLength());
+ dumpOrder[context.getOriginalPosition()] = currentId++;
+ }
+ });
+ } else {
+ // for direct encoding, we don't want the dictionary data stream
+ stringOutput.suppress();
+ }
+ int length = rows.size();
+ int rowIndexEntry = 0;
+ OrcProto.RowIndex.Builder rowIndex = getRowIndex();
+ Text text = new Text();
+ // write the values translated into the dump order.
+ for(int i = 0; i <= length; ++i) {
+ // now that we are writing out the row values, we can finalize the
+ // row index
+ if (buildIndex) {
+ while (i == rowIndexValueCount.get(rowIndexEntry) &&
+ rowIndexEntry < savedRowIndex.size()) {
+ OrcProto.RowIndexEntry.Builder base =
+ savedRowIndex.get(rowIndexEntry++).toBuilder();
+ if (useDictionaryEncoding) {
+ rowOutput.getPosition(new RowIndexPositionRecorder(base));
+ } else {
+ PositionRecorder posn = new RowIndexPositionRecorder(base);
+ directStreamOutput.getPosition(posn);
+ directLengthOutput.getPosition(posn);
+ }
+ rowIndex.addEntry(base.build());
+ }
+ }
+ if (i != length) {
+ if (useDictionaryEncoding) {
+ rowOutput.write(dumpOrder[rows.get(i)]);
+ } else {
+ dictionary.getText(text, rows.get(i));
+ directStreamOutput.write(text.getBytes(), 0, text.getLength());
+ directLengthOutput.write(text.getLength());
+ }
+ }
+ }
+ rows.clear();
+ }
+
+ @Override
+ OrcProto.ColumnEncoding getEncoding() {
+ // Returns the encoding used for the last call to writeStripe
+ if (useDictionaryEncoding) {
+ if(isDirectV2) {
+ return OrcProto.ColumnEncoding.newBuilder().setKind(
+ OrcProto.ColumnEncoding.Kind.DICTIONARY_V2).
+ setDictionarySize(dictionary.size()).build();
+ }
+ return OrcProto.ColumnEncoding.newBuilder().setKind(
+ OrcProto.ColumnEncoding.Kind.DICTIONARY).
+ setDictionarySize(dictionary.size()).build();
+ } else {
+ if(isDirectV2) {
+ return OrcProto.ColumnEncoding.newBuilder().setKind(
+ OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
+ }
+ return OrcProto.ColumnEncoding.newBuilder().setKind(
+ OrcProto.ColumnEncoding.Kind.DIRECT).build();
+ }
+ }
+
+ /**
+ * This method doesn't call the super method, because unlike most of the
+ * other TreeWriters, this one can't record the position in the streams
+ * until the stripe is being flushed. Therefore it saves all of the entries
+ * and augments them with the final information as the stripe is written.
+ * @throws IOException
+ */
+ @Override
+ void createRowIndexEntry() throws IOException {
+ getStripeStatistics().merge(indexStatistics);
+ OrcProto.RowIndexEntry.Builder rowIndexEntry = getRowIndexEntry();
+ rowIndexEntry.setStatistics(indexStatistics.serialize());
+ indexStatistics.reset();
+ OrcProto.RowIndexEntry base = rowIndexEntry.build();
+ savedRowIndex.add(base);
+ rowIndexEntry.clear();
+ addBloomFilterEntry();
+ recordPosition(rowIndexPosition);
+ rowIndexValueCount.add(Long.valueOf(rows.size()));
+ if (strideDictionaryCheck) {
+ checkDictionaryEncoding();
+ }
+ if (!useDictionaryEncoding) {
+ if (rows.size() > 0) {
+ flushDictionary();
+ // just record the start positions of next index stride
+ recordDirectStreamPosition();
+ } else {
+ // record the start positions of next index stride
+ recordDirectStreamPosition();
+ getRowIndex().addEntry(base);
+ }
+ }
+ }
+
+ private void recordDirectStreamPosition() throws IOException {
+ directStreamOutput.getPosition(rowIndexPosition);
+ directLengthOutput.getPosition(rowIndexPosition);
+ }
+
+ @Override
+ long estimateMemory() {
+ return rows.getSizeInBytes() + dictionary.getSizeInBytes();
+ }
+ }
+
+ private static class StringTreeWriter extends StringBaseTreeWriter {
+ StringTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ BytesColumnVector vec = (BytesColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ if (useDictionaryEncoding) {
+ int id = dictionary.add(vec.vector[0], vec.start[0], vec.length[0]);
+ for(int i=0; i < length; ++i) {
+ rows.add(id);
+ }
+ } else {
+ for(int i=0; i < length; ++i) {
+ directStreamOutput.write(vec.vector[0], vec.start[0],
+ vec.length[0]);
+ directLengthOutput.write(vec.length[0]);
+ }
+ }
+ indexStatistics.updateString(vec.vector[0], vec.start[0],
+ vec.length[0], length);
+ if (createBloomFilter) {
+ bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
+ }
+ }
+ } else {
+ for(int i=0; i < length; ++i) {
+ if (vec.noNulls || !vec.isNull[i + offset]) {
+ if (useDictionaryEncoding) {
+ rows.add(dictionary.add(vec.vector[offset + i],
+ vec.start[offset + i], vec.length[offset + i]));
+ } else {
+ directStreamOutput.write(vec.vector[offset + i],
+ vec.start[offset + i], vec.length[offset + i]);
+ directLengthOutput.write(vec.length[offset + i]);
+ }
+ indexStatistics.updateString(vec.vector[offset + i],
+ vec.start[offset + i], vec.length[offset + i], 1);
+ if (createBloomFilter) {
+ bloomFilter.addBytes(vec.vector[offset + i],
+ vec.start[offset + i], vec.length[offset + i]);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Under the covers, char is written to ORC the same way as string.
+ */
+ private static class CharTreeWriter extends StringBaseTreeWriter {
+ private final int itemLength;
+ private final byte[] padding;
+
+ CharTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ itemLength = schema.getMaxLength();
+ padding = new byte[itemLength];
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ BytesColumnVector vec = (BytesColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ byte[] ptr;
+ int ptrOffset;
+ if (vec.length[0] >= itemLength) {
+ ptr = vec.vector[0];
+ ptrOffset = vec.start[0];
+ } else {
+ ptr = padding;
+ ptrOffset = 0;
+ System.arraycopy(vec.vector[0], vec.start[0], ptr, 0,
+ vec.length[0]);
+ Arrays.fill(ptr, vec.length[0], itemLength, (byte) ' ');
+ }
+ if (useDictionaryEncoding) {
+ int id = dictionary.add(ptr, ptrOffset, itemLength);
+ for(int i=0; i < length; ++i) {
+ rows.add(id);
+ }
+ } else {
+ for(int i=0; i < length; ++i) {
+ directStreamOutput.write(ptr, ptrOffset, itemLength);
+ directLengthOutput.write(itemLength);
+ }
+ }
+ indexStatistics.updateString(ptr, ptrOffset, itemLength, length);
+ if (createBloomFilter) {
+ bloomFilter.addBytes(ptr, ptrOffset, itemLength);
+ }
+ }
+ } else {
+ for(int i=0; i < length; ++i) {
+ if (vec.noNulls || !vec.isNull[i + offset]) {
+ byte[] ptr;
+ int ptrOffset;
+ if (vec.length[offset + i] >= itemLength) {
+ ptr = vec.vector[offset + i];
+ ptrOffset = vec.start[offset + i];
+ } else {
+ // it is the wrong length, so copy it
+ ptr = padding;
+ ptrOffset = 0;
+ System.arraycopy(vec.vector[offset + i], vec.start[offset + i],
+ ptr, 0, vec.length[offset + i]);
+ Arrays.fill(ptr, vec.length[offset + i], itemLength, (byte) ' ');
+ }
+ if (useDictionaryEncoding) {
+ rows.add(dictionary.add(ptr, ptrOffset, itemLength));
+ } else {
+ directStreamOutput.write(ptr, ptrOffset, itemLength);
+ directLengthOutput.write(itemLength);
+ }
+ indexStatistics.updateString(ptr, ptrOffset, itemLength, 1);
+ if (createBloomFilter) {
+ bloomFilter.addBytes(ptr, ptrOffset, itemLength);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Under the covers, varchar is written to ORC the same way as string.
+ */
+ private static class VarcharTreeWriter extends StringBaseTreeWriter {
+ private final int maxLength;
+
+ VarcharTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ maxLength = schema.getMaxLength();
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ BytesColumnVector vec = (BytesColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ int itemLength = Math.min(vec.length[0], maxLength);
+ if (useDictionaryEncoding) {
+ int id = dictionary.add(vec.vector[0], vec.start[0], itemLength);
+ for(int i=0; i < length; ++i) {
+ rows.add(id);
+ }
+ } else {
+ for(int i=0; i < length; ++i) {
+ directStreamOutput.write(vec.vector[0], vec.start[0],
+ itemLength);
+ directLengthOutput.write(itemLength);
+ }
+ }
+ indexStatistics.updateString(vec.vector[0], vec.start[0],
+ itemLength, length);
+ if (createBloomFilter) {
+ bloomFilter.addBytes(vec.vector[0], vec.start[0], itemLength);
+ }
+ }
+ } else {
+ for(int i=0; i < length; ++i) {
+ if (vec.noNulls || !vec.isNull[i + offset]) {
+ int itemLength = Math.min(vec.length[offset + i], maxLength);
+ if (useDictionaryEncoding) {
+ rows.add(dictionary.add(vec.vector[offset + i],
+ vec.start[offset + i], itemLength));
+ } else {
+ directStreamOutput.write(vec.vector[offset + i],
+ vec.start[offset + i], itemLength);
+ directLengthOutput.write(itemLength);
+ }
+ indexStatistics.updateString(vec.vector[offset + i],
+ vec.start[offset + i], itemLength, 1);
+ if (createBloomFilter) {
+ bloomFilter.addBytes(vec.vector[offset + i],
+ vec.start[offset + i], itemLength);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private static class BinaryTreeWriter extends TreeWriter {
+ private final PositionedOutputStream stream;
+ private final IntegerWriter length;
+ private boolean isDirectV2 = true;
+
+ BinaryTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ this.stream = writer.createStream(id,
+ OrcProto.Stream.Kind.DATA);
+ this.isDirectV2 = isNewWriteFormat(writer);
+ this.length = createIntegerWriter(writer.createStream(id,
+ OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ OrcProto.ColumnEncoding getEncoding() {
+ if (isDirectV2) {
+ return OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
+ }
+ return OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build();
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ BytesColumnVector vec = (BytesColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ for(int i=0; i < length; ++i) {
+ stream.write(vec.vector[0], vec.start[0],
+ vec.length[0]);
+ this.length.write(vec.length[0]);
+ }
+ indexStatistics.updateBinary(vec.vector[0], vec.start[0],
+ vec.length[0], length);
+ if (createBloomFilter) {
+ bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
+ }
+ }
+ } else {
+ for(int i=0; i < length; ++i) {
+ if (vec.noNulls || !vec.isNull[i + offset]) {
+ stream.write(vec.vector[offset + i],
+ vec.start[offset + i], vec.length[offset + i]);
+ this.length.write(vec.length[offset + i]);
+ indexStatistics.updateBinary(vec.vector[offset + i],
+ vec.start[offset + i], vec.length[offset + i], 1);
+ if (createBloomFilter) {
+ bloomFilter.addBytes(vec.vector[offset + i],
+ vec.start[offset + i], vec.length[offset + i]);
+ }
+ }
+ }
+ }
+ }
+
+
+ @Override
+ void writeStripe(OrcProto.StripeFooter.Builder builder,
+ int requiredIndexEntries) throws IOException {
+ super.writeStripe(builder, requiredIndexEntries);
+ stream.flush();
+ length.flush();
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void recordPosition(PositionRecorder recorder) throws IOException {
+ super.recordPosition(recorder);
+ stream.getPosition(recorder);
+ length.getPosition(recorder);
+ }
+ }
+
+ public static long MILLIS_PER_DAY = 24 * 60 * 60 * 1000;
+ public static long NANOS_PER_MILLI = 1000000;
+ public static final int MILLIS_PER_SECOND = 1000;
+ static final int NANOS_PER_SECOND = 1000000000;
+ public static final String BASE_TIMESTAMP_STRING = "2015-01-01 00:00:00";
+
+ private static class TimestampTreeWriter extends TreeWriter {
+ private final IntegerWriter seconds;
+ private final IntegerWriter nanos;
+ private final boolean isDirectV2;
+ private final long base_timestamp;
+
+ TimestampTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ this.isDirectV2 = isNewWriteFormat(writer);
+ this.seconds = createIntegerWriter(writer.createStream(id,
+ OrcProto.Stream.Kind.DATA), true, isDirectV2, writer);
+ this.nanos = createIntegerWriter(writer.createStream(id,
+ OrcProto.Stream.Kind.SECONDARY), false, isDirectV2, writer);
+ recordPosition(rowIndexPosition);
+ // for unit tests to set different time zones
+ this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / MILLIS_PER_SECOND;
+ writer.useWriterTimeZone(true);
+ }
+
+ @Override
+ OrcProto.ColumnEncoding getEncoding() {
+ if (isDirectV2) {
+ return OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
+ }
+ return OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build();
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ TimestampColumnVector vec = (TimestampColumnVector) vector;
+ Timestamp val;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ val = vec.asScratchTimestamp(0);
+ long millis = val.getTime();
+ indexStatistics.updateTimestamp(millis);
+ if (createBloomFilter) {
+ bloomFilter.addLong(millis);
+ }
+ final long secs = millis / MILLIS_PER_SECOND - base_timestamp;
+ final long nano = formatNanos(val.getNanos());
+ for(int i=0; i < length; ++i) {
+ seconds.write(secs);
+ nanos.write(nano);
+ }
+ }
+ } else {
+ for(int i=0; i < length; ++i) {
+ if (vec.noNulls || !vec.isNull[i + offset]) {
+ val = vec.asScratchTimestamp(i + offset);
+ long millis = val.getTime();
+ long secs = millis / MILLIS_PER_SECOND - base_timestamp;
+ seconds.write(secs);
+ nanos.write(formatNanos(val.getNanos()));
+ indexStatistics.updateTimestamp(millis);
+ if (createBloomFilter) {
+ bloomFilter.addLong(millis);
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ void writeStripe(OrcProto.StripeFooter.Builder builder,
+ int requiredIndexEntries) throws IOException {
+ super.writeStripe(builder, requiredIndexEntries);
+ seconds.flush();
+ nanos.flush();
+ recordPosition(rowIndexPosition);
+ }
+
+ private static long formatNanos(int nanos) {
+ if (nanos == 0) {
+ return 0;
+ } else if (nanos % 100 != 0) {
+ return ((long) nanos) << 3;
+ } else {
+ nanos /= 100;
+ int trailingZeros = 1;
+ while (nanos % 10 == 0 && trailingZeros < 7) {
+ nanos /= 10;
+ trailingZeros += 1;
+ }
+ return ((long) nanos) << 3 | trailingZeros;
+ }
+ }
+
+ @Override
+ void recordPosition(PositionRecorder recorder) throws IOException {
+ super.recordPosition(recorder);
+ seconds.getPosition(recorder);
+ nanos.getPosition(recorder);
+ }
+ }
+
+ private static class DateTreeWriter extends TreeWriter {
+ private final IntegerWriter writer;
+ private final boolean isDirectV2;
+
+ DateTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ OutStream out = writer.createStream(id,
+ OrcProto.Stream.Kind.DATA);
+ this.isDirectV2 = isNewWriteFormat(writer);
+ this.writer = createIntegerWriter(out, true, isDirectV2, writer);
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ LongColumnVector vec = (LongColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ int value = (int) vec.vector[0];
+ indexStatistics.updateDate(value);
+ if (createBloomFilter) {
+ bloomFilter.addLong(value);
+ }
+ for(int i=0; i < length; ++i) {
+ writer.write(value);
+ }
+ }
+ } else {
+ for(int i=0; i < length; ++i) {
+ if (vec.noNulls || !vec.isNull[i + offset]) {
+ int value = (int) vec.vector[i + offset];
+ writer.write(value);
+ indexStatistics.updateDate(value);
+ if (createBloomFilter) {
+ bloomFilter.addLong(value);
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ void writeStripe(OrcProto.StripeFooter.Builder builder,
+ int requiredIndexEntries) throws IOException {
+ super.writeStripe(builder, requiredIndexEntries);
+ writer.flush();
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void recordPosition(PositionRecorder recorder) throws IOException {
+ super.recordPosition(recorder);
+ writer.getPosition(recorder);
+ }
+
+ @Override
+ OrcProto.ColumnEncoding getEncoding() {
+ if (isDirectV2) {
+ return OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
+ }
+ return OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build();
+ }
+ }
+
+ private static class DecimalTreeWriter extends TreeWriter {
+ private final PositionedOutputStream valueStream;
+
+ // These scratch buffers allow us to serialize decimals much faster.
+ private final long[] scratchLongs;
+ private final byte[] scratchBuffer;
+
+ private final IntegerWriter scaleStream;
+ private final boolean isDirectV2;
+
+ DecimalTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ this.isDirectV2 = isNewWriteFormat(writer);
+ valueStream = writer.createStream(id, OrcProto.Stream.Kind.DATA);
+ scratchLongs = new long[HiveDecimal.SCRATCH_LONGS_LEN];
+ scratchBuffer = new byte[HiveDecimal.SCRATCH_BUFFER_LEN_TO_BYTES];
+ this.scaleStream = createIntegerWriter(writer.createStream(id,
+ OrcProto.Stream.Kind.SECONDARY), true, isDirectV2, writer);
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ OrcProto.ColumnEncoding getEncoding() {
+ if (isDirectV2) {
+ return OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
+ }
+ return OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build();
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ DecimalColumnVector vec = (DecimalColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ HiveDecimalWritable value = vec.vector[0];
+ indexStatistics.updateDecimal(value);
+ if (createBloomFilter) {
+
+ // The HiveDecimalWritable toString() method with a scratch buffer for good performance
+ // when creating the String. We need to use a String hash code and not UTF-8 byte[]
+ // hash code in order to get the right hash code.
+ bloomFilter.addString(value.toString(scratchBuffer));
+ }
+ for(int i=0; i < length; ++i) {
+
+ // Use the fast ORC serialization method that emulates SerializationUtils.writeBigInteger
+ // provided by HiveDecimalWritable.
+ value.serializationUtilsWrite(
+ valueStream,
+ scratchLongs);
+ scaleStream.write(value.scale());
+ }
+ }
+ } else {
+ for(int i=0; i < length; ++i) {
+ if (vec.noNulls || !vec.isNull[i + offset]) {
+ HiveDecimalWritable value = vec.vector[i + offset];
+ value.serializationUtilsWrite(
+ valueStream,
+ scratchLongs);
+ scaleStream.write(value.scale());
+ indexStatistics.updateDecimal(value);
+ if (createBloomFilter) {
+ bloomFilter.addString(value.toString(scratchBuffer));
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ void writeStripe(OrcProto.StripeFooter.Builder builder,
+ int requiredIndexEntries) throws IOException {
+ super.writeStripe(builder, requiredIndexEntries);
+ valueStream.flush();
+ scaleStream.flush();
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void recordPosition(PositionRecorder recorder) throws IOException {
+ super.recordPosition(recorder);
+ valueStream.getPosition(recorder);
+ scaleStream.getPosition(recorder);
+ }
+ }
+
+ private static class StructTreeWriter extends TreeWriter {
+ StructTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ List<TypeDescription> children = schema.getChildren();
+ childrenWriters = new TreeWriter[children.size()];
+ for(int i=0; i < childrenWriters.length; ++i) {
+ childrenWriters[i] = createTreeWriter(
+ children.get(i), writer,
+ true);
+ }
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void writeRootBatch(VectorizedRowBatch batch, int offset,
+ int length) throws IOException {
+ // update the statistics for the root column
+ indexStatistics.increment(length);
+ // I'm assuming that the root column isn't nullable so that I don't need
+ // to update isPresent.
+ for(int i=0; i < childrenWriters.length; ++i) {
+ childrenWriters[i].writeBatch(batch.cols[i], offset, length);
+ }
+ }
+
+ private static void writeFields(StructColumnVector vector,
+ TreeWriter[] childrenWriters,
+ int offset, int length) throws IOException {
+ for(int field=0; field < childrenWriters.length; ++field) {
+ childrenWriters[field].writeBatch(vector.fields[field], offset, length);
+ }
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ StructColumnVector vec = (StructColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ writeFields(vec, childrenWriters, offset, length);
+ }
+ } else if (vector.noNulls) {
+ writeFields(vec, childrenWriters, offset, length);
+ } else {
+ // write the records in runs
+ int currentRun = 0;
+ boolean started = false;
+ for(int i=0; i < length; ++i) {
+ if (!vec.isNull[i + offset]) {
+ if (!started) {
+ started = true;
+ currentRun = i;
+ }
+ } else if (started) {
+ started = false;
+ writeFields(vec, childrenWriters, offset + currentRun,
+ i - currentRun);
+ }
+ }
+ if (started) {
+ writeFields(vec, childrenWriters, offset + currentRun,
+ length - currentRun);
+ }
+ }
+ }
+
+ @Override
+ void writeStripe(OrcProto.StripeFooter.Builder builder,
+ int requiredIndexEntries) throws IOException {
+ super.writeStripe(builder, requiredIndexEntries);
+ for(TreeWriter child: childrenWriters) {
+ child.writeStripe(builder, requiredIndexEntries);
+ }
+ recordPosition(rowIndexPosition);
+ }
+ }
+
+ private static class ListTreeWriter extends TreeWriter {
+ private final IntegerWriter lengths;
+ private final boolean isDirectV2;
+
+ ListTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ this.isDirectV2 = isNewWriteFormat(writer);
+ childrenWriters = new TreeWriter[1];
+ childrenWriters[0] =
+ createTreeWriter(schema.getChildren().get(0), writer, true);
+ lengths = createIntegerWriter(writer.createStream(columnId,
+ OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ OrcProto.ColumnEncoding getEncoding() {
+ if (isDirectV2) {
+ return OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
+ }
+ return OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build();
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ ListColumnVector vec = (ListColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ int childOffset = (int) vec.offsets[0];
+ int childLength = (int) vec.lengths[0];
+ for(int i=0; i < length; ++i) {
+ lengths.write(childLength);
+ childrenWriters[0].writeBatch(vec.child, childOffset, childLength);
+ }
+ if (createBloomFilter) {
+ bloomFilter.addLong(childLength);
+ }
+ }
+ } else {
+ // write the elements in runs
+ int currentOffset = 0;
+ int currentLength = 0;
+ for(int i=0; i < length; ++i) {
+ if (!vec.isNull[i + offset]) {
+ int nextLength = (int) vec.lengths[offset + i];
+ int nextOffset = (int) vec.offsets[offset + i];
+ lengths.write(nextLength);
+ if (currentLength == 0) {
+ currentOffset = nextOffset;
+ currentLength = nextLength;
+ } else if (currentOffset + currentLength != nextOffset) {
+ childrenWriters[0].writeBatch(vec.child, currentOffset,
+ currentLength);
+ currentOffset = nextOffset;
+ currentLength = nextLength;
+ } else {
+ currentLength += nextLength;
+ }
+ }
+ }
+ if (currentLength != 0) {
+ childrenWriters[0].writeBatch(vec.child, currentOffset,
+ currentLength);
+ }
+ }
+ }
+
+ @Override
+ void writeStripe(OrcProto.StripeFooter.Builder builder,
+ int requiredIndexEntries) throws IOException {
+ super.writeStripe(builder, requiredIndexEntries);
+ lengths.flush();
+ for(TreeWriter child: childrenWriters) {
+ child.writeStripe(builder, requiredIndexEntries);
+ }
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void recordPosition(PositionRecorder recorder) throws IOException {
+ super.recordPosition(recorder);
+ lengths.getPosition(recorder);
+ }
+ }
+
+ private static class MapTreeWriter extends TreeWriter {
+ private final IntegerWriter lengths;
+ private final boolean isDirectV2;
+
+ MapTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ this.isDirectV2 = isNewWriteFormat(writer);
+ childrenWriters = new TreeWriter[2];
+ List<TypeDescription> children = schema.getChildren();
+ childrenWriters[0] =
+ createTreeWriter(children.get(0), writer, true);
+ childrenWriters[1] =
+ createTreeWriter(children.get(1), writer, true);
+ lengths = createIntegerWriter(writer.createStream(columnId,
+ OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer);
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ OrcProto.ColumnEncoding getEncoding() {
+ if (isDirectV2) {
+ return OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build();
+ }
+ return OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build();
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ MapColumnVector vec = (MapColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ int childOffset = (int) vec.offsets[0];
+ int childLength = (int) vec.lengths[0];
+ for(int i=0; i < length; ++i) {
+ lengths.write(childLength);
+ childrenWriters[0].writeBatch(vec.keys, childOffset, childLength);
+ childrenWriters[1].writeBatch(vec.values, childOffset, childLength);
+ }
+ if (createBloomFilter) {
+ bloomFilter.addLong(childLength);
+ }
+ }
+ } else {
+ // write the elements in runs
+ int currentOffset = 0;
+ int currentLength = 0;
+ for(int i=0; i < length; ++i) {
+ if (!vec.isNull[i + offset]) {
+ int nextLength = (int) vec.lengths[offset + i];
+ int nextOffset = (int) vec.offsets[offset + i];
+ lengths.write(nextLength);
+ if (currentLength == 0) {
+ currentOffset = nextOffset;
+ currentLength = nextLength;
+ } else if (currentOffset + currentLength != nextOffset) {
+ childrenWriters[0].writeBatch(vec.keys, currentOffset,
+ currentLength);
+ childrenWriters[1].writeBatch(vec.values, currentOffset,
+ currentLength);
+ currentOffset = nextOffset;
+ currentLength = nextLength;
+ } else {
+ currentLength += nextLength;
+ }
+ }
+ }
+ if (currentLength != 0) {
+ childrenWriters[0].writeBatch(vec.keys, currentOffset,
+ currentLength);
+ childrenWriters[1].writeBatch(vec.values, currentOffset,
+ currentLength);
+ }
+ }
+ }
+
+ @Override
+ void writeStripe(OrcProto.StripeFooter.Builder builder,
+ int requiredIndexEntries) throws IOException {
+ super.writeStripe(builder, requiredIndexEntries);
+ lengths.flush();
+ for(TreeWriter child: childrenWriters) {
+ child.writeStripe(builder, requiredIndexEntries);
+ }
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void recordPosition(PositionRecorder recorder) throws IOException {
+ super.recordPosition(recorder);
+ lengths.getPosition(recorder);
+ }
+ }
+
+ private static class UnionTreeWriter extends TreeWriter {
+ private final RunLengthByteWriter tags;
+
+ UnionTreeWriter(int columnId,
+ TypeDescription schema,
+ StreamFactory writer,
+ boolean nullable) throws IOException {
+ super(columnId, schema, writer, nullable);
+ List<TypeDescription> children = schema.getChildren();
+ childrenWriters = new TreeWriter[children.size()];
+ for(int i=0; i < childrenWriters.length; ++i) {
+ childrenWriters[i] =
+ createTreeWriter(children.get(i), writer, true);
+ }
+ tags =
+ new RunLengthByteWriter(writer.createStream(columnId,
+ OrcProto.Stream.Kind.DATA));
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ UnionColumnVector vec = (UnionColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ byte tag = (byte) vec.tags[0];
+ for(int i=0; i < length; ++i) {
+ tags.write(tag);
+ }
+ if (createBloomFilter) {
+ bloomFilter.addLong(tag);
+ }
+ childrenWriters[tag].writeBatch(vec.fields[tag], offset, length);
+ }
+ } else {
+ // write the records in runs of the same tag
+ int[] currentStart = new int[vec.fields.length];
+ int[] currentLength = new int[vec.fields.length];
+ for(int i=0; i < length; ++i) {
+ // only need to deal with the non-nulls, since the nulls were dealt
+ // with in the super method.
+ if (vec.noNulls || !vec.isNull[i + offset]) {
+ byte tag = (byte) vec.tags[offset + i];
+ tags.write(tag);
+ if (currentLength[tag] == 0) {
+ // start a new sequence
+ currentStart[tag] = i + offset;
+ currentLength[tag] = 1;
+ } else if (currentStart[tag] + currentLength[tag] == i + offset) {
+ // ok, we are extending the current run for that tag.
+ currentLength[tag] += 1;
+ } else {
+ // otherwise, we need to close off the old run and start a new one
+ childrenWriters[tag].writeBatch(vec.fields[tag],
+ currentStart[tag], currentLength[tag]);
+ currentStart[tag] = i + offset;
+ currentLength[tag] = 1;
+ }
+ }
+ }
+ // write out any left over sequences
+ for(int tag=0; tag < currentStart.length; ++tag) {
+ if (currentLength[tag] != 0) {
+ childrenWriters[tag].writeBatch(vec.fields[tag], currentStart[tag],
+ currentLength[tag]);
+ }
+ }
+ }
+ }
+
+ @Override
+ void writeStripe(OrcProto.StripeFooter.Builder builder,
+ int requiredIndexEntries) throws IOException {
+ super.writeStripe(builder, requiredIndexEntries);
+ tags.flush();
+ for(TreeWriter child: childrenWriters) {
+ child.writeStripe(builder, requiredIndexEntries);
+ }
+ recordPosition(rowIndexPosition);
+ }
+
+ @Override
+ void recordPosition(PositionRecorder recorder) throws IOException {
+ super.recordPosition(recorder);
+ tags.getPosition(recorder);
+ }
+ }
+
+ private static TreeWriter createTreeWriter(TypeDescription schema,
+ StreamFactory streamFactory,
+ boolean nullable) throws IOException {
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ return new BooleanTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case BYTE:
+ return new ByteTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case SHORT:
+ case INT:
+ case LONG:
+ return new IntegerTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case FLOAT:
+ return new FloatTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case DOUBLE:
+ return new DoubleTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case STRING:
+ return new StringTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case CHAR:
+ return new CharTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case VARCHAR:
+ return new VarcharTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case BINARY:
+ return new BinaryTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case TIMESTAMP:
+ return new TimestampTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case DATE:
+ return new DateTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case DECIMAL:
+ return new DecimalTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case STRUCT:
+ return new StructTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case MAP:
+ return new MapTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case LIST:
+ return new ListTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ case UNION:
+ return new UnionTreeWriter(streamFactory.getNextColumnId(),
+ schema, streamFactory, nullable);
+ default:
+ throw new IllegalArgumentException("Bad category: " +
+ schema.getCategory());
+ }
+ }
+
+ private static void writeTypes(OrcProto.Footer.Builder builder,
+ TypeDescription schema) {
+ OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
+ List<TypeDescription> children = OrcUtils.setTypeBuilderFromSchema(type, schema);
+ builder.addTypes(type);
+ if (children != null) {
+ for(TypeDescription child: children) {
+ writeTypes(builder, child);
+ }
+ }
+ }
+
+ @VisibleForTesting
+ public void ensureStream() throws IOException {
+ physWriter.initialize();
+ }
+
+ private void createRowIndexEntry() throws IOException {
+ treeWriter.createRowIndexEntry();
+ rowsInIndex = 0;
+ }
+
+ private void flushStripe() throws IOException {
+ ensureStream();
+ if (buildIndex && rowsInIndex != 0) {
+ createRowIndexEntry();
+ }
+ if (rowsInStripe != 0) {
+ if (callback != null) {
+ callback.preStripeWrite(callbackContext);
+ }
+ // finalize the data for the stripe
+ int requiredIndexEntries = rowIndexStride == 0 ? 0 :
+ (int) ((rowsInStripe + rowIndexStride - 1) / rowIndexStride);
+ OrcProto.StripeFooter.Builder builder = OrcProto.StripeFooter.newBuilder();
+ OrcProto.StripeInformation.Builder dirEntry = OrcProto.StripeInformation
+ .newBuilder().setNumberOfRows(rowsInStripe);
+ treeWriter.writeStripe(builder, requiredIndexEntries);
+ physWriter.finalizeStripe(builder, dirEntry);
+ stripes.add(dirEntry.build());
+ rowCount += rowsInStripe;
+ rowsInStripe = 0;
+ }
+ }
+
+ private long computeRawDataSize() {
+ return getRawDataSize(treeWriter, schema);
+ }
+
+ private long getRawDataSize(TreeWriter child,
+ TypeDescription schema) {
+ long total = 0;
+ long numVals = child.fileStatistics.getNumberOfValues();
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case FLOAT:
+ return numVals * JavaDataModel.get().primitive1();
+ case LONG:
+ case DOUBLE:
+ return numVals * JavaDataModel.get().primitive2();
+ case STRING:
+ case VARCHAR:
+ case CHAR:
+ // ORC strings are converted to java Strings. so use JavaDataModel to
+ // compute the overall size of strings
+ StringColumnStatistics scs = (StringColumnStatistics) child.fileStatistics;
+ numVals = numVals == 0 ? 1 : numVals;
+ int avgStringLen = (int) (scs.getSum() / numVals);
+ return numVals * JavaDataModel.get().lengthForStringOfLength(avgStringLen);
+ case DECIMAL:
+ return numVals * JavaDataModel.get().lengthOfDecimal();
+ case DATE:
+ return numVals * JavaDataModel.get().lengthOfDate();
+ case BINARY:
+ // get total length of binary blob
+ BinaryColumnStatistics bcs = (BinaryColumnStatistics) child.fileStatistics;
+ return bcs.getSum();
+ case TIMESTAMP:
+ return numVals * JavaDataModel.get().lengthOfTimestamp();
+ case LIST:
+ case MAP:
+ case UNION:
+ case STRUCT: {
+ TreeWriter[] childWriters = child.getChildrenWriters();
+ List<TypeDescription> childTypes = schema.getChildren();
+ for (int i=0; i < childWriters.length; ++i) {
+ total += getRawDataSize(childWriters[i], childTypes.get(i));
+ }
+ break;
+ }
+ default:
+ LOG.debug("Unknown object inspector category.");
+ break;
+ }
+ return total;
+ }
+
+ private void writeFileStatistics(OrcProto.Footer.Builder builder,
+ TreeWriter writer) throws IOException {
+ builder.addStatistics(writer.fileStatistics.serialize());
+ for(TreeWriter child: writer.getChildrenWriters()) {
+ writeFileStatistics(builder, child);
+ }
+ }
+
+ private void writeMetadata() throws IOException {
+ ensureStream();
+ OrcProto.Metadata.Builder builder = OrcProto.Metadata.newBuilder();
+ for(OrcProto.StripeStatistics.Builder ssb : treeWriter.stripeStatsBuilders) {
+ builder.addStripeStats(ssb.build());
+ }
+
+ physWriter.writeFileMetadata(builder);
+ }
+
+ private void writeFooter() throws IOException {
+ ensureStream();
+ OrcProto.Footer.Builder builder = OrcProto.Footer.newBuilder();
+ builder.setNumberOfRows(rowCount);
+ builder.setRowIndexStride(rowIndexStride);
+ // populate raw data size
+ rawDataSize = computeRawDataSize();
+ // serialize the types
+ writeTypes(builder, schema);
+ // add the stripe information
+ for(OrcProto.StripeInformation stripe: stripes) {
+ builder.addStripes(stripe);
+ }
+ // add the column statistics
+ writeFileStatistics(builder, treeWriter);
+ // add all of the user metadata
+ for(Map.Entry<String, ByteString> entry: userMetadata.entrySet()) {
+ builder.addMetadata(OrcProto.UserMetadataItem.newBuilder()
+ .setName(entry.getKey()).setValue(entry.getValue()));
+ }
+ physWriter.writeFileFooter(builder);
+ }
+
+ private void writePostScript() throws IOException {
+ OrcProto.PostScript.Builder builder =
+ OrcProto.PostScript.newBuilder()
+ .setMagic(OrcFile.MAGIC)
+ .addVersion(version.getMajor())
+ .addVersion(version.getMinor())
+ .setWriterVersion(OrcFile.CURRENT_WRITER.getId());
+ physWriter.writePostScript(builder);
+ }
+
+ private long estimateStripeSize() {
+ return physWriter.estimateMemory() + treeWriter.estimateMemory();
+ }
+
+ @Override
+ public TypeDescription getSchema() {
+ return schema;
+ }
+
+ @Override
+ public void addUserMetadata(String name, ByteBuffer value) {
+ userMetadata.put(name, ByteString.copyFrom(value));
+ }
+
+ @Override
+ public void addRowBatch(VectorizedRowBatch batch) throws IOException {
+ if (buildIndex) {
+ // Batch the writes up to the rowIndexStride so that we can get the
+ // right size indexes.
+ int posn = 0;
+ while (posn < batch.size) {
+ int chunkSize = Math.min(batch.size - posn,
+ rowIndexStride - rowsInIndex);
+ treeWriter.writeRootBatch(batch, posn, chunkSize);
+ posn += chunkSize;
+ rowsInIndex += chunkSize;
+ rowsInStripe += chunkSize;
+ if (rowsInIndex >= rowIndexStride) {
+ createRowIndexEntry();
+ }
+ }
+ } else {
+ rowsInStripe += batch.size;
+ treeWriter.writeRootBatch(batch, 0, batch.size);
+ }
+ if (path != null) {
+ memoryManager.addedRow(batch.size);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (callback != null) {
+ callback.preFooterWrite(callbackContext);
+ }
+ // remove us from the memory manager so that we don't get any callbacks
+ if (path != null) {
+ memoryManager.removeWriter(path);
+ }
+ // actually close the file
+ flushStripe();
+ writeMetadata();
+ writeFooter();
+ writePostScript();
+ physWriter.close();
+ }
+
+ /**
+ * Raw data size will be compute when writing the file footer. Hence raw data
+ * size value will be available only after closing the writer.
+ */
+ @Override
+ public long getRawDataSize() {
+ return rawDataSize;
+ }
+
+ /**
+ * Row count gets updated when flushing the stripes. To get accurate row
+ * count call this method after writer is closed.
+ */
+ @Override
+ public long getNumberOfRows() {
+ return rowCount;
+ }
+
+ @Override
+ public long writeIntermediateFooter() throws IOException {
+ // flush any buffered rows
+ flushStripe();
+ // write a footer
+ if (stripesAtLastFlush != stripes.size()) {
+ if (callback != null) {
+ callback.preFooterWrite(callbackContext);
+ }
+ writeMetadata();
+ writeFooter();
+ writePostScript();
+ stripesAtLastFlush = stripes.size();
+ physWriter.flush();
+ }
+ return physWriter.getRawWriterPosition();
+ }
+
+ @Override
+ public void appendStripe(byte[] stripe, int offset, int length,
+ StripeInformation stripeInfo,
+ OrcProto.StripeStatistics stripeStatistics) throws IOException {
+ checkArgument(stripe != null, "Stripe must not be null");
+ checkArgument(length <= stripe.length,
+ "Specified length must not be greater specified array length");
+ checkArgument(stripeInfo != null, "Stripe information must not be null");
+ checkArgument(stripeStatistics != null,
+ "Stripe statistics must not be null");
+
+ ensureStream();
+ OrcProto.StripeInformation.Builder dirEntry = OrcProto.StripeInformation.newBuilder();
+ physWriter.appendRawStripe(stripe, offset, length, dirEntry);
+
+ rowsInStripe = stripeStatistics.getColStats(0).getNumberOfValues();
+ rowCount += rowsInStripe;
+
+ // since we have already written the stripe, just update stripe statistics
+ treeWriter.stripeStatsBuilders.add(stripeStatistics.toBuilder());
+
+ // update file level statistics
+ updateFileStatistics(stripeStatistics);
+
+ // update stripe information
+ stripes.add(dirEntry.setNumberOfRows(rowsInStripe)
+ .setIndexLength(stripeInfo.getIndexLength())
+ .setDataLength(stripeInfo.getDataLength())
+ .setFooterLength(stripeInfo.getFooterLength())
+ .build());
+
+ // reset it after writing the stripe
+ rowsInStripe = 0;
+ }
+
+ private void updateFileStatistics(OrcProto.StripeStatistics stripeStatistics) {
+ List<OrcProto.ColumnStatistics> cs = stripeStatistics.getColSt
<TRUNCATED>
[33/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/StripeInformation.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/StripeInformation.java b/orc/src/java/org/apache/hive/orc/StripeInformation.java
new file mode 100644
index 0000000..b8dfc60
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/StripeInformation.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+/**
+ * Information about the stripes in an ORC file that is provided by the Reader.
+ */
+public interface StripeInformation {
+ /**
+ * Get the byte offset of the start of the stripe.
+ * @return the bytes from the start of the file
+ */
+ long getOffset();
+
+ /**
+ * Get the total length of the stripe in bytes.
+ * @return the number of bytes in the stripe
+ */
+ long getLength();
+
+ /**
+ * Get the length of the stripe's indexes.
+ * @return the number of bytes in the index
+ */
+ long getIndexLength();
+
+ /**
+ * Get the length of the stripe's data.
+ * @return the number of bytes in the stripe
+ */
+ long getDataLength();
+
+ /**
+ * Get the length of the stripe's tail section, which contains its index.
+ * @return the number of bytes in the tail
+ */
+ long getFooterLength();
+
+ /**
+ * Get the number of rows in the stripe.
+ * @return a count of the number of rows
+ */
+ long getNumberOfRows();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/StripeStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/StripeStatistics.java b/orc/src/java/org/apache/hive/orc/StripeStatistics.java
new file mode 100644
index 0000000..c704dd9
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/StripeStatistics.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc;
+
+import org.apache.hive.orc.impl.ColumnStatisticsImpl;
+
+import java.util.List;
+
+public class StripeStatistics {
+ private final List<OrcProto.ColumnStatistics> cs;
+
+ public StripeStatistics(List<OrcProto.ColumnStatistics> list) {
+ this.cs = list;
+ }
+
+ /**
+ * Return list of column statistics
+ *
+ * @return column stats
+ */
+ public ColumnStatistics[] getColumnStatistics() {
+ ColumnStatistics[] result = new ColumnStatistics[cs.size()];
+ for (int i = 0; i < result.length; ++i) {
+ result[i] = ColumnStatisticsImpl.deserialize(cs.get(i));
+ }
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/TimestampColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/TimestampColumnStatistics.java b/orc/src/java/org/apache/hive/orc/TimestampColumnStatistics.java
new file mode 100644
index 0000000..55a7e42
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/TimestampColumnStatistics.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc;
+
+import java.sql.Timestamp;
+
+/**
+ * Statistics for Timestamp columns.
+ */
+public interface TimestampColumnStatistics extends ColumnStatistics {
+ /**
+ * Get the minimum value for the column.
+ * @return minimum value
+ */
+ Timestamp getMinimum();
+
+ /**
+ * Get the maximum value for the column.
+ * @return maximum value
+ */
+ Timestamp getMaximum();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/TypeDescription.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/TypeDescription.java b/orc/src/java/org/apache/hive/orc/TypeDescription.java
new file mode 100644
index 0000000..f4ed908
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/TypeDescription.java
@@ -0,0 +1,870 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * This is the description of the types in an ORC file.
+ */
+public class TypeDescription
+ implements Comparable<TypeDescription>, Serializable {
+ private static final int MAX_PRECISION = 38;
+ private static final int MAX_SCALE = 38;
+ private static final int DEFAULT_PRECISION = 38;
+ private static final int DEFAULT_SCALE = 10;
+ private static final int DEFAULT_LENGTH = 256;
+
+ @Override
+ public int compareTo(TypeDescription other) {
+ if (this == other) {
+ return 0;
+ } else if (other == null) {
+ return -1;
+ } else {
+ int result = category.compareTo(other.category);
+ if (result == 0) {
+ switch (category) {
+ case CHAR:
+ case VARCHAR:
+ return maxLength - other.maxLength;
+ case DECIMAL:
+ if (precision != other.precision) {
+ return precision - other.precision;
+ }
+ return scale - other.scale;
+ case UNION:
+ case LIST:
+ case MAP:
+ if (children.size() != other.children.size()) {
+ return children.size() - other.children.size();
+ }
+ for(int c=0; result == 0 && c < children.size(); ++c) {
+ result = children.get(c).compareTo(other.children.get(c));
+ }
+ break;
+ case STRUCT:
+ if (children.size() != other.children.size()) {
+ return children.size() - other.children.size();
+ }
+ for(int c=0; result == 0 && c < children.size(); ++c) {
+ result = fieldNames.get(c).compareTo(other.fieldNames.get(c));
+ if (result == 0) {
+ result = children.get(c).compareTo(other.children.get(c));
+ }
+ }
+ break;
+ default:
+ // PASS
+ }
+ }
+ return result;
+ }
+ }
+
+ public enum Category {
+ BOOLEAN("boolean", true),
+ BYTE("tinyint", true),
+ SHORT("smallint", true),
+ INT("int", true),
+ LONG("bigint", true),
+ FLOAT("float", true),
+ DOUBLE("double", true),
+ STRING("string", true),
+ DATE("date", true),
+ TIMESTAMP("timestamp", true),
+ BINARY("binary", true),
+ DECIMAL("decimal", true),
+ VARCHAR("varchar", true),
+ CHAR("char", true),
+ LIST("array", false),
+ MAP("map", false),
+ STRUCT("struct", false),
+ UNION("uniontype", false);
+
+ Category(String name, boolean isPrimitive) {
+ this.name = name;
+ this.isPrimitive = isPrimitive;
+ }
+
+ final boolean isPrimitive;
+ final String name;
+
+ public boolean isPrimitive() {
+ return isPrimitive;
+ }
+
+ public String getName() {
+ return name;
+ }
+ }
+
+ public static TypeDescription createBoolean() {
+ return new TypeDescription(Category.BOOLEAN);
+ }
+
+ public static TypeDescription createByte() {
+ return new TypeDescription(Category.BYTE);
+ }
+
+ public static TypeDescription createShort() {
+ return new TypeDescription(Category.SHORT);
+ }
+
+ public static TypeDescription createInt() {
+ return new TypeDescription(Category.INT);
+ }
+
+ public static TypeDescription createLong() {
+ return new TypeDescription(Category.LONG);
+ }
+
+ public static TypeDescription createFloat() {
+ return new TypeDescription(Category.FLOAT);
+ }
+
+ public static TypeDescription createDouble() {
+ return new TypeDescription(Category.DOUBLE);
+ }
+
+ public static TypeDescription createString() {
+ return new TypeDescription(Category.STRING);
+ }
+
+ public static TypeDescription createDate() {
+ return new TypeDescription(Category.DATE);
+ }
+
+ public static TypeDescription createTimestamp() {
+ return new TypeDescription(Category.TIMESTAMP);
+ }
+
+ public static TypeDescription createBinary() {
+ return new TypeDescription(Category.BINARY);
+ }
+
+ public static TypeDescription createDecimal() {
+ return new TypeDescription(Category.DECIMAL);
+ }
+
+ static class StringPosition {
+ final String value;
+ int position;
+ final int length;
+
+ StringPosition(String value) {
+ this.value = value;
+ position = 0;
+ length = value.length();
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buffer = new StringBuilder();
+ buffer.append('\'');
+ buffer.append(value.substring(0, position));
+ buffer.append('^');
+ buffer.append(value.substring(position));
+ buffer.append('\'');
+ return buffer.toString();
+ }
+ }
+
+ static Category parseCategory(StringPosition source) {
+ int start = source.position;
+ while (source.position < source.length) {
+ char ch = source.value.charAt(source.position);
+ if (!Character.isLetter(ch)) {
+ break;
+ }
+ source.position += 1;
+ }
+ if (source.position != start) {
+ String word = source.value.substring(start, source.position).toLowerCase();
+ for (Category cat : Category.values()) {
+ if (cat.getName().equals(word)) {
+ return cat;
+ }
+ }
+ }
+ throw new IllegalArgumentException("Can't parse category at " + source);
+ }
+
+ static int parseInt(StringPosition source) {
+ int start = source.position;
+ int result = 0;
+ while (source.position < source.length) {
+ char ch = source.value.charAt(source.position);
+ if (!Character.isDigit(ch)) {
+ break;
+ }
+ result = result * 10 + (ch - '0');
+ source.position += 1;
+ }
+ if (source.position == start) {
+ throw new IllegalArgumentException("Missing integer at " + source);
+ }
+ return result;
+ }
+
+ static String parseName(StringPosition source) {
+ int start = source.position;
+ while (source.position < source.length) {
+ char ch = source.value.charAt(source.position);
+ if (!Character.isLetterOrDigit(ch) && ch != '.' && ch != '_') {
+ break;
+ }
+ source.position += 1;
+ }
+ if (source.position == start) {
+ throw new IllegalArgumentException("Missing name at " + source);
+ }
+ return source.value.substring(start, source.position);
+ }
+
+ static void requireChar(StringPosition source, char required) {
+ if (source.position >= source.length ||
+ source.value.charAt(source.position) != required) {
+ throw new IllegalArgumentException("Missing required char '" +
+ required + "' at " + source);
+ }
+ source.position += 1;
+ }
+
+ static boolean consumeChar(StringPosition source, char ch) {
+ boolean result = source.position < source.length &&
+ source.value.charAt(source.position) == ch;
+ if (result) {
+ source.position += 1;
+ }
+ return result;
+ }
+
+ static void parseUnion(TypeDescription type, StringPosition source) {
+ requireChar(source, '<');
+ do {
+ type.addUnionChild(parseType(source));
+ } while (consumeChar(source, ','));
+ requireChar(source, '>');
+ }
+
+ static void parseStruct(TypeDescription type, StringPosition source) {
+ requireChar(source, '<');
+ do {
+ String fieldName = parseName(source);
+ requireChar(source, ':');
+ type.addField(fieldName, parseType(source));
+ } while (consumeChar(source, ','));
+ requireChar(source, '>');
+ }
+
+ static TypeDescription parseType(StringPosition source) {
+ TypeDescription result = new TypeDescription(parseCategory(source));
+ switch (result.getCategory()) {
+ case BINARY:
+ case BOOLEAN:
+ case BYTE:
+ case DATE:
+ case DOUBLE:
+ case FLOAT:
+ case INT:
+ case LONG:
+ case SHORT:
+ case STRING:
+ case TIMESTAMP:
+ break;
+ case CHAR:
+ case VARCHAR:
+ requireChar(source, '(');
+ result.withMaxLength(parseInt(source));
+ requireChar(source, ')');
+ break;
+ case DECIMAL: {
+ requireChar(source, '(');
+ int precision = parseInt(source);
+ requireChar(source, ',');
+ result.withScale(parseInt(source));
+ result.withPrecision(precision);
+ requireChar(source, ')');
+ break;
+ }
+ case LIST:
+ requireChar(source, '<');
+ result.children.add(parseType(source));
+ requireChar(source, '>');
+ break;
+ case MAP:
+ requireChar(source, '<');
+ result.children.add(parseType(source));
+ requireChar(source, ',');
+ result.children.add(parseType(source));
+ requireChar(source, '>');
+ break;
+ case UNION:
+ parseUnion(result, source);
+ break;
+ case STRUCT:
+ parseStruct(result, source);
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown type " +
+ result.getCategory() + " at " + source);
+ }
+ return result;
+ }
+
+ /**
+ * Parse TypeDescription from the Hive type names. This is the inverse
+ * of TypeDescription.toString()
+ * @param typeName the name of the type
+ * @return a new TypeDescription or null if typeName was null
+ * @throws IllegalArgumentException if the string is badly formed
+ */
+ public static TypeDescription fromString(String typeName) {
+ if (typeName == null) {
+ return null;
+ }
+ StringPosition source = new StringPosition(typeName);
+ TypeDescription result = parseType(source);
+ if (source.position != source.length) {
+ throw new IllegalArgumentException("Extra characters at " + source);
+ }
+ return result;
+ }
+
+ /**
+ * For decimal types, set the precision.
+ * @param precision the new precision
+ * @return this
+ */
+ public TypeDescription withPrecision(int precision) {
+ if (category != Category.DECIMAL) {
+ throw new IllegalArgumentException("precision is only allowed on decimal"+
+ " and not " + category.name);
+ } else if (precision < 1 || precision > MAX_PRECISION || scale > precision){
+ throw new IllegalArgumentException("precision " + precision +
+ " is out of range 1 .. " + scale);
+ }
+ this.precision = precision;
+ return this;
+ }
+
+ /**
+ * For decimal types, set the scale.
+ * @param scale the new scale
+ * @return this
+ */
+ public TypeDescription withScale(int scale) {
+ if (category != Category.DECIMAL) {
+ throw new IllegalArgumentException("scale is only allowed on decimal"+
+ " and not " + category.name);
+ } else if (scale < 0 || scale > MAX_SCALE || scale > precision) {
+ throw new IllegalArgumentException("scale is out of range at " + scale);
+ }
+ this.scale = scale;
+ return this;
+ }
+
+ public static TypeDescription createVarchar() {
+ return new TypeDescription(Category.VARCHAR);
+ }
+
+ public static TypeDescription createChar() {
+ return new TypeDescription(Category.CHAR);
+ }
+
+ /**
+ * Set the maximum length for char and varchar types.
+ * @param maxLength the maximum value
+ * @return this
+ */
+ public TypeDescription withMaxLength(int maxLength) {
+ if (category != Category.VARCHAR && category != Category.CHAR) {
+ throw new IllegalArgumentException("maxLength is only allowed on char" +
+ " and varchar and not " + category.name);
+ }
+ this.maxLength = maxLength;
+ return this;
+ }
+
+ public static TypeDescription createList(TypeDescription childType) {
+ TypeDescription result = new TypeDescription(Category.LIST);
+ result.children.add(childType);
+ childType.parent = result;
+ return result;
+ }
+
+ public static TypeDescription createMap(TypeDescription keyType,
+ TypeDescription valueType) {
+ TypeDescription result = new TypeDescription(Category.MAP);
+ result.children.add(keyType);
+ result.children.add(valueType);
+ keyType.parent = result;
+ valueType.parent = result;
+ return result;
+ }
+
+ public static TypeDescription createUnion() {
+ return new TypeDescription(Category.UNION);
+ }
+
+ public static TypeDescription createStruct() {
+ return new TypeDescription(Category.STRUCT);
+ }
+
+ /**
+ * Add a child to a union type.
+ * @param child a new child type to add
+ * @return the union type.
+ */
+ public TypeDescription addUnionChild(TypeDescription child) {
+ if (category != Category.UNION) {
+ throw new IllegalArgumentException("Can only add types to union type" +
+ " and not " + category);
+ }
+ children.add(child);
+ child.parent = this;
+ return this;
+ }
+
+ /**
+ * Add a field to a struct type as it is built.
+ * @param field the field name
+ * @param fieldType the type of the field
+ * @return the struct type
+ */
+ public TypeDescription addField(String field, TypeDescription fieldType) {
+ if (category != Category.STRUCT) {
+ throw new IllegalArgumentException("Can only add fields to struct type" +
+ " and not " + category);
+ }
+ fieldNames.add(field);
+ children.add(fieldType);
+ fieldType.parent = this;
+ return this;
+ }
+
+ /**
+ * Get the id for this type.
+ * The first call will cause all of the the ids in tree to be assigned, so
+ * it should not be called before the type is completely built.
+ * @return the sequential id
+ */
+ public int getId() {
+ // if the id hasn't been assigned, assign all of the ids from the root
+ if (id == -1) {
+ TypeDescription root = this;
+ while (root.parent != null) {
+ root = root.parent;
+ }
+ root.assignIds(0);
+ }
+ return id;
+ }
+
+ public TypeDescription clone() {
+ TypeDescription result = new TypeDescription(category);
+ result.maxLength = maxLength;
+ result.precision = precision;
+ result.scale = scale;
+ if (fieldNames != null) {
+ result.fieldNames.addAll(fieldNames);
+ }
+ if (children != null) {
+ for(TypeDescription child: children) {
+ TypeDescription clone = child.clone();
+ clone.parent = result;
+ result.children.add(clone);
+ }
+ }
+ return result;
+ }
+
+ @Override
+ public int hashCode() {
+ long result = category.ordinal() * 4241 + maxLength + precision * 13 + scale;
+ if (children != null) {
+ for(TypeDescription child: children) {
+ result = result * 6959 + child.hashCode();
+ }
+ }
+ return (int) result;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == null || !(other instanceof TypeDescription)) {
+ return false;
+ }
+ if (other == this) {
+ return true;
+ }
+ TypeDescription castOther = (TypeDescription) other;
+ if (category != castOther.category ||
+ maxLength != castOther.maxLength ||
+ scale != castOther.scale ||
+ precision != castOther.precision) {
+ return false;
+ }
+ if (children != null) {
+ if (children.size() != castOther.children.size()) {
+ return false;
+ }
+ for (int i = 0; i < children.size(); ++i) {
+ if (!children.get(i).equals(castOther.children.get(i))) {
+ return false;
+ }
+ }
+ }
+ if (category == Category.STRUCT) {
+ for(int i=0; i < fieldNames.size(); ++i) {
+ if (!fieldNames.get(i).equals(castOther.fieldNames.get(i))) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Get the maximum id assigned to this type or its children.
+ * The first call will cause all of the the ids in tree to be assigned, so
+ * it should not be called before the type is completely built.
+ * @return the maximum id assigned under this type
+ */
+ public int getMaximumId() {
+ // if the id hasn't been assigned, assign all of the ids from the root
+ if (maxId == -1) {
+ TypeDescription root = this;
+ while (root.parent != null) {
+ root = root.parent;
+ }
+ root.assignIds(0);
+ }
+ return maxId;
+ }
+
+ private ColumnVector createColumn(int maxSize) {
+ switch (category) {
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case DATE:
+ return new LongColumnVector(maxSize);
+ case TIMESTAMP:
+ return new TimestampColumnVector(maxSize);
+ case FLOAT:
+ case DOUBLE:
+ return new DoubleColumnVector(maxSize);
+ case DECIMAL:
+ return new DecimalColumnVector(maxSize, precision, scale);
+ case STRING:
+ case BINARY:
+ case CHAR:
+ case VARCHAR:
+ return new BytesColumnVector(maxSize);
+ case STRUCT: {
+ ColumnVector[] fieldVector = new ColumnVector[children.size()];
+ for(int i=0; i < fieldVector.length; ++i) {
+ fieldVector[i] = children.get(i).createColumn(maxSize);
+ }
+ return new StructColumnVector(maxSize,
+ fieldVector);
+ }
+ case UNION: {
+ ColumnVector[] fieldVector = new ColumnVector[children.size()];
+ for(int i=0; i < fieldVector.length; ++i) {
+ fieldVector[i] = children.get(i).createColumn(maxSize);
+ }
+ return new UnionColumnVector(maxSize,
+ fieldVector);
+ }
+ case LIST:
+ return new ListColumnVector(maxSize,
+ children.get(0).createColumn(maxSize));
+ case MAP:
+ return new MapColumnVector(maxSize,
+ children.get(0).createColumn(maxSize),
+ children.get(1).createColumn(maxSize));
+ default:
+ throw new IllegalArgumentException("Unknown type " + category);
+ }
+ }
+
+ public VectorizedRowBatch createRowBatch(int maxSize) {
+ VectorizedRowBatch result;
+ if (category == Category.STRUCT) {
+ result = new VectorizedRowBatch(children.size(), maxSize);
+ for(int i=0; i < result.cols.length; ++i) {
+ result.cols[i] = children.get(i).createColumn(maxSize);
+ }
+ } else {
+ result = new VectorizedRowBatch(1, maxSize);
+ result.cols[0] = createColumn(maxSize);
+ }
+ result.reset();
+ return result;
+ }
+
+ public VectorizedRowBatch createRowBatch() {
+ return createRowBatch(VectorizedRowBatch.DEFAULT_SIZE);
+ }
+
+ /**
+ * Get the kind of this type.
+ * @return get the category for this type.
+ */
+ public Category getCategory() {
+ return category;
+ }
+
+ /**
+ * Get the maximum length of the type. Only used for char and varchar types.
+ * @return the maximum length of the string type
+ */
+ public int getMaxLength() {
+ return maxLength;
+ }
+
+ /**
+ * Get the precision of the decimal type.
+ * @return the number of digits for the precision.
+ */
+ public int getPrecision() {
+ return precision;
+ }
+
+ /**
+ * Get the scale of the decimal type.
+ * @return the number of digits for the scale.
+ */
+ public int getScale() {
+ return scale;
+ }
+
+ /**
+ * For struct types, get the list of field names.
+ * @return the list of field names.
+ */
+ public List<String> getFieldNames() {
+ return Collections.unmodifiableList(fieldNames);
+ }
+
+ /**
+ * Get the subtypes of this type.
+ * @return the list of children types
+ */
+ public List<TypeDescription> getChildren() {
+ return children == null ? null : Collections.unmodifiableList(children);
+ }
+
+ /**
+ * Assign ids to all of the nodes under this one.
+ * @param startId the lowest id to assign
+ * @return the next available id
+ */
+ private int assignIds(int startId) {
+ id = startId++;
+ if (children != null) {
+ for (TypeDescription child : children) {
+ startId = child.assignIds(startId);
+ }
+ }
+ maxId = startId - 1;
+ return startId;
+ }
+
+ private TypeDescription(Category category) {
+ this.category = category;
+ if (category.isPrimitive) {
+ children = null;
+ } else {
+ children = new ArrayList<>();
+ }
+ if (category == Category.STRUCT) {
+ fieldNames = new ArrayList<>();
+ } else {
+ fieldNames = null;
+ }
+ }
+
+ private int id = -1;
+ private int maxId = -1;
+ private TypeDescription parent;
+ private final Category category;
+ private final List<TypeDescription> children;
+ private final List<String> fieldNames;
+ private int maxLength = DEFAULT_LENGTH;
+ private int precision = DEFAULT_PRECISION;
+ private int scale = DEFAULT_SCALE;
+
+ public void printToBuffer(StringBuilder buffer) {
+ buffer.append(category.name);
+ switch (category) {
+ case DECIMAL:
+ buffer.append('(');
+ buffer.append(precision);
+ buffer.append(',');
+ buffer.append(scale);
+ buffer.append(')');
+ break;
+ case CHAR:
+ case VARCHAR:
+ buffer.append('(');
+ buffer.append(maxLength);
+ buffer.append(')');
+ break;
+ case LIST:
+ case MAP:
+ case UNION:
+ buffer.append('<');
+ for(int i=0; i < children.size(); ++i) {
+ if (i != 0) {
+ buffer.append(',');
+ }
+ children.get(i).printToBuffer(buffer);
+ }
+ buffer.append('>');
+ break;
+ case STRUCT:
+ buffer.append('<');
+ for(int i=0; i < children.size(); ++i) {
+ if (i != 0) {
+ buffer.append(',');
+ }
+ buffer.append(fieldNames.get(i));
+ buffer.append(':');
+ children.get(i).printToBuffer(buffer);
+ }
+ buffer.append('>');
+ break;
+ default:
+ break;
+ }
+ }
+
+ public String toString() {
+ StringBuilder buffer = new StringBuilder();
+ printToBuffer(buffer);
+ return buffer.toString();
+ }
+
+ private void printJsonToBuffer(String prefix, StringBuilder buffer,
+ int indent) {
+ for(int i=0; i < indent; ++i) {
+ buffer.append(' ');
+ }
+ buffer.append(prefix);
+ buffer.append("{\"category\": \"");
+ buffer.append(category.name);
+ buffer.append("\", \"id\": ");
+ buffer.append(getId());
+ buffer.append(", \"max\": ");
+ buffer.append(maxId);
+ switch (category) {
+ case DECIMAL:
+ buffer.append(", \"precision\": ");
+ buffer.append(precision);
+ buffer.append(", \"scale\": ");
+ buffer.append(scale);
+ break;
+ case CHAR:
+ case VARCHAR:
+ buffer.append(", \"length\": ");
+ buffer.append(maxLength);
+ break;
+ case LIST:
+ case MAP:
+ case UNION:
+ buffer.append(", \"children\": [");
+ for(int i=0; i < children.size(); ++i) {
+ buffer.append('\n');
+ children.get(i).printJsonToBuffer("", buffer, indent + 2);
+ if (i != children.size() - 1) {
+ buffer.append(',');
+ }
+ }
+ buffer.append("]");
+ break;
+ case STRUCT:
+ buffer.append(", \"fields\": [");
+ for(int i=0; i < children.size(); ++i) {
+ buffer.append('\n');
+ children.get(i).printJsonToBuffer("\"" + fieldNames.get(i) + "\": ",
+ buffer, indent + 2);
+ if (i != children.size() - 1) {
+ buffer.append(',');
+ }
+ }
+ buffer.append(']');
+ break;
+ default:
+ break;
+ }
+ buffer.append('}');
+ }
+
+ public String toJson() {
+ StringBuilder buffer = new StringBuilder();
+ printJsonToBuffer("", buffer, 0);
+ return buffer.toString();
+ }
+
+ /**
+ * Locate a subtype by its id.
+ * @param goal the column id to look for
+ * @return the subtype
+ */
+ public TypeDescription findSubtype(int goal) {
+ // call getId method to make sure the ids are assigned
+ int id = getId();
+ if (goal < id || goal > maxId) {
+ throw new IllegalArgumentException("Unknown type id " + id + " in " +
+ toJson());
+ }
+ if (goal == id) {
+ return this;
+ } else {
+ TypeDescription prev = null;
+ for(TypeDescription next: children) {
+ if (next.id > goal) {
+ return prev.findSubtype(goal);
+ }
+ prev = next;
+ }
+ return prev.findSubtype(goal);
+ }
+ }}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/Writer.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/Writer.java b/orc/src/java/org/apache/hive/orc/Writer.java
new file mode 100644
index 0000000..c4c2147
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/Writer.java
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+/**
+ * The interface for writing ORC files.
+ */
+public interface Writer {
+
+ /**
+ * Get the schema for this writer
+ * @return the file schema
+ */
+ TypeDescription getSchema();
+
+ /**
+ * Add arbitrary meta-data to the ORC file. This may be called at any point
+ * until the Writer is closed. If the same key is passed a second time, the
+ * second value will replace the first.
+ * @param key a key to label the data with.
+ * @param value the contents of the metadata.
+ */
+ void addUserMetadata(String key, ByteBuffer value);
+
+ /**
+ * Add a row batch to the ORC file.
+ * @param batch the rows to add
+ */
+ void addRowBatch(VectorizedRowBatch batch) throws IOException;
+
+ /**
+ * Flush all of the buffers and close the file. No methods on this writer
+ * should be called afterwards.
+ * @throws IOException
+ */
+ void close() throws IOException;
+
+ /**
+ * Return the deserialized data size. Raw data size will be compute when
+ * writing the file footer. Hence raw data size value will be available only
+ * after closing the writer.
+ *
+ * @return raw data size
+ */
+ long getRawDataSize();
+
+ /**
+ * Return the number of rows in file. Row count gets updated when flushing
+ * the stripes. To get accurate row count this method should be called after
+ * closing the writer.
+ *
+ * @return row count
+ */
+ long getNumberOfRows();
+
+ /**
+ * Write an intermediate footer on the file such that if the file is
+ * truncated to the returned offset, it would be a valid ORC file.
+ * @return the offset that would be a valid end location for an ORC file
+ */
+ long writeIntermediateFooter() throws IOException;
+
+ /**
+ * Fast stripe append to ORC file. This interface is used for fast ORC file
+ * merge with other ORC files. When merging, the file to be merged should pass
+ * stripe in binary form along with stripe information and stripe statistics.
+ * After appending last stripe of a file, use appendUserMetadata() to append
+ * any user metadata.
+ * @param stripe - stripe as byte array
+ * @param offset - offset within byte array
+ * @param length - length of stripe within byte array
+ * @param stripeInfo - stripe information
+ * @param stripeStatistics - stripe statistics (Protobuf objects can be
+ * merged directly)
+ * @throws IOException
+ */
+ public void appendStripe(byte[] stripe, int offset, int length,
+ StripeInformation stripeInfo,
+ OrcProto.StripeStatistics stripeStatistics) throws IOException;
+
+ /**
+ * When fast stripe append is used for merging ORC stripes, after appending
+ * the last stripe from a file, this interface must be used to merge any
+ * user metadata.
+ * @param userMetadata - user metadata
+ */
+ public void appendUserMetadata(List<OrcProto.UserMetadataItem> userMetadata);
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/AcidStats.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/AcidStats.java b/orc/src/java/org/apache/hive/orc/impl/AcidStats.java
new file mode 100644
index 0000000..aff9659
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/AcidStats.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+/**
+ * Statistics about the ACID operations in an ORC file
+ */
+public class AcidStats {
+ public long inserts;
+ public long updates;
+ public long deletes;
+
+ public AcidStats() {
+ inserts = 0;
+ updates = 0;
+ deletes = 0;
+ }
+
+ public AcidStats(String serialized) {
+ String[] parts = serialized.split(",");
+ inserts = Long.parseLong(parts[0]);
+ updates = Long.parseLong(parts[1]);
+ deletes = Long.parseLong(parts[2]);
+ }
+
+ public String serialize() {
+ StringBuilder builder = new StringBuilder();
+ builder.append(inserts);
+ builder.append(",");
+ builder.append(updates);
+ builder.append(",");
+ builder.append(deletes);
+ return builder.toString();
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ builder.append(" inserts: ").append(inserts);
+ builder.append(" updates: ").append(updates);
+ builder.append(" deletes: ").append(deletes);
+ return builder.toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/BitFieldReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/BitFieldReader.java b/orc/src/java/org/apache/hive/orc/impl/BitFieldReader.java
new file mode 100644
index 0000000..264061d
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/BitFieldReader.java
@@ -0,0 +1,214 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.EOFException;
+import java.io.IOException;
+
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+
+public class BitFieldReader {
+ private final RunLengthByteReader input;
+ /** The number of bits in one item. Non-test code always uses 1. */
+ private final int bitSize;
+ private int current;
+ private int bitsLeft;
+ private final int mask;
+
+ public BitFieldReader(InStream input,
+ int bitSize) throws IOException {
+ this.input = new RunLengthByteReader(input);
+ this.bitSize = bitSize;
+ mask = (1 << bitSize) - 1;
+ }
+
+ public void setInStream(InStream inStream) {
+ this.input.setInStream(inStream);
+ }
+
+ private void readByte() throws IOException {
+ if (input.hasNext()) {
+ current = 0xff & input.next();
+ bitsLeft = 8;
+ } else {
+ throw new EOFException("Read past end of bit field from " + this);
+ }
+ }
+
+ public int next() throws IOException {
+ int result = 0;
+ int bitsLeftToRead = bitSize;
+ while (bitsLeftToRead > bitsLeft) {
+ result <<= bitsLeft;
+ result |= current & ((1 << bitsLeft) - 1);
+ bitsLeftToRead -= bitsLeft;
+ readByte();
+ }
+ if (bitsLeftToRead > 0) {
+ result <<= bitsLeftToRead;
+ bitsLeft -= bitsLeftToRead;
+ result |= (current >>> bitsLeft) & ((1 << bitsLeftToRead) - 1);
+ }
+ return result & mask;
+ }
+
+ /**
+ * Unlike integer readers, where runs are encoded explicitly, in this one we have to read ahead
+ * to figure out whether we have a run. Given that runs in booleans are likely it's worth it.
+ * However it means we'd need to keep track of how many bytes we read, and next/nextVector won't
+ * work anymore once this is called. These is trivial to fix, but these are never interspersed.
+ */
+ private boolean lastRunValue;
+ private int lastRunLength = -1;
+ private void readNextRun(int maxRunLength) throws IOException {
+ assert bitSize == 1;
+ if (lastRunLength > 0) return; // last run is not exhausted yet
+ if (bitsLeft == 0) {
+ readByte();
+ }
+ // First take care of the partial bits.
+ boolean hasVal = false;
+ int runLength = 0;
+ if (bitsLeft != 8) {
+ int partialBitsMask = (1 << bitsLeft) - 1;
+ int partialBits = current & partialBitsMask;
+ if (partialBits == partialBitsMask || partialBits == 0) {
+ lastRunValue = (partialBits == partialBitsMask);
+ if (maxRunLength <= bitsLeft) {
+ lastRunLength = maxRunLength;
+ return;
+ }
+ maxRunLength -= bitsLeft;
+ hasVal = true;
+ runLength = bitsLeft;
+ bitsLeft = 0;
+ } else {
+ // There's no run in partial bits. Return whatever we have.
+ int prefixBitsCount = 32 - bitsLeft;
+ runLength = Integer.numberOfLeadingZeros(partialBits) - prefixBitsCount;
+ lastRunValue = (runLength > 0);
+ lastRunLength = Math.min(maxRunLength, lastRunValue ? runLength :
+ (Integer.numberOfLeadingZeros(~(partialBits | ~partialBitsMask)) - prefixBitsCount));
+ return;
+ }
+ assert bitsLeft == 0;
+ readByte();
+ }
+ if (!hasVal) {
+ lastRunValue = ((current >> 7) == 1);
+ hasVal = true;
+ }
+ // Read full bytes until the run ends.
+ assert bitsLeft == 8;
+ while (maxRunLength >= 8
+ && ((lastRunValue && (current == 0xff)) || (!lastRunValue && (current == 0)))) {
+ runLength += 8;
+ maxRunLength -= 8;
+ readByte();
+ }
+ if (maxRunLength > 0) {
+ int extraBits = Integer.numberOfLeadingZeros(
+ lastRunValue ? (~(current | ~255)) : current) - 24;
+ bitsLeft -= extraBits;
+ runLength += extraBits;
+ }
+ lastRunLength = runLength;
+ }
+
+ public void nextVector(LongColumnVector previous,
+ long previousLen) throws IOException {
+ previous.isRepeating = true;
+ for (int i = 0; i < previousLen; i++) {
+ if (previous.noNulls || !previous.isNull[i]) {
+ previous.vector[i] = next();
+ } else {
+ // The default value of null for int types in vectorized
+ // processing is 1, so set that if the value is null
+ previous.vector[i] = 1;
+ }
+
+ // The default value for nulls in Vectorization for int types is 1
+ // and given that non null value can also be 1, we need to check for isNull also
+ // when determining the isRepeating flag.
+ if (previous.isRepeating
+ && i > 0
+ && ((previous.vector[0] != previous.vector[i]) ||
+ (previous.isNull[0] != previous.isNull[i]))) {
+ previous.isRepeating = false;
+ }
+ }
+ }
+
+ public void seek(PositionProvider index) throws IOException {
+ input.seek(index);
+ int consumed = (int) index.getNext();
+ if (consumed > 8) {
+ throw new IllegalArgumentException("Seek past end of byte at " +
+ consumed + " in " + input);
+ } else if (consumed != 0) {
+ readByte();
+ bitsLeft = 8 - consumed;
+ } else {
+ bitsLeft = 0;
+ }
+ }
+
+ public void skip(long items) throws IOException {
+ long totalBits = bitSize * items;
+ if (bitsLeft >= totalBits) {
+ bitsLeft -= totalBits;
+ } else {
+ totalBits -= bitsLeft;
+ input.skip(totalBits / 8);
+ current = input.next();
+ bitsLeft = (int) (8 - (totalBits % 8));
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "bit reader current: " + current + " bits left: " + bitsLeft +
+ " bit size: " + bitSize + " from " + input;
+ }
+
+ boolean hasFullByte() {
+ return bitsLeft == 8 || bitsLeft == 0;
+ }
+
+ int peekOneBit() throws IOException {
+ assert bitSize == 1;
+ if (bitsLeft == 0) {
+ readByte();
+ }
+ return (current >>> (bitsLeft - 1)) & 1;
+ }
+
+ int peekFullByte() throws IOException {
+ assert bitSize == 1;
+ assert bitsLeft == 8 || bitsLeft == 0;
+ if (bitsLeft == 0) {
+ readByte();
+ }
+ return current;
+ }
+
+ void skipInCurrentByte(int bits) throws IOException {
+ assert bitsLeft >= bits;
+ bitsLeft -= bits;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/BitFieldWriter.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/BitFieldWriter.java b/orc/src/java/org/apache/hive/orc/impl/BitFieldWriter.java
new file mode 100644
index 0000000..962f035
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/BitFieldWriter.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+
+public class BitFieldWriter {
+ private RunLengthByteWriter output;
+ private final int bitSize;
+ private byte current = 0;
+ private int bitsLeft = 8;
+
+ public BitFieldWriter(PositionedOutputStream output,
+ int bitSize) throws IOException {
+ this.output = new RunLengthByteWriter(output);
+ this.bitSize = bitSize;
+ }
+
+ private void writeByte() throws IOException {
+ output.write(current);
+ current = 0;
+ bitsLeft = 8;
+ }
+
+ public void flush() throws IOException {
+ if (bitsLeft != 8) {
+ writeByte();
+ }
+ output.flush();
+ }
+
+ public void write(int value) throws IOException {
+ int bitsToWrite = bitSize;
+ while (bitsToWrite > bitsLeft) {
+ // add the bits to the bottom of the current word
+ current |= value >>> (bitsToWrite - bitsLeft);
+ // subtract out the bits we just added
+ bitsToWrite -= bitsLeft;
+ // zero out the bits above bitsToWrite
+ value &= (1 << bitsToWrite) - 1;
+ writeByte();
+ }
+ bitsLeft -= bitsToWrite;
+ current |= value << bitsLeft;
+ if (bitsLeft == 0) {
+ writeByte();
+ }
+ }
+
+ public void getPosition(PositionRecorder recorder) throws IOException {
+ output.getPosition(recorder);
+ recorder.addPosition(8 - bitsLeft);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/BufferChunk.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/BufferChunk.java b/orc/src/java/org/apache/hive/orc/impl/BufferChunk.java
new file mode 100644
index 0000000..0f59b5d
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/BufferChunk.java
@@ -0,0 +1,85 @@
+package org.apache.hive.orc.impl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.hive.common.io.DiskRange;
+import org.apache.hadoop.hive.common.io.DiskRangeList;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.nio.ByteBuffer;
+
+/**
+ * The sections of stripe that we have read.
+ * This might not match diskRange - 1 disk range can be multiple buffer chunks,
+ * depending on DFS block boundaries.
+ */
+public class BufferChunk extends DiskRangeList {
+
+ private static final Logger LOG =
+ LoggerFactory.getLogger(BufferChunk.class);
+ final ByteBuffer chunk;
+
+ public BufferChunk(ByteBuffer chunk, long offset) {
+ super(offset, offset + chunk.remaining());
+ this.chunk = chunk;
+ }
+
+ public ByteBuffer getChunk() {
+ return chunk;
+ }
+
+ @Override
+ public boolean hasData() {
+ return chunk != null;
+ }
+
+ @Override
+ public final String toString() {
+ boolean makesSense = chunk.remaining() == (end - offset);
+ return "data range [" + offset + ", " + end + "), size: " + chunk.remaining()
+ + (makesSense ? "" : "(!)") + " type: " +
+ (chunk.isDirect() ? "direct" : "array-backed");
+ }
+
+ @Override
+ public DiskRange sliceAndShift(long offset, long end, long shiftBy) {
+ assert offset <= end && offset >= this.offset && end <= this.end;
+ assert offset + shiftBy >= 0;
+ ByteBuffer sliceBuf = chunk.slice();
+ int newPos = (int) (offset - this.offset);
+ int newLimit = newPos + (int) (end - offset);
+ try {
+ sliceBuf.position(newPos);
+ sliceBuf.limit(newLimit);
+ } catch (Throwable t) {
+ LOG.error("Failed to slice buffer chunk with range" + " [" + this.offset + ", " + this.end
+ + "), position: " + chunk.position() + " limit: " + chunk.limit() + ", "
+ + (chunk.isDirect() ? "direct" : "array") + "; to [" + offset + ", " + end + ") "
+ + t.getClass());
+ throw new RuntimeException(t);
+ }
+ return new BufferChunk(sliceBuf, offset + shiftBy);
+ }
+
+ @Override
+ public ByteBuffer getData() {
+ return chunk;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/ColumnStatisticsImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/ColumnStatisticsImpl.java b/orc/src/java/org/apache/hive/orc/impl/ColumnStatisticsImpl.java
new file mode 100644
index 0000000..9cf725d
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/ColumnStatisticsImpl.java
@@ -0,0 +1,1101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.sql.Date;
+import java.sql.Timestamp;
+
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparator;
+import org.apache.hive.orc.BinaryColumnStatistics;
+import org.apache.hive.orc.BooleanColumnStatistics;
+import org.apache.hive.orc.ColumnStatistics;
+import org.apache.hive.orc.DecimalColumnStatistics;
+import org.apache.hive.orc.DoubleColumnStatistics;
+import org.apache.hive.orc.IntegerColumnStatistics;
+import org.apache.hive.orc.TimestampColumnStatistics;
+import org.apache.hive.orc.TypeDescription;
+import org.apache.hive.orc.DateColumnStatistics;
+import org.apache.hive.orc.OrcProto;
+import org.apache.hive.orc.StringColumnStatistics;
+
+public class ColumnStatisticsImpl implements ColumnStatistics {
+
+ private static final class BooleanStatisticsImpl extends ColumnStatisticsImpl
+ implements BooleanColumnStatistics {
+ private long trueCount = 0;
+
+ BooleanStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.BucketStatistics bkt = stats.getBucketStatistics();
+ trueCount = bkt.getCount(0);
+ }
+
+ BooleanStatisticsImpl() {
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ trueCount = 0;
+ }
+
+ @Override
+ public void updateBoolean(boolean value, int repetitions) {
+ if (value) {
+ trueCount += repetitions;
+ }
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof BooleanStatisticsImpl) {
+ BooleanStatisticsImpl bkt = (BooleanStatisticsImpl) other;
+ trueCount += bkt.trueCount;
+ } else {
+ if (isStatsExists() && trueCount != 0) {
+ throw new IllegalArgumentException("Incompatible merging of boolean column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder builder = super.serialize();
+ OrcProto.BucketStatistics.Builder bucket =
+ OrcProto.BucketStatistics.newBuilder();
+ bucket.addCount(trueCount);
+ builder.setBucketStatistics(bucket);
+ return builder;
+ }
+
+ @Override
+ public long getFalseCount() {
+ return getNumberOfValues() - trueCount;
+ }
+
+ @Override
+ public long getTrueCount() {
+ return trueCount;
+ }
+
+ @Override
+ public String toString() {
+ return super.toString() + " true: " + trueCount;
+ }
+ }
+
+ private static final class IntegerStatisticsImpl extends ColumnStatisticsImpl
+ implements IntegerColumnStatistics {
+
+ private long minimum = Long.MAX_VALUE;
+ private long maximum = Long.MIN_VALUE;
+ private long sum = 0;
+ private boolean hasMinimum = false;
+ private boolean overflow = false;
+
+ IntegerStatisticsImpl() {
+ }
+
+ IntegerStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.IntegerStatistics intStat = stats.getIntStatistics();
+ if (intStat.hasMinimum()) {
+ hasMinimum = true;
+ minimum = intStat.getMinimum();
+ }
+ if (intStat.hasMaximum()) {
+ maximum = intStat.getMaximum();
+ }
+ if (intStat.hasSum()) {
+ sum = intStat.getSum();
+ } else {
+ overflow = true;
+ }
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ hasMinimum = false;
+ minimum = Long.MAX_VALUE;
+ maximum = Long.MIN_VALUE;
+ sum = 0;
+ overflow = false;
+ }
+
+ @Override
+ public void updateInteger(long value, int repetitions) {
+ if (!hasMinimum) {
+ hasMinimum = true;
+ minimum = value;
+ maximum = value;
+ } else if (value < minimum) {
+ minimum = value;
+ } else if (value > maximum) {
+ maximum = value;
+ }
+ if (!overflow) {
+ boolean wasPositive = sum >= 0;
+ sum += value * repetitions;
+ if ((value >= 0) == wasPositive) {
+ overflow = (sum >= 0) != wasPositive;
+ }
+ }
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof IntegerStatisticsImpl) {
+ IntegerStatisticsImpl otherInt = (IntegerStatisticsImpl) other;
+ if (!hasMinimum) {
+ hasMinimum = otherInt.hasMinimum;
+ minimum = otherInt.minimum;
+ maximum = otherInt.maximum;
+ } else if (otherInt.hasMinimum) {
+ if (otherInt.minimum < minimum) {
+ minimum = otherInt.minimum;
+ }
+ if (otherInt.maximum > maximum) {
+ maximum = otherInt.maximum;
+ }
+ }
+
+ overflow |= otherInt.overflow;
+ if (!overflow) {
+ boolean wasPositive = sum >= 0;
+ sum += otherInt.sum;
+ if ((otherInt.sum >= 0) == wasPositive) {
+ overflow = (sum >= 0) != wasPositive;
+ }
+ }
+ } else {
+ if (isStatsExists() && hasMinimum) {
+ throw new IllegalArgumentException("Incompatible merging of integer column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder builder = super.serialize();
+ OrcProto.IntegerStatistics.Builder intb =
+ OrcProto.IntegerStatistics.newBuilder();
+ if (hasMinimum) {
+ intb.setMinimum(minimum);
+ intb.setMaximum(maximum);
+ }
+ if (!overflow) {
+ intb.setSum(sum);
+ }
+ builder.setIntStatistics(intb);
+ return builder;
+ }
+
+ @Override
+ public long getMinimum() {
+ return minimum;
+ }
+
+ @Override
+ public long getMaximum() {
+ return maximum;
+ }
+
+ @Override
+ public boolean isSumDefined() {
+ return !overflow;
+ }
+
+ @Override
+ public long getSum() {
+ return sum;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (hasMinimum) {
+ buf.append(" min: ");
+ buf.append(minimum);
+ buf.append(" max: ");
+ buf.append(maximum);
+ }
+ if (!overflow) {
+ buf.append(" sum: ");
+ buf.append(sum);
+ }
+ return buf.toString();
+ }
+ }
+
+ private static final class DoubleStatisticsImpl extends ColumnStatisticsImpl
+ implements DoubleColumnStatistics {
+ private boolean hasMinimum = false;
+ private double minimum = Double.MAX_VALUE;
+ private double maximum = Double.MIN_VALUE;
+ private double sum = 0;
+
+ DoubleStatisticsImpl() {
+ }
+
+ DoubleStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.DoubleStatistics dbl = stats.getDoubleStatistics();
+ if (dbl.hasMinimum()) {
+ hasMinimum = true;
+ minimum = dbl.getMinimum();
+ }
+ if (dbl.hasMaximum()) {
+ maximum = dbl.getMaximum();
+ }
+ if (dbl.hasSum()) {
+ sum = dbl.getSum();
+ }
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ hasMinimum = false;
+ minimum = Double.MAX_VALUE;
+ maximum = Double.MIN_VALUE;
+ sum = 0;
+ }
+
+ @Override
+ public void updateDouble(double value) {
+ if (!hasMinimum) {
+ hasMinimum = true;
+ minimum = value;
+ maximum = value;
+ } else if (value < minimum) {
+ minimum = value;
+ } else if (value > maximum) {
+ maximum = value;
+ }
+ sum += value;
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof DoubleStatisticsImpl) {
+ DoubleStatisticsImpl dbl = (DoubleStatisticsImpl) other;
+ if (!hasMinimum) {
+ hasMinimum = dbl.hasMinimum;
+ minimum = dbl.minimum;
+ maximum = dbl.maximum;
+ } else if (dbl.hasMinimum) {
+ if (dbl.minimum < minimum) {
+ minimum = dbl.minimum;
+ }
+ if (dbl.maximum > maximum) {
+ maximum = dbl.maximum;
+ }
+ }
+ sum += dbl.sum;
+ } else {
+ if (isStatsExists() && hasMinimum) {
+ throw new IllegalArgumentException("Incompatible merging of double column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder builder = super.serialize();
+ OrcProto.DoubleStatistics.Builder dbl =
+ OrcProto.DoubleStatistics.newBuilder();
+ if (hasMinimum) {
+ dbl.setMinimum(minimum);
+ dbl.setMaximum(maximum);
+ }
+ dbl.setSum(sum);
+ builder.setDoubleStatistics(dbl);
+ return builder;
+ }
+
+ @Override
+ public double getMinimum() {
+ return minimum;
+ }
+
+ @Override
+ public double getMaximum() {
+ return maximum;
+ }
+
+ @Override
+ public double getSum() {
+ return sum;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (hasMinimum) {
+ buf.append(" min: ");
+ buf.append(minimum);
+ buf.append(" max: ");
+ buf.append(maximum);
+ }
+ buf.append(" sum: ");
+ buf.append(sum);
+ return buf.toString();
+ }
+ }
+
+ protected static final class StringStatisticsImpl extends ColumnStatisticsImpl
+ implements StringColumnStatistics {
+ private Text minimum = null;
+ private Text maximum = null;
+ private long sum = 0;
+
+ StringStatisticsImpl() {
+ }
+
+ StringStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.StringStatistics str = stats.getStringStatistics();
+ if (str.hasMaximum()) {
+ maximum = new Text(str.getMaximum());
+ }
+ if (str.hasMinimum()) {
+ minimum = new Text(str.getMinimum());
+ }
+ if(str.hasSum()) {
+ sum = str.getSum();
+ }
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ minimum = null;
+ maximum = null;
+ sum = 0;
+ }
+
+ @Override
+ public void updateString(Text value) {
+ if (minimum == null) {
+ maximum = minimum = new Text(value);
+ } else if (minimum.compareTo(value) > 0) {
+ minimum = new Text(value);
+ } else if (maximum.compareTo(value) < 0) {
+ maximum = new Text(value);
+ }
+ sum += value.getLength();
+ }
+
+ @Override
+ public void updateString(byte[] bytes, int offset, int length,
+ int repetitions) {
+ if (minimum == null) {
+ maximum = minimum = new Text();
+ maximum.set(bytes, offset, length);
+ } else if (WritableComparator.compareBytes(minimum.getBytes(), 0,
+ minimum.getLength(), bytes, offset, length) > 0) {
+ minimum = new Text();
+ minimum.set(bytes, offset, length);
+ } else if (WritableComparator.compareBytes(maximum.getBytes(), 0,
+ maximum.getLength(), bytes, offset, length) < 0) {
+ maximum = new Text();
+ maximum.set(bytes, offset, length);
+ }
+ sum += length * repetitions;
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof StringStatisticsImpl) {
+ StringStatisticsImpl str = (StringStatisticsImpl) other;
+ if (minimum == null) {
+ if (str.minimum != null) {
+ maximum = new Text(str.getMaximum());
+ minimum = new Text(str.getMinimum());
+ } else {
+ /* both are empty */
+ maximum = minimum = null;
+ }
+ } else if (str.minimum != null) {
+ if (minimum.compareTo(str.minimum) > 0) {
+ minimum = new Text(str.getMinimum());
+ }
+ if (maximum.compareTo(str.maximum) < 0) {
+ maximum = new Text(str.getMaximum());
+ }
+ }
+ sum += str.sum;
+ } else {
+ if (isStatsExists() && minimum != null) {
+ throw new IllegalArgumentException("Incompatible merging of string column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder result = super.serialize();
+ OrcProto.StringStatistics.Builder str =
+ OrcProto.StringStatistics.newBuilder();
+ if (getNumberOfValues() != 0) {
+ str.setMinimum(getMinimum());
+ str.setMaximum(getMaximum());
+ str.setSum(sum);
+ }
+ result.setStringStatistics(str);
+ return result;
+ }
+
+ @Override
+ public String getMinimum() {
+ return minimum == null ? null : minimum.toString();
+ }
+
+ @Override
+ public String getMaximum() {
+ return maximum == null ? null : maximum.toString();
+ }
+
+ @Override
+ public long getSum() {
+ return sum;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (getNumberOfValues() != 0) {
+ buf.append(" min: ");
+ buf.append(getMinimum());
+ buf.append(" max: ");
+ buf.append(getMaximum());
+ buf.append(" sum: ");
+ buf.append(sum);
+ }
+ return buf.toString();
+ }
+ }
+
+ protected static final class BinaryStatisticsImpl extends ColumnStatisticsImpl implements
+ BinaryColumnStatistics {
+
+ private long sum = 0;
+
+ BinaryStatisticsImpl() {
+ }
+
+ BinaryStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.BinaryStatistics binStats = stats.getBinaryStatistics();
+ if (binStats.hasSum()) {
+ sum = binStats.getSum();
+ }
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ sum = 0;
+ }
+
+ @Override
+ public void updateBinary(BytesWritable value) {
+ sum += value.getLength();
+ }
+
+ @Override
+ public void updateBinary(byte[] bytes, int offset, int length,
+ int repetitions) {
+ sum += length * repetitions;
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof BinaryColumnStatistics) {
+ BinaryStatisticsImpl bin = (BinaryStatisticsImpl) other;
+ sum += bin.sum;
+ } else {
+ if (isStatsExists() && sum != 0) {
+ throw new IllegalArgumentException("Incompatible merging of binary column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public long getSum() {
+ return sum;
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder result = super.serialize();
+ OrcProto.BinaryStatistics.Builder bin = OrcProto.BinaryStatistics.newBuilder();
+ bin.setSum(sum);
+ result.setBinaryStatistics(bin);
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (getNumberOfValues() != 0) {
+ buf.append(" sum: ");
+ buf.append(sum);
+ }
+ return buf.toString();
+ }
+ }
+
+ private static final class DecimalStatisticsImpl extends ColumnStatisticsImpl
+ implements DecimalColumnStatistics {
+
+ // These objects are mutable for better performance.
+ private HiveDecimalWritable minimum = null;
+ private HiveDecimalWritable maximum = null;
+ private HiveDecimalWritable sum = new HiveDecimalWritable(0);
+
+ DecimalStatisticsImpl() {
+ }
+
+ DecimalStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.DecimalStatistics dec = stats.getDecimalStatistics();
+ if (dec.hasMaximum()) {
+ maximum = new HiveDecimalWritable(dec.getMaximum());
+ }
+ if (dec.hasMinimum()) {
+ minimum = new HiveDecimalWritable(dec.getMinimum());
+ }
+ if (dec.hasSum()) {
+ sum = new HiveDecimalWritable(dec.getSum());
+ } else {
+ sum = null;
+ }
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ minimum = null;
+ maximum = null;
+ sum = new HiveDecimalWritable(0);
+ }
+
+ @Override
+ public void updateDecimal(HiveDecimalWritable value) {
+ if (minimum == null) {
+ minimum = new HiveDecimalWritable(value);
+ maximum = new HiveDecimalWritable(value);
+ } else if (minimum.compareTo(value) > 0) {
+ minimum.set(value);
+ } else if (maximum.compareTo(value) < 0) {
+ maximum.set(value);
+ }
+ if (sum != null) {
+ sum.mutateAdd(value);
+ }
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof DecimalStatisticsImpl) {
+ DecimalStatisticsImpl dec = (DecimalStatisticsImpl) other;
+ if (minimum == null) {
+ minimum = (dec.minimum != null ? new HiveDecimalWritable(dec.minimum) : null);
+ maximum = (dec.maximum != null ? new HiveDecimalWritable(dec.maximum) : null);
+ sum = dec.sum;
+ } else if (dec.minimum != null) {
+ if (minimum.compareTo(dec.minimum) > 0) {
+ minimum.set(dec.minimum);
+ }
+ if (maximum.compareTo(dec.maximum) < 0) {
+ maximum.set(dec.maximum);
+ }
+ if (sum == null || dec.sum == null) {
+ sum = null;
+ } else {
+ sum.mutateAdd(dec.sum);
+ }
+ }
+ } else {
+ if (isStatsExists() && minimum != null) {
+ throw new IllegalArgumentException("Incompatible merging of decimal column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder result = super.serialize();
+ OrcProto.DecimalStatistics.Builder dec =
+ OrcProto.DecimalStatistics.newBuilder();
+ if (getNumberOfValues() != 0 && minimum != null) {
+ dec.setMinimum(minimum.toString());
+ dec.setMaximum(maximum.toString());
+ }
+ // Check isSet for overflow.
+ if (sum != null && sum.isSet()) {
+ dec.setSum(sum.toString());
+ }
+ result.setDecimalStatistics(dec);
+ return result;
+ }
+
+ @Override
+ public HiveDecimal getMinimum() {
+ return minimum == null ? null : minimum.getHiveDecimal();
+ }
+
+ @Override
+ public HiveDecimal getMaximum() {
+ return maximum == null ? null : maximum.getHiveDecimal();
+ }
+
+ @Override
+ public HiveDecimal getSum() {
+ return sum == null ? null : sum.getHiveDecimal();
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (getNumberOfValues() != 0) {
+ buf.append(" min: ");
+ buf.append(minimum);
+ buf.append(" max: ");
+ buf.append(maximum);
+ if (sum != null) {
+ buf.append(" sum: ");
+ buf.append(sum);
+ }
+ }
+ return buf.toString();
+ }
+ }
+
+ private static final class DateStatisticsImpl extends ColumnStatisticsImpl
+ implements DateColumnStatistics {
+ private Integer minimum = null;
+ private Integer maximum = null;
+
+ DateStatisticsImpl() {
+ }
+
+ DateStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.DateStatistics dateStats = stats.getDateStatistics();
+ // min,max values serialized/deserialized as int (days since epoch)
+ if (dateStats.hasMaximum()) {
+ maximum = dateStats.getMaximum();
+ }
+ if (dateStats.hasMinimum()) {
+ minimum = dateStats.getMinimum();
+ }
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ minimum = null;
+ maximum = null;
+ }
+
+ @Override
+ public void updateDate(DateWritable value) {
+ if (minimum == null) {
+ minimum = value.getDays();
+ maximum = value.getDays();
+ } else if (minimum > value.getDays()) {
+ minimum = value.getDays();
+ } else if (maximum < value.getDays()) {
+ maximum = value.getDays();
+ }
+ }
+
+ @Override
+ public void updateDate(int value) {
+ if (minimum == null) {
+ minimum = value;
+ maximum = value;
+ } else if (minimum > value) {
+ minimum = value;
+ } else if (maximum < value) {
+ maximum = value;
+ }
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof DateStatisticsImpl) {
+ DateStatisticsImpl dateStats = (DateStatisticsImpl) other;
+ if (minimum == null) {
+ minimum = dateStats.minimum;
+ maximum = dateStats.maximum;
+ } else if (dateStats.minimum != null) {
+ if (minimum > dateStats.minimum) {
+ minimum = dateStats.minimum;
+ }
+ if (maximum < dateStats.maximum) {
+ maximum = dateStats.maximum;
+ }
+ }
+ } else {
+ if (isStatsExists() && minimum != null) {
+ throw new IllegalArgumentException("Incompatible merging of date column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder result = super.serialize();
+ OrcProto.DateStatistics.Builder dateStats =
+ OrcProto.DateStatistics.newBuilder();
+ if (getNumberOfValues() != 0 && minimum != null) {
+ dateStats.setMinimum(minimum);
+ dateStats.setMaximum(maximum);
+ }
+ result.setDateStatistics(dateStats);
+ return result;
+ }
+
+ private transient final DateWritable minDate = new DateWritable();
+ private transient final DateWritable maxDate = new DateWritable();
+
+ @Override
+ public Date getMinimum() {
+ if (minimum == null) {
+ return null;
+ }
+ minDate.set(minimum);
+ return minDate.get();
+ }
+
+ @Override
+ public Date getMaximum() {
+ if (maximum == null) {
+ return null;
+ }
+ maxDate.set(maximum);
+ return maxDate.get();
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (getNumberOfValues() != 0) {
+ buf.append(" min: ");
+ buf.append(getMinimum());
+ buf.append(" max: ");
+ buf.append(getMaximum());
+ }
+ return buf.toString();
+ }
+ }
+
+ private static final class TimestampStatisticsImpl extends ColumnStatisticsImpl
+ implements TimestampColumnStatistics {
+ private Long minimum = null;
+ private Long maximum = null;
+
+ TimestampStatisticsImpl() {
+ }
+
+ TimestampStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ OrcProto.TimestampStatistics timestampStats = stats.getTimestampStatistics();
+ // min,max values serialized/deserialized as int (milliseconds since epoch)
+ if (timestampStats.hasMaximum()) {
+ maximum = timestampStats.getMaximum();
+ }
+ if (timestampStats.hasMinimum()) {
+ minimum = timestampStats.getMinimum();
+ }
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ minimum = null;
+ maximum = null;
+ }
+
+ @Override
+ public void updateTimestamp(Timestamp value) {
+ if (minimum == null) {
+ minimum = value.getTime();
+ maximum = value.getTime();
+ } else if (minimum > value.getTime()) {
+ minimum = value.getTime();
+ } else if (maximum < value.getTime()) {
+ maximum = value.getTime();
+ }
+ }
+
+ @Override
+ public void updateTimestamp(long value) {
+ if (minimum == null) {
+ minimum = value;
+ maximum = value;
+ } else if (minimum > value) {
+ minimum = value;
+ } else if (maximum < value) {
+ maximum = value;
+ }
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof TimestampStatisticsImpl) {
+ TimestampStatisticsImpl timestampStats = (TimestampStatisticsImpl) other;
+ if (minimum == null) {
+ minimum = timestampStats.minimum;
+ maximum = timestampStats.maximum;
+ } else if (timestampStats.minimum != null) {
+ if (minimum > timestampStats.minimum) {
+ minimum = timestampStats.minimum;
+ }
+ if (maximum < timestampStats.maximum) {
+ maximum = timestampStats.maximum;
+ }
+ }
+ } else {
+ if (isStatsExists() && minimum != null) {
+ throw new IllegalArgumentException("Incompatible merging of timestamp column statistics");
+ }
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder result = super.serialize();
+ OrcProto.TimestampStatistics.Builder timestampStats = OrcProto.TimestampStatistics
+ .newBuilder();
+ if (getNumberOfValues() != 0 && minimum != null) {
+ timestampStats.setMinimum(minimum);
+ timestampStats.setMaximum(maximum);
+ }
+ result.setTimestampStatistics(timestampStats);
+ return result;
+ }
+
+ @Override
+ public Timestamp getMinimum() {
+ return minimum == null ? null : new Timestamp(minimum);
+ }
+
+ @Override
+ public Timestamp getMaximum() {
+ return maximum == null ? null : new Timestamp(maximum);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (getNumberOfValues() != 0) {
+ buf.append(" min: ");
+ buf.append(getMinimum());
+ buf.append(" max: ");
+ buf.append(getMaximum());
+ }
+ return buf.toString();
+ }
+ }
+
+ private long count = 0;
+ private boolean hasNull = false;
+
+ ColumnStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ if (stats.hasNumberOfValues()) {
+ count = stats.getNumberOfValues();
+ }
+
+ if (stats.hasHasNull()) {
+ hasNull = stats.getHasNull();
+ } else {
+ hasNull = true;
+ }
+ }
+
+ ColumnStatisticsImpl() {
+ }
+
+ public void increment() {
+ count += 1;
+ }
+
+ public void increment(int count) {
+ this.count += count;
+ }
+
+ public void setNull() {
+ hasNull = true;
+ }
+
+ public void updateBoolean(boolean value, int repetitions) {
+ throw new UnsupportedOperationException("Can't update boolean");
+ }
+
+ public void updateInteger(long value, int repetitions) {
+ throw new UnsupportedOperationException("Can't update integer");
+ }
+
+ public void updateDouble(double value) {
+ throw new UnsupportedOperationException("Can't update double");
+ }
+
+ public void updateString(Text value) {
+ throw new UnsupportedOperationException("Can't update string");
+ }
+
+ public void updateString(byte[] bytes, int offset, int length,
+ int repetitions) {
+ throw new UnsupportedOperationException("Can't update string");
+ }
+
+ public void updateBinary(BytesWritable value) {
+ throw new UnsupportedOperationException("Can't update binary");
+ }
+
+ public void updateBinary(byte[] bytes, int offset, int length,
+ int repetitions) {
+ throw new UnsupportedOperationException("Can't update string");
+ }
+
+ public void updateDecimal(HiveDecimalWritable value) {
+ throw new UnsupportedOperationException("Can't update decimal");
+ }
+
+ public void updateDate(DateWritable value) {
+ throw new UnsupportedOperationException("Can't update date");
+ }
+
+ public void updateDate(int value) {
+ throw new UnsupportedOperationException("Can't update date");
+ }
+
+ public void updateTimestamp(Timestamp value) {
+ throw new UnsupportedOperationException("Can't update timestamp");
+ }
+
+ public void updateTimestamp(long value) {
+ throw new UnsupportedOperationException("Can't update timestamp");
+ }
+
+ public boolean isStatsExists() {
+ return (count > 0 || hasNull == true);
+ }
+
+ public void merge(ColumnStatisticsImpl stats) {
+ count += stats.count;
+ hasNull |= stats.hasNull;
+ }
+
+ public void reset() {
+ count = 0;
+ hasNull = false;
+ }
+
+ @Override
+ public long getNumberOfValues() {
+ return count;
+ }
+
+ @Override
+ public boolean hasNull() {
+ return hasNull;
+ }
+
+ @Override
+ public String toString() {
+ return "count: " + count + " hasNull: " + hasNull;
+ }
+
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder builder =
+ OrcProto.ColumnStatistics.newBuilder();
+ builder.setNumberOfValues(count);
+ builder.setHasNull(hasNull);
+ return builder;
+ }
+
+ public static ColumnStatisticsImpl create(TypeDescription schema) {
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ return new BooleanStatisticsImpl();
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new IntegerStatisticsImpl();
+ case FLOAT:
+ case DOUBLE:
+ return new DoubleStatisticsImpl();
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ return new StringStatisticsImpl();
+ case DECIMAL:
+ return new DecimalStatisticsImpl();
+ case DATE:
+ return new DateStatisticsImpl();
+ case TIMESTAMP:
+ return new TimestampStatisticsImpl();
+ case BINARY:
+ return new BinaryStatisticsImpl();
+ default:
+ return new ColumnStatisticsImpl();
+ }
+ }
+
+ public static ColumnStatisticsImpl deserialize(OrcProto.ColumnStatistics stats) {
+ if (stats.hasBucketStatistics()) {
+ return new BooleanStatisticsImpl(stats);
+ } else if (stats.hasIntStatistics()) {
+ return new IntegerStatisticsImpl(stats);
+ } else if (stats.hasDoubleStatistics()) {
+ return new DoubleStatisticsImpl(stats);
+ } else if (stats.hasStringStatistics()) {
+ return new StringStatisticsImpl(stats);
+ } else if (stats.hasDecimalStatistics()) {
+ return new DecimalStatisticsImpl(stats);
+ } else if (stats.hasDateStatistics()) {
+ return new DateStatisticsImpl(stats);
+ } else if (stats.hasTimestampStatistics()) {
+ return new TimestampStatisticsImpl(stats);
+ } else if(stats.hasBinaryStatistics()) {
+ return new BinaryStatisticsImpl(stats);
+ } else {
+ return new ColumnStatisticsImpl(stats);
+ }
+ }
+}
[03/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestBitFieldReader.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestBitFieldReader.java b/orc/src/test/org/apache/orc/impl/TestBitFieldReader.java
deleted file mode 100644
index e4c6f6b..0000000
--- a/orc/src/test/org/apache/orc/impl/TestBitFieldReader.java
+++ /dev/null
@@ -1,145 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import static junit.framework.Assert.assertEquals;
-
-import java.nio.ByteBuffer;
-
-import org.apache.orc.CompressionCodec;
-import org.junit.Test;
-
-public class TestBitFieldReader {
-
- public void runSeekTest(CompressionCodec codec) throws Exception {
- TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
- final int COUNT = 16384;
- BitFieldWriter out = new BitFieldWriter(
- new OutStream("test", 500, codec, collect), 1);
- TestInStream.PositionCollector[] positions =
- new TestInStream.PositionCollector[COUNT];
- for(int i=0; i < COUNT; ++i) {
- positions[i] = new TestInStream.PositionCollector();
- out.getPosition(positions[i]);
- // test runs, non-runs
- if (i < COUNT / 2) {
- out.write(i & 1);
- } else {
- out.write((i/3) & 1);
- }
- }
- out.flush();
- ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
- collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
- inBuf.flip();
- BitFieldReader in = new BitFieldReader(InStream.create("test",
- new ByteBuffer[]{inBuf}, new long[]{0}, inBuf.remaining(),
- codec, 500), 1);
- for(int i=0; i < COUNT; ++i) {
- int x = in.next();
- if (i < COUNT / 2) {
- assertEquals(i & 1, x);
- } else {
- assertEquals((i/3) & 1, x);
- }
- }
- for(int i=COUNT-1; i >= 0; --i) {
- in.seek(positions[i]);
- int x = in.next();
- if (i < COUNT / 2) {
- assertEquals(i & 1, x);
- } else {
- assertEquals((i/3) & 1, x);
- }
- }
- }
-
- @Test
- public void testUncompressedSeek() throws Exception {
- runSeekTest(null);
- }
-
- @Test
- public void testCompressedSeek() throws Exception {
- runSeekTest(new ZlibCodec());
- }
-
- @Test
- public void testBiggerItems() throws Exception {
- TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
- final int COUNT = 16384;
- BitFieldWriter out = new BitFieldWriter(
- new OutStream("test", 500, null, collect), 3);
- for(int i=0; i < COUNT; ++i) {
- // test runs, non-runs
- if (i < COUNT / 2) {
- out.write(i & 7);
- } else {
- out.write((i/3) & 7);
- }
- }
- out.flush();
- ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
- collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
- inBuf.flip();
- BitFieldReader in = new BitFieldReader(InStream.create("test",
- new ByteBuffer[]{inBuf}, new long[]{0}, inBuf.remaining(),
- null, 500), 3);
- for(int i=0; i < COUNT; ++i) {
- int x = in.next();
- if (i < COUNT / 2) {
- assertEquals(i & 7, x);
- } else {
- assertEquals((i/3) & 7, x);
- }
- }
- }
-
- @Test
- public void testSkips() throws Exception {
- TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
- BitFieldWriter out = new BitFieldWriter(
- new OutStream("test", 100, null, collect), 1);
- final int COUNT = 16384;
- for(int i=0; i < COUNT; ++i) {
- if (i < COUNT/2) {
- out.write(i & 1);
- } else {
- out.write((i/3) & 1);
- }
- }
- out.flush();
- ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
- collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
- inBuf.flip();
- BitFieldReader in = new BitFieldReader(InStream.create("test", new ByteBuffer[]{inBuf},
- new long[]{0}, inBuf.remaining(), null, 100), 1);
- for(int i=0; i < COUNT; i += 5) {
- int x = (int) in.next();
- if (i < COUNT/2) {
- assertEquals(i & 1, x);
- } else {
- assertEquals((i/3) & 1, x);
- }
- if (i < COUNT - 5) {
- in.skip(4);
- }
- in.skip(0);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestBitPack.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestBitPack.java b/orc/src/test/org/apache/orc/impl/TestBitPack.java
deleted file mode 100644
index f2d3d64..0000000
--- a/orc/src/test/org/apache/orc/impl/TestBitPack.java
+++ /dev/null
@@ -1,279 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import static org.junit.Assert.assertArrayEquals;
-import static org.junit.Assert.assertEquals;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-
-import com.google.common.primitives.Longs;
-
-public class TestBitPack {
-
- private static final int SIZE = 100;
- private static Random rand = new Random(100);
- Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
- + File.separator + "tmp"));
-
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- private long[] deltaEncode(long[] inp) {
- long[] output = new long[inp.length];
- SerializationUtils utils = new SerializationUtils();
- for (int i = 0; i < inp.length; i++) {
- output[i] = utils.zigzagEncode(inp[i]);
- }
- return output;
- }
-
- private long nextLong(Random rng, long n) {
- long bits, val;
- do {
- bits = (rng.nextLong() << 1) >>> 1;
- val = bits % n;
- } while (bits - val + (n - 1) < 0L);
- return val;
- }
-
- private void runTest(int numBits) throws IOException {
- long[] inp = new long[SIZE];
- for (int i = 0; i < SIZE; i++) {
- long val = 0;
- if (numBits <= 32) {
- if (numBits == 1) {
- val = -1 * rand.nextInt(2);
- } else {
- val = rand.nextInt((int) Math.pow(2, numBits - 1));
- }
- } else {
- val = nextLong(rand, (long) Math.pow(2, numBits - 2));
- }
- if (val % 2 == 0) {
- val = -val;
- }
- inp[i] = val;
- }
- long[] deltaEncoded = deltaEncode(inp);
- long minInput = Collections.min(Longs.asList(deltaEncoded));
- long maxInput = Collections.max(Longs.asList(deltaEncoded));
- long rangeInput = maxInput - minInput;
- SerializationUtils utils = new SerializationUtils();
- int fixedWidth = utils.findClosestNumBits(rangeInput);
- TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
- OutStream output = new OutStream("test", SIZE, null, collect);
- utils.writeInts(deltaEncoded, 0, deltaEncoded.length, fixedWidth, output);
- output.flush();
- ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
- collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
- inBuf.flip();
- long[] buff = new long[SIZE];
- utils.readInts(buff, 0, SIZE, fixedWidth, InStream.create("test", new ByteBuffer[] { inBuf },
- new long[] { 0 }, inBuf.remaining(), null, SIZE));
- for (int i = 0; i < SIZE; i++) {
- buff[i] = utils.zigzagDecode(buff[i]);
- }
- assertEquals(numBits, fixedWidth);
- assertArrayEquals(inp, buff);
- }
-
- @Test
- public void test01BitPacking1Bit() throws IOException {
- runTest(1);
- }
-
- @Test
- public void test02BitPacking2Bit() throws IOException {
- runTest(2);
- }
-
- @Test
- public void test03BitPacking3Bit() throws IOException {
- runTest(3);
- }
-
- @Test
- public void test04BitPacking4Bit() throws IOException {
- runTest(4);
- }
-
- @Test
- public void test05BitPacking5Bit() throws IOException {
- runTest(5);
- }
-
- @Test
- public void test06BitPacking6Bit() throws IOException {
- runTest(6);
- }
-
- @Test
- public void test07BitPacking7Bit() throws IOException {
- runTest(7);
- }
-
- @Test
- public void test08BitPacking8Bit() throws IOException {
- runTest(8);
- }
-
- @Test
- public void test09BitPacking9Bit() throws IOException {
- runTest(9);
- }
-
- @Test
- public void test10BitPacking10Bit() throws IOException {
- runTest(10);
- }
-
- @Test
- public void test11BitPacking11Bit() throws IOException {
- runTest(11);
- }
-
- @Test
- public void test12BitPacking12Bit() throws IOException {
- runTest(12);
- }
-
- @Test
- public void test13BitPacking13Bit() throws IOException {
- runTest(13);
- }
-
- @Test
- public void test14BitPacking14Bit() throws IOException {
- runTest(14);
- }
-
- @Test
- public void test15BitPacking15Bit() throws IOException {
- runTest(15);
- }
-
- @Test
- public void test16BitPacking16Bit() throws IOException {
- runTest(16);
- }
-
- @Test
- public void test17BitPacking17Bit() throws IOException {
- runTest(17);
- }
-
- @Test
- public void test18BitPacking18Bit() throws IOException {
- runTest(18);
- }
-
- @Test
- public void test19BitPacking19Bit() throws IOException {
- runTest(19);
- }
-
- @Test
- public void test20BitPacking20Bit() throws IOException {
- runTest(20);
- }
-
- @Test
- public void test21BitPacking21Bit() throws IOException {
- runTest(21);
- }
-
- @Test
- public void test22BitPacking22Bit() throws IOException {
- runTest(22);
- }
-
- @Test
- public void test23BitPacking23Bit() throws IOException {
- runTest(23);
- }
-
- @Test
- public void test24BitPacking24Bit() throws IOException {
- runTest(24);
- }
-
- @Test
- public void test26BitPacking26Bit() throws IOException {
- runTest(26);
- }
-
- @Test
- public void test28BitPacking28Bit() throws IOException {
- runTest(28);
- }
-
- @Test
- public void test30BitPacking30Bit() throws IOException {
- runTest(30);
- }
-
- @Test
- public void test32BitPacking32Bit() throws IOException {
- runTest(32);
- }
-
- @Test
- public void test40BitPacking40Bit() throws IOException {
- runTest(40);
- }
-
- @Test
- public void test48BitPacking48Bit() throws IOException {
- runTest(48);
- }
-
- @Test
- public void test56BitPacking56Bit() throws IOException {
- runTest(56);
- }
-
- @Test
- public void test64BitPacking64Bit() throws IOException {
- runTest(64);
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java b/orc/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
deleted file mode 100644
index 6165526..0000000
--- a/orc/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.orc.OrcProto;
-import org.apache.orc.TypeDescription;
-import org.junit.Test;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-public class TestColumnStatisticsImpl {
-
- @Test
- public void testUpdateDate() throws Exception {
- ColumnStatisticsImpl stat = ColumnStatisticsImpl.create(TypeDescription.createDate());
- DateWritable date = new DateWritable(16400);
- stat.increment();
- stat.updateDate(date);
- assertDateStatistics(stat, 1, 16400, 16400);
-
- date.set(16410);
- stat.increment();
- stat.updateDate(date);
- assertDateStatistics(stat, 2, 16400, 16410);
-
- date.set(16420);
- stat.increment();
- stat.updateDate(date);
- assertDateStatistics(stat, 3, 16400, 16420);
- }
-
- private void assertDateStatistics(ColumnStatisticsImpl stat, int count, int minimum, int maximum) {
- OrcProto.ColumnStatistics.Builder builder = stat.serialize();
-
- assertEquals(count, builder.getNumberOfValues());
- assertTrue(builder.hasDateStatistics());
- assertFalse(builder.hasStringStatistics());
-
- OrcProto.DateStatistics protoStat = builder.getDateStatistics();
- assertTrue(protoStat.hasMinimum());
- assertEquals(minimum, protoStat.getMinimum());
- assertTrue(protoStat.hasMaximum());
- assertEquals(maximum, protoStat.getMaximum());
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestDataReaderProperties.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestDataReaderProperties.java b/orc/src/test/org/apache/orc/impl/TestDataReaderProperties.java
deleted file mode 100644
index 46546b0..0000000
--- a/orc/src/test/org/apache/orc/impl/TestDataReaderProperties.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.orc.CompressionCodec;
-import org.apache.orc.CompressionKind;
-import org.junit.Test;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNull;
-import static org.mockito.Mockito.mock;
-
-public class TestDataReaderProperties {
-
- private FileSystem mockedFileSystem = mock(FileSystem.class);
- private Path mockedPath = mock(Path.class);
- private boolean mockedZeroCopy = false;
-
- @Test
- public void testCompleteBuild() {
- DataReaderProperties properties = DataReaderProperties.builder()
- .withFileSystem(mockedFileSystem)
- .withPath(mockedPath)
- .withCompression(CompressionKind.ZLIB)
- .withZeroCopy(mockedZeroCopy)
- .build();
- assertEquals(mockedFileSystem, properties.getFileSystem());
- assertEquals(mockedPath, properties.getPath());
- assertEquals(CompressionKind.ZLIB, properties.getCompression());
- assertEquals(mockedZeroCopy, properties.getZeroCopy());
- }
-
- @Test
- public void testMissingNonRequiredArgs() {
- DataReaderProperties properties = DataReaderProperties.builder()
- .withFileSystem(mockedFileSystem)
- .withPath(mockedPath)
- .build();
- assertEquals(mockedFileSystem, properties.getFileSystem());
- assertEquals(mockedPath, properties.getPath());
- assertNull(properties.getCompression());
- assertFalse(properties.getZeroCopy());
- }
-
- @Test(expected = java.lang.NullPointerException.class)
- public void testEmptyBuild() {
- DataReaderProperties.builder().build();
- }
-
- @Test(expected = java.lang.NullPointerException.class)
- public void testMissingPath() {
- DataReaderProperties.builder()
- .withFileSystem(mockedFileSystem)
- .withCompression(CompressionKind.NONE)
- .withZeroCopy(mockedZeroCopy)
- .build();
- }
-
- @Test(expected = java.lang.NullPointerException.class)
- public void testMissingFileSystem() {
- DataReaderProperties.builder()
- .withPath(mockedPath)
- .withCompression(CompressionKind.NONE)
- .withZeroCopy(mockedZeroCopy)
- .build();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestDynamicArray.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestDynamicArray.java b/orc/src/test/org/apache/orc/impl/TestDynamicArray.java
deleted file mode 100644
index af583f7..0000000
--- a/orc/src/test/org/apache/orc/impl/TestDynamicArray.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.util.Random;
-
-import org.apache.orc.impl.DynamicByteArray;
-import org.apache.orc.impl.DynamicIntArray;
-import org.junit.Test;
-
-import static org.junit.Assert.assertEquals;
-
-public class TestDynamicArray {
-
- @Test
- public void testByteArray() throws Exception {
- DynamicByteArray dba = new DynamicByteArray(3, 10);
- dba.add((byte) 0);
- dba.add((byte) 1);
- dba.set(3, (byte) 3);
- dba.set(2, (byte) 2);
- dba.add((byte) 4);
- assertEquals("{0,1,2,3,4}", dba.toString());
- assertEquals(5, dba.size());
- byte[] val;
- val = new byte[0];
- assertEquals(0, dba.compare(val, 0, 0, 2, 0));
- assertEquals(-1, dba.compare(val, 0, 0, 2, 1));
- val = new byte[]{3,42};
- assertEquals(1, dba.compare(val, 0, 1, 2, 0));
- assertEquals(1, dba.compare(val, 0, 1, 2, 1));
- assertEquals(0, dba.compare(val, 0, 1, 3, 1));
- assertEquals(-1, dba.compare(val, 0, 1, 3, 2));
- assertEquals(1, dba.compare(val, 0, 2, 3, 1));
- val = new byte[256];
- for(int b=-128; b < 128; ++b) {
- dba.add((byte) b);
- val[b+128] = (byte) b;
- }
- assertEquals(0, dba.compare(val, 0, 256, 5, 256));
- assertEquals(1, dba.compare(val, 0, 1, 0, 1));
- assertEquals(1, dba.compare(val, 254, 1, 0, 1));
- assertEquals(1, dba.compare(val, 120, 1, 64, 1));
- val = new byte[1024];
- Random rand = new Random(1701);
- for(int i = 0; i < val.length; ++i) {
- rand.nextBytes(val);
- }
- dba.add(val, 0, 1024);
- assertEquals(1285, dba.size());
- assertEquals(0, dba.compare(val, 0, 1024, 261, 1024));
- }
-
- @Test
- public void testIntArray() throws Exception {
- DynamicIntArray dia = new DynamicIntArray(10);
- for(int i=0; i < 10000; ++i) {
- dia.add(2*i);
- }
- assertEquals(10000, dia.size());
- for(int i=0; i < 10000; ++i) {
- assertEquals(2*i, dia.get(i));
- }
- dia.clear();
- assertEquals(0, dia.size());
- dia.add(3);
- dia.add(12);
- dia.add(65);
- assertEquals("{3,12,65}", dia.toString());
- for(int i=0; i < 5; ++i) {
- dia.increment(i, 3);
- }
- assertEquals("{6,15,68,3,3}", dia.toString());
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestInStream.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestInStream.java b/orc/src/test/org/apache/orc/impl/TestInStream.java
deleted file mode 100644
index 9e65345..0000000
--- a/orc/src/test/org/apache/orc/impl/TestInStream.java
+++ /dev/null
@@ -1,314 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import static junit.framework.Assert.assertEquals;
-import static junit.framework.Assert.fail;
-
-import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.orc.CompressionCodec;
-import org.junit.Test;
-
-public class TestInStream {
-
- static class OutputCollector implements OutStream.OutputReceiver {
- DynamicByteArray buffer = new DynamicByteArray();
-
- @Override
- public void output(ByteBuffer buffer) throws IOException {
- this.buffer.add(buffer.array(), buffer.arrayOffset() + buffer.position(),
- buffer.remaining());
- }
- }
-
- static class PositionCollector
- implements PositionProvider, PositionRecorder {
- private List<Long> positions = new ArrayList<Long>();
- private int index = 0;
-
- @Override
- public long getNext() {
- return positions.get(index++);
- }
-
- @Override
- public void addPosition(long offset) {
- positions.add(offset);
- }
-
- public void reset() {
- index = 0;
- }
-
- @Override
- public String toString() {
- StringBuilder builder = new StringBuilder("position: ");
- for(int i=0; i < positions.size(); ++i) {
- if (i != 0) {
- builder.append(", ");
- }
- builder.append(positions.get(i));
- }
- return builder.toString();
- }
- }
-
- @Test
- public void testUncompressed() throws Exception {
- OutputCollector collect = new OutputCollector();
- OutStream out = new OutStream("test", 100, null, collect);
- PositionCollector[] positions = new PositionCollector[1024];
- for(int i=0; i < 1024; ++i) {
- positions[i] = new PositionCollector();
- out.getPosition(positions[i]);
- out.write(i);
- }
- out.flush();
- assertEquals(1024, collect.buffer.size());
- for(int i=0; i < 1024; ++i) {
- assertEquals((byte) i, collect.buffer.get(i));
- }
- ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
- collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
- inBuf.flip();
- InStream in = InStream.create("test", new ByteBuffer[]{inBuf},
- new long[]{0}, inBuf.remaining(), null, 100);
- assertEquals("uncompressed stream test position: 0 length: 1024" +
- " range: 0 offset: 0 limit: 0",
- in.toString());
- for(int i=0; i < 1024; ++i) {
- int x = in.read();
- assertEquals(i & 0xff, x);
- }
- for(int i=1023; i >= 0; --i) {
- in.seek(positions[i]);
- assertEquals(i & 0xff, in.read());
- }
- }
-
- @Test
- public void testCompressed() throws Exception {
- OutputCollector collect = new OutputCollector();
- CompressionCodec codec = new ZlibCodec();
- OutStream out = new OutStream("test", 300, codec, collect);
- PositionCollector[] positions = new PositionCollector[1024];
- for(int i=0; i < 1024; ++i) {
- positions[i] = new PositionCollector();
- out.getPosition(positions[i]);
- out.write(i);
- }
- out.flush();
- assertEquals("test", out.toString());
- assertEquals(961, collect.buffer.size());
- ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
- collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
- inBuf.flip();
- InStream in = InStream.create("test", new ByteBuffer[]{inBuf},
- new long[]{0}, inBuf.remaining(), codec, 300);
- assertEquals("compressed stream test position: 0 length: 961 range: 0" +
- " offset: 0 limit: 0 range 0 = 0 to 961",
- in.toString());
- for(int i=0; i < 1024; ++i) {
- int x = in.read();
- assertEquals(i & 0xff, x);
- }
- assertEquals(0, in.available());
- for(int i=1023; i >= 0; --i) {
- in.seek(positions[i]);
- assertEquals(i & 0xff, in.read());
- }
- }
-
- @Test
- public void testCorruptStream() throws Exception {
- OutputCollector collect = new OutputCollector();
- CompressionCodec codec = new ZlibCodec();
- OutStream out = new OutStream("test", 500, codec, collect);
- PositionCollector[] positions = new PositionCollector[1024];
- for(int i=0; i < 1024; ++i) {
- positions[i] = new PositionCollector();
- out.getPosition(positions[i]);
- out.write(i);
- }
- out.flush();
-
- // now try to read the stream with a buffer that is too small
- ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
- collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
- inBuf.flip();
- InStream in = InStream.create("test", new ByteBuffer[]{inBuf},
- new long[]{0}, inBuf.remaining(), codec, 100);
- byte[] contents = new byte[1024];
- try {
- in.read(contents);
- fail();
- } catch(IllegalArgumentException iae) {
- // EXPECTED
- }
-
- // make a corrupted header
- inBuf.clear();
- inBuf.put((byte) 32);
- inBuf.put((byte) 0);
- inBuf.flip();
- in = InStream.create("test2", new ByteBuffer[]{inBuf}, new long[]{0},
- inBuf.remaining(), codec, 300);
- try {
- in.read();
- fail();
- } catch (IllegalStateException ise) {
- // EXPECTED
- }
- }
-
- @Test
- public void testDisjointBuffers() throws Exception {
- OutputCollector collect = new OutputCollector();
- CompressionCodec codec = new ZlibCodec();
- OutStream out = new OutStream("test", 400, codec, collect);
- PositionCollector[] positions = new PositionCollector[1024];
- DataOutput stream = new DataOutputStream(out);
- for(int i=0; i < 1024; ++i) {
- positions[i] = new PositionCollector();
- out.getPosition(positions[i]);
- stream.writeInt(i);
- }
- out.flush();
- assertEquals("test", out.toString());
- assertEquals(1674, collect.buffer.size());
- ByteBuffer[] inBuf = new ByteBuffer[3];
- inBuf[0] = ByteBuffer.allocate(500);
- inBuf[1] = ByteBuffer.allocate(1200);
- inBuf[2] = ByteBuffer.allocate(500);
- collect.buffer.setByteBuffer(inBuf[0], 0, 483);
- collect.buffer.setByteBuffer(inBuf[1], 483, 1625 - 483);
- collect.buffer.setByteBuffer(inBuf[2], 1625, 1674 - 1625);
-
- for(int i=0; i < inBuf.length; ++i) {
- inBuf[i].flip();
- }
- InStream in = InStream.create("test", inBuf,
- new long[]{0,483, 1625}, 1674, codec, 400);
- assertEquals("compressed stream test position: 0 length: 1674 range: 0" +
- " offset: 0 limit: 0 range 0 = 0 to 483;" +
- " range 1 = 483 to 1142; range 2 = 1625 to 49",
- in.toString());
- DataInputStream inStream = new DataInputStream(in);
- for(int i=0; i < 1024; ++i) {
- int x = inStream.readInt();
- assertEquals(i, x);
- }
- assertEquals(0, in.available());
- for(int i=1023; i >= 0; --i) {
- in.seek(positions[i]);
- assertEquals(i, inStream.readInt());
- }
-
- in = InStream.create("test", new ByteBuffer[]{inBuf[1], inBuf[2]},
- new long[]{483, 1625}, 1674, codec, 400);
- inStream = new DataInputStream(in);
- positions[303].reset();
- in.seek(positions[303]);
- for(int i=303; i < 1024; ++i) {
- assertEquals(i, inStream.readInt());
- }
-
- in = InStream.create("test", new ByteBuffer[]{inBuf[0], inBuf[2]},
- new long[]{0, 1625}, 1674, codec, 400);
- inStream = new DataInputStream(in);
- positions[1001].reset();
- for(int i=0; i < 300; ++i) {
- assertEquals(i, inStream.readInt());
- }
- in.seek(positions[1001]);
- for(int i=1001; i < 1024; ++i) {
- assertEquals(i, inStream.readInt());
- }
- }
-
- @Test
- public void testUncompressedDisjointBuffers() throws Exception {
- OutputCollector collect = new OutputCollector();
- OutStream out = new OutStream("test", 400, null, collect);
- PositionCollector[] positions = new PositionCollector[1024];
- DataOutput stream = new DataOutputStream(out);
- for(int i=0; i < 1024; ++i) {
- positions[i] = new PositionCollector();
- out.getPosition(positions[i]);
- stream.writeInt(i);
- }
- out.flush();
- assertEquals("test", out.toString());
- assertEquals(4096, collect.buffer.size());
- ByteBuffer[] inBuf = new ByteBuffer[3];
- inBuf[0] = ByteBuffer.allocate(1100);
- inBuf[1] = ByteBuffer.allocate(2200);
- inBuf[2] = ByteBuffer.allocate(1100);
- collect.buffer.setByteBuffer(inBuf[0], 0, 1024);
- collect.buffer.setByteBuffer(inBuf[1], 1024, 2048);
- collect.buffer.setByteBuffer(inBuf[2], 3072, 1024);
-
- for(int i=0; i < inBuf.length; ++i) {
- inBuf[i].flip();
- }
- InStream in = InStream.create("test", inBuf,
- new long[]{0, 1024, 3072}, 4096, null, 400);
- assertEquals("uncompressed stream test position: 0 length: 4096" +
- " range: 0 offset: 0 limit: 0",
- in.toString());
- DataInputStream inStream = new DataInputStream(in);
- for(int i=0; i < 1024; ++i) {
- int x = inStream.readInt();
- assertEquals(i, x);
- }
- assertEquals(0, in.available());
- for(int i=1023; i >= 0; --i) {
- in.seek(positions[i]);
- assertEquals(i, inStream.readInt());
- }
-
- in = InStream.create("test", new ByteBuffer[]{inBuf[1], inBuf[2]},
- new long[]{1024, 3072}, 4096, null, 400);
- inStream = new DataInputStream(in);
- positions[256].reset();
- in.seek(positions[256]);
- for(int i=256; i < 1024; ++i) {
- assertEquals(i, inStream.readInt());
- }
-
- in = InStream.create("test", new ByteBuffer[]{inBuf[0], inBuf[2]},
- new long[]{0, 3072}, 4096, null, 400);
- inStream = new DataInputStream(in);
- positions[768].reset();
- for(int i=0; i < 256; ++i) {
- assertEquals(i, inStream.readInt());
- }
- in.seek(positions[768]);
- for(int i=768; i < 1024; ++i) {
- assertEquals(i, inStream.readInt());
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestIntegerCompressionReader.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestIntegerCompressionReader.java b/orc/src/test/org/apache/orc/impl/TestIntegerCompressionReader.java
deleted file mode 100644
index 399f35e..0000000
--- a/orc/src/test/org/apache/orc/impl/TestIntegerCompressionReader.java
+++ /dev/null
@@ -1,130 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import static junit.framework.Assert.assertEquals;
-
-import java.nio.ByteBuffer;
-import java.util.Random;
-
-import org.apache.orc.CompressionCodec;
-import org.junit.Test;
-
-public class TestIntegerCompressionReader {
-
- public void runSeekTest(CompressionCodec codec) throws Exception {
- TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
- RunLengthIntegerWriterV2 out = new RunLengthIntegerWriterV2(
- new OutStream("test", 1000, codec, collect), true);
- TestInStream.PositionCollector[] positions =
- new TestInStream.PositionCollector[4096];
- Random random = new Random(99);
- int[] junk = new int[2048];
- for(int i=0; i < junk.length; ++i) {
- junk[i] = random.nextInt();
- }
- for(int i=0; i < 4096; ++i) {
- positions[i] = new TestInStream.PositionCollector();
- out.getPosition(positions[i]);
- // test runs, incrementing runs, non-runs
- if (i < 1024) {
- out.write(i/4);
- } else if (i < 2048) {
- out.write(2*i);
- } else {
- out.write(junk[i-2048]);
- }
- }
- out.flush();
- ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
- collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
- inBuf.flip();
- RunLengthIntegerReaderV2 in =
- new RunLengthIntegerReaderV2(InStream.create
- ("test", new ByteBuffer[]{inBuf},
- new long[]{0}, inBuf.remaining(),
- codec, 1000), true, false);
- for(int i=0; i < 2048; ++i) {
- int x = (int) in.next();
- if (i < 1024) {
- assertEquals(i/4, x);
- } else if (i < 2048) {
- assertEquals(2*i, x);
- } else {
- assertEquals(junk[i-2048], x);
- }
- }
- for(int i=2047; i >= 0; --i) {
- in.seek(positions[i]);
- int x = (int) in.next();
- if (i < 1024) {
- assertEquals(i/4, x);
- } else if (i < 2048) {
- assertEquals(2*i, x);
- } else {
- assertEquals(junk[i-2048], x);
- }
- }
- }
-
- @Test
- public void testUncompressedSeek() throws Exception {
- runSeekTest(null);
- }
-
- @Test
- public void testCompressedSeek() throws Exception {
- runSeekTest(new ZlibCodec());
- }
-
- @Test
- public void testSkips() throws Exception {
- TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
- RunLengthIntegerWriterV2 out = new RunLengthIntegerWriterV2(
- new OutStream("test", 100, null, collect), true);
- for(int i=0; i < 2048; ++i) {
- if (i < 1024) {
- out.write(i);
- } else {
- out.write(256 * i);
- }
- }
- out.flush();
- ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
- collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
- inBuf.flip();
- RunLengthIntegerReaderV2 in =
- new RunLengthIntegerReaderV2(InStream.create("test",
- new ByteBuffer[]{inBuf},
- new long[]{0},
- inBuf.remaining(),
- null, 100), true, false);
- for(int i=0; i < 2048; i += 10) {
- int x = (int) in.next();
- if (i < 1024) {
- assertEquals(i, x);
- } else {
- assertEquals(256 * i, x);
- }
- if (i < 2038) {
- in.skip(9);
- }
- in.skip(0);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestMemoryManager.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestMemoryManager.java b/orc/src/test/org/apache/orc/impl/TestMemoryManager.java
deleted file mode 100644
index f48c545..0000000
--- a/orc/src/test/org/apache/orc/impl/TestMemoryManager.java
+++ /dev/null
@@ -1,133 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.orc.impl.MemoryManager;
-import org.hamcrest.BaseMatcher;
-import org.hamcrest.Description;
-import org.junit.Test;
-import org.mockito.Matchers;
-import org.mockito.Mockito;
-
-import java.lang.management.ManagementFactory;
-
-import static junit.framework.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.verify;
-
-/**
- * Test the ORC memory manager.
- */
-public class TestMemoryManager {
- private static final double ERROR = 0.000001;
-
- private static class NullCallback implements MemoryManager.Callback {
- public boolean checkMemory(double newScale) {
- return false;
- }
- }
-
- @Test
- public void testBasics() throws Exception {
- Configuration conf = new Configuration();
- MemoryManager mgr = new MemoryManager(conf);
- NullCallback callback = new NullCallback();
- long poolSize = mgr.getTotalMemoryPool();
- assertEquals(Math.round(ManagementFactory.getMemoryMXBean().
- getHeapMemoryUsage().getMax() * 0.5d), poolSize);
- assertEquals(1.0, mgr.getAllocationScale(), 0.00001);
- mgr.addWriter(new Path("p1"), 1000, callback);
- assertEquals(1.0, mgr.getAllocationScale(), 0.00001);
- mgr.addWriter(new Path("p1"), poolSize / 2, callback);
- assertEquals(1.0, mgr.getAllocationScale(), 0.00001);
- mgr.addWriter(new Path("p2"), poolSize / 2, callback);
- assertEquals(1.0, mgr.getAllocationScale(), 0.00001);
- mgr.addWriter(new Path("p3"), poolSize / 2, callback);
- assertEquals(0.6666667, mgr.getAllocationScale(), 0.00001);
- mgr.addWriter(new Path("p4"), poolSize / 2, callback);
- assertEquals(0.5, mgr.getAllocationScale(), 0.000001);
- mgr.addWriter(new Path("p4"), 3 * poolSize / 2, callback);
- assertEquals(0.3333333, mgr.getAllocationScale(), 0.000001);
- mgr.removeWriter(new Path("p1"));
- mgr.removeWriter(new Path("p2"));
- assertEquals(0.5, mgr.getAllocationScale(), 0.00001);
- mgr.removeWriter(new Path("p4"));
- assertEquals(1.0, mgr.getAllocationScale(), 0.00001);
- }
-
- @Test
- public void testConfig() throws Exception {
- Configuration conf = new Configuration();
- conf.set("hive.exec.orc.memory.pool", "0.9");
- MemoryManager mgr = new MemoryManager(conf);
- long mem =
- ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getMax();
- System.err.print("Memory = " + mem);
- long pool = mgr.getTotalMemoryPool();
- assertTrue("Pool too small: " + pool, mem * 0.899 < pool);
- assertTrue("Pool too big: " + pool, pool < mem * 0.901);
- }
-
- private static class DoubleMatcher extends BaseMatcher<Double> {
- final double expected;
- final double error;
- DoubleMatcher(double expected, double error) {
- this.expected = expected;
- this.error = error;
- }
-
- @Override
- public boolean matches(Object val) {
- double dbl = (Double) val;
- return Math.abs(dbl - expected) <= error;
- }
-
- @Override
- public void describeTo(Description description) {
- description.appendText("not sufficiently close to ");
- description.appendText(Double.toString(expected));
- }
- }
-
- private static DoubleMatcher closeTo(double value, double error) {
- return new DoubleMatcher(value, error);
- }
-
- @Test
- public void testCallback() throws Exception {
- Configuration conf = new Configuration();
- MemoryManager mgr = new MemoryManager(conf);
- long pool = mgr.getTotalMemoryPool();
- MemoryManager.Callback[] calls = new MemoryManager.Callback[20];
- for(int i=0; i < calls.length; ++i) {
- calls[i] = Mockito.mock(MemoryManager.Callback.class);
- mgr.addWriter(new Path(Integer.toString(i)), pool/4, calls[i]);
- }
- // add enough rows to get the memory manager to check the limits
- for(int i=0; i < 10000; ++i) {
- mgr.addedRow(1);
- }
- for(int call=0; call < calls.length; ++call) {
- Mockito.verify(calls[call], Mockito.times(2))
- .checkMemory(Matchers.doubleThat(closeTo(0.2, ERROR)));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestOrcWideTable.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestOrcWideTable.java b/orc/src/test/org/apache/orc/impl/TestOrcWideTable.java
deleted file mode 100644
index efa3ffb..0000000
--- a/orc/src/test/org/apache/orc/impl/TestOrcWideTable.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.IOException;
-
-import org.junit.Test;
-
-public class TestOrcWideTable {
-
- @Test
- public void testBufferSizeFor1Col() throws IOException {
- assertEquals(128 * 1024, PhysicalFsWriter.getEstimatedBufferSize(512 * 1024 * 1024,
- 1, 128*1024));
- }
-
- @Test
- public void testBufferSizeFor50Col() throws IOException {
- assertEquals(256 * 1024, PhysicalFsWriter.getEstimatedBufferSize(256 * 1024 * 1024,
- 50, 256*1024));
- }
-
- @Test
- public void testBufferSizeFor1000Col() throws IOException {
- assertEquals(32 * 1024, PhysicalFsWriter.getEstimatedBufferSize(512 * 1024 * 1024,
- 1000, 128*1024));
- }
-
- @Test
- public void testBufferSizeFor2000Col() throws IOException {
- assertEquals(16 * 1024, PhysicalFsWriter.getEstimatedBufferSize(512 * 1024 * 1024,
- 2000, 256*1024));
- }
-
- @Test
- public void testBufferSizeFor4000Col() throws IOException {
- assertEquals(8 * 1024, PhysicalFsWriter.getEstimatedBufferSize(512 * 1024 * 1024,
- 4000, 256*1024));
- }
-
- @Test
- public void testBufferSizeFor25000Col() throws IOException {
- assertEquals(4 * 1024, PhysicalFsWriter.getEstimatedBufferSize(512 * 1024 * 1024,
- 25000, 256*1024));
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestOutStream.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestOutStream.java b/orc/src/test/org/apache/orc/impl/TestOutStream.java
deleted file mode 100644
index e9614d5..0000000
--- a/orc/src/test/org/apache/orc/impl/TestOutStream.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import org.apache.orc.CompressionCodec;
-import org.junit.Test;
-import org.mockito.Mockito;
-
-import java.nio.ByteBuffer;
-
-import static org.junit.Assert.assertEquals;
-
-public class TestOutStream {
-
- @Test
- public void testFlush() throws Exception {
- OutStream.OutputReceiver receiver =
- Mockito.mock(OutStream.OutputReceiver.class);
- CompressionCodec codec = new ZlibCodec();
- OutStream stream = new OutStream("test", 128*1024, codec, receiver);
- assertEquals(0L, stream.getBufferSize());
- stream.write(new byte[]{0, 1, 2});
- stream.flush();
- Mockito.verify(receiver).output(Mockito.any(ByteBuffer.class));
- assertEquals(0L, stream.getBufferSize());
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestRLEv2.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestRLEv2.java b/orc/src/test/org/apache/orc/impl/TestRLEv2.java
deleted file mode 100644
index e139619..0000000
--- a/orc/src/test/org/apache/orc/impl/TestRLEv2.java
+++ /dev/null
@@ -1,307 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.PrintStream;
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.orc.CompressionKind;
-import org.apache.orc.OrcFile;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.Writer;
-import org.apache.orc.tools.FileDump;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-
-public class TestRLEv2 {
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
- Path testFilePath;
- Configuration conf;
- FileSystem fs;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem () throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestRLEv2." +
- testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- void appendInt(VectorizedRowBatch batch, int i) {
- ((LongColumnVector) batch.cols[0]).vector[batch.size++] = i;
- }
-
- @Test
- public void testFixedDeltaZero() throws Exception {
- TypeDescription schema = TypeDescription.createInt();
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .setSchema(schema)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- for (int i = 0; i < 5120; ++i) {
- appendInt(batch, 123);
- }
- w.addRowBatch(batch);
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // 10 runs of 512 elements. Each run has 2 bytes header, 2 bytes base (base = 123,
- // zigzag encoded varint) and 1 byte delta (delta = 0). In total, 5 bytes per run.
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 50"));
- System.setOut(origOut);
- }
-
- @Test
- public void testFixedDeltaOne() throws Exception {
- TypeDescription schema = TypeDescription.createInt();
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .setSchema(schema)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- for (int i = 0; i < 5120; ++i) {
- appendInt(batch, i % 512);
- }
- w.addRowBatch(batch);
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // 10 runs of 512 elements. Each run has 2 bytes header, 1 byte base (base = 0)
- // and 1 byte delta (delta = 1). In total, 4 bytes per run.
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 40"));
- System.setOut(origOut);
- }
-
- @Test
- public void testFixedDeltaOneDescending() throws Exception {
- TypeDescription schema = TypeDescription.createInt();
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .setSchema(schema)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- for (int i = 0; i < 5120; ++i) {
- appendInt(batch, 512 - (i % 512));
- }
- w.addRowBatch(batch);
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // 10 runs of 512 elements. Each run has 2 bytes header, 2 byte base (base = 512, zigzag + varint)
- // and 1 byte delta (delta = 1). In total, 5 bytes per run.
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 50"));
- System.setOut(origOut);
- }
-
- @Test
- public void testFixedDeltaLarge() throws Exception {
- TypeDescription schema = TypeDescription.createInt();
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .setSchema(schema)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- for (int i = 0; i < 5120; ++i) {
- appendInt(batch, i % 512 + ((i % 512) * 100));
- }
- w.addRowBatch(batch);
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // 10 runs of 512 elements. Each run has 2 bytes header, 1 byte base (base = 0)
- // and 2 bytes delta (delta = 100, zigzag encoded varint). In total, 5 bytes per run.
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 50"));
- System.setOut(origOut);
- }
-
- @Test
- public void testFixedDeltaLargeDescending() throws Exception {
- TypeDescription schema = TypeDescription.createInt();
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .setSchema(schema)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- for (int i = 0; i < 5120; ++i) {
- appendInt(batch, (512 - i % 512) + ((i % 512) * 100));
- }
- w.addRowBatch(batch);
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // 10 runs of 512 elements. Each run has 2 bytes header, 2 byte base (base = 512, zigzag + varint)
- // and 2 bytes delta (delta = 100, zigzag encoded varint). In total, 6 bytes per run.
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 60"));
- System.setOut(origOut);
- }
-
- @Test
- public void testShortRepeat() throws Exception {
- TypeDescription schema = TypeDescription.createInt();
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .setSchema(schema)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- for (int i = 0; i < 5; ++i) {
- appendInt(batch, 10);
- }
- w.addRowBatch(batch);
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // 1 byte header + 1 byte value
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 2"));
- System.setOut(origOut);
- }
-
- @Test
- public void testDeltaUnknownSign() throws Exception {
- TypeDescription schema = TypeDescription.createInt();
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .setSchema(schema)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- appendInt(batch, 0);
- for (int i = 0; i < 511; ++i) {
- appendInt(batch, i);
- }
- w.addRowBatch(batch);
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // monotonicity will be undetermined for this sequence 0,0,1,2,3,...510. Hence DIRECT encoding
- // will be used. 2 bytes for header and 640 bytes for data (512 values with fixed bit of 10 bits
- // each, 5120/8 = 640). Total bytes 642
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 642"));
- System.setOut(origOut);
- }
-
- @Test
- public void testPatchedBase() throws Exception {
- TypeDescription schema = TypeDescription.createInt();
- Writer w = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .compress(CompressionKind.NONE)
- .setSchema(schema)
- .rowIndexStride(0)
- .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
- .version(OrcFile.Version.V_0_12)
- );
-
- Random rand = new Random(123);
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- appendInt(batch, 10000000);
- for (int i = 0; i < 511; ++i) {
- appendInt(batch, rand.nextInt(i+1));
- }
- w.addRowBatch(batch);
- w.close();
-
- PrintStream origOut = System.out;
- ByteArrayOutputStream myOut = new ByteArrayOutputStream();
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toUri().toString()});
- System.out.flush();
- String outDump = new String(myOut.toByteArray());
- // use PATCHED_BASE encoding
- assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 583"));
- System.setOut(origOut);
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/impl/TestReaderImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestReaderImpl.java b/orc/src/test/org/apache/orc/impl/TestReaderImpl.java
deleted file mode 100644
index 23d0dab..0000000
--- a/orc/src/test/org/apache/orc/impl/TestReaderImpl.java
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright 2016 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.ByteArrayInputStream;
-import java.io.EOFException;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.charset.CharacterCodingException;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PositionedReadable;
-import org.apache.hadoop.fs.Seekable;
-import org.apache.orc.FileFormatException;
-import org.apache.hadoop.io.Text;
-import org.apache.orc.OrcFile;
-import org.junit.Test;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.rules.ExpectedException;
-
-public class TestReaderImpl {
-
- @Rule
- public ExpectedException thrown = ExpectedException.none();
-
- private final Path path = new Path("test-file.orc");
- private FSDataInputStream in;
- private int psLen;
- private ByteBuffer buffer;
-
- @Before
- public void setup() {
- in = null;
- }
-
- @Test
- public void testEnsureOrcFooterSmallTextFile() throws IOException {
- prepareTestCase("1".getBytes());
- thrown.expect(FileFormatException.class);
- ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
- }
-
- @Test
- public void testEnsureOrcFooterLargeTextFile() throws IOException {
- prepareTestCase("This is Some Text File".getBytes());
- thrown.expect(FileFormatException.class);
- ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
- }
-
- @Test
- public void testEnsureOrcFooter011ORCFile() throws IOException {
- prepareTestCase(composeContent(OrcFile.MAGIC, "FOOTER"));
- ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
- }
-
- @Test
- public void testEnsureOrcFooterCorrectORCFooter() throws IOException {
- prepareTestCase(composeContent("", OrcFile.MAGIC));
- ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
- }
-
- private void prepareTestCase(byte[] bytes) {
- buffer = ByteBuffer.wrap(bytes);
- psLen = buffer.get(bytes.length - 1) & 0xff;
- in = new FSDataInputStream(new SeekableByteArrayInputStream(bytes));
- }
-
- private byte[] composeContent(String headerStr, String footerStr) throws CharacterCodingException {
- ByteBuffer header = Text.encode(headerStr);
- ByteBuffer footer = Text.encode(footerStr);
- int headerLen = header.remaining();
- int footerLen = footer.remaining() + 1;
-
- ByteBuffer buf = ByteBuffer.allocate(headerLen + footerLen);
-
- buf.put(header);
- buf.put(footer);
- buf.put((byte) footerLen);
- return buf.array();
- }
-
- private static final class SeekableByteArrayInputStream extends ByteArrayInputStream
- implements Seekable, PositionedReadable {
-
- public SeekableByteArrayInputStream(byte[] buf) {
- super(buf);
- }
-
- @Override
- public void seek(long pos) throws IOException {
- this.reset();
- this.skip(pos);
- }
-
- @Override
- public long getPos() throws IOException {
- return pos;
- }
-
- @Override
- public boolean seekToNewSource(long targetPos) throws IOException {
- return false;
- }
-
- @Override
- public int read(long position, byte[] buffer, int offset, int length)
- throws IOException {
- long oldPos = getPos();
- int nread = -1;
- try {
- seek(position);
- nread = read(buffer, offset, length);
- } finally {
- seek(oldPos);
- }
- return nread;
- }
-
- @Override
- public void readFully(long position, byte[] buffer, int offset, int length)
- throws IOException {
- int nread = 0;
- while (nread < length) {
- int nbytes = read(position + nread, buffer, offset + nread, length - nread);
- if (nbytes < 0) {
- throw new EOFException("End of file reached before reading fully.");
- }
- nread += nbytes;
- }
- }
-
- @Override
- public void readFully(long position, byte[] buffer)
- throws IOException {
- readFully(position, buffer, 0, buffer.length);
- }
- }
-}
[08/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestRecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestRecordReaderImpl.java b/orc/src/test/org/apache/hive/orc/impl/TestRecordReaderImpl.java
new file mode 100644
index 0000000..1849d96
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestRecordReaderImpl.java
@@ -0,0 +1,1708 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import static junit.framework.Assert.assertEquals;
+import static org.hamcrest.core.Is.is;
+import static org.junit.Assert.*;
+import static org.mockito.Mockito.any;
+import static org.mockito.Mockito.atLeastOnce;
+import static org.mockito.Mockito.doThrow;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Timestamp;
+import java.util.ArrayList;
+import java.util.List;
+
+import junit.framework.Assert;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PositionedReadable;
+import org.apache.hadoop.fs.Seekable;
+import org.apache.hadoop.hive.common.io.DiskRangeList;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl;
+import org.apache.hive.orc.ColumnStatistics;
+import org.apache.hive.orc.OrcFile;
+import org.apache.hive.orc.BloomFilterIO;
+import org.apache.hive.orc.DataReader;
+import org.apache.hive.orc.RecordReader;
+import org.apache.hive.orc.TypeDescription;
+import org.apache.hive.orc.Writer;
+import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.io.DataOutputBuffer;
+import org.apache.hive.orc.Reader;
+import org.apache.hive.orc.OrcProto;
+
+import org.junit.Test;
+import org.mockito.MockSettings;
+import org.mockito.Mockito;
+
+public class TestRecordReaderImpl {
+ /**
+ * Create a predicate leaf. This is used by another test.
+ */
+ public static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator operator,
+ PredicateLeaf.Type type,
+ String columnName,
+ Object literal,
+ List<Object> literalList) {
+ return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName,
+ literal, literalList, null);
+ }
+
+ // can add .verboseLogging() to cause Mockito to log invocations
+ private final MockSettings settings = Mockito.withSettings().verboseLogging();
+
+ static class BufferInStream
+ extends InputStream implements PositionedReadable, Seekable {
+ private final byte[] buffer;
+ private final int length;
+ private int position = 0;
+
+ BufferInStream(byte[] bytes, int length) {
+ this.buffer = bytes;
+ this.length = length;
+ }
+
+ @Override
+ public int read() {
+ if (position < length) {
+ return buffer[position++];
+ }
+ return -1;
+ }
+
+ @Override
+ public int read(byte[] bytes, int offset, int length) {
+ int lengthToRead = Math.min(length, this.length - this.position);
+ if (lengthToRead >= 0) {
+ for(int i=0; i < lengthToRead; ++i) {
+ bytes[offset + i] = buffer[position++];
+ }
+ return lengthToRead;
+ } else {
+ return -1;
+ }
+ }
+
+ @Override
+ public int read(long position, byte[] bytes, int offset, int length) {
+ this.position = (int) position;
+ return read(bytes, offset, length);
+ }
+
+ @Override
+ public void readFully(long position, byte[] bytes, int offset,
+ int length) throws IOException {
+ this.position = (int) position;
+ while (length > 0) {
+ int result = read(bytes, offset, length);
+ offset += result;
+ length -= result;
+ if (result < 0) {
+ throw new IOException("Read past end of buffer at " + offset);
+ }
+ }
+ }
+
+ @Override
+ public void readFully(long position, byte[] bytes) throws IOException {
+ readFully(position, bytes, 0, bytes.length);
+ }
+
+ @Override
+ public void seek(long position) {
+ this.position = (int) position;
+ }
+
+ @Override
+ public long getPos() {
+ return position;
+ }
+
+ @Override
+ public boolean seekToNewSource(long position) throws IOException {
+ this.position = (int) position;
+ return false;
+ }
+ }
+
+ @Test
+ public void testMaxLengthToReader() throws Exception {
+ Configuration conf = new Configuration();
+ OrcProto.Type rowType = OrcProto.Type.newBuilder()
+ .setKind(OrcProto.Type.Kind.STRUCT).build();
+ OrcProto.Footer footer = OrcProto.Footer.newBuilder()
+ .setHeaderLength(0).setContentLength(0).setNumberOfRows(0)
+ .setRowIndexStride(0).addTypes(rowType).build();
+ OrcProto.PostScript ps = OrcProto.PostScript.newBuilder()
+ .setCompression(OrcProto.CompressionKind.NONE)
+ .setFooterLength(footer.getSerializedSize())
+ .setMagic("ORC").addVersion(0).addVersion(11).build();
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ footer.writeTo(buffer);
+ ps.writeTo(buffer);
+ buffer.write(ps.getSerializedSize());
+ FileSystem fs = mock(FileSystem.class, settings);
+ FSDataInputStream file =
+ new FSDataInputStream(new BufferInStream(buffer.getData(),
+ buffer.getLength()));
+ Path p = new Path("/dir/file.orc");
+ when(fs.open(p)).thenReturn(file);
+ OrcFile.ReaderOptions options = OrcFile.readerOptions(conf);
+ options.filesystem(fs);
+ options.maxLength(buffer.getLength());
+ when(fs.getFileStatus(p))
+ .thenReturn(new FileStatus(10, false, 3, 3000, 0, p));
+ Reader reader = OrcFile.createReader(p, options);
+ }
+
+ @Test
+ public void testCompareToRangeInt() throws Exception {
+ Assert.assertEquals(RecordReaderImpl.Location.BEFORE,
+ RecordReaderImpl.compareToRange(19L, 20L, 40L));
+ Assert.assertEquals(RecordReaderImpl.Location.AFTER,
+ RecordReaderImpl.compareToRange(41L, 20L, 40L));
+ Assert.assertEquals(RecordReaderImpl.Location.MIN,
+ RecordReaderImpl.compareToRange(20L, 20L, 40L));
+ Assert.assertEquals(RecordReaderImpl.Location.MIDDLE,
+ RecordReaderImpl.compareToRange(21L, 20L, 40L));
+ Assert.assertEquals(RecordReaderImpl.Location.MAX,
+ RecordReaderImpl.compareToRange(40L, 20L, 40L));
+ Assert.assertEquals(RecordReaderImpl.Location.BEFORE,
+ RecordReaderImpl.compareToRange(0L, 1L, 1L));
+ Assert.assertEquals(RecordReaderImpl.Location.MIN,
+ RecordReaderImpl.compareToRange(1L, 1L, 1L));
+ Assert.assertEquals(RecordReaderImpl.Location.AFTER,
+ RecordReaderImpl.compareToRange(2L, 1L, 1L));
+ }
+
+ @Test
+ public void testCompareToRangeString() throws Exception {
+ Assert.assertEquals(RecordReaderImpl.Location.BEFORE,
+ RecordReaderImpl.compareToRange("a", "b", "c"));
+ Assert.assertEquals(RecordReaderImpl.Location.AFTER,
+ RecordReaderImpl.compareToRange("d", "b", "c"));
+ Assert.assertEquals(RecordReaderImpl.Location.MIN,
+ RecordReaderImpl.compareToRange("b", "b", "c"));
+ Assert.assertEquals(RecordReaderImpl.Location.MIDDLE,
+ RecordReaderImpl.compareToRange("bb", "b", "c"));
+ Assert.assertEquals(RecordReaderImpl.Location.MAX,
+ RecordReaderImpl.compareToRange("c", "b", "c"));
+ Assert.assertEquals(RecordReaderImpl.Location.BEFORE,
+ RecordReaderImpl.compareToRange("a", "b", "b"));
+ Assert.assertEquals(RecordReaderImpl.Location.MIN,
+ RecordReaderImpl.compareToRange("b", "b", "b"));
+ Assert.assertEquals(RecordReaderImpl.Location.AFTER,
+ RecordReaderImpl.compareToRange("c", "b", "b"));
+ }
+
+ @Test
+ public void testCompareToCharNeedConvert() throws Exception {
+ Assert.assertEquals(RecordReaderImpl.Location.BEFORE,
+ RecordReaderImpl.compareToRange("apple", "hello", "world"));
+ Assert.assertEquals(RecordReaderImpl.Location.AFTER,
+ RecordReaderImpl.compareToRange("zombie", "hello", "world"));
+ Assert.assertEquals(RecordReaderImpl.Location.MIN,
+ RecordReaderImpl.compareToRange("hello", "hello", "world"));
+ Assert.assertEquals(RecordReaderImpl.Location.MIDDLE,
+ RecordReaderImpl.compareToRange("pilot", "hello", "world"));
+ Assert.assertEquals(RecordReaderImpl.Location.MAX,
+ RecordReaderImpl.compareToRange("world", "hello", "world"));
+ Assert.assertEquals(RecordReaderImpl.Location.BEFORE,
+ RecordReaderImpl.compareToRange("apple", "hello", "hello"));
+ Assert.assertEquals(RecordReaderImpl.Location.MIN,
+ RecordReaderImpl.compareToRange("hello", "hello", "hello"));
+ Assert.assertEquals(RecordReaderImpl.Location.AFTER,
+ RecordReaderImpl.compareToRange("zombie", "hello", "hello"));
+ }
+
+ @Test
+ public void testGetMin() throws Exception {
+ assertEquals(10L, RecordReaderImpl.getMin(
+ ColumnStatisticsImpl.deserialize(createIntStats(10L, 100L))));
+ assertEquals(10.0d, RecordReaderImpl.getMin(ColumnStatisticsImpl.deserialize(
+ OrcProto.ColumnStatistics.newBuilder()
+ .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
+ .setMinimum(10.0d).setMaximum(100.0d).build()).build())));
+ assertEquals(null, RecordReaderImpl.getMin(ColumnStatisticsImpl.deserialize(
+ OrcProto.ColumnStatistics.newBuilder()
+ .setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
+ .build())));
+ assertEquals("a", RecordReaderImpl.getMin(ColumnStatisticsImpl.deserialize(
+ OrcProto.ColumnStatistics.newBuilder()
+ .setStringStatistics(OrcProto.StringStatistics.newBuilder()
+ .setMinimum("a").setMaximum("b").build()).build())));
+ assertEquals("hello", RecordReaderImpl.getMin(ColumnStatisticsImpl
+ .deserialize(createStringStats("hello", "world"))));
+ assertEquals(HiveDecimal.create("111.1"), RecordReaderImpl.getMin(ColumnStatisticsImpl
+ .deserialize(createDecimalStats("111.1", "112.1"))));
+ }
+
+ private static OrcProto.ColumnStatistics createIntStats(Long min,
+ Long max) {
+ OrcProto.IntegerStatistics.Builder intStats =
+ OrcProto.IntegerStatistics.newBuilder();
+ if (min != null) {
+ intStats.setMinimum(min);
+ }
+ if (max != null) {
+ intStats.setMaximum(max);
+ }
+ return OrcProto.ColumnStatistics.newBuilder()
+ .setIntStatistics(intStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createBooleanStats(int n, int trueCount) {
+ OrcProto.BucketStatistics.Builder boolStats = OrcProto.BucketStatistics.newBuilder();
+ boolStats.addCount(trueCount);
+ return OrcProto.ColumnStatistics.newBuilder().setNumberOfValues(n).setBucketStatistics(
+ boolStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createIntStats(int min, int max) {
+ OrcProto.IntegerStatistics.Builder intStats = OrcProto.IntegerStatistics.newBuilder();
+ intStats.setMinimum(min);
+ intStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setIntStatistics(intStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createDoubleStats(double min, double max) {
+ OrcProto.DoubleStatistics.Builder dblStats = OrcProto.DoubleStatistics.newBuilder();
+ dblStats.setMinimum(min);
+ dblStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setDoubleStatistics(dblStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createStringStats(String min, String max,
+ boolean hasNull) {
+ OrcProto.StringStatistics.Builder strStats = OrcProto.StringStatistics.newBuilder();
+ strStats.setMinimum(min);
+ strStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build())
+ .setHasNull(hasNull).build();
+ }
+
+ private static OrcProto.ColumnStatistics createStringStats(String min, String max) {
+ OrcProto.StringStatistics.Builder strStats = OrcProto.StringStatistics.newBuilder();
+ strStats.setMinimum(min);
+ strStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createDateStats(int min, int max) {
+ OrcProto.DateStatistics.Builder dateStats = OrcProto.DateStatistics.newBuilder();
+ dateStats.setMinimum(min);
+ dateStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setDateStatistics(dateStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createTimestampStats(long min, long max) {
+ OrcProto.TimestampStatistics.Builder tsStats = OrcProto.TimestampStatistics.newBuilder();
+ tsStats.setMinimum(min);
+ tsStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setTimestampStatistics(tsStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createDecimalStats(String min, String max) {
+ OrcProto.DecimalStatistics.Builder decStats = OrcProto.DecimalStatistics.newBuilder();
+ decStats.setMinimum(min);
+ decStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setDecimalStatistics(decStats.build()).build();
+ }
+
+ private static OrcProto.ColumnStatistics createDecimalStats(String min, String max,
+ boolean hasNull) {
+ OrcProto.DecimalStatistics.Builder decStats = OrcProto.DecimalStatistics.newBuilder();
+ decStats.setMinimum(min);
+ decStats.setMaximum(max);
+ return OrcProto.ColumnStatistics.newBuilder().setDecimalStatistics(decStats.build())
+ .setHasNull(hasNull).build();
+ }
+
+ @Test
+ public void testGetMax() throws Exception {
+ assertEquals(100L, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(createIntStats(10L, 100L))));
+ assertEquals(100.0d, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
+ OrcProto.ColumnStatistics.newBuilder()
+ .setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
+ .setMinimum(10.0d).setMaximum(100.0d).build()).build())));
+ assertEquals(null, RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
+ OrcProto.ColumnStatistics.newBuilder()
+ .setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
+ .build())));
+ assertEquals("b", RecordReaderImpl.getMax(ColumnStatisticsImpl.deserialize(
+ OrcProto.ColumnStatistics.newBuilder()
+ .setStringStatistics(OrcProto.StringStatistics.newBuilder()
+ .setMinimum("a").setMaximum("b").build()).build())));
+ assertEquals("world", RecordReaderImpl.getMax(ColumnStatisticsImpl
+ .deserialize(createStringStats("hello", "world"))));
+ assertEquals(HiveDecimal.create("112.1"), RecordReaderImpl.getMax(ColumnStatisticsImpl
+ .deserialize(createDecimalStats("111.1", "112.1"))));
+ }
+
+ @Test
+ public void testPredEvalWithBooleanStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
+
+ pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
+
+ pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", false, null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 10), pred, null));
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createBooleanStats(10, 0), pred, null));
+ }
+
+ @Test
+ public void testPredEvalWithIntStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.FLOAT, "x", 15.0, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+
+ // Stats gets converted to column type. "15" is outside of "10" and "100"
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "15", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+
+ // Integer stats will not be converted date because of days/seconds/millis ambiguity
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10, 100), pred, null));
+ }
+
+ @Test
+ public void testPredEvalWithDoubleStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.FLOAT, "x", 15.0, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+
+ // Stats gets converted to column type. "15.0" is outside of "10.0" and "100.0"
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "15", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+
+ // Double is not converted to date type because of days/seconds/millis ambiguity
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15*1000L), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150*1000L), null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDoubleStats(10.0, 100.0), pred, null));
+ }
+
+ @Test
+ public void testPredEvalWithStringStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 100L, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.FLOAT, "x", 100.0, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "100", null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+
+ // IllegalArgumentException is thrown when converting String to Date, hence YES_NO
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DATE, "x", new DateWritable(100).get(), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 1000), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("100"), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(100), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("10", "1000"), pred, null));
+ }
+
+ @Test
+ public void testPredEvalWithDateStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
+ // Date to Integer conversion is not possible.
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ // Date to Float conversion is also not possible.
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.FLOAT, "x", 15.0, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "15", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "1970-01-11", null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "15.1", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "__a15__1", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "2000-01-16", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "1970-01-16", null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DATE, "x", new DateWritable(150).get(), null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ // Date to Decimal conversion is also not possible.
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15L * 24L * 60L * 60L * 1000L), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDateStats(10, 100), pred, null));
+ }
+
+ @Test
+ public void testPredEvalWithDecimalStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.FLOAT, "x", 15.0, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+
+ // "15" out of range of "10.0" and "100.0"
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "15", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+
+ // Decimal to Date not possible.
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15 * 1000L), null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150 * 1000L), null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createDecimalStats("10.0", "100.0"), pred, null));
+ }
+
+ @Test
+ public void testPredEvalWithTimestampStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.FLOAT, "x", 15.0, null);
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", "15", null);
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.STRING, "x", new Timestamp(15).toString(), null);
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10 * 24L * 60L * 60L * 1000L,
+ 100 * 24L * 60L * 60L * 1000L), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
+
+ pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
+ PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10, 100), pred, null));
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createTimestampStats(10000, 100000), pred, null));
+ }
+
+ @Test
+ public void testEquals() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG,
+ "x", 15L, null);
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null));
+ }
+
+ @Test
+ public void testNullSafeEquals() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG,
+ "x", 15L, null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 15L), pred, null));
+ }
+
+ @Test
+ public void testLessThan() throws Exception {
+ PredicateLeaf lessThan = createPredicateLeaf
+ (PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.LONG,
+ "x", 15L, null);
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), lessThan, null));
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), lessThan, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), lessThan, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), lessThan, null));
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), lessThan, null));
+ }
+
+ @Test
+ public void testLessThanEquals() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.LONG,
+ "x", 15L, null);
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 15L), pred, null));
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 10L), pred, null));
+ }
+
+ @Test
+ public void testIn() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(10L);
+ args.add(20L);
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG,
+ "x", null, args);
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 20L), pred, null));
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 30L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 30L), pred, null));
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null));
+ }
+
+ @Test
+ public void testBetween() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(10L);
+ args.add(20L);
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.LONG,
+ "x", null, args);
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(0L, 5L), pred, null));
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(30L, 40L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 15L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(15L, 25L), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(5L, 25L), pred, null));
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(10L, 20L), pred, null));
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(12L, 18L), pred, null));
+ }
+
+ @Test
+ public void testIsNull() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.LONG,
+ "x", null, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createIntStats(20L, 30L), pred, null));
+ }
+
+
+ @Test
+ public void testEqualsWithNullInStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING,
+ "x", "c", null);
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+ }
+
+ @Test
+ public void testNullSafeEqualsWithNullInStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING,
+ "x", "c", null);
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+ }
+
+ @Test
+ public void testLessThanWithNullInStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.STRING,
+ "x", "c", null);
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ assertEquals(TruthValue.NO_NULL, // min, same stats
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null));
+ }
+
+ @Test
+ public void testLessThanEqualsWithNullInStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.STRING,
+ "x", "c", null);
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null)); // before
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "c", true), pred, null)); // max
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+ }
+
+ @Test
+ public void testInWithNullInStats() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add("c");
+ args.add("f");
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING,
+ "x", null, args);
+ assertEquals(TruthValue.NO_NULL, // before & after
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null));
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null)); // max
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null)); // min
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ assertEquals(TruthValue.YES_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null)); // same
+ }
+
+ @Test
+ public void testBetweenWithNullInStats() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add("c");
+ args.add("f");
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.STRING,
+ "x", null, args);
+ assertEquals(TruthValue.YES_NULL, // before & after
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("d", "e", true), pred, null));
+ assertEquals(TruthValue.YES_NULL, // before & max
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "f", true), pred, null));
+ assertEquals(TruthValue.NO_NULL, // before & before
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("h", "g", true), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL, // before & min
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("f", "g", true), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL, // before & middle
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("e", "g", true), pred, null));
+
+ assertEquals(TruthValue.YES_NULL, // min & after
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "e", true), pred, null));
+ assertEquals(TruthValue.YES_NULL, // min & max
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "f", true), pred, null));
+ assertEquals(TruthValue.YES_NO_NULL, // min & middle
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "g", true), pred, null));
+
+ assertEquals(TruthValue.NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "b", true), pred, null)); // after
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("a", "c", true), pred, null)); // max
+ assertEquals(TruthValue.YES_NO_NULL,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("b", "d", true), pred, null)); // middle
+ assertEquals(TruthValue.YES_NULL, // min & after, same stats
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "c", true), pred, null));
+ }
+
+ @Test
+ public void testIsNullWithNullInStats() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.STRING,
+ "x", null, null);
+ assertEquals(TruthValue.YES_NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", true), pred, null));
+ assertEquals(TruthValue.NO,
+ RecordReaderImpl.evaluatePredicateProto(createStringStats("c", "d", false), pred, null));
+ }
+
+ @Test
+ public void testOverlap() throws Exception {
+ assertTrue(!RecordReaderUtils.overlap(0, 10, -10, -1));
+ assertTrue(RecordReaderUtils.overlap(0, 10, -1, 0));
+ assertTrue(RecordReaderUtils.overlap(0, 10, -1, 1));
+ assertTrue(RecordReaderUtils.overlap(0, 10, 2, 8));
+ assertTrue(RecordReaderUtils.overlap(0, 10, 5, 10));
+ assertTrue(RecordReaderUtils.overlap(0, 10, 10, 11));
+ assertTrue(RecordReaderUtils.overlap(0, 10, 0, 10));
+ assertTrue(RecordReaderUtils.overlap(0, 10, -1, 11));
+ assertTrue(!RecordReaderUtils.overlap(0, 10, 11, 12));
+ }
+
+ private static DiskRangeList diskRanges(Integer... points) {
+ DiskRangeList head = null, tail = null;
+ for(int i = 0; i < points.length; i += 2) {
+ DiskRangeList range = new DiskRangeList(points[i], points[i+1]);
+ if (tail == null) {
+ head = tail = range;
+ } else {
+ tail = tail.insertAfter(range);
+ }
+ }
+ return head;
+ }
+
+ @Test
+ public void testGetIndexPosition() throws Exception {
+ assertEquals(0, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
+ OrcProto.Stream.Kind.PRESENT, true, true));
+ assertEquals(4, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
+ OrcProto.Stream.Kind.DATA, true, true));
+ assertEquals(3, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
+ OrcProto.Stream.Kind.DATA, false, true));
+ assertEquals(0, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.INT,
+ OrcProto.Stream.Kind.DATA, true, false));
+ assertEquals(4, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DICTIONARY, OrcProto.Type.Kind.STRING,
+ OrcProto.Stream.Kind.DATA, true, true));
+ assertEquals(4, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
+ OrcProto.Stream.Kind.DATA, true, true));
+ assertEquals(3, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
+ OrcProto.Stream.Kind.DATA, false, true));
+ assertEquals(6, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
+ OrcProto.Stream.Kind.LENGTH, true, true));
+ assertEquals(4, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.BINARY,
+ OrcProto.Stream.Kind.LENGTH, false, true));
+ assertEquals(4, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
+ OrcProto.Stream.Kind.DATA, true, true));
+ assertEquals(3, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
+ OrcProto.Stream.Kind.DATA, false, true));
+ assertEquals(6, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
+ OrcProto.Stream.Kind.SECONDARY, true, true));
+ assertEquals(4, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.DECIMAL,
+ OrcProto.Stream.Kind.SECONDARY, false, true));
+ assertEquals(4, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
+ OrcProto.Stream.Kind.DATA, true, true));
+ assertEquals(3, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
+ OrcProto.Stream.Kind.DATA, false, true));
+ assertEquals(7, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
+ OrcProto.Stream.Kind.SECONDARY, true, true));
+ assertEquals(5, RecordReaderUtils.getIndexPosition
+ (OrcProto.ColumnEncoding.Kind.DIRECT, OrcProto.Type.Kind.TIMESTAMP,
+ OrcProto.Stream.Kind.SECONDARY, false, true));
+ }
+
+ @Test
+ public void testPartialPlan() throws Exception {
+ DiskRangeList result;
+
+ // set the streams
+ List<OrcProto.Stream> streams = new ArrayList<OrcProto.Stream>();
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.PRESENT)
+ .setColumn(1).setLength(1000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.DATA)
+ .setColumn(1).setLength(99000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.PRESENT)
+ .setColumn(2).setLength(2000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.DATA)
+ .setColumn(2).setLength(98000).build());
+
+ boolean[] columns = new boolean[]{true, true, false};
+ boolean[] rowGroups = new boolean[]{true, true, false, false, true, false};
+
+ // set the index
+ OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.length];
+ indexes[1] = OrcProto.RowIndex.newBuilder()
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(0).addPositions(-1).addPositions(-1)
+ .addPositions(0)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(100).addPositions(-1).addPositions(-1)
+ .addPositions(10000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(200).addPositions(-1).addPositions(-1)
+ .addPositions(20000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(300).addPositions(-1).addPositions(-1)
+ .addPositions(30000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(400).addPositions(-1).addPositions(-1)
+ .addPositions(40000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(500).addPositions(-1).addPositions(-1)
+ .addPositions(50000)
+ .build())
+ .build();
+
+ // set encodings
+ List<OrcProto.ColumnEncoding> encodings =
+ new ArrayList<OrcProto.ColumnEncoding>();
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+
+ // set types struct{x: int, y: int}
+ List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT)
+ .addSubtypes(1).addSubtypes(2).addFieldNames("x")
+ .addFieldNames("y").build());
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
+
+ // filter by rows and groups
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, false);
+ assertThat(result, is(diskRanges(0, 1000, 100, 1000, 400, 1000,
+ 1000, 11000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
+ 11000, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
+ 41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, true);
+ assertThat(result, is(diskRanges(0, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
+ 41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
+
+ // if we read no rows, don't read any bytes
+ rowGroups = new boolean[]{false, false, false, false, false, false};
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, false);
+ assertNull(result);
+
+ // all rows, but only columns 0 and 2.
+ rowGroups = null;
+ columns = new boolean[]{true, false, true};
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, null, false, encodings, types, 32768, false);
+ assertThat(result, is(diskRanges(100000, 102000, 102000, 200000)));
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, null, false, encodings, types, 32768, true);
+ assertThat(result, is(diskRanges(100000, 200000)));
+
+ rowGroups = new boolean[]{false, true, false, false, false, false};
+ indexes[2] = indexes[1];
+ indexes[1] = null;
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, false);
+ assertThat(result, is(diskRanges(100100, 102000,
+ 112000, 122000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, true);
+ assertThat(result, is(diskRanges(100100, 102000,
+ 112000, 122000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP)));
+
+ rowGroups = new boolean[]{false, false, false, false, false, true};
+ indexes[1] = indexes[2];
+ columns = new boolean[]{true, true, true};
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, false);
+ assertThat(result, is(diskRanges(500, 1000, 51000, 100000, 100500, 102000,
+ 152000, 200000)));
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, true);
+ assertThat(result, is(diskRanges(500, 1000, 51000, 100000, 100500, 102000,
+ 152000, 200000)));
+ }
+
+
+ @Test
+ public void testPartialPlanCompressed() throws Exception {
+ DiskRangeList result;
+
+ // set the streams
+ List<OrcProto.Stream> streams = new ArrayList<OrcProto.Stream>();
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.PRESENT)
+ .setColumn(1).setLength(1000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.DATA)
+ .setColumn(1).setLength(99000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.PRESENT)
+ .setColumn(2).setLength(2000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.DATA)
+ .setColumn(2).setLength(98000).build());
+
+ boolean[] columns = new boolean[]{true, true, false};
+ boolean[] rowGroups = new boolean[]{true, true, false, false, true, false};
+
+ // set the index
+ OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.length];
+ indexes[1] = OrcProto.RowIndex.newBuilder()
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(0).addPositions(-1).addPositions(-1).addPositions(-1)
+ .addPositions(0)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(100).addPositions(-1).addPositions(-1).addPositions(-1)
+ .addPositions(10000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(200).addPositions(-1).addPositions(-1).addPositions(-1)
+ .addPositions(20000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(300).addPositions(-1).addPositions(-1).addPositions(-1)
+ .addPositions(30000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(400).addPositions(-1).addPositions(-1).addPositions(-1)
+ .addPositions(40000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(500).addPositions(-1).addPositions(-1).addPositions(-1)
+ .addPositions(50000)
+ .build())
+ .build();
+
+ // set encodings
+ List<OrcProto.ColumnEncoding> encodings =
+ new ArrayList<OrcProto.ColumnEncoding>();
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+
+ // set types struct{x: int, y: int}
+ List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT)
+ .addSubtypes(1).addSubtypes(2).addFieldNames("x")
+ .addFieldNames("y").build());
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
+
+ // filter by rows and groups
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, true, encodings, types, 32768, false);
+ assertThat(result, is(diskRanges(0, 1000, 100, 1000,
+ 400, 1000, 1000, 11000+(2*32771),
+ 11000, 21000+(2*32771), 41000, 100000)));
+
+ rowGroups = new boolean[]{false, false, false, false, false, true};
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, true, encodings, types, 32768, false);
+ assertThat(result, is(diskRanges(500, 1000, 51000, 100000)));
+ }
+
+ @Test
+ public void testPartialPlanString() throws Exception {
+ DiskRangeList result;
+
+ // set the streams
+ List<OrcProto.Stream> streams = new ArrayList<OrcProto.Stream>();
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.PRESENT)
+ .setColumn(1).setLength(1000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.DATA)
+ .setColumn(1).setLength(94000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.LENGTH)
+ .setColumn(1).setLength(2000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.DICTIONARY_DATA)
+ .setColumn(1).setLength(3000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.PRESENT)
+ .setColumn(2).setLength(2000).build());
+ streams.add(OrcProto.Stream.newBuilder()
+ .setKind(OrcProto.Stream.Kind.DATA)
+ .setColumn(2).setLength(98000).build());
+
+ boolean[] columns = new boolean[]{true, true, false};
+ boolean[] rowGroups = new boolean[]{false, true, false, false, true, true};
+
+ // set the index
+ OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[columns.length];
+ indexes[1] = OrcProto.RowIndex.newBuilder()
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(0).addPositions(-1).addPositions(-1)
+ .addPositions(0)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(100).addPositions(-1).addPositions(-1)
+ .addPositions(10000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(200).addPositions(-1).addPositions(-1)
+ .addPositions(20000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(300).addPositions(-1).addPositions(-1)
+ .addPositions(30000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(400).addPositions(-1).addPositions(-1)
+ .addPositions(40000)
+ .build())
+ .addEntry(OrcProto.RowIndexEntry.newBuilder()
+ .addPositions(500).addPositions(-1).addPositions(-1)
+ .addPositions(50000)
+ .build())
+ .build();
+
+ // set encodings
+ List<OrcProto.ColumnEncoding> encodings =
+ new ArrayList<OrcProto.ColumnEncoding>();
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DICTIONARY).build());
+ encodings.add(OrcProto.ColumnEncoding.newBuilder()
+ .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
+
+ // set types struct{x: string, y: int}
+ List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT)
+ .addSubtypes(1).addSubtypes(2).addFieldNames("x")
+ .addFieldNames("y").build());
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRING).build());
+ types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
+
+ // filter by rows and groups
+ result = RecordReaderImpl.planReadPartialDataStreams(streams, indexes,
+ columns, rowGroups, false, encodings, types, 32768, false);
+ assertThat(result, is(diskRanges(100, 1000, 400, 1000, 500, 1000,
+ 11000, 21000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
+ 41000, 51000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
+ 51000, 95000, 95000, 97000, 97000, 100000)));
+ }
+
+ @Test
+ public void testIntNullSafeEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong(i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
+ assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong(15);
+ assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testIntEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong(i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong(15);
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testIntInBloomFilter() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(15L);
+ args.add(19L);
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG,
+ "x", null, args);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong(i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong(19);
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong(15);
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDoubleNullSafeEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addDouble(i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
+ assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addDouble(15.0);
+ assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDoubleEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addDouble(i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addDouble(15.0);
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDoubleInBloomFilter() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(15.0);
+ args.add(19.0);
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.FLOAT,
+ "x", null, args);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addDouble(i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addDouble(19.0);
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addDouble(15.0);
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testStringNullSafeEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addString("str_" + i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
+ assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString("str_15");
+ assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testStringEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addString("str_" + i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString("str_15");
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testStringInBloomFilter() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add("str_15");
+ args.add("str_19");
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING,
+ "x", null, args);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addString("str_" + i);
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString("str_19");
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString("str_15");
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDateWritableNullSafeEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x",
+ new DateWritable(15).get(), null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong((new DateWritable(i)).getDays());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDateStats(10, 100));
+ assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new DateWritable(15)).getDays());
+ assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDateWritableEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DATE, "x",
+ new DateWritable(15).get(), null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong((new DateWritable(i)).getDays());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDateStats(10, 100));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new DateWritable(15)).getDays());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDateWritableInBloomFilter() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(new DateWritable(15).get());
+ args.add(new DateWritable(19).get());
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DATE,
+ "x", null, args);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong((new DateWritable(i)).getDays());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDateStats(10, 100));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new DateWritable(19)).getDays());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new DateWritable(15)).getDays());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testTimestampNullSafeEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x",
+ new Timestamp(15),
+ null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong((new Timestamp(i)).getTime());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100));
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new Timestamp(15)).getTime());
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testTimestampEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong((new Timestamp(i)).getTime());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100));
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new Timestamp(15)).getTime());
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testTimestampInBloomFilter() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(new Timestamp(15));
+ args.add(new Timestamp(19));
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.TIMESTAMP,
+ "x", null, args);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addLong((new Timestamp(i)).getTime());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100));
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new Timestamp(19)).getTime());
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addLong((new Timestamp(15)).getTime());
+ // timestamp PPD is disable until ORC-135
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDecimalNullSafeEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x",
+ new HiveDecimalWritable("15"),
+ null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addString(HiveDecimal.create(i).toString());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200"));
+ assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString(HiveDecimal.create(15).toString());
+ assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDecimalEqualsBloomFilter() throws Exception {
+ PredicateLeaf pred = createPredicateLeaf(
+ PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DECIMAL, "x",
+ new HiveDecimalWritable("15"),
+ null);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addString(HiveDecimal.create(i).toString());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200"));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString(HiveDecimal.create(15).toString());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testDecimalInBloomFilter() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(new HiveDecimalWritable("15"));
+ args.add(new HiveDecimalWritable("19"));
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL,
+ "x", null, args);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addString(HiveDecimal.create(i).toString());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200"));
+ assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString(HiveDecimal.create(19).toString());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString(HiveDecimal.create(15).toString());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testNullsInBloomFilter() throws Exception {
+ List<Object> args = new ArrayList<Object>();
+ args.add(new HiveDecimalWritable("15"));
+ args.add(null);
+ args.add(new HiveDecimalWritable("19"));
+ PredicateLeaf pred = createPredicateLeaf
+ (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL,
+ "x", null, args);
+ BloomFilterIO bf = new BloomFilterIO(10000);
+ for (int i = 20; i < 1000; i++) {
+ bf.addString(HiveDecimal.create(i).toString());
+ }
+ ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200", false));
+ // hasNull is false, so bloom filter should return NO
+ assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200", true));
+ // hasNull is true, so bloom filter should return YES_NO_NULL
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString(HiveDecimal.create(19).toString());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+
+ bf.addString(HiveDecimal.create(15).toString());
+ assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
+ }
+
+ @Test
+ public void testClose() throws Exception {
+ DataReader mockedDataReader = mock(DataReader.class);
+ closeMockedRecordReader(mockedDataReader);
+
+ verify(mockedDataReader, atLeastOnce()).close();
+ }
+
+ @Test
+ public void testCloseWithException() throws Exception {
+ DataReader mockedDataReader = mock(DataReader.class);
+ doThrow(IOException.class).when(mockedDataReader).close();
+
+ try {
+ closeMockedRecordReader(mockedDataReader);
+ fail("Exception should have been thrown when Record Reader was closed");
+ } catch (IOException expected) {
+
+ }
+
+ verify(mockedDataReader, atLeastOnce()).close();
+ }
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+
+ private void closeMockedRecordReader(DataReader mockedDataReader) throws IOException {
+ Configuration conf = new Configuration();
+ Path path = new Path(workDir, "empty.orc");
+ FileSystem.get(conf).delete(path, true);
+ Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf)
+ .setSchema(TypeDescription.createLong()));
+ writer.close();
+ Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
+
+ RecordReader recordReader = reader.rows(new Reader.Options()
+ .dataReader(mockedDataReader));
+
+ recordReader.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/impl/TestRunLengthByteReader.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/impl/TestRunLengthByteReader.java b/orc/src/test/org/apache/hive/orc/impl/TestRunLengthByteReader.java
new file mode 100644
index 0000000..ac3f93e
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/impl/TestRunLengthByteReader.java
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.nio.ByteBuffer;
+
+import org.apache.hive.orc.CompressionCodec;
+import org.junit.Test;
+
+public class TestRunLengthByteReader {
+
+ @Test
+ public void testUncompressedSeek() throws Exception {
+ TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
+ RunLengthByteWriter out = new RunLengthByteWriter(new OutStream("test", 100,
+ null, collect));
+ TestInStream.PositionCollector[] positions =
+ new TestInStream.PositionCollector[2048];
+ for(int i=0; i < 2048; ++i) {
+ positions[i] = new TestInStream.PositionCollector();
+ out.getPosition(positions[i]);
+ if (i < 1024) {
+ out.write((byte) (i/4));
+ } else {
+ out.write((byte) i);
+ }
+ }
+ out.flush();
+ ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
+ collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
+ inBuf.flip();
+ RunLengthByteReader in = new RunLengthByteReader(InStream.create("test",
+ new ByteBuffer[]{inBuf}, new long[]{0}, inBuf.remaining(), null, 100));
+ for(int i=0; i < 2048; ++i) {
+ int x = in.next() & 0xff;
+ if (i < 1024) {
+ assertEquals((i/4) & 0xff, x);
+ } else {
+ assertEquals(i & 0xff, x);
+ }
+ }
+ for(int i=2047; i >= 0; --i) {
+ in.seek(positions[i]);
+ int x = in.next() & 0xff;
+ if (i < 1024) {
+ assertEquals((i/4) & 0xff, x);
+ } else {
+ assertEquals(i & 0xff, x);
+ }
+ }
+ }
+
+ @Test
+ public void testCompressedSeek() throws Exception {
+ CompressionCodec codec = new SnappyCodec();
+ TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
+ RunLengthByteWriter out = new RunLengthByteWriter(new OutStream("test", 500,
+ codec, collect));
+ TestInStream.PositionCollector[] positions =
+ new TestInStream.PositionCollector[2048];
+ for(int i=0; i < 2048; ++i) {
+ positions[i] = new TestInStream.PositionCollector();
+ out.getPosition(positions[i]);
+ if (i < 1024) {
+ out.write((byte) (i/4));
+ } else {
+ out.write((byte) i);
+ }
+ }
+ out.flush();
+ ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
+ collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
+ inBuf.flip();
+ RunLengthByteReader in = new RunLengthByteReader(InStream.create("test",
+ new ByteBuffer[]{inBuf}, new long[]{0}, inBuf.remaining(), codec, 500));
+ for(int i=0; i < 2048; ++i) {
+ int x = in.next() & 0xff;
+ if (i < 1024) {
+ assertEquals((i/4) & 0xff, x);
+ } else {
+ assertEquals(i & 0xff, x);
+ }
+ }
+ for(int i=2047; i >= 0; --i) {
+ in.seek(positions[i]);
+ int x = in.next() & 0xff;
+ if (i < 1024) {
+ assertEquals((i/4) & 0xff, x);
+ } else {
+ assertEquals(i & 0xff, x);
+ }
+ }
+ }
+
+ @Test
+ public void testSkips() throws Exception {
+ TestInStream.OutputCollector collect = new TestInStream.OutputCollector();
+ RunLengthByteWriter out = new RunLengthByteWriter(new OutStream("test", 100,
+ null, collect));
+ for(int i=0; i < 2048; ++i) {
+ if (i < 1024) {
+ out.write((byte) (i/16));
+ } else {
+ out.write((byte) i);
+ }
+ }
+ out.flush();
+ ByteBuffer inBuf = ByteBuffer.allocate(collect.buffer.size());
+ collect.buffer.setByteBuffer(inBuf, 0, collect.buffer.size());
+ inBuf.flip();
+ RunLengthByteReader in = new RunLengthByteReader(InStream.create("test",
+ new ByteBuffer[]{inBuf}, new long[]{0}, inBuf.remaining(), null, 100));
+ for(int i=0; i < 2048; i += 10) {
+ int x = in.next() & 0xff;
+ if (i < 1024) {
+ assertEquals((i/16) & 0xff, x);
+ } else {
+ assertEquals(i & 0xff, x);
+ }
+ if (i < 2038) {
+ in.skip(9);
+ }
+ in.skip(0);
+ }
+ }
+}
[12/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/TestNewIntegerEncoding.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/TestNewIntegerEncoding.java b/orc/src/test/org/apache/hive/orc/TestNewIntegerEncoding.java
new file mode 100644
index 0000000..7f598c7
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/TestNewIntegerEncoding.java
@@ -0,0 +1,1373 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.io.File;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import com.google.common.collect.Lists;
+import com.google.common.primitives.Longs;
+
+@RunWith(value = Parameterized.class)
+public class TestNewIntegerEncoding {
+
+ private OrcFile.EncodingStrategy encodingStrategy;
+
+ public TestNewIntegerEncoding( OrcFile.EncodingStrategy es) {
+ this.encodingStrategy = es;
+ }
+
+ @Parameters
+ public static Collection<Object[]> data() {
+ Object[][] data = new Object[][] { { OrcFile.EncodingStrategy.COMPRESSION },
+ { OrcFile.EncodingStrategy.SPEED } };
+ return Arrays.asList(data);
+ }
+
+ public static class TSRow {
+ Timestamp ts;
+
+ public TSRow(Timestamp ts) {
+ this.ts = ts;
+ }
+ }
+
+ public static TypeDescription getRowSchema() {
+ return TypeDescription.createStruct()
+ .addField("int1", TypeDescription.createInt())
+ .addField("long1", TypeDescription.createLong());
+ }
+
+ public static void appendRow(VectorizedRowBatch batch,
+ int int1, long long1) {
+ int row = batch.size++;
+ ((LongColumnVector) batch.cols[0]).vector[row] = int1;
+ ((LongColumnVector) batch.cols[1]).vector[row] = long1;
+ }
+
+ public static void appendLong(VectorizedRowBatch batch,
+ long long1) {
+ int row = batch.size++;
+ ((LongColumnVector) batch.cols[0]).vector[row] = long1;
+ }
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir", "target"
+ + File.separator + "test" + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcFile."
+ + testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testBasicRow() throws Exception {
+ TypeDescription schema= getRowSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ appendRow(batch, 111, 1111L);
+ appendRow(batch, 111, 1111L);
+ appendRow(batch, 111, 1111L);
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(111, ((LongColumnVector) batch.cols[0]).vector[r]);
+ assertEquals(1111, ((LongColumnVector) batch.cols[1]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testBasicOld() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+ long[] inp = new long[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6,
+ 7, 8, 9, 10, 1, 1, 1, 1, 1, 1, 10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1,
+ 2, 5, 1, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1,
+ 9, 2, 6, 3, 7, 1, 9, 2, 6, 2000, 2, 1, 1, 1, 1, 1, 3, 7, 1, 9, 2, 6, 1,
+ 1, 1, 1, 1 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .version(OrcFile.Version.V_0_11)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ batch = reader.getSchema().createRowBatch();
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testBasicNew() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6,
+ 7, 8, 9, 10, 1, 1, 1, 1, 1, 1, 10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1,
+ 2, 5, 1, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1,
+ 9, 2, 6, 3, 7, 1, 9, 2, 6, 2000, 2, 1, 1, 1, 1, 1, 3, 7, 1, 9, 2, 6, 1,
+ 1, 1, 1, 1 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ batch = reader.getSchema().createRowBatch();
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testBasicDelta1() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { -500, -400, -350, -325, -310 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testBasicDelta2() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { -500, -600, -650, -675, -710 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testBasicDelta3() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { 500, 400, 350, 325, 310 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testBasicDelta4() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { 500, 600, 650, 675, 710 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testDeltaOverflow() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[]{4513343538618202719l, 4513343538618202711l,
+ 2911390882471569739l,
+ -9181829309989854913l};
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .compress(CompressionKind.NONE).bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile
+ .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testDeltaOverflow2() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[]{Long.MAX_VALUE, 4513343538618202711l,
+ 2911390882471569739l,
+ Long.MIN_VALUE};
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .compress(CompressionKind.NONE).bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile
+ .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testDeltaOverflow3() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[]{-4513343538618202711l, -2911390882471569739l, -2,
+ Long.MAX_VALUE};
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .compress(CompressionKind.NONE).bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile
+ .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testIntegerMin() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ input.add((long) Integer.MIN_VALUE);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testIntegerMax() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ input.add((long) Integer.MAX_VALUE);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testLongMin() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ input.add(Long.MIN_VALUE);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testLongMax() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ input.add(Long.MAX_VALUE);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testRandomInt() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 100000; i++) {
+ input.add((long) rand.nextInt());
+ }
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(100000);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testRandomLong() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 100000; i++) {
+ input.add(rand.nextLong());
+ }
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(100000);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseNegativeMin() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { 20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2, 2,
+ 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1,
+ 1, 135, 3, 3, 1, 414, 2, 1, 2, 2, 594, 2, 5, 6, 4, 11, 1, 2, 2, 1, 1,
+ 52, 4, 1, 2, 7, 1, 17, 334, 1, 2, 1, 2, 2, 6, 1, 266, 1, 2, 217, 2, 6,
+ 2, 13, 2, 2, 1, 2, 3, 5, 1, 2, 1, 7244, 11813, 1, 33, 2, -13, 1, 2, 3,
+ 13, 1, 92, 3, 13, 5, 14, 9, 141, 12, 6, 15, 25, 1, 1, 1, 46, 2, 1, 1,
+ 141, 3, 1, 1, 1, 1, 2, 1, 4, 34, 5, 78, 8, 1, 2, 2, 1, 9, 10, 2, 1, 4,
+ 13, 1, 5, 4, 4, 19, 5, 1, 1, 1, 68, 33, 399, 1, 1885, 25, 5, 2, 4, 1,
+ 1, 2, 16, 1, 2966, 3, 1, 1, 25501, 1, 1, 1, 66, 1, 3, 8, 131, 14, 5, 1,
+ 2, 2, 1, 1, 8, 1, 1, 2, 1, 5, 9, 2, 3, 112, 13, 2, 2, 1, 5, 10, 3, 1,
+ 1, 13, 2, 3, 4, 1, 3, 1, 1, 2, 1, 1, 2, 4, 2, 207, 1, 1, 2, 4, 3, 3, 2,
+ 2, 16 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseNegativeMin2() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { 20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2, 2,
+ 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1,
+ 1, 135, 3, 3, 1, 414, 2, 1, 2, 2, 594, 2, 5, 6, 4, 11, 1, 2, 2, 1, 1,
+ 52, 4, 1, 2, 7, 1, 17, 334, 1, 2, 1, 2, 2, 6, 1, 266, 1, 2, 217, 2, 6,
+ 2, 13, 2, 2, 1, 2, 3, 5, 1, 2, 1, 7244, 11813, 1, 33, 2, -1, 1, 2, 3,
+ 13, 1, 92, 3, 13, 5, 14, 9, 141, 12, 6, 15, 25, 1, 1, 1, 46, 2, 1, 1,
+ 141, 3, 1, 1, 1, 1, 2, 1, 4, 34, 5, 78, 8, 1, 2, 2, 1, 9, 10, 2, 1, 4,
+ 13, 1, 5, 4, 4, 19, 5, 1, 1, 1, 68, 33, 399, 1, 1885, 25, 5, 2, 4, 1,
+ 1, 2, 16, 1, 2966, 3, 1, 1, 25501, 1, 1, 1, 66, 1, 3, 8, 131, 14, 5, 1,
+ 2, 2, 1, 1, 8, 1, 1, 2, 1, 5, 9, 2, 3, 112, 13, 2, 2, 1, 5, 10, 3, 1,
+ 1, 13, 2, 3, 4, 1, 3, 1, 1, 2, 1, 1, 2, 4, 2, 207, 1, 1, 2, 4, 3, 3, 2,
+ 2, 16 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseNegativeMin3() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { 20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2, 2,
+ 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1,
+ 1, 135, 3, 3, 1, 414, 2, 1, 2, 2, 594, 2, 5, 6, 4, 11, 1, 2, 2, 1, 1,
+ 52, 4, 1, 2, 7, 1, 17, 334, 1, 2, 1, 2, 2, 6, 1, 266, 1, 2, 217, 2, 6,
+ 2, 13, 2, 2, 1, 2, 3, 5, 1, 2, 1, 7244, 11813, 1, 33, 2, 0, 1, 2, 3,
+ 13, 1, 92, 3, 13, 5, 14, 9, 141, 12, 6, 15, 25, 1, 1, 1, 46, 2, 1, 1,
+ 141, 3, 1, 1, 1, 1, 2, 1, 4, 34, 5, 78, 8, 1, 2, 2, 1, 9, 10, 2, 1, 4,
+ 13, 1, 5, 4, 4, 19, 5, 1, 1, 1, 68, 33, 399, 1, 1885, 25, 5, 2, 4, 1,
+ 1, 2, 16, 1, 2966, 3, 1, 1, 25501, 1, 1, 1, 66, 1, 3, 8, 131, 14, 5, 1,
+ 2, 2, 1, 1, 8, 1, 1, 2, 1, 5, 9, 2, 3, 112, 13, 2, 2, 1, 5, 10, 3, 1,
+ 1, 13, 2, 3, 4, 1, 3, 1, 1, 2, 1, 1, 2, 4, 2, 207, 1, 1, 2, 4, 3, 3, 2,
+ 2, 16 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseNegativeMin4() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { 13, 13, 11, 8, 13, 10, 10, 11, 11, 14, 11, 7, 13,
+ 12, 12, 11, 15, 12, 12, 9, 8, 10, 13, 11, 8, 6, 5, 6, 11, 7, 15, 10, 7,
+ 6, 8, 7, 9, 9, 11, 33, 11, 3, 7, 4, 6, 10, 14, 12, 5, 14, 7, 6 };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseAt0() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(100));
+ }
+ input.set(0, 20000L);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseAt1() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(100));
+ }
+ input.set(1, 20000L);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseAt255() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(100));
+ }
+ input.set(255, 20000L);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseAt256() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(100));
+ }
+ input.set(256, 20000L);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBase510() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(100));
+ }
+ input.set(510, 20000L);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBase511() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(100));
+ }
+ input.set(511, 20000L);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseMax1() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for (int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(60));
+ }
+ input.set(511, Long.MAX_VALUE);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseMax2() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for (int i = 0; i < 5120; i++) {
+ input.add((long) rand.nextInt(60));
+ }
+ input.set(128, Long.MAX_VALUE);
+ input.set(256, Long.MAX_VALUE);
+ input.set(511, Long.MAX_VALUE);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(5120);
+ for (Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseMax3() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ input.add(371946367L);
+ input.add(11963367L);
+ input.add(68639400007L);
+ input.add(100233367L);
+ input.add(6367L);
+ input.add(10026367L);
+ input.add(3670000L);
+ input.add(3602367L);
+ input.add(4719226367L);
+ input.add(7196367L);
+ input.add(444442L);
+ input.add(210267L);
+ input.add(21033L);
+ input.add(160267L);
+ input.add(400267L);
+ input.add(23634347L);
+ input.add(16027L);
+ input.add(46026367L);
+ input.add(Long.MAX_VALUE);
+ input.add(33333L);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseMax4() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ for (int i = 0; i < 25; i++) {
+ input.add(371292224226367L);
+ input.add(119622332222267L);
+ input.add(686329400222007L);
+ input.add(100233333222367L);
+ input.add(636272333322222L);
+ input.add(10202633223267L);
+ input.add(36700222022230L);
+ input.add(36023226224227L);
+ input.add(47192226364427L);
+ input.add(71963622222447L);
+ input.add(22244444222222L);
+ input.add(21220263327442L);
+ input.add(21032233332232L);
+ input.add(16026322232227L);
+ input.add(40022262272212L);
+ input.add(23634342227222L);
+ input.add(16022222222227L);
+ input.add(46026362222227L);
+ input.add(46026362222227L);
+ input.add(33322222222323L);
+ }
+ input.add(Long.MAX_VALUE);
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+ @Test
+ public void testPatchedBaseTimestamp() throws Exception {
+ TypeDescription schema = TypeDescription.createStruct()
+ .addField("ts", TypeDescription.createTimestamp());
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+
+ List<Timestamp> tslist = Lists.newArrayList();
+ tslist.add(Timestamp.valueOf("2099-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2003-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("1999-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("1995-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2002-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2010-03-02 00:00:00"));
+ tslist.add(Timestamp.valueOf("2005-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2006-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2003-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("1996-08-02 00:00:00"));
+ tslist.add(Timestamp.valueOf("1998-11-02 00:00:00"));
+ tslist.add(Timestamp.valueOf("2008-10-02 00:00:00"));
+ tslist.add(Timestamp.valueOf("1993-08-02 00:00:00"));
+ tslist.add(Timestamp.valueOf("2008-01-02 00:00:00"));
+ tslist.add(Timestamp.valueOf("2007-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2004-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2008-10-02 00:00:00"));
+ tslist.add(Timestamp.valueOf("2003-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2004-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2008-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2005-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("1994-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2006-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2004-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2001-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2000-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2000-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2002-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2006-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2011-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2002-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("2005-01-01 00:00:00"));
+ tslist.add(Timestamp.valueOf("1974-01-01 00:00:00"));
+ int idx = 0;
+ for (Timestamp ts : tslist) {
+ ((TimestampColumnVector) batch.cols[0]).set(idx, ts);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(tslist.get(idx++),
+ ((TimestampColumnVector) batch.cols[0]).asScratchTimestamp(r));
+ }
+ }
+ }
+
+ @Test
+ public void testDirectLargeNegatives() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch();
+
+ appendLong(batch, -7486502418706614742L);
+ appendLong(batch, 0L);
+ appendLong(batch, 1L);
+ appendLong(batch, 1L);
+ appendLong(batch, -5535739865598783616L);
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ assertEquals(true, rows.nextBatch(batch));
+ assertEquals(5, batch.size);
+ assertEquals(-7486502418706614742L,
+ ((LongColumnVector) batch.cols[0]).vector[0]);
+ assertEquals(0L,
+ ((LongColumnVector) batch.cols[0]).vector[1]);
+ assertEquals(1L,
+ ((LongColumnVector) batch.cols[0]).vector[2]);
+ assertEquals(1L,
+ ((LongColumnVector) batch.cols[0]).vector[3]);
+ assertEquals(-5535739865598783616L,
+ ((LongColumnVector) batch.cols[0]).vector[4]);
+ assertEquals(false, rows.nextBatch(batch));
+ }
+
+ @Test
+ public void testSeek() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ List<Long> input = Lists.newArrayList();
+ Random rand = new Random();
+ for(int i = 0; i < 100000; i++) {
+ input.add((long) rand.nextInt());
+ }
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .stripeSize(100000)
+ .bufferSize(10000)
+ .version(OrcFile.Version.V_0_11)
+ .encodingStrategy(encodingStrategy));
+ VectorizedRowBatch batch = schema.createRowBatch(100000);
+ for(Long l : input) {
+ appendLong(batch, l);
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 55555;
+ rows.seekToRow(idx);
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/TestOrcNullOptimization.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/TestOrcNullOptimization.java b/orc/src/test/org/apache/hive/orc/TestOrcNullOptimization.java
new file mode 100644
index 0000000..bf9b902
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/TestOrcNullOptimization.java
@@ -0,0 +1,415 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+import java.util.Random;
+
+import junit.framework.Assert;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import org.apache.hive.orc.impl.RecordReaderImpl;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+import com.google.common.collect.Lists;
+
+public class TestOrcNullOptimization {
+
+ TypeDescription createMyStruct() {
+ return TypeDescription.createStruct()
+ .addField("a", TypeDescription.createInt())
+ .addField("b", TypeDescription.createString())
+ .addField("c", TypeDescription.createBoolean())
+ .addField("d", TypeDescription.createList(
+ TypeDescription.createStruct()
+ .addField("z", TypeDescription.createInt())));
+ }
+
+ void addRow(Writer writer, VectorizedRowBatch batch,
+ Integer a, String b, Boolean c,
+ Integer... d) throws IOException {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ int row = batch.size++;
+ LongColumnVector aColumn = (LongColumnVector) batch.cols[0];
+ BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1];
+ LongColumnVector cColumn = (LongColumnVector) batch.cols[2];
+ ListColumnVector dColumn = (ListColumnVector) batch.cols[3];
+ StructColumnVector dStruct = (StructColumnVector) dColumn.child;
+ LongColumnVector dInt = (LongColumnVector) dStruct.fields[0];
+ if (a == null) {
+ aColumn.noNulls = false;
+ aColumn.isNull[row] = true;
+ } else {
+ aColumn.vector[row] = a;
+ }
+ if (b == null) {
+ bColumn.noNulls = false;
+ bColumn.isNull[row] = true;
+ } else {
+ bColumn.setVal(row, b.getBytes());
+ }
+ if (c == null) {
+ cColumn.noNulls = false;
+ cColumn.isNull[row] = true;
+ } else {
+ cColumn.vector[row] = c ? 1 : 0;
+ }
+ if (d == null) {
+ dColumn.noNulls = false;
+ dColumn.isNull[row] = true;
+ } else {
+ dColumn.offsets[row] = dColumn.childCount;
+ dColumn.lengths[row] = d.length;
+ dColumn.childCount += d.length;
+ for(int e=0; e < d.length; ++e) {
+ dInt.vector[(int) dColumn.offsets[row] + e] = d[e];
+ }
+ }
+ }
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcNullOptimization." +
+ testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testMultiStripeWithNull() throws Exception {
+ TypeDescription schema = createMyStruct();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ Random rand = new Random(100);
+ VectorizedRowBatch batch = schema.createRowBatch();
+ addRow(writer, batch, null, null, true, 100);
+ for (int i = 2; i < 20000; i++) {
+ addRow(writer, batch, rand.nextInt(1), "a", true, 100);
+ }
+ addRow(writer, batch, null, null, true, 100);
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ // check the stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(20000, reader.getNumberOfRows());
+ assertEquals(20000, stats[0].getNumberOfValues());
+
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMaximum());
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum());
+ assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getSum());
+ assertEquals("count: 19998 hasNull: true min: 0 max: 0 sum: 0",
+ stats[1].toString());
+
+ assertEquals("a", ((StringColumnStatistics) stats[2]).getMaximum());
+ assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
+ assertEquals(19998, stats[2].getNumberOfValues());
+ assertEquals("count: 19998 hasNull: true min: a max: a sum: 19998",
+ stats[2].toString());
+
+ // check the inspectors
+ assertEquals("struct<a:int,b:string,c:boolean,d:array<struct<z:int>>>",
+ reader.getSchema().toString());
+
+ RecordReader rows = reader.rows();
+
+ List<Boolean> expected = Lists.newArrayList();
+ for (StripeInformation sinfo : reader.getStripes()) {
+ expected.add(false);
+ }
+ // only the first and last stripe will have PRESENT stream
+ expected.set(0, true);
+ expected.set(expected.size() - 1, true);
+
+ List<Boolean> got = Lists.newArrayList();
+ // check if the strip footer contains PRESENT stream
+ for (StripeInformation sinfo : reader.getStripes()) {
+ OrcProto.StripeFooter sf =
+ ((RecordReaderImpl) rows).readStripeFooter(sinfo);
+ got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString())
+ != -1);
+ }
+ assertEquals(expected, got);
+
+ batch = reader.getSchema().createRowBatch();
+ LongColumnVector aColumn = (LongColumnVector) batch.cols[0];
+ BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1];
+ LongColumnVector cColumn = (LongColumnVector) batch.cols[2];
+ ListColumnVector dColumn = (ListColumnVector) batch.cols[3];
+ LongColumnVector dElements =
+ (LongColumnVector)(((StructColumnVector) dColumn.child).fields[0]);
+ assertEquals(true , rows.nextBatch(batch));
+ assertEquals(1024, batch.size);
+
+ // row 1
+ assertEquals(true, aColumn.isNull[0]);
+ assertEquals(true, bColumn.isNull[0]);
+ assertEquals(1, cColumn.vector[0]);
+ assertEquals(0, dColumn.offsets[0]);
+ assertEquals(1, dColumn.lengths[1]);
+ assertEquals(100, dElements.vector[0]);
+
+ rows.seekToRow(19998);
+ rows.nextBatch(batch);
+ assertEquals(2, batch.size);
+
+ // last-1 row
+ assertEquals(0, aColumn.vector[0]);
+ assertEquals("a", bColumn.toString(0));
+ assertEquals(1, cColumn.vector[0]);
+ assertEquals(0, dColumn.offsets[0]);
+ assertEquals(1, dColumn.lengths[0]);
+ assertEquals(100, dElements.vector[0]);
+
+ // last row
+ assertEquals(true, aColumn.isNull[1]);
+ assertEquals(true, bColumn.isNull[1]);
+ assertEquals(1, cColumn.vector[1]);
+ assertEquals(1, dColumn.offsets[1]);
+ assertEquals(1, dColumn.lengths[1]);
+ assertEquals(100, dElements.vector[1]);
+
+ assertEquals(false, rows.nextBatch(batch));
+ rows.close();
+ }
+
+ @Test
+ public void testMultiStripeWithoutNull() throws Exception {
+ TypeDescription schema = createMyStruct();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ Random rand = new Random(100);
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (int i = 1; i < 20000; i++) {
+ addRow(writer, batch, rand.nextInt(1), "a", true, 100);
+ }
+ addRow(writer, batch, 0, "b", true, 100);
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ // check the stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(20000, reader.getNumberOfRows());
+ assertEquals(20000, stats[0].getNumberOfValues());
+
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMaximum());
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum());
+ assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
+ assertEquals(0, ((IntegerColumnStatistics) stats[1]).getSum());
+ assertEquals("count: 20000 hasNull: false min: 0 max: 0 sum: 0",
+ stats[1].toString());
+
+ assertEquals("b", ((StringColumnStatistics) stats[2]).getMaximum());
+ assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
+ assertEquals(20000, stats[2].getNumberOfValues());
+ assertEquals("count: 20000 hasNull: false min: a max: b sum: 20000",
+ stats[2].toString());
+
+ // check the inspectors
+ Assert.assertEquals("struct<a:int,b:string,c:boolean,d:array<struct<z:int>>>",
+ reader.getSchema().toString());
+
+ RecordReader rows = reader.rows();
+
+ // none of the stripes will have PRESENT stream
+ List<Boolean> expected = Lists.newArrayList();
+ for (StripeInformation sinfo : reader.getStripes()) {
+ expected.add(false);
+ }
+
+ List<Boolean> got = Lists.newArrayList();
+ // check if the strip footer contains PRESENT stream
+ for (StripeInformation sinfo : reader.getStripes()) {
+ OrcProto.StripeFooter sf =
+ ((RecordReaderImpl) rows).readStripeFooter(sinfo);
+ got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString())
+ != -1);
+ }
+ assertEquals(expected, got);
+
+ rows.seekToRow(19998);
+
+ batch = reader.getSchema().createRowBatch();
+ LongColumnVector aColumn = (LongColumnVector) batch.cols[0];
+ BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1];
+ LongColumnVector cColumn = (LongColumnVector) batch.cols[2];
+ ListColumnVector dColumn = (ListColumnVector) batch.cols[3];
+ LongColumnVector dElements =
+ (LongColumnVector)(((StructColumnVector) dColumn.child).fields[0]);
+
+ assertEquals(true, rows.nextBatch(batch));
+ assertEquals(2, batch.size);
+
+ // last-1 row
+ assertEquals(0, aColumn.vector[0]);
+ assertEquals("a", bColumn.toString(0));
+ assertEquals(1, cColumn.vector[0]);
+ assertEquals(0, dColumn.offsets[0]);
+ assertEquals(1, dColumn.lengths[0]);
+ assertEquals(100, dElements.vector[0]);
+
+ // last row
+ assertEquals(0, aColumn.vector[1]);
+ assertEquals("b", bColumn.toString(1));
+ assertEquals(1, cColumn.vector[1]);
+ assertEquals(1, dColumn.offsets[1]);
+ assertEquals(1, dColumn.lengths[1]);
+ assertEquals(100, dElements.vector[1]);
+ rows.close();
+ }
+
+ @Test
+ public void testColumnsWithNullAndCompression() throws Exception {
+ TypeDescription schema = createMyStruct();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(100000)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ addRow(writer, batch, 3, "a", true, 100);
+ addRow(writer, batch, null, "b", true, 100);
+ addRow(writer, batch, 3, null, false, 100);
+ addRow(writer, batch, 3, "d", true, 100);
+ addRow(writer, batch, 2, "e", true, 100);
+ addRow(writer, batch, 2, "f", true, 100);
+ addRow(writer, batch, 2, "g", true, 100);
+ addRow(writer, batch, 2, "h", true, 100);
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ // check the stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(8, reader.getNumberOfRows());
+ assertEquals(8, stats[0].getNumberOfValues());
+
+ assertEquals(3, ((IntegerColumnStatistics) stats[1]).getMaximum());
+ assertEquals(2, ((IntegerColumnStatistics) stats[1]).getMinimum());
+ assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
+ assertEquals(17, ((IntegerColumnStatistics) stats[1]).getSum());
+ assertEquals("count: 7 hasNull: true min: 2 max: 3 sum: 17",
+ stats[1].toString());
+
+ assertEquals("h", ((StringColumnStatistics) stats[2]).getMaximum());
+ assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
+ assertEquals(7, stats[2].getNumberOfValues());
+ assertEquals("count: 7 hasNull: true min: a max: h sum: 7",
+ stats[2].toString());
+
+ // check the inspectors
+ batch = reader.getSchema().createRowBatch();
+ LongColumnVector aColumn = (LongColumnVector) batch.cols[0];
+ BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1];
+ LongColumnVector cColumn = (LongColumnVector) batch.cols[2];
+ ListColumnVector dColumn = (ListColumnVector) batch.cols[3];
+ LongColumnVector dElements =
+ (LongColumnVector)(((StructColumnVector) dColumn.child).fields[0]);
+ Assert.assertEquals("struct<a:int,b:string,c:boolean,d:array<struct<z:int>>>",
+ reader.getSchema().toString());
+
+ RecordReader rows = reader.rows();
+ // only the last strip will have PRESENT stream
+ List<Boolean> expected = Lists.newArrayList();
+ for (StripeInformation sinfo : reader.getStripes()) {
+ expected.add(false);
+ }
+ expected.set(expected.size() - 1, true);
+
+ List<Boolean> got = Lists.newArrayList();
+ // check if the strip footer contains PRESENT stream
+ for (StripeInformation sinfo : reader.getStripes()) {
+ OrcProto.StripeFooter sf =
+ ((RecordReaderImpl) rows).readStripeFooter(sinfo);
+ got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString())
+ != -1);
+ }
+ assertEquals(expected, got);
+
+ assertEquals(true, rows.nextBatch(batch));
+ assertEquals(8, batch.size);
+
+ // row 1
+ assertEquals(3, aColumn.vector[0]);
+ assertEquals("a", bColumn.toString(0));
+ assertEquals(1, cColumn.vector[0]);
+ assertEquals(0, dColumn.offsets[0]);
+ assertEquals(1, dColumn.lengths[0]);
+ assertEquals(100, dElements.vector[0]);
+
+ // row 2
+ assertEquals(true, aColumn.isNull[1]);
+ assertEquals("b", bColumn.toString(1));
+ assertEquals(1, cColumn.vector[1]);
+ assertEquals(1, dColumn.offsets[1]);
+ assertEquals(1, dColumn.lengths[1]);
+ assertEquals(100, dElements.vector[1]);
+
+ // row 3
+ assertEquals(3, aColumn.vector[2]);
+ assertEquals(true, bColumn.isNull[2]);
+ assertEquals(0, cColumn.vector[2]);
+ assertEquals(2, dColumn.offsets[2]);
+ assertEquals(1, dColumn.lengths[2]);
+ assertEquals(100, dElements.vector[2]);
+
+ rows.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/TestOrcTimezone1.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/TestOrcTimezone1.java b/orc/src/test/org/apache/hive/orc/TestOrcTimezone1.java
new file mode 100644
index 0000000..6c65c74
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/TestOrcTimezone1.java
@@ -0,0 +1,189 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertNotNull;
+
+import java.io.File;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.TimeZone;
+
+import junit.framework.Assert;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import com.google.common.collect.Lists;
+
+/**
+ *
+ */
+@RunWith(Parameterized.class)
+public class TestOrcTimezone1 {
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+ String writerTimeZone;
+ String readerTimeZone;
+ static TimeZone defaultTimeZone = TimeZone.getDefault();
+
+ public TestOrcTimezone1(String writerTZ, String readerTZ) {
+ this.writerTimeZone = writerTZ;
+ this.readerTimeZone = readerTZ;
+ }
+
+ @Parameterized.Parameters
+ public static Collection<Object[]> data() {
+ List<Object[]> result = Arrays.asList(new Object[][]{
+ /* Extreme timezones */
+ {"GMT-12:00", "GMT+14:00"},
+ /* No difference in DST */
+ {"America/Los_Angeles", "America/Los_Angeles"}, /* same timezone both with DST */
+ {"Europe/Berlin", "Europe/Berlin"}, /* same as above but europe */
+ {"America/Phoenix", "Asia/Kolkata"} /* Writer no DST, Reader no DST */,
+ {"Europe/Berlin", "America/Los_Angeles"} /* Writer DST, Reader DST */,
+ {"Europe/Berlin", "America/Chicago"} /* Writer DST, Reader DST */,
+ /* With DST difference */
+ {"Europe/Berlin", "UTC"},
+ {"UTC", "Europe/Berlin"} /* Writer no DST, Reader DST */,
+ {"America/Los_Angeles", "Asia/Kolkata"} /* Writer DST, Reader no DST */,
+ {"Europe/Berlin", "Asia/Kolkata"} /* Writer DST, Reader no DST */,
+ /* Timezone offsets for the reader has changed historically */
+ {"Asia/Saigon", "Pacific/Enderbury"},
+ {"UTC", "Asia/Jerusalem"},
+
+ // NOTE:
+ // "1995-01-01 03:00:00.688888888" this is not a valid time in Pacific/Enderbury timezone.
+ // On 1995-01-01 00:00:00 GMT offset moved from -11:00 hr to +13:00 which makes all values
+ // on 1995-01-01 invalid. Try this with joda time
+ // new MutableDateTime("1995-01-01", DateTimeZone.forTimeZone(readerTimeZone));
+ });
+ return result;
+ }
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcFile." +
+ testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @After
+ public void restoreTimeZone() {
+ TimeZone.setDefault(defaultTimeZone);
+ }
+
+ @Test
+ public void testTimestampWriter() throws Exception {
+ TypeDescription schema = TypeDescription.createTimestamp();
+
+ TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone));
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .bufferSize(10000));
+ assertEquals(writerTimeZone, TimeZone.getDefault().getID());
+ List<String> ts = Lists.newArrayList();
+ ts.add("2003-01-01 01:00:00.000000222");
+ ts.add("1996-08-02 09:00:00.723100809");
+ ts.add("1999-01-01 02:00:00.999999999");
+ ts.add("1995-01-02 03:00:00.688888888");
+ ts.add("2002-01-01 04:00:00.1");
+ ts.add("2010-03-02 05:00:00.000009001");
+ ts.add("2005-01-01 06:00:00.000002229");
+ ts.add("2006-01-01 07:00:00.900203003");
+ ts.add("2003-01-01 08:00:00.800000007");
+ ts.add("1998-11-02 10:00:00.857340643");
+ ts.add("2008-10-02 11:00:00.0");
+ ts.add("2037-01-01 00:00:00.000999");
+ ts.add("2014-03-28 00:00:00.0");
+ VectorizedRowBatch batch = schema.createRowBatch();
+ TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
+ for (String t : ts) {
+ times.set(batch.size++, Timestamp.valueOf(t));
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ assertEquals(readerTimeZone, TimeZone.getDefault().getID());
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ times = (TimestampColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(ts.get(idx++), times.asScratchTimestamp(r).toString());
+ }
+ }
+ rows.close();
+ }
+
+ @Test
+ public void testReadTimestampFormat_0_11() throws Exception {
+ TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
+ Path oldFilePath = new Path(getClass().getClassLoader().
+ getSystemResource("orc-file-11-format.orc").getPath());
+ Reader reader = OrcFile.createReader(oldFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ TypeDescription schema = reader.getSchema();
+ int col = schema.getFieldNames().indexOf("ts");
+ VectorizedRowBatch batch = schema.createRowBatch(10);
+ TimestampColumnVector ts = (TimestampColumnVector) batch.cols[col];
+
+ boolean[] include = new boolean[schema.getMaximumId() + 1];
+ include[schema.getChildren().get(col).getId()] = true;
+ RecordReader rows = reader.rows
+ (new Reader.Options().include(include));
+ assertEquals(true, rows.nextBatch(batch));
+ assertEquals(Timestamp.valueOf("2000-03-12 15:00:00"),
+ ts.asScratchTimestamp(0));
+
+ // check the contents of second row
+ rows.seekToRow(7499);
+ assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1, batch.size);
+ assertEquals(Timestamp.valueOf("2000-03-12 15:00:01"),
+ ts.asScratchTimestamp(0));
+
+ // handle the close up
+ Assert.assertEquals(false, rows.nextBatch(batch));
+ rows.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/TestOrcTimezone2.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/TestOrcTimezone2.java b/orc/src/test/org/apache/hive/orc/TestOrcTimezone2.java
new file mode 100644
index 0000000..67d3f16
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/TestOrcTimezone2.java
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.io.File;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Random;
+import java.util.TimeZone;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import com.google.common.collect.Lists;
+
+/**
+ *
+ */
+@RunWith(Parameterized.class)
+public class TestOrcTimezone2 {
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+ String writerTimeZone;
+ String readerTimeZone;
+ static TimeZone defaultTimeZone = TimeZone.getDefault();
+
+ public TestOrcTimezone2(String writerTZ, String readerTZ) {
+ this.writerTimeZone = writerTZ;
+ this.readerTimeZone = readerTZ;
+ }
+
+ @Parameterized.Parameters
+ public static Collection<Object[]> data() {
+ String[] allTimeZones = TimeZone.getAvailableIDs();
+ Random rand = new Random(123);
+ int len = allTimeZones.length;
+ int n = 500;
+ Object[][] data = new Object[n][];
+ for (int i = 0; i < n; i++) {
+ int wIdx = rand.nextInt(len);
+ int rIdx = rand.nextInt(len);
+ data[i] = new Object[2];
+ data[i][0] = allTimeZones[wIdx];
+ data[i][1] = allTimeZones[rIdx];
+ }
+ return Arrays.asList(data);
+ }
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcFile." +
+ testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @After
+ public void restoreTimeZone() {
+ TimeZone.setDefault(defaultTimeZone);
+ }
+
+ @Test
+ public void testTimestampWriter() throws Exception {
+ TypeDescription schema = TypeDescription.createTimestamp();
+
+ TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone));
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema)
+ .stripeSize(100000).bufferSize(10000));
+ assertEquals(writerTimeZone, TimeZone.getDefault().getID());
+ List<String> ts = Lists.newArrayList();
+ ts.add("2003-01-01 01:00:00.000000222");
+ ts.add("1999-01-01 02:00:00.999999999");
+ ts.add("1995-01-02 03:00:00.688888888");
+ ts.add("2002-01-01 04:00:00.1");
+ ts.add("2010-03-02 05:00:00.000009001");
+ ts.add("2005-01-01 06:00:00.000002229");
+ ts.add("2006-01-01 07:00:00.900203003");
+ ts.add("2003-01-01 08:00:00.800000007");
+ ts.add("1996-08-02 09:00:00.723100809");
+ ts.add("1998-11-02 10:00:00.857340643");
+ ts.add("2008-10-02 11:00:00.0");
+ ts.add("2037-01-01 00:00:00.000999");
+ VectorizedRowBatch batch = schema.createRowBatch();
+ TimestampColumnVector tsc = (TimestampColumnVector) batch.cols[0];
+ for (String t : ts) {
+ tsc.set(batch.size++, Timestamp.valueOf(t));
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ assertEquals(readerTimeZone, TimeZone.getDefault().getID());
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ batch = reader.getSchema().createRowBatch();
+ tsc = (TimestampColumnVector) batch.cols[0];
+ while (rows.nextBatch(batch)) {
+ for (int r=0; r < batch.size; ++r) {
+ assertEquals(ts.get(idx++), tsc.asScratchTimestamp(r).toString());
+ }
+ }
+ rows.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/TestOrcTimezone3.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/TestOrcTimezone3.java b/orc/src/test/org/apache/hive/orc/TestOrcTimezone3.java
new file mode 100644
index 0000000..25d9720
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/TestOrcTimezone3.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.io.File;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.TimeZone;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import com.google.common.collect.Lists;
+
+/**
+ *
+ */
+@RunWith(Parameterized.class)
+public class TestOrcTimezone3 {
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+ String writerTimeZone;
+ String readerTimeZone;
+ static TimeZone defaultTimeZone = TimeZone.getDefault();
+
+ public TestOrcTimezone3(String writerTZ, String readerTZ) {
+ this.writerTimeZone = writerTZ;
+ this.readerTimeZone = readerTZ;
+ }
+
+ @Parameterized.Parameters
+ public static Collection<Object[]> data() {
+ List<Object[]> result = Arrays.asList(new Object[][]{
+ {"America/Chicago", "America/Los_Angeles"},
+ });
+ return result;
+ }
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcTimezone3." +
+ testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @After
+ public void restoreTimeZone() {
+ TimeZone.setDefault(defaultTimeZone);
+ }
+
+ @Test
+ public void testTimestampWriter() throws Exception {
+ TypeDescription schema = TypeDescription.createTimestamp();
+
+ TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone));
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .bufferSize(10000));
+ assertEquals(writerTimeZone, TimeZone.getDefault().getID());
+ List<String> ts = Lists.newArrayList();
+ ts.add("1969-12-31 16:00:14.007");
+ ts.add("1969-12-31 16:00:06.021");
+ ts.add("1969-12-31 16:00:03.963");
+ VectorizedRowBatch batch = schema.createRowBatch();
+ TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
+ for (String t : ts) {
+ times.set(batch.size++, Timestamp.valueOf(t));
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ assertEquals(readerTimeZone, TimeZone.getDefault().getID());
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ times = (TimestampColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(ts.get(idx++), times.asScratchTimestamp(r).toString());
+ }
+ }
+ rows.close();
+ }
+}
[30/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/PhysicalFsWriter.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/PhysicalFsWriter.java b/orc/src/java/org/apache/hive/orc/impl/PhysicalFsWriter.java
new file mode 100644
index 0000000..47c33bb
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/PhysicalFsWriter.java
@@ -0,0 +1,529 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hive.orc.CompressionCodec;
+import org.apache.hive.orc.CompressionCodec.Modifier;
+import org.apache.hive.orc.CompressionKind;
+import org.apache.hive.orc.OrcFile;
+import org.apache.hive.orc.OrcFile.CompressionStrategy;
+import org.apache.hive.orc.OrcProto;
+import org.apache.hive.orc.OrcProto.BloomFilterIndex;
+import org.apache.hive.orc.OrcProto.Footer;
+import org.apache.hive.orc.OrcProto.Metadata;
+import org.apache.hive.orc.OrcProto.PostScript;
+import org.apache.hive.orc.OrcProto.Stream.Kind;
+import org.apache.hive.orc.OrcProto.StripeFooter;
+import org.apache.hive.orc.OrcProto.StripeInformation;
+import org.apache.hive.orc.OrcProto.RowIndex.Builder;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.protobuf.CodedOutputStream;
+
+public class PhysicalFsWriter implements PhysicalWriter {
+ private static final Logger LOG = LoggerFactory.getLogger(PhysicalFsWriter.class);
+
+ private static final int HDFS_BUFFER_SIZE = 256 * 1024;
+
+ private FSDataOutputStream rawWriter = null;
+ // the compressed metadata information outStream
+ private OutStream writer = null;
+ // a protobuf outStream around streamFactory
+ private CodedOutputStream protobufWriter = null;
+
+ private final FileSystem fs;
+ private final Path path;
+ private final long blockSize;
+ private final int bufferSize;
+ private final CompressionCodec codec;
+ private final double paddingTolerance;
+ private final long defaultStripeSize;
+ private final CompressionKind compress;
+ private final boolean addBlockPadding;
+ private final CompressionStrategy compressionStrategy;
+
+ // the streams that make up the current stripe
+ private final Map<StreamName, BufferedStream> streams =
+ new TreeMap<StreamName, BufferedStream>();
+
+ private long adjustedStripeSize;
+ private long headerLength;
+ private long stripeStart;
+ private int metadataLength;
+ private int footerLength;
+
+ public PhysicalFsWriter(FileSystem fs, Path path, int numColumns, OrcFile.WriterOptions opts) {
+ this.fs = fs;
+ this.path = path;
+ this.defaultStripeSize = this.adjustedStripeSize = opts.getStripeSize();
+ this.addBlockPadding = opts.getBlockPadding();
+ if (opts.isEnforceBufferSize()) {
+ this.bufferSize = opts.getBufferSize();
+ } else {
+ this.bufferSize = getEstimatedBufferSize(defaultStripeSize, numColumns, opts.getBufferSize());
+ }
+ this.compress = opts.getCompress();
+ this.compressionStrategy = opts.getCompressionStrategy();
+ codec = createCodec(compress);
+ this.paddingTolerance = opts.getPaddingTolerance();
+ this.blockSize = opts.getBlockSize();
+ LOG.info("ORC writer created for path: {} with stripeSize: {} blockSize: {}" +
+ " compression: {} bufferSize: {}", path, defaultStripeSize, blockSize,
+ compress, bufferSize);
+ }
+
+ @Override
+ public void initialize() throws IOException {
+ if (rawWriter != null) return;
+ rawWriter = fs.create(path, false, HDFS_BUFFER_SIZE,
+ fs.getDefaultReplication(path), blockSize);
+ rawWriter.writeBytes(OrcFile.MAGIC);
+ headerLength = rawWriter.getPos();
+ writer = new OutStream("metadata", bufferSize, codec,
+ new DirectStream(rawWriter));
+ protobufWriter = CodedOutputStream.newInstance(writer);
+ }
+
+ private void padStripe(long indexSize, long dataSize, int footerSize) throws IOException {
+ this.stripeStart = rawWriter.getPos();
+ final long currentStripeSize = indexSize + dataSize + footerSize;
+ final long available = blockSize - (stripeStart % blockSize);
+ final long overflow = currentStripeSize - adjustedStripeSize;
+ final float availRatio = (float) available / (float) defaultStripeSize;
+
+ if (availRatio > 0.0f && availRatio < 1.0f
+ && availRatio > paddingTolerance) {
+ // adjust default stripe size to fit into remaining space, also adjust
+ // the next stripe for correction based on the current stripe size
+ // and user specified padding tolerance. Since stripe size can overflow
+ // the default stripe size we should apply this correction to avoid
+ // writing portion of last stripe to next hdfs block.
+ double correction = overflow > 0 ? (double) overflow
+ / (double) adjustedStripeSize : 0.0;
+
+ // correction should not be greater than user specified padding
+ // tolerance
+ correction = correction > paddingTolerance ? paddingTolerance
+ : correction;
+
+ // adjust next stripe size based on current stripe estimate correction
+ adjustedStripeSize = (long) ((1.0f - correction) * (availRatio * defaultStripeSize));
+ } else if (availRatio >= 1.0) {
+ adjustedStripeSize = defaultStripeSize;
+ }
+
+ if (availRatio < paddingTolerance && addBlockPadding) {
+ long padding = blockSize - (stripeStart % blockSize);
+ byte[] pad = new byte[(int) Math.min(HDFS_BUFFER_SIZE, padding)];
+ LOG.info(String.format("Padding ORC by %d bytes (<= %.2f * %d)",
+ padding, availRatio, defaultStripeSize));
+ stripeStart += padding;
+ while (padding > 0) {
+ int writeLen = (int) Math.min(padding, pad.length);
+ rawWriter.write(pad, 0, writeLen);
+ padding -= writeLen;
+ }
+ adjustedStripeSize = defaultStripeSize;
+ } else if (currentStripeSize < blockSize
+ && (stripeStart % blockSize) + currentStripeSize > blockSize) {
+ // even if you don't pad, reset the default stripe size when crossing a
+ // block boundary
+ adjustedStripeSize = defaultStripeSize;
+ }
+ }
+
+ /**
+ * An output receiver that writes the ByteBuffers to the output stream
+ * as they are received.
+ */
+ private class DirectStream implements OutStream.OutputReceiver {
+ private final FSDataOutputStream output;
+
+ DirectStream(FSDataOutputStream output) {
+ this.output = output;
+ }
+
+ @Override
+ public void output(ByteBuffer buffer) throws IOException {
+ output.write(buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining());
+ }
+ }
+
+ @Override
+ public long getPhysicalStripeSize() {
+ return adjustedStripeSize;
+ }
+
+ @Override
+ public boolean isCompressed() {
+ return codec != null;
+ }
+
+
+ public static CompressionCodec createCodec(CompressionKind kind) {
+ switch (kind) {
+ case NONE:
+ return null;
+ case ZLIB:
+ return new ZlibCodec();
+ case SNAPPY:
+ return new SnappyCodec();
+ case LZO:
+ try {
+ ClassLoader loader = Thread.currentThread().getContextClassLoader();
+ if (loader == null) {
+ loader = WriterImpl.class.getClassLoader();
+ }
+ @SuppressWarnings("unchecked")
+ Class<? extends CompressionCodec> lzo =
+ (Class<? extends CompressionCodec>)
+ loader.loadClass("org.apache.hadoop.hive.ql.io.orc.LzoCodec");
+ return lzo.newInstance();
+ } catch (ClassNotFoundException e) {
+ throw new IllegalArgumentException("LZO is not available.", e);
+ } catch (InstantiationException e) {
+ throw new IllegalArgumentException("Problem initializing LZO", e);
+ } catch (IllegalAccessException e) {
+ throw new IllegalArgumentException("Insufficient access to LZO", e);
+ }
+ default:
+ throw new IllegalArgumentException("Unknown compression codec: " +
+ kind);
+ }
+ }
+
+ private void writeStripeFooter(StripeFooter footer, long dataSize, long indexSize,
+ StripeInformation.Builder dirEntry) throws IOException {
+ footer.writeTo(protobufWriter);
+ protobufWriter.flush();
+ writer.flush();
+ dirEntry.setOffset(stripeStart);
+ dirEntry.setFooterLength(rawWriter.getPos() - stripeStart - dataSize - indexSize);
+ }
+
+ @VisibleForTesting
+ public static int getEstimatedBufferSize(long stripeSize, int numColumns,
+ int bs) {
+ // The worst case is that there are 2 big streams per a column and
+ // we want to guarantee that each stream gets ~10 buffers.
+ // This keeps buffers small enough that we don't get really small stripe
+ // sizes.
+ int estBufferSize = (int) (stripeSize / (20 * numColumns));
+ estBufferSize = getClosestBufferSize(estBufferSize);
+ return estBufferSize > bs ? bs : estBufferSize;
+ }
+
+ private static int getClosestBufferSize(int estBufferSize) {
+ final int kb4 = 4 * 1024;
+ final int kb8 = 8 * 1024;
+ final int kb16 = 16 * 1024;
+ final int kb32 = 32 * 1024;
+ final int kb64 = 64 * 1024;
+ final int kb128 = 128 * 1024;
+ final int kb256 = 256 * 1024;
+ if (estBufferSize <= kb4) {
+ return kb4;
+ } else if (estBufferSize > kb4 && estBufferSize <= kb8) {
+ return kb8;
+ } else if (estBufferSize > kb8 && estBufferSize <= kb16) {
+ return kb16;
+ } else if (estBufferSize > kb16 && estBufferSize <= kb32) {
+ return kb32;
+ } else if (estBufferSize > kb32 && estBufferSize <= kb64) {
+ return kb64;
+ } else if (estBufferSize > kb64 && estBufferSize <= kb128) {
+ return kb128;
+ } else {
+ return kb256;
+ }
+ }
+
+ @Override
+ public void writeFileMetadata(Metadata.Builder builder) throws IOException {
+ long startPosn = rawWriter.getPos();
+ Metadata metadata = builder.build();
+ metadata.writeTo(protobufWriter);
+ protobufWriter.flush();
+ writer.flush();
+ this.metadataLength = (int) (rawWriter.getPos() - startPosn);
+ }
+
+ @Override
+ public void writeFileFooter(Footer.Builder builder) throws IOException {
+ long bodyLength = rawWriter.getPos() - metadataLength;
+ builder.setContentLength(bodyLength);
+ builder.setHeaderLength(headerLength);
+ long startPosn = rawWriter.getPos();
+ Footer footer = builder.build();
+ footer.writeTo(protobufWriter);
+ protobufWriter.flush();
+ writer.flush();
+ this.footerLength = (int) (rawWriter.getPos() - startPosn);
+ }
+
+ @Override
+ public void writePostScript(PostScript.Builder builder) throws IOException {
+ builder.setCompression(writeCompressionKind(compress));
+ builder.setFooterLength(footerLength);
+ builder.setMetadataLength(metadataLength);
+ if (compress != CompressionKind.NONE) {
+ builder.setCompressionBlockSize(bufferSize);
+ }
+ PostScript ps = builder.build();
+ // need to write this uncompressed
+ long startPosn = rawWriter.getPos();
+ ps.writeTo(rawWriter);
+ long length = rawWriter.getPos() - startPosn;
+ if (length > 255) {
+ throw new IllegalArgumentException("PostScript too large at " + length);
+ }
+ rawWriter.writeByte((int)length);
+ }
+
+ @Override
+ public void close() throws IOException {
+ rawWriter.close();
+ }
+
+ private OrcProto.CompressionKind writeCompressionKind(CompressionKind kind) {
+ switch (kind) {
+ case NONE: return OrcProto.CompressionKind.NONE;
+ case ZLIB: return OrcProto.CompressionKind.ZLIB;
+ case SNAPPY: return OrcProto.CompressionKind.SNAPPY;
+ case LZO: return OrcProto.CompressionKind.LZO;
+ default:
+ throw new IllegalArgumentException("Unknown compression " + kind);
+ }
+ }
+
+ @Override
+ public void flush() throws IOException {
+ rawWriter.hflush();
+ // TODO: reset?
+ }
+
+ @Override
+ public long getRawWriterPosition() throws IOException {
+ return rawWriter.getPos();
+ }
+
+ @Override
+ public void appendRawStripe(byte[] stripe, int offset, int length,
+ StripeInformation.Builder dirEntry) throws IOException {
+ long start = rawWriter.getPos();
+ long availBlockSpace = blockSize - (start % blockSize);
+
+ // see if stripe can fit in the current hdfs block, else pad the remaining
+ // space in the block
+ if (length < blockSize && length > availBlockSpace &&
+ addBlockPadding) {
+ byte[] pad = new byte[(int) Math.min(HDFS_BUFFER_SIZE, availBlockSpace)];
+ LOG.info(String.format("Padding ORC by %d bytes while merging..",
+ availBlockSpace));
+ start += availBlockSpace;
+ while (availBlockSpace > 0) {
+ int writeLen = (int) Math.min(availBlockSpace, pad.length);
+ rawWriter.write(pad, 0, writeLen);
+ availBlockSpace -= writeLen;
+ }
+ }
+
+ rawWriter.write(stripe);
+ dirEntry.setOffset(start);
+ }
+
+
+ /**
+ * This class is used to hold the contents of streams as they are buffered.
+ * The TreeWriters write to the outStream and the codec compresses the
+ * data as buffers fill up and stores them in the output list. When the
+ * stripe is being written, the whole stream is written to the file.
+ */
+ private class BufferedStream implements OutStream.OutputReceiver {
+ private final OutStream outStream;
+ private final List<ByteBuffer> output = new ArrayList<ByteBuffer>();
+
+ BufferedStream(String name, int bufferSize,
+ CompressionCodec codec) throws IOException {
+ outStream = new OutStream(name, bufferSize, codec, this);
+ }
+
+ /**
+ * Receive a buffer from the compression codec.
+ * @param buffer the buffer to save
+ */
+ @Override
+ public void output(ByteBuffer buffer) {
+ output.add(buffer);
+ }
+
+ /**
+ * @return the number of bytes in buffers that are allocated to this stream.
+ */
+ public long getBufferSize() {
+ long result = 0;
+ for (ByteBuffer buf: output) {
+ result += buf.capacity();
+ }
+ return outStream.getBufferSize() + result;
+ }
+
+ /**
+ * Write any saved buffers to the OutputStream if needed, and clears all the buffers.
+ */
+ public void spillToDiskAndClear() throws IOException {
+ if (!outStream.isSuppressed()) {
+ for (ByteBuffer buffer: output) {
+ rawWriter.write(buffer.array(), buffer.arrayOffset() + buffer.position(),
+ buffer.remaining());
+ }
+ }
+ outStream.clear();
+ output.clear();
+ }
+
+ /**
+ * @return The number of bytes that will be written to the output. Assumes the stream writing
+ * into this receiver has already been flushed.
+ */
+ public long getOutputSize() {
+ long result = 0;
+ for (ByteBuffer buffer: output) {
+ result += buffer.remaining();
+ }
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return outStream.toString();
+ }
+ }
+
+ @Override
+ public OutStream getOrCreatePhysicalStream(StreamName name) throws IOException {
+ BufferedStream result = streams.get(name);
+ if (result == null) {
+ EnumSet<Modifier> modifiers = createCompressionModifiers(name.getKind());
+ result = new BufferedStream(name.toString(), bufferSize,
+ codec == null ? null : codec.modify(modifiers));
+ streams.put(name, result);
+ }
+ return result.outStream;
+ }
+
+ private EnumSet<Modifier> createCompressionModifiers(Kind kind) {
+ switch (kind) {
+ case BLOOM_FILTER:
+ case DATA:
+ case DICTIONARY_DATA:
+ return EnumSet.of(Modifier.TEXT,
+ compressionStrategy == CompressionStrategy.SPEED ? Modifier.FAST : Modifier.DEFAULT);
+ case LENGTH:
+ case DICTIONARY_COUNT:
+ case PRESENT:
+ case ROW_INDEX:
+ case SECONDARY:
+ // easily compressed using the fastest modes
+ return EnumSet.of(CompressionCodec.Modifier.FASTEST, CompressionCodec.Modifier.BINARY);
+ default:
+ LOG.warn("Missing ORC compression modifiers for " + kind);
+ return null;
+ }
+ }
+
+ @Override
+ public void finalizeStripe(StripeFooter.Builder footerBuilder,
+ StripeInformation.Builder dirEntry) throws IOException {
+ long indexSize = 0;
+ long dataSize = 0;
+ for (Map.Entry<StreamName, BufferedStream> pair: streams.entrySet()) {
+ BufferedStream receiver = pair.getValue();
+ OutStream outStream = receiver.outStream;
+ if (!outStream.isSuppressed()) {
+ outStream.flush();
+ long streamSize = receiver.getOutputSize();
+ StreamName name = pair.getKey();
+ footerBuilder.addStreams(OrcProto.Stream.newBuilder().setColumn(name.getColumn())
+ .setKind(name.getKind()).setLength(streamSize));
+ if (StreamName.Area.INDEX == name.getArea()) {
+ indexSize += streamSize;
+ } else {
+ dataSize += streamSize;
+ }
+ }
+ }
+ dirEntry.setIndexLength(indexSize).setDataLength(dataSize);
+
+ OrcProto.StripeFooter footer = footerBuilder.build();
+ // Do we need to pad the file so the stripe doesn't straddle a block boundary?
+ padStripe(indexSize, dataSize, footer.getSerializedSize());
+
+ // write out the data streams
+ for (Map.Entry<StreamName, BufferedStream> pair : streams.entrySet()) {
+ pair.getValue().spillToDiskAndClear();
+ }
+ // Write out the footer.
+ writeStripeFooter(footer, dataSize, indexSize, dirEntry);
+ }
+
+ @Override
+ public long estimateMemory() {
+ long result = 0;
+ for (BufferedStream stream: streams.values()) {
+ result += stream.getBufferSize();
+ }
+ return result;
+ }
+
+ @Override
+ public void writeIndexStream(StreamName name, Builder rowIndex) throws IOException {
+ OutStream stream = getOrCreatePhysicalStream(name);
+ rowIndex.build().writeTo(stream);
+ stream.flush();
+ }
+
+ @Override
+ public void writeBloomFilterStream(
+ StreamName name, BloomFilterIndex.Builder bloomFilterIndex) throws IOException {
+ OutStream stream = getOrCreatePhysicalStream(name);
+ bloomFilterIndex.build().writeTo(stream);
+ stream.flush();
+ }
+
+ @VisibleForTesting
+ public OutputStream getStream() throws IOException {
+ initialize();
+ return rawWriter;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/PhysicalWriter.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/PhysicalWriter.java b/orc/src/java/org/apache/hive/orc/impl/PhysicalWriter.java
new file mode 100644
index 0000000..dc0089a
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/PhysicalWriter.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+
+import org.apache.hive.orc.OrcProto.BloomFilterIndex;
+import org.apache.hive.orc.OrcProto.Footer;
+import org.apache.hive.orc.OrcProto.Metadata;
+import org.apache.hive.orc.OrcProto.PostScript;
+import org.apache.hive.orc.OrcProto.RowIndex;
+import org.apache.hive.orc.OrcProto.StripeFooter;
+import org.apache.hive.orc.OrcProto.StripeInformation;
+
+public interface PhysicalWriter {
+
+ /**
+ * Creates all the streams/connections/etc. necessary to write.
+ */
+ void initialize() throws IOException;
+
+ /**
+ * Writes out the file metadata.
+ * @param builder Metadata builder to finalize and write.
+ */
+ void writeFileMetadata(Metadata.Builder builder) throws IOException;
+
+ /**
+ * Writes out the file footer.
+ * @param builder Footer builder to finalize and write.
+ */
+ void writeFileFooter(Footer.Builder builder) throws IOException;
+
+ /**
+ * Writes out the postscript (including the size byte if needed).
+ * @param builder Postscript builder to finalize and write.
+ */
+ void writePostScript(PostScript.Builder builder) throws IOException;
+
+ /**
+ * Creates physical stream to write data to.
+ * @param name Stream name.
+ * @return The output stream.
+ */
+ OutStream getOrCreatePhysicalStream(StreamName name) throws IOException;
+
+ /**
+ * Flushes the data in all the streams, spills them to disk, write out stripe footer.
+ * @param footer Stripe footer to be updated with relevant data and written out.
+ * @param dirEntry File metadata entry for the stripe, to be updated with relevant data.
+ */
+ void finalizeStripe(StripeFooter.Builder footer,
+ StripeInformation.Builder dirEntry) throws IOException;
+
+ /**
+ * Writes out the index for the stripe column.
+ * @param streamName Stream name.
+ * @param rowIndex Row index entries to write.
+ */
+ void writeIndexStream(StreamName name, RowIndex.Builder rowIndex) throws IOException;
+
+ /**
+ * Writes out the index for the stripe column.
+ * @param streamName Stream name.
+ * @param bloomFilterIndex Bloom filter index to write.
+ */
+ void writeBloomFilterStream(StreamName streamName,
+ BloomFilterIndex.Builder bloomFilterIndex) throws IOException;
+
+ /**
+ * Closes the writer.
+ */
+ void close() throws IOException;
+
+ /**
+ * Force-flushes the writer.
+ */
+ void flush() throws IOException;
+
+ /**
+ * @return the physical writer position (e.g. for updater).
+ */
+ long getRawWriterPosition() throws IOException;
+
+ /** @return physical stripe size, taking padding into account. */
+ long getPhysicalStripeSize();
+
+ /** @return whether the writer is compressed. */
+ boolean isCompressed();
+
+ /**
+ * Appends raw stripe data (e.g. for file merger).
+ * @param stripe Stripe data buffer.
+ * @param offset Stripe data buffer offset.
+ * @param length Stripe data buffer length.
+ * @param dirEntry File metadata entry for the stripe, to be updated with relevant data.
+ * @throws IOException
+ */
+ void appendRawStripe(byte[] stripe, int offset, int length,
+ StripeInformation.Builder dirEntry) throws IOException;
+
+ /**
+ * @return the estimated memory usage for the stripe.
+ */
+ long estimateMemory();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/PositionProvider.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/PositionProvider.java b/orc/src/java/org/apache/hive/orc/impl/PositionProvider.java
new file mode 100644
index 0000000..36c2654
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/PositionProvider.java
@@ -0,0 +1,26 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+/**
+ * An interface used for seeking to a row index.
+ */
+public interface PositionProvider {
+ long getNext();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/PositionRecorder.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/PositionRecorder.java b/orc/src/java/org/apache/hive/orc/impl/PositionRecorder.java
new file mode 100644
index 0000000..11eb9cc
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/PositionRecorder.java
@@ -0,0 +1,25 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+/**
+ * An interface for recording positions in a stream.
+ */
+public interface PositionRecorder {
+ void addPosition(long offset);
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/PositionedOutputStream.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/PositionedOutputStream.java b/orc/src/java/org/apache/hive/orc/impl/PositionedOutputStream.java
new file mode 100644
index 0000000..1b8f7c2
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/PositionedOutputStream.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+import java.io.OutputStream;
+
+public abstract class PositionedOutputStream extends OutputStream {
+
+ /**
+ * Record the current position to the recorder.
+ * @param recorder the object that receives the position
+ * @throws IOException
+ */
+ public abstract void getPosition(PositionRecorder recorder
+ ) throws IOException;
+
+ /**
+ * Get the memory size currently allocated as buffer associated with this
+ * stream.
+ * @return the number of bytes used by buffers.
+ */
+ public abstract long getBufferSize();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/ReaderImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/ReaderImpl.java b/orc/src/java/org/apache/hive/orc/impl/ReaderImpl.java
new file mode 100644
index 0000000..8ab9e92
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/ReaderImpl.java
@@ -0,0 +1,763 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hive.orc.ColumnStatistics;
+import org.apache.hive.orc.CompressionCodec;
+import org.apache.hive.orc.FileFormatException;
+import org.apache.hive.orc.FileMetadata;
+import org.apache.hive.orc.OrcFile;
+import org.apache.hive.orc.CompressionKind;
+import org.apache.hive.orc.OrcUtils;
+import org.apache.hive.orc.Reader;
+import org.apache.hive.orc.RecordReader;
+import org.apache.hive.orc.TypeDescription;
+import org.apache.hive.orc.StripeInformation;
+import org.apache.hive.orc.StripeStatistics;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.io.DiskRange;
+import org.apache.hadoop.hive.ql.util.JavaDataModel;
+import org.apache.hadoop.io.Text;
+import org.apache.hive.orc.OrcProto;
+
+import com.google.common.collect.Lists;
+import com.google.protobuf.CodedInputStream;
+
+public class ReaderImpl implements Reader {
+
+ private static final Logger LOG = LoggerFactory.getLogger(ReaderImpl.class);
+
+ private static final int DIRECTORY_SIZE_GUESS = 16 * 1024;
+
+ protected final FileSystem fileSystem;
+ private final long maxLength;
+ protected final Path path;
+ protected final CompressionKind compressionKind;
+ protected CompressionCodec codec;
+ protected int bufferSize;
+ protected OrcProto.Metadata metadata;
+ private List<OrcProto.StripeStatistics> stripeStats;
+ private final int metadataSize;
+ protected final List<OrcProto.Type> types;
+ private TypeDescription schema;
+ private final List<OrcProto.UserMetadataItem> userMetadata;
+ private final List<OrcProto.ColumnStatistics> fileStats;
+ private final List<StripeInformation> stripes;
+ protected final int rowIndexStride;
+ private final long contentLength, numberOfRows;
+
+ private long deserializedSize = -1;
+ protected final Configuration conf;
+ private final List<Integer> versionList;
+ private final OrcFile.WriterVersion writerVersion;
+
+ protected OrcTail tail;
+
+ public static class StripeInformationImpl
+ implements StripeInformation {
+ private final OrcProto.StripeInformation stripe;
+
+ public StripeInformationImpl(OrcProto.StripeInformation stripe) {
+ this.stripe = stripe;
+ }
+
+ @Override
+ public long getOffset() {
+ return stripe.getOffset();
+ }
+
+ @Override
+ public long getLength() {
+ return stripe.getDataLength() + getIndexLength() + getFooterLength();
+ }
+
+ @Override
+ public long getDataLength() {
+ return stripe.getDataLength();
+ }
+
+ @Override
+ public long getFooterLength() {
+ return stripe.getFooterLength();
+ }
+
+ @Override
+ public long getIndexLength() {
+ return stripe.getIndexLength();
+ }
+
+ @Override
+ public long getNumberOfRows() {
+ return stripe.getNumberOfRows();
+ }
+
+ @Override
+ public String toString() {
+ return "offset: " + getOffset() + " data: " + getDataLength() +
+ " rows: " + getNumberOfRows() + " tail: " + getFooterLength() +
+ " index: " + getIndexLength();
+ }
+ }
+
+ @Override
+ public long getNumberOfRows() {
+ return numberOfRows;
+ }
+
+ @Override
+ public List<String> getMetadataKeys() {
+ List<String> result = new ArrayList<String>();
+ for(OrcProto.UserMetadataItem item: userMetadata) {
+ result.add(item.getName());
+ }
+ return result;
+ }
+
+ @Override
+ public ByteBuffer getMetadataValue(String key) {
+ for(OrcProto.UserMetadataItem item: userMetadata) {
+ if (item.hasName() && item.getName().equals(key)) {
+ return item.getValue().asReadOnlyByteBuffer();
+ }
+ }
+ throw new IllegalArgumentException("Can't find user metadata " + key);
+ }
+
+ public boolean hasMetadataValue(String key) {
+ for(OrcProto.UserMetadataItem item: userMetadata) {
+ if (item.hasName() && item.getName().equals(key)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public CompressionKind getCompressionKind() {
+ return compressionKind;
+ }
+
+ @Override
+ public int getCompressionSize() {
+ return bufferSize;
+ }
+
+ @Override
+ public List<StripeInformation> getStripes() {
+ return stripes;
+ }
+
+ @Override
+ public long getContentLength() {
+ return contentLength;
+ }
+
+ @Override
+ public List<OrcProto.Type> getTypes() {
+ return types;
+ }
+
+ @Override
+ public OrcFile.Version getFileVersion() {
+ for (OrcFile.Version version: OrcFile.Version.values()) {
+ if ((versionList != null && !versionList.isEmpty()) &&
+ version.getMajor() == versionList.get(0) &&
+ version.getMinor() == versionList.get(1)) {
+ return version;
+ }
+ }
+ return OrcFile.Version.V_0_11;
+ }
+
+ @Override
+ public OrcFile.WriterVersion getWriterVersion() {
+ return writerVersion;
+ }
+
+ @Override
+ public OrcProto.FileTail getFileTail() {
+ return tail.getFileTail();
+ }
+
+ @Override
+ public int getRowIndexStride() {
+ return rowIndexStride;
+ }
+
+ @Override
+ public ColumnStatistics[] getStatistics() {
+ ColumnStatistics[] result = new ColumnStatistics[types.size()];
+ for(int i=0; i < result.length; ++i) {
+ result[i] = ColumnStatisticsImpl.deserialize(fileStats.get(i));
+ }
+ return result;
+ }
+
+ @Override
+ public TypeDescription getSchema() {
+ return schema;
+ }
+
+ /**
+ * Ensure this is an ORC file to prevent users from trying to read text
+ * files or RC files as ORC files.
+ * @param in the file being read
+ * @param path the filename for error messages
+ * @param psLen the postscript length
+ * @param buffer the tail of the file
+ * @throws IOException
+ */
+ protected static void ensureOrcFooter(FSDataInputStream in,
+ Path path,
+ int psLen,
+ ByteBuffer buffer) throws IOException {
+ int magicLength = OrcFile.MAGIC.length();
+ int fullLength = magicLength + 1;
+ if (psLen < fullLength || buffer.remaining() < fullLength) {
+ throw new FileFormatException("Malformed ORC file " + path +
+ ". Invalid postscript length " + psLen);
+ }
+ int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength;
+ byte[] array = buffer.array();
+ // now look for the magic string at the end of the postscript.
+ if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) {
+ // If it isn't there, this may be the 0.11.0 version of ORC.
+ // Read the first 3 bytes of the file to check for the header
+ byte[] header = new byte[magicLength];
+ in.readFully(0, header, 0, magicLength);
+ // if it isn't there, this isn't an ORC file
+ if (!Text.decode(header, 0 , magicLength).equals(OrcFile.MAGIC)) {
+ throw new FileFormatException("Malformed ORC file " + path +
+ ". Invalid postscript.");
+ }
+ }
+ }
+
+ /**
+ * Ensure this is an ORC file to prevent users from trying to read text
+ * files or RC files as ORC files.
+ * @param psLen the postscript length
+ * @param buffer the tail of the file
+ * @throws IOException
+ */
+ protected static void ensureOrcFooter(ByteBuffer buffer, int psLen) throws IOException {
+ int magicLength = OrcFile.MAGIC.length();
+ int fullLength = magicLength + 1;
+ if (psLen < fullLength || buffer.remaining() < fullLength) {
+ throw new FileFormatException("Malformed ORC file. Invalid postscript length " + psLen);
+ }
+
+ int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength;
+ byte[] array = buffer.array();
+ // now look for the magic string at the end of the postscript.
+ if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) {
+ // if it isn't there, this may be 0.11.0 version of the ORC file.
+ // Read the first 3 bytes from the buffer to check for the header
+ if (!Text.decode(buffer.array(), 0, magicLength).equals(OrcFile.MAGIC)) {
+ throw new FileFormatException("Malformed ORC file. Invalid postscript length " + psLen);
+ }
+ }
+ }
+
+ /**
+ * Build a version string out of an array.
+ * @param version the version number as a list
+ * @return the human readable form of the version string
+ */
+ private static String versionString(List<Integer> version) {
+ StringBuilder buffer = new StringBuilder();
+ for(int i=0; i < version.size(); ++i) {
+ if (i != 0) {
+ buffer.append('.');
+ }
+ buffer.append(version.get(i));
+ }
+ return buffer.toString();
+ }
+
+ /**
+ * Check to see if this ORC file is from a future version and if so,
+ * warn the user that we may not be able to read all of the column encodings.
+ * @param log the logger to write any error message to
+ * @param path the data source path for error messages
+ * @param version the version of hive that wrote the file.
+ */
+ protected static void checkOrcVersion(Logger log, Path path,
+ List<Integer> version) {
+ if (version.size() >= 1) {
+ int major = version.get(0);
+ int minor = 0;
+ if (version.size() >= 2) {
+ minor = version.get(1);
+ }
+ if (major > OrcFile.Version.CURRENT.getMajor() ||
+ (major == OrcFile.Version.CURRENT.getMajor() &&
+ minor > OrcFile.Version.CURRENT.getMinor())) {
+ log.warn(path + " was written by a future Hive version " +
+ versionString(version) +
+ ". This file may not be readable by this version of Hive.");
+ }
+ }
+ }
+
+ /**
+ * Constructor that let's the user specify additional options.
+ * @param path pathname for file
+ * @param options options for reading
+ * @throws IOException
+ */
+ public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException {
+ FileSystem fs = options.getFilesystem();
+ if (fs == null) {
+ fs = path.getFileSystem(options.getConfiguration());
+ }
+ this.fileSystem = fs;
+ this.path = path;
+ this.conf = options.getConfiguration();
+ this.maxLength = options.getMaxLength();
+ FileMetadata fileMetadata = options.getFileMetadata();
+ if (fileMetadata != null) {
+ this.compressionKind = fileMetadata.getCompressionKind();
+ this.bufferSize = fileMetadata.getCompressionBufferSize();
+ this.codec = PhysicalFsWriter.createCodec(compressionKind);
+ this.metadataSize = fileMetadata.getMetadataSize();
+ this.stripeStats = fileMetadata.getStripeStats();
+ this.versionList = fileMetadata.getVersionList();
+ this.writerVersion =
+ OrcFile.WriterVersion.from(fileMetadata.getWriterVersionNum());
+ this.types = fileMetadata.getTypes();
+ this.rowIndexStride = fileMetadata.getRowIndexStride();
+ this.contentLength = fileMetadata.getContentLength();
+ this.numberOfRows = fileMetadata.getNumberOfRows();
+ this.fileStats = fileMetadata.getFileStats();
+ this.stripes = fileMetadata.getStripes();
+ this.userMetadata = null; // not cached and not needed here
+ } else {
+ OrcTail orcTail = options.getOrcTail();
+ if (orcTail == null) {
+ tail = extractFileTail(fs, path, options.getMaxLength());
+ options.orcTail(tail);
+ } else {
+ tail = orcTail;
+ }
+ this.compressionKind = tail.getCompressionKind();
+ this.codec = tail.getCompressionCodec();
+ this.bufferSize = tail.getCompressionBufferSize();
+ this.metadataSize = tail.getMetadataSize();
+ this.versionList = tail.getPostScript().getVersionList();
+ this.types = tail.getFooter().getTypesList();
+ this.rowIndexStride = tail.getFooter().getRowIndexStride();
+ this.contentLength = tail.getFooter().getContentLength();
+ this.numberOfRows = tail.getFooter().getNumberOfRows();
+ this.userMetadata = tail.getFooter().getMetadataList();
+ this.fileStats = tail.getFooter().getStatisticsList();
+ this.writerVersion = tail.getWriterVersion();
+ this.stripes = tail.getStripes();
+ this.stripeStats = tail.getStripeStatisticsProto();
+ }
+ this.schema = OrcUtils.convertTypeFromProtobuf(this.types, 0);
+ }
+
+ /**
+ * Get the WriterVersion based on the ORC file postscript.
+ * @param writerVersion the integer writer version
+ * @return the version of the software that produced the file
+ */
+ public static OrcFile.WriterVersion getWriterVersion(int writerVersion) {
+ for(OrcFile.WriterVersion version: OrcFile.WriterVersion.values()) {
+ if (version.getId() == writerVersion) {
+ return version;
+ }
+ }
+ return OrcFile.WriterVersion.FUTURE;
+ }
+
+ private static OrcProto.Footer extractFooter(ByteBuffer bb, int footerAbsPos,
+ int footerSize, CompressionCodec codec, int bufferSize) throws IOException {
+ bb.position(footerAbsPos);
+ bb.limit(footerAbsPos + footerSize);
+ return OrcProto.Footer.parseFrom(InStream.createCodedInputStream("footer",
+ Lists.<DiskRange>newArrayList(new BufferChunk(bb, 0)), footerSize, codec, bufferSize));
+ }
+
+ public static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos,
+ int metadataSize, CompressionCodec codec, int bufferSize) throws IOException {
+ bb.position(metadataAbsPos);
+ bb.limit(metadataAbsPos + metadataSize);
+ return OrcProto.Metadata.parseFrom(InStream.createCodedInputStream("metadata",
+ Lists.<DiskRange>newArrayList(new BufferChunk(bb, 0)), metadataSize, codec, bufferSize));
+ }
+
+ private static OrcProto.PostScript extractPostScript(ByteBuffer bb, Path path,
+ int psLen, int psAbsOffset) throws IOException {
+ // TODO: when PB is upgraded to 2.6, newInstance(ByteBuffer) method should be used here.
+ assert bb.hasArray();
+ CodedInputStream in = CodedInputStream.newInstance(
+ bb.array(), bb.arrayOffset() + psAbsOffset, psLen);
+ OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(in);
+ checkOrcVersion(LOG, path, ps.getVersionList());
+
+ // Check compression codec.
+ switch (ps.getCompression()) {
+ case NONE:
+ break;
+ case ZLIB:
+ break;
+ case SNAPPY:
+ break;
+ case LZO:
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown compression");
+ }
+ return ps;
+ }
+
+ public static OrcTail extractFileTail(ByteBuffer buffer)
+ throws IOException {
+ return extractFileTail(buffer, -1, -1);
+ }
+
+ public static OrcTail extractFileTail(ByteBuffer buffer, long fileLength, long modificationTime)
+ throws IOException {
+ int readSize = buffer.limit();
+ int psLen = buffer.get(readSize - 1) & 0xff;
+ int psOffset = readSize - 1 - psLen;
+ ensureOrcFooter(buffer, psLen);
+ byte[] psBuffer = new byte[psLen];
+ System.arraycopy(buffer.array(), psOffset, psBuffer, 0, psLen);
+ OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(psBuffer);
+ int footerSize = (int) ps.getFooterLength();
+ CompressionCodec codec = PhysicalFsWriter
+ .createCodec(CompressionKind.valueOf(ps.getCompression().name()));
+ OrcProto.Footer footer = extractFooter(buffer,
+ (int) (buffer.position() + ps.getMetadataLength()),
+ footerSize, codec, (int) ps.getCompressionBlockSize());
+ OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder()
+ .setPostscriptLength(psLen)
+ .setPostscript(ps)
+ .setFooter(footer)
+ .setFileLength(fileLength);
+ // clear does not clear the contents but sets position to 0 and limit = capacity
+ buffer.clear();
+ return new OrcTail(fileTailBuilder.build(), buffer.slice(), modificationTime);
+ }
+
+ protected OrcTail extractFileTail(FileSystem fs, Path path,
+ long maxFileLength) throws IOException {
+ FSDataInputStream file = fs.open(path);
+ ByteBuffer buffer;
+ OrcProto.PostScript ps;
+ OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder();
+ long modificationTime;
+ try {
+ // figure out the size of the file using the option or filesystem
+ long size;
+ if (maxFileLength == Long.MAX_VALUE) {
+ FileStatus fileStatus = fs.getFileStatus(path);
+ size = fileStatus.getLen();
+ modificationTime = fileStatus.getModificationTime();
+ } else {
+ size = maxFileLength;
+ modificationTime = -1;
+ }
+ fileTailBuilder.setFileLength(size);
+
+ //read last bytes into buffer to get PostScript
+ int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS);
+ buffer = ByteBuffer.allocate(readSize);
+ assert buffer.position() == 0;
+ file.readFully((size - readSize),
+ buffer.array(), buffer.arrayOffset(), readSize);
+ buffer.position(0);
+
+ //read the PostScript
+ //get length of PostScript
+ int psLen = buffer.get(readSize - 1) & 0xff;
+ ensureOrcFooter(file, path, psLen, buffer);
+ int psOffset = readSize - 1 - psLen;
+ ps = extractPostScript(buffer, path, psLen, psOffset);
+ bufferSize = (int) ps.getCompressionBlockSize();
+ codec = PhysicalFsWriter.createCodec(CompressionKind.valueOf(ps.getCompression().name()));
+ fileTailBuilder.setPostscriptLength(psLen).setPostscript(ps);
+
+ int footerSize = (int) ps.getFooterLength();
+ int metadataSize = (int) ps.getMetadataLength();
+
+ //check if extra bytes need to be read
+ int extra = Math.max(0, psLen + 1 + footerSize + metadataSize - readSize);
+ int tailSize = 1 + psLen + footerSize + metadataSize;
+ if (extra > 0) {
+ //more bytes need to be read, seek back to the right place and read extra bytes
+ ByteBuffer extraBuf = ByteBuffer.allocate(extra + readSize);
+ file.readFully((size - readSize - extra), extraBuf.array(),
+ extraBuf.arrayOffset() + extraBuf.position(), extra);
+ extraBuf.position(extra);
+ //append with already read bytes
+ extraBuf.put(buffer);
+ buffer = extraBuf;
+ buffer.position(0);
+ buffer.limit(tailSize);
+ readSize += extra;
+ psOffset = readSize - 1 - psLen;
+ } else {
+ //footer is already in the bytes in buffer, just adjust position, length
+ buffer.position(psOffset - footerSize - metadataSize);
+ buffer.limit(buffer.position() + tailSize);
+ }
+
+ buffer.mark();
+ int footerOffset = psOffset - footerSize;
+ buffer.position(footerOffset);
+ ByteBuffer footerBuffer = buffer.slice();
+ buffer.reset();
+ OrcProto.Footer footer = extractFooter(footerBuffer, 0, footerSize,
+ codec, bufferSize);
+ fileTailBuilder.setFooter(footer);
+ } finally {
+ try {
+ file.close();
+ } catch (IOException ex) {
+ LOG.error("Failed to close the file after another error", ex);
+ }
+ }
+
+ ByteBuffer serializedTail = ByteBuffer.allocate(buffer.remaining());
+ serializedTail.put(buffer.slice());
+ serializedTail.rewind();
+ return new OrcTail(fileTailBuilder.build(), serializedTail, modificationTime);
+ }
+
+ @Override
+ public ByteBuffer getSerializedFileFooter() {
+ return tail.getSerializedTail();
+ }
+
+ @Override
+ public RecordReader rows() throws IOException {
+ return rows(new Options());
+ }
+
+ @Override
+ public RecordReader rows(Options options) throws IOException {
+ LOG.info("Reading ORC rows from " + path + " with " + options);
+ return new RecordReaderImpl(this, options);
+ }
+
+
+ @Override
+ public long getRawDataSize() {
+ // if the deserializedSize is not computed, then compute it, else
+ // return the already computed size. since we are reading from the footer
+ // we don't have to compute deserialized size repeatedly
+ if (deserializedSize == -1) {
+ List<Integer> indices = Lists.newArrayList();
+ for (int i = 0; i < fileStats.size(); ++i) {
+ indices.add(i);
+ }
+ deserializedSize = getRawDataSizeFromColIndices(indices);
+ }
+ return deserializedSize;
+ }
+
+ @Override
+ public long getRawDataSizeFromColIndices(List<Integer> colIndices) {
+ return getRawDataSizeFromColIndices(colIndices, types, fileStats);
+ }
+
+ public static long getRawDataSizeFromColIndices(
+ List<Integer> colIndices, List<OrcProto.Type> types,
+ List<OrcProto.ColumnStatistics> stats) {
+ long result = 0;
+ for (int colIdx : colIndices) {
+ result += getRawDataSizeOfColumn(colIdx, types, stats);
+ }
+ return result;
+ }
+
+ private static long getRawDataSizeOfColumn(int colIdx, List<OrcProto.Type> types,
+ List<OrcProto.ColumnStatistics> stats) {
+ OrcProto.ColumnStatistics colStat = stats.get(colIdx);
+ long numVals = colStat.getNumberOfValues();
+ OrcProto.Type type = types.get(colIdx);
+
+ switch (type.getKind()) {
+ case BINARY:
+ // old orc format doesn't support binary statistics. checking for binary
+ // statistics is not required as protocol buffers takes care of it.
+ return colStat.getBinaryStatistics().getSum();
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ // old orc format doesn't support sum for string statistics. checking for
+ // existence is not required as protocol buffers takes care of it.
+
+ // ORC strings are deserialized to java strings. so use java data model's
+ // string size
+ numVals = numVals == 0 ? 1 : numVals;
+ int avgStrLen = (int) (colStat.getStringStatistics().getSum() / numVals);
+ return numVals * JavaDataModel.get().lengthForStringOfLength(avgStrLen);
+ case TIMESTAMP:
+ return numVals * JavaDataModel.get().lengthOfTimestamp();
+ case DATE:
+ return numVals * JavaDataModel.get().lengthOfDate();
+ case DECIMAL:
+ return numVals * JavaDataModel.get().lengthOfDecimal();
+ case DOUBLE:
+ case LONG:
+ return numVals * JavaDataModel.get().primitive2();
+ case FLOAT:
+ case INT:
+ case SHORT:
+ case BOOLEAN:
+ case BYTE:
+ return numVals * JavaDataModel.get().primitive1();
+ default:
+ LOG.debug("Unknown primitive category: " + type.getKind());
+ break;
+ }
+
+ return 0;
+ }
+
+ @Override
+ public long getRawDataSizeOfColumns(List<String> colNames) {
+ List<Integer> colIndices = getColumnIndicesFromNames(colNames);
+ return getRawDataSizeFromColIndices(colIndices);
+ }
+
+ private List<Integer> getColumnIndicesFromNames(List<String> colNames) {
+ // top level struct
+ OrcProto.Type type = types.get(0);
+ List<Integer> colIndices = Lists.newArrayList();
+ List<String> fieldNames = type.getFieldNamesList();
+ int fieldIdx;
+ for (String colName : colNames) {
+ if (fieldNames.contains(colName)) {
+ fieldIdx = fieldNames.indexOf(colName);
+ } else {
+ String s = "Cannot find field for: " + colName + " in ";
+ for (String fn : fieldNames) {
+ s += fn + ", ";
+ }
+ LOG.warn(s);
+ continue;
+ }
+
+ // a single field may span multiple columns. find start and end column
+ // index for the requested field
+ int idxStart = type.getSubtypes(fieldIdx);
+
+ int idxEnd;
+
+ // if the specified is the last field and then end index will be last
+ // column index
+ if (fieldIdx + 1 > fieldNames.size() - 1) {
+ idxEnd = getLastIdx() + 1;
+ } else {
+ idxEnd = type.getSubtypes(fieldIdx + 1);
+ }
+
+ // if start index and end index are same then the field is a primitive
+ // field else complex field (like map, list, struct, union)
+ if (idxStart == idxEnd) {
+ // simple field
+ colIndices.add(idxStart);
+ } else {
+ // complex fields spans multiple columns
+ for (int i = idxStart; i < idxEnd; i++) {
+ colIndices.add(i);
+ }
+ }
+ }
+ return colIndices;
+ }
+
+ private int getLastIdx() {
+ Set<Integer> indices = new HashSet<>();
+ for (OrcProto.Type type : types) {
+ indices.addAll(type.getSubtypesList());
+ }
+ return Collections.max(indices);
+ }
+
+ @Override
+ public List<OrcProto.StripeStatistics> getOrcProtoStripeStatistics() {
+ return stripeStats;
+ }
+
+ @Override
+ public List<OrcProto.ColumnStatistics> getOrcProtoFileStatistics() {
+ return fileStats;
+ }
+
+ @Override
+ public List<StripeStatistics> getStripeStatistics() throws IOException {
+ if (stripeStats == null && metadata == null) {
+ metadata = extractMetadata(tail.getSerializedTail(), 0, metadataSize, codec, bufferSize);
+ stripeStats = metadata.getStripeStatsList();
+ }
+ List<StripeStatistics> result = new ArrayList<>();
+ for (OrcProto.StripeStatistics ss : stripeStats) {
+ result.add(new StripeStatistics(ss.getColStatsList()));
+ }
+ return result;
+ }
+
+ public List<OrcProto.UserMetadataItem> getOrcProtoUserMetadata() {
+ return userMetadata;
+ }
+
+ @Override
+ public List<Integer> getVersionList() {
+ return versionList;
+ }
+
+ @Override
+ public int getMetadataSize() {
+ return metadataSize;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buffer = new StringBuilder();
+ buffer.append("ORC Reader(");
+ buffer.append(path);
+ if (maxLength != -1) {
+ buffer.append(", ");
+ buffer.append(maxLength);
+ }
+ buffer.append(")");
+ return buffer.toString();
+ }
+}
[13/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/ZeroCopyShims.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/ZeroCopyShims.java b/orc/src/java/org/apache/orc/impl/ZeroCopyShims.java
deleted file mode 100644
index de02c8b..0000000
--- a/orc/src/java/org/apache/orc/impl/ZeroCopyShims.java
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.EnumSet;
-
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.ReadOption;
-import org.apache.hadoop.io.ByteBufferPool;
-
-class ZeroCopyShims {
- private static final class ByteBufferPoolAdapter implements ByteBufferPool {
- private HadoopShims.ByteBufferPoolShim pool;
-
- public ByteBufferPoolAdapter(HadoopShims.ByteBufferPoolShim pool) {
- this.pool = pool;
- }
-
- @Override
- public final ByteBuffer getBuffer(boolean direct, int length) {
- return this.pool.getBuffer(direct, length);
- }
-
- @Override
- public final void putBuffer(ByteBuffer buffer) {
- this.pool.putBuffer(buffer);
- }
- }
-
- private static final class ZeroCopyAdapter implements HadoopShims.ZeroCopyReaderShim {
- private final FSDataInputStream in;
- private final ByteBufferPoolAdapter pool;
- private final static EnumSet<ReadOption> CHECK_SUM = EnumSet
- .noneOf(ReadOption.class);
- private final static EnumSet<ReadOption> NO_CHECK_SUM = EnumSet
- .of(ReadOption.SKIP_CHECKSUMS);
-
- public ZeroCopyAdapter(FSDataInputStream in,
- HadoopShims.ByteBufferPoolShim poolshim) {
- this.in = in;
- if (poolshim != null) {
- pool = new ByteBufferPoolAdapter(poolshim);
- } else {
- pool = null;
- }
- }
-
- public final ByteBuffer readBuffer(int maxLength, boolean verifyChecksums)
- throws IOException {
- EnumSet<ReadOption> options = NO_CHECK_SUM;
- if (verifyChecksums) {
- options = CHECK_SUM;
- }
- return this.in.read(this.pool, maxLength, options);
- }
-
- public final void releaseBuffer(ByteBuffer buffer) {
- this.in.releaseBuffer(buffer);
- }
-
- @Override
- public final void close() throws IOException {
- this.in.close();
- }
- }
-
- public static HadoopShims.ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in,
- HadoopShims.ByteBufferPoolShim pool) throws IOException {
- return new ZeroCopyAdapter(in, pool);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/ZlibCodec.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/ZlibCodec.java b/orc/src/java/org/apache/orc/impl/ZlibCodec.java
deleted file mode 100644
index 5f648a8..0000000
--- a/orc/src/java/org/apache/orc/impl/ZlibCodec.java
+++ /dev/null
@@ -1,169 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.EnumSet;
-import java.util.zip.DataFormatException;
-import java.util.zip.Deflater;
-import java.util.zip.Inflater;
-
-import javax.annotation.Nullable;
-
-import org.apache.hadoop.io.compress.DirectDecompressor;
-import org.apache.orc.CompressionCodec;
-
-public class ZlibCodec implements CompressionCodec, DirectDecompressionCodec {
- private static final HadoopShims SHIMS = HadoopShims.Factory.get();
- private Boolean direct = null;
-
- private final int level;
- private final int strategy;
-
- public ZlibCodec() {
- level = Deflater.DEFAULT_COMPRESSION;
- strategy = Deflater.DEFAULT_STRATEGY;
- }
-
- private ZlibCodec(int level, int strategy) {
- this.level = level;
- this.strategy = strategy;
- }
-
- @Override
- public boolean compress(ByteBuffer in, ByteBuffer out,
- ByteBuffer overflow) throws IOException {
- Deflater deflater = new Deflater(level, true);
- deflater.setStrategy(strategy);
- int length = in.remaining();
- deflater.setInput(in.array(), in.arrayOffset() + in.position(), length);
- deflater.finish();
- int outSize = 0;
- int offset = out.arrayOffset() + out.position();
- while (!deflater.finished() && (length > outSize)) {
- int size = deflater.deflate(out.array(), offset, out.remaining());
- out.position(size + out.position());
- outSize += size;
- offset += size;
- // if we run out of space in the out buffer, use the overflow
- if (out.remaining() == 0) {
- if (overflow == null) {
- deflater.end();
- return false;
- }
- out = overflow;
- offset = out.arrayOffset() + out.position();
- }
- }
- deflater.end();
- return length > outSize;
- }
-
- @Override
- public void decompress(ByteBuffer in, ByteBuffer out) throws IOException {
-
- if(in.isDirect() && out.isDirect()) {
- directDecompress(in, out);
- return;
- }
-
- Inflater inflater = new Inflater(true);
- inflater.setInput(in.array(), in.arrayOffset() + in.position(),
- in.remaining());
- while (!(inflater.finished() || inflater.needsDictionary() ||
- inflater.needsInput())) {
- try {
- int count = inflater.inflate(out.array(),
- out.arrayOffset() + out.position(),
- out.remaining());
- out.position(count + out.position());
- } catch (DataFormatException dfe) {
- throw new IOException("Bad compression data", dfe);
- }
- }
- out.flip();
- inflater.end();
- in.position(in.limit());
- }
-
- @Override
- public boolean isAvailable() {
- if (direct == null) {
- // see nowrap option in new Inflater(boolean) which disables zlib headers
- try {
- if (SHIMS.getDirectDecompressor(
- HadoopShims.DirectCompressionType.ZLIB_NOHEADER) != null) {
- direct = Boolean.valueOf(true);
- } else {
- direct = Boolean.valueOf(false);
- }
- } catch (UnsatisfiedLinkError ule) {
- direct = Boolean.valueOf(false);
- }
- }
- return direct.booleanValue();
- }
-
- @Override
- public void directDecompress(ByteBuffer in, ByteBuffer out)
- throws IOException {
- HadoopShims.DirectDecompressor decompressShim =
- SHIMS.getDirectDecompressor(HadoopShims.DirectCompressionType.ZLIB_NOHEADER);
- decompressShim.decompress(in, out);
- out.flip(); // flip for read
- }
-
- @Override
- public CompressionCodec modify(@Nullable EnumSet<Modifier> modifiers) {
-
- if (modifiers == null) {
- return this;
- }
-
- int l = this.level;
- int s = this.strategy;
-
- for (Modifier m : modifiers) {
- switch (m) {
- case BINARY:
- /* filtered == less LZ77, more huffman */
- s = Deflater.FILTERED;
- break;
- case TEXT:
- s = Deflater.DEFAULT_STRATEGY;
- break;
- case FASTEST:
- // deflate_fast looking for 8 byte patterns
- l = Deflater.BEST_SPEED;
- break;
- case FAST:
- // deflate_fast looking for 16 byte patterns
- l = Deflater.BEST_SPEED + 1;
- break;
- case DEFAULT:
- // deflate_slow looking for 128 byte patterns
- l = Deflater.DEFAULT_COMPRESSION;
- break;
- default:
- break;
- }
- }
- return new ZlibCodec(l, s);
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/tools/FileDump.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/tools/FileDump.java b/orc/src/java/org/apache/orc/tools/FileDump.java
deleted file mode 100644
index 1a1d8ab..0000000
--- a/orc/src/java/org/apache/orc/tools/FileDump.java
+++ /dev/null
@@ -1,946 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.tools;
-
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.PrintStream;
-import java.text.DecimalFormat;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.GnuParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.OptionBuilder;
-import org.apache.commons.cli.Options;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-import org.apache.hadoop.hdfs.DistributedFileSystem;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.orc.BloomFilterIO;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.CompressionKind;
-import org.apache.orc.OrcFile;
-import org.apache.orc.Reader;
-import org.apache.orc.RecordReader;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.Writer;
-import org.apache.orc.impl.AcidStats;
-import org.apache.orc.impl.ColumnStatisticsImpl;
-import org.apache.orc.impl.OrcAcidUtils;
-import org.apache.orc.impl.OrcIndex;
-import org.apache.orc.OrcProto;
-import org.apache.orc.StripeInformation;
-import org.apache.orc.StripeStatistics;
-import org.apache.orc.impl.RecordReaderImpl;
-import org.codehaus.jettison.json.JSONException;
-import org.codehaus.jettison.json.JSONWriter;
-
-import com.google.common.base.Joiner;
-import com.google.common.base.Strings;
-import com.google.common.collect.Lists;
-
-/**
- * A tool for printing out the file structure of ORC files.
- */
-public final class FileDump {
- public static final String UNKNOWN = "UNKNOWN";
- public static final String SEPARATOR = Strings.repeat("_", 120) + "\n";
- public static final int DEFAULT_BLOCK_SIZE = 256 * 1024 * 1024;
- public static final String DEFAULT_BACKUP_PATH = System.getProperty("java.io.tmpdir");
- public static final PathFilter HIDDEN_AND_SIDE_FILE_FILTER = new PathFilter() {
- public boolean accept(Path p) {
- String name = p.getName();
- return !name.startsWith("_") && !name.startsWith(".") && !name.endsWith(
- OrcAcidUtils.DELTA_SIDE_FILE_SUFFIX);
- }
- };
-
- // not used
- private FileDump() {
- }
-
- public static void main(String[] args) throws Exception {
- Configuration conf = new Configuration();
-
- List<Integer> rowIndexCols = new ArrayList<Integer>(0);
- Options opts = createOptions();
- CommandLine cli = new GnuParser().parse(opts, args);
-
- if (cli.hasOption('h')) {
- HelpFormatter formatter = new HelpFormatter();
- formatter.printHelp("orcfiledump", opts);
- return;
- }
-
- boolean dumpData = cli.hasOption('d');
- boolean recover = cli.hasOption("recover");
- boolean skipDump = cli.hasOption("skip-dump");
- String backupPath = DEFAULT_BACKUP_PATH;
- if (cli.hasOption("backup-path")) {
- backupPath = cli.getOptionValue("backup-path");
- }
-
- if (cli.hasOption("r")) {
- String val = cli.getOptionValue("r");
- if (val != null && val.trim().equals("*")) {
- rowIndexCols = null; // All the columns
- } else {
- String[] colStrs = cli.getOptionValue("r").split(",");
- rowIndexCols = new ArrayList<Integer>(colStrs.length);
- for (String colStr : colStrs) {
- rowIndexCols.add(Integer.parseInt(colStr));
- }
- }
- }
-
- boolean printTimeZone = cli.hasOption('t');
- boolean jsonFormat = cli.hasOption('j');
- String[] files = cli.getArgs();
- if (files.length == 0) {
- System.err.println("Error : ORC files are not specified");
- return;
- }
-
- // if the specified path is directory, iterate through all files and print the file dump
- List<String> filesInPath = Lists.newArrayList();
- for (String filename : files) {
- Path path = new Path(filename);
- filesInPath.addAll(getAllFilesInPath(path, conf));
- }
-
- if (dumpData) {
- printData(filesInPath, conf);
- } else if (recover && skipDump) {
- recoverFiles(filesInPath, conf, backupPath);
- } else {
- if (jsonFormat) {
- boolean prettyPrint = cli.hasOption('p');
- JsonFileDump.printJsonMetaData(filesInPath, conf, rowIndexCols, prettyPrint, printTimeZone);
- } else {
- printMetaData(filesInPath, conf, rowIndexCols, printTimeZone, recover, backupPath);
- }
- }
- }
-
- /**
- * This method returns an ORC reader object if the specified file is readable. If the specified
- * file has side file (_flush_length) file, then max footer offset will be read from the side
- * file and orc reader will be created from that offset. Since both data file and side file
- * use hflush() for flushing the data, there could be some inconsistencies and both files could be
- * out-of-sync. Following are the cases under which null will be returned
- *
- * 1) If the file specified by path or its side file is still open for writes
- * 2) If *_flush_length file does not return any footer offset
- * 3) If *_flush_length returns a valid footer offset but the data file is not readable at that
- * position (incomplete data file)
- * 4) If *_flush_length file length is not a multiple of 8, then reader will be created from
- * previous valid footer. If there is no such footer (file length > 0 and < 8), then null will
- * be returned
- *
- * Also, if this method detects any file corruption (mismatch between data file and side file)
- * then it will add the corresponding file to the specified input list for corrupted files.
- *
- * In all other cases, where the file is readable this method will return a reader object.
- *
- * @param path - file to get reader for
- * @param conf - configuration object
- * @param corruptFiles - fills this list with all possible corrupted files
- * @return - reader for the specified file or null
- * @throws IOException
- */
- static Reader getReader(final Path path, final Configuration conf,
- final List<String> corruptFiles) throws IOException {
- FileSystem fs = path.getFileSystem(conf);
- long dataFileLen = fs.getFileStatus(path).getLen();
- System.err.println("Processing data file " + path + " [length: " + dataFileLen + "]");
- Path sideFile = OrcAcidUtils.getSideFile(path);
- final boolean sideFileExists = fs.exists(sideFile);
- boolean openDataFile = false;
- boolean openSideFile = false;
- if (fs instanceof DistributedFileSystem) {
- DistributedFileSystem dfs = (DistributedFileSystem) fs;
- openDataFile = !dfs.isFileClosed(path);
- openSideFile = sideFileExists && !dfs.isFileClosed(sideFile);
- }
-
- if (openDataFile || openSideFile) {
- if (openDataFile && openSideFile) {
- System.err.println("Unable to perform file dump as " + path + " and " + sideFile +
- " are still open for writes.");
- } else if (openSideFile) {
- System.err.println("Unable to perform file dump as " + sideFile +
- " is still open for writes.");
- } else {
- System.err.println("Unable to perform file dump as " + path +
- " is still open for writes.");
- }
-
- return null;
- }
-
- Reader reader = null;
- if (sideFileExists) {
- final long maxLen = OrcAcidUtils.getLastFlushLength(fs, path);
- final long sideFileLen = fs.getFileStatus(sideFile).getLen();
- System.err.println("Found flush length file " + sideFile
- + " [length: " + sideFileLen + ", maxFooterOffset: " + maxLen + "]");
- // no offsets read from side file
- if (maxLen == -1) {
-
- // if data file is larger than last flush length, then additional data could be recovered
- if (dataFileLen > maxLen) {
- System.err.println("Data file has more data than max footer offset:" + maxLen +
- ". Adding data file to recovery list.");
- if (corruptFiles != null) {
- corruptFiles.add(path.toUri().toString());
- }
- }
- return null;
- }
-
- try {
- reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).maxLength(maxLen));
-
- // if data file is larger than last flush length, then additional data could be recovered
- if (dataFileLen > maxLen) {
- System.err.println("Data file has more data than max footer offset:" + maxLen +
- ". Adding data file to recovery list.");
- if (corruptFiles != null) {
- corruptFiles.add(path.toUri().toString());
- }
- }
- } catch (Exception e) {
- if (corruptFiles != null) {
- corruptFiles.add(path.toUri().toString());
- }
- System.err.println("Unable to read data from max footer offset." +
- " Adding data file to recovery list.");
- return null;
- }
- } else {
- reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
- }
-
- return reader;
- }
-
- public static Collection<String> getAllFilesInPath(final Path path,
- final Configuration conf) throws IOException {
- List<String> filesInPath = Lists.newArrayList();
- FileSystem fs = path.getFileSystem(conf);
- FileStatus fileStatus = fs.getFileStatus(path);
- if (fileStatus.isDir()) {
- FileStatus[] fileStatuses = fs.listStatus(path, HIDDEN_AND_SIDE_FILE_FILTER);
- for (FileStatus fileInPath : fileStatuses) {
- if (fileInPath.isDir()) {
- filesInPath.addAll(getAllFilesInPath(fileInPath.getPath(), conf));
- } else {
- filesInPath.add(fileInPath.getPath().toString());
- }
- }
- } else {
- filesInPath.add(path.toString());
- }
-
- return filesInPath;
- }
-
- private static void printData(List<String> files,
- Configuration conf) throws IOException,
- JSONException {
- for (String file : files) {
- try {
- Path path = new Path(file);
- Reader reader = getReader(path, conf, Lists.<String>newArrayList());
- if (reader == null) {
- continue;
- }
- printJsonData(reader);
- System.out.println(SEPARATOR);
- } catch (Exception e) {
- System.err.println("Unable to dump data for file: " + file);
- continue;
- }
- }
- }
-
- private static void printMetaData(List<String> files, Configuration conf,
- List<Integer> rowIndexCols, boolean printTimeZone, final boolean recover,
- final String backupPath)
- throws IOException {
- List<String> corruptFiles = Lists.newArrayList();
- for (String filename : files) {
- printMetaDataImpl(filename, conf, rowIndexCols, printTimeZone, corruptFiles);
- System.out.println(SEPARATOR);
- }
-
- if (!corruptFiles.isEmpty()) {
- if (recover) {
- recoverFiles(corruptFiles, conf, backupPath);
- } else {
- System.err.println(corruptFiles.size() + " file(s) are corrupted." +
- " Run the following command to recover corrupted files.\n");
- String fileNames = Joiner.on(" ").skipNulls().join(corruptFiles);
- System.err.println("hive --orcfiledump --recover --skip-dump " + fileNames);
- System.out.println(SEPARATOR);
- }
- }
- }
-
- private static void printMetaDataImpl(final String filename,
- final Configuration conf, List<Integer> rowIndexCols, final boolean printTimeZone,
- final List<String> corruptFiles) throws IOException {
- Path file = new Path(filename);
- Reader reader = getReader(file, conf, corruptFiles);
- // if we can create reader then footer is not corrupt and file will readable
- if (reader == null) {
- return;
- }
-
- System.out.println("Structure for " + filename);
- System.out.println("File Version: " + reader.getFileVersion().getName() +
- " with " + reader.getWriterVersion());
- RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
- System.out.println("Rows: " + reader.getNumberOfRows());
- System.out.println("Compression: " + reader.getCompressionKind());
- if (reader.getCompressionKind() != CompressionKind.NONE) {
- System.out.println("Compression size: " + reader.getCompressionSize());
- }
- System.out.println("Type: " + reader.getSchema().toString());
- System.out.println("\nStripe Statistics:");
- List<StripeStatistics> stripeStats = reader.getStripeStatistics();
- for (int n = 0; n < stripeStats.size(); n++) {
- System.out.println(" Stripe " + (n + 1) + ":");
- StripeStatistics ss = stripeStats.get(n);
- for (int i = 0; i < ss.getColumnStatistics().length; ++i) {
- System.out.println(" Column " + i + ": " +
- ss.getColumnStatistics()[i].toString());
- }
- }
- ColumnStatistics[] stats = reader.getStatistics();
- int colCount = stats.length;
- if (rowIndexCols == null) {
- rowIndexCols = new ArrayList<>(colCount);
- for (int i = 0; i < colCount; ++i) {
- rowIndexCols.add(i);
- }
- }
- System.out.println("\nFile Statistics:");
- for (int i = 0; i < stats.length; ++i) {
- System.out.println(" Column " + i + ": " + stats[i].toString());
- }
- System.out.println("\nStripes:");
- int stripeIx = -1;
- for (StripeInformation stripe : reader.getStripes()) {
- ++stripeIx;
- long stripeStart = stripe.getOffset();
- OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
- if (printTimeZone) {
- String tz = footer.getWriterTimezone();
- if (tz == null || tz.isEmpty()) {
- tz = UNKNOWN;
- }
- System.out.println(" Stripe: " + stripe.toString() + " timezone: " + tz);
- } else {
- System.out.println(" Stripe: " + stripe.toString());
- }
- long sectionStart = stripeStart;
- for (OrcProto.Stream section : footer.getStreamsList()) {
- String kind = section.hasKind() ? section.getKind().name() : UNKNOWN;
- System.out.println(" Stream: column " + section.getColumn() +
- " section " + kind + " start: " + sectionStart +
- " length " + section.getLength());
- sectionStart += section.getLength();
- }
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- StringBuilder buf = new StringBuilder();
- buf.append(" Encoding column ");
- buf.append(i);
- buf.append(": ");
- buf.append(encoding.getKind());
- if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
- encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
- buf.append("[");
- buf.append(encoding.getDictionarySize());
- buf.append("]");
- }
- System.out.println(buf);
- }
- if (rowIndexCols != null && !rowIndexCols.isEmpty()) {
- // include the columns that are specified, only if the columns are included, bloom filter
- // will be read
- boolean[] sargColumns = new boolean[colCount];
- for (int colIdx : rowIndexCols) {
- sargColumns[colIdx] = true;
- }
- OrcIndex indices = rows
- .readRowIndex(stripeIx, null, null, null, sargColumns);
- for (int col : rowIndexCols) {
- StringBuilder buf = new StringBuilder();
- String rowIdxString = getFormattedRowIndices(col, indices.getRowGroupIndex());
- buf.append(rowIdxString);
- String bloomFilString = getFormattedBloomFilters(col, indices.getBloomFilterIndex());
- buf.append(bloomFilString);
- System.out.println(buf);
- }
- }
- }
-
- FileSystem fs = file.getFileSystem(conf);
- long fileLen = fs.getFileStatus(file).getLen();
- long paddedBytes = getTotalPaddingSize(reader);
- // empty ORC file is ~45 bytes. Assumption here is file length always >0
- double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
- DecimalFormat format = new DecimalFormat("##.##");
- System.out.println("\nFile length: " + fileLen + " bytes");
- System.out.println("Padding length: " + paddedBytes + " bytes");
- System.out.println("Padding ratio: " + format.format(percentPadding) + "%");
- AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader);
- if (acidStats != null) {
- System.out.println("ACID stats:" + acidStats);
- }
- rows.close();
- }
-
- private static void recoverFiles(final List<String> corruptFiles, final Configuration conf,
- final String backup)
- throws IOException {
- for (String corruptFile : corruptFiles) {
- System.err.println("Recovering file " + corruptFile);
- Path corruptPath = new Path(corruptFile);
- FileSystem fs = corruptPath.getFileSystem(conf);
- FSDataInputStream fdis = fs.open(corruptPath);
- try {
- long corruptFileLen = fs.getFileStatus(corruptPath).getLen();
- long remaining = corruptFileLen;
- List<Long> footerOffsets = Lists.newArrayList();
-
- // start reading the data file form top to bottom and record the valid footers
- while (remaining > 0) {
- int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
- byte[] data = new byte[toRead];
- long startPos = corruptFileLen - remaining;
- fdis.readFully(startPos, data, 0, toRead);
-
- // find all MAGIC string and see if the file is readable from there
- int index = 0;
- long nextFooterOffset;
-
- while (index != -1) {
- index = indexOf(data, OrcFile.MAGIC.getBytes(), index + 1);
- if (index != -1) {
- nextFooterOffset = startPos + index + OrcFile.MAGIC.length() + 1;
- if (isReadable(corruptPath, conf, nextFooterOffset)) {
- footerOffsets.add(nextFooterOffset);
- }
- }
- }
-
- System.err.println("Scanning for valid footers - startPos: " + startPos +
- " toRead: " + toRead + " remaining: " + remaining);
- remaining = remaining - toRead;
- }
-
- System.err.println("Readable footerOffsets: " + footerOffsets);
- recoverFile(corruptPath, fs, conf, footerOffsets, backup);
- } catch (Exception e) {
- Path recoveryFile = getRecoveryFile(corruptPath);
- if (fs.exists(recoveryFile)) {
- fs.delete(recoveryFile, false);
- }
- System.err.println("Unable to recover file " + corruptFile);
- e.printStackTrace();
- System.err.println(SEPARATOR);
- continue;
- } finally {
- fdis.close();
- }
- System.err.println(corruptFile + " recovered successfully!");
- System.err.println(SEPARATOR);
- }
- }
-
- private static void recoverFile(final Path corruptPath, final FileSystem fs,
- final Configuration conf, final List<Long> footerOffsets, final String backup)
- throws IOException {
-
- // first recover the file to .recovered file and then once successful rename it to actual file
- Path recoveredPath = getRecoveryFile(corruptPath);
-
- // make sure that file does not exist
- if (fs.exists(recoveredPath)) {
- fs.delete(recoveredPath, false);
- }
-
- // if there are no valid footers, the file should still be readable so create an empty orc file
- if (footerOffsets == null || footerOffsets.isEmpty()) {
- System.err.println("No readable footers found. Creating empty orc file.");
- TypeDescription schema = TypeDescription.createStruct();
- Writer writer = OrcFile.createWriter(recoveredPath,
- OrcFile.writerOptions(conf).setSchema(schema));
- writer.close();
- } else {
- FSDataInputStream fdis = fs.open(corruptPath);
- FileStatus fileStatus = fs.getFileStatus(corruptPath);
- // read corrupt file and copy it to recovered file until last valid footer
- FSDataOutputStream fdos = fs.create(recoveredPath, true,
- conf.getInt("io.file.buffer.size", 4096),
- fileStatus.getReplication(),
- fileStatus.getBlockSize());
- try {
- long fileLen = footerOffsets.get(footerOffsets.size() - 1);
- long remaining = fileLen;
-
- while (remaining > 0) {
- int toRead = (int) Math.min(DEFAULT_BLOCK_SIZE, remaining);
- byte[] data = new byte[toRead];
- long startPos = fileLen - remaining;
- fdis.readFully(startPos, data, 0, toRead);
- fdos.write(data);
- System.err.println("Copying data to recovery file - startPos: " + startPos +
- " toRead: " + toRead + " remaining: " + remaining);
- remaining = remaining - toRead;
- }
- } catch (Exception e) {
- fs.delete(recoveredPath, false);
- throw new IOException(e);
- } finally {
- fdis.close();
- fdos.close();
- }
- }
-
- // validate the recovered file once again and start moving corrupt files to backup folder
- if (isReadable(recoveredPath, conf, Long.MAX_VALUE)) {
- Path backupDataPath;
- String scheme = corruptPath.toUri().getScheme();
- String authority = corruptPath.toUri().getAuthority();
- String filePath = corruptPath.toUri().getPath();
-
- // use the same filesystem as corrupt file if backup-path is not explicitly specified
- if (backup.equals(DEFAULT_BACKUP_PATH)) {
- backupDataPath = new Path(scheme, authority, DEFAULT_BACKUP_PATH + filePath);
- } else {
- backupDataPath = Path.mergePaths(new Path(backup), corruptPath);
- }
-
- // Move data file to backup path
- moveFiles(fs, corruptPath, backupDataPath);
-
- // Move side file to backup path
- Path sideFilePath = OrcAcidUtils.getSideFile(corruptPath);
- Path backupSideFilePath = new Path(backupDataPath.getParent(), sideFilePath.getName());
- moveFiles(fs, sideFilePath, backupSideFilePath);
-
- // finally move recovered file to actual file
- moveFiles(fs, recoveredPath, corruptPath);
-
- // we are done recovering, backing up and validating
- System.err.println("Validation of recovered file successful!");
- }
- }
-
- private static void moveFiles(final FileSystem fs, final Path src, final Path dest)
- throws IOException {
- try {
- // create the dest directory if not exist
- if (!fs.exists(dest.getParent())) {
- fs.mkdirs(dest.getParent());
- }
-
- // if the destination file exists for some reason delete it
- fs.delete(dest, false);
-
- if (fs.rename(src, dest)) {
- System.err.println("Moved " + src + " to " + dest);
- } else {
- throw new IOException("Unable to move " + src + " to " + dest);
- }
-
- } catch (Exception e) {
- throw new IOException("Unable to move " + src + " to " + dest, e);
- }
- }
-
- private static Path getRecoveryFile(final Path corruptPath) {
- return new Path(corruptPath.getParent(), corruptPath.getName() + ".recovered");
- }
-
- private static boolean isReadable(final Path corruptPath, final Configuration conf,
- final long maxLen) {
- try {
- OrcFile.createReader(corruptPath, OrcFile.readerOptions(conf).maxLength(maxLen));
- return true;
- } catch (Exception e) {
- // ignore this exception as maxLen is unreadable
- return false;
- }
- }
-
- // search for byte pattern in another byte array
- private static int indexOf(final byte[] data, final byte[] pattern, final int index) {
- if (data == null || data.length == 0 || pattern == null || pattern.length == 0 ||
- index > data.length || index < 0) {
- return -1;
- }
-
- int j = 0;
- for (int i = index; i < data.length; i++) {
- if (pattern[j] == data[i]) {
- j++;
- } else {
- j = 0;
- }
-
- if (j == pattern.length) {
- return i - pattern.length + 1;
- }
- }
-
- return -1;
- }
-
- private static String getFormattedBloomFilters(int col,
- OrcProto.BloomFilterIndex[] bloomFilterIndex) {
- StringBuilder buf = new StringBuilder();
- BloomFilterIO stripeLevelBF = null;
- if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
- int idx = 0;
- buf.append("\n Bloom filters for column ").append(col).append(":");
- for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
- BloomFilterIO toMerge = new BloomFilterIO(bf);
- buf.append("\n Entry ").append(idx++).append(":").append(getBloomFilterStats(toMerge));
- if (stripeLevelBF == null) {
- stripeLevelBF = toMerge;
- } else {
- stripeLevelBF.merge(toMerge);
- }
- }
- String bloomFilterStats = getBloomFilterStats(stripeLevelBF);
- buf.append("\n Stripe level merge:").append(bloomFilterStats);
- }
- return buf.toString();
- }
-
- private static String getBloomFilterStats(BloomFilterIO bf) {
- StringBuilder sb = new StringBuilder();
- int bitCount = bf.getBitSize();
- int popCount = 0;
- for (long l : bf.getBitSet()) {
- popCount += Long.bitCount(l);
- }
- int k = bf.getNumHashFunctions();
- float loadFactor = (float) popCount / (float) bitCount;
- float expectedFpp = (float) Math.pow(loadFactor, k);
- DecimalFormat df = new DecimalFormat("###.####");
- sb.append(" numHashFunctions: ").append(k);
- sb.append(" bitCount: ").append(bitCount);
- sb.append(" popCount: ").append(popCount);
- sb.append(" loadFactor: ").append(df.format(loadFactor));
- sb.append(" expectedFpp: ").append(expectedFpp);
- return sb.toString();
- }
-
- private static String getFormattedRowIndices(int col,
- OrcProto.RowIndex[] rowGroupIndex) {
- StringBuilder buf = new StringBuilder();
- OrcProto.RowIndex index;
- buf.append(" Row group indices for column ").append(col).append(":");
- if (rowGroupIndex == null || (col >= rowGroupIndex.length) ||
- ((index = rowGroupIndex[col]) == null)) {
- buf.append(" not found\n");
- return buf.toString();
- }
-
- for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
- buf.append("\n Entry ").append(entryIx).append(": ");
- OrcProto.RowIndexEntry entry = index.getEntry(entryIx);
- if (entry == null) {
- buf.append("unknown\n");
- continue;
- }
- OrcProto.ColumnStatistics colStats = entry.getStatistics();
- if (colStats == null) {
- buf.append("no stats at ");
- } else {
- ColumnStatistics cs = ColumnStatisticsImpl.deserialize(colStats);
- buf.append(cs.toString());
- }
- buf.append(" positions: ");
- for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
- if (posIx != 0) {
- buf.append(",");
- }
- buf.append(entry.getPositions(posIx));
- }
- }
- return buf.toString();
- }
-
- public static long getTotalPaddingSize(Reader reader) throws IOException {
- long paddedBytes = 0;
- List<StripeInformation> stripes = reader.getStripes();
- for (int i = 1; i < stripes.size(); i++) {
- long prevStripeOffset = stripes.get(i - 1).getOffset();
- long prevStripeLen = stripes.get(i - 1).getLength();
- paddedBytes += stripes.get(i).getOffset() - (prevStripeOffset + prevStripeLen);
- }
- return paddedBytes;
- }
-
- @SuppressWarnings("static-access")
- static Options createOptions() {
- Options result = new Options();
-
- // add -d and --data to print the rows
- result.addOption(OptionBuilder
- .withLongOpt("data")
- .withDescription("Should the data be printed")
- .create('d'));
-
- // to avoid breaking unit tests (when run in different time zones) for file dump, printing
- // of timezone is made optional
- result.addOption(OptionBuilder
- .withLongOpt("timezone")
- .withDescription("Print writer's time zone")
- .create('t'));
-
- result.addOption(OptionBuilder
- .withLongOpt("help")
- .withDescription("print help message")
- .create('h'));
-
- result.addOption(OptionBuilder
- .withLongOpt("rowindex")
- .withArgName("comma separated list of column ids for which row index should be printed")
- .withDescription("Dump stats for column number(s)")
- .hasArg()
- .create('r'));
-
- result.addOption(OptionBuilder
- .withLongOpt("json")
- .withDescription("Print metadata in JSON format")
- .create('j'));
-
- result.addOption(OptionBuilder
- .withLongOpt("pretty")
- .withDescription("Pretty print json metadata output")
- .create('p'));
-
- result.addOption(OptionBuilder
- .withLongOpt("recover")
- .withDescription("recover corrupted orc files generated by streaming")
- .create());
-
- result.addOption(OptionBuilder
- .withLongOpt("skip-dump")
- .withDescription("used along with --recover to directly recover files without dumping")
- .create());
-
- result.addOption(OptionBuilder
- .withLongOpt("backup-path")
- .withDescription("specify a backup path to store the corrupted files (default: /tmp)")
- .hasArg()
- .create());
- return result;
- }
-
- private static void printMap(JSONWriter writer,
- MapColumnVector vector,
- TypeDescription schema,
- int row) throws JSONException {
- writer.array();
- TypeDescription keyType = schema.getChildren().get(0);
- TypeDescription valueType = schema.getChildren().get(1);
- int offset = (int) vector.offsets[row];
- for (int i = 0; i < vector.lengths[row]; ++i) {
- writer.object();
- writer.key("_key");
- printValue(writer, vector.keys, keyType, offset + i);
- writer.key("_value");
- printValue(writer, vector.values, valueType, offset + i);
- writer.endObject();
- }
- writer.endArray();
- }
-
- private static void printList(JSONWriter writer,
- ListColumnVector vector,
- TypeDescription schema,
- int row) throws JSONException {
- writer.array();
- int offset = (int) vector.offsets[row];
- TypeDescription childType = schema.getChildren().get(0);
- for (int i = 0; i < vector.lengths[row]; ++i) {
- printValue(writer, vector.child, childType, offset + i);
- }
- writer.endArray();
- }
-
- private static void printUnion(JSONWriter writer,
- UnionColumnVector vector,
- TypeDescription schema,
- int row) throws JSONException {
- int tag = vector.tags[row];
- printValue(writer, vector.fields[tag], schema.getChildren().get(tag), row);
- }
-
- static void printStruct(JSONWriter writer,
- StructColumnVector batch,
- TypeDescription schema,
- int row) throws JSONException {
- writer.object();
- List<String> fieldNames = schema.getFieldNames();
- List<TypeDescription> fieldTypes = schema.getChildren();
- for (int i = 0; i < fieldTypes.size(); ++i) {
- writer.key(fieldNames.get(i));
- printValue(writer, batch.fields[i], fieldTypes.get(i), row);
- }
- writer.endObject();
- }
-
- static void printBinary(JSONWriter writer, BytesColumnVector vector,
- int row) throws JSONException {
- writer.array();
- int offset = vector.start[row];
- for(int i=0; i < vector.length[row]; ++i) {
- writer.value(0xff & (int) vector.vector[row][offset + i]);
- }
- writer.endArray();
- }
- static void printValue(JSONWriter writer, ColumnVector vector,
- TypeDescription schema, int row) throws JSONException {
- if (vector.isRepeating) {
- row = 0;
- }
- if (vector.noNulls || !vector.isNull[row]) {
- switch (schema.getCategory()) {
- case BOOLEAN:
- writer.value(((LongColumnVector) vector).vector[row] != 0);
- break;
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- writer.value(((LongColumnVector) vector).vector[row]);
- break;
- case FLOAT:
- case DOUBLE:
- writer.value(((DoubleColumnVector) vector).vector[row]);
- break;
- case STRING:
- case CHAR:
- case VARCHAR:
- writer.value(((BytesColumnVector) vector).toString(row));
- break;
- case BINARY:
- printBinary(writer, (BytesColumnVector) vector, row);
- break;
- case DECIMAL:
- writer.value(((DecimalColumnVector) vector).vector[row].toString());
- break;
- case DATE:
- writer.value(new DateWritable(
- (int) ((LongColumnVector) vector).vector[row]).toString());
- break;
- case TIMESTAMP:
- writer.value(((TimestampColumnVector) vector)
- .asScratchTimestamp(row).toString());
- break;
- case LIST:
- printList(writer, (ListColumnVector) vector, schema, row);
- break;
- case MAP:
- printMap(writer, (MapColumnVector) vector, schema, row);
- break;
- case STRUCT:
- printStruct(writer, (StructColumnVector) vector, schema, row);
- break;
- case UNION:
- printUnion(writer, (UnionColumnVector) vector, schema, row);
- break;
- default:
- throw new IllegalArgumentException("Unknown type " +
- schema.toString());
- }
- } else {
- writer.value(null);
- }
- }
-
- static void printRow(JSONWriter writer,
- VectorizedRowBatch batch,
- TypeDescription schema,
- int row) throws JSONException {
- if (schema.getCategory() == TypeDescription.Category.STRUCT) {
- List<TypeDescription> fieldTypes = schema.getChildren();
- List<String> fieldNames = schema.getFieldNames();
- writer.object();
- for (int c = 0; c < batch.cols.length; ++c) {
- writer.key(fieldNames.get(c));
- printValue(writer, batch.cols[c], fieldTypes.get(c), row);
- }
- writer.endObject();
- } else {
- printValue(writer, batch.cols[0], schema, row);
- }
- }
-
- static void printJsonData(final Reader reader) throws IOException, JSONException {
- PrintStream printStream = System.out;
- OutputStreamWriter out = new OutputStreamWriter(printStream, "UTF-8");
- RecordReader rows = reader.rows();
- try {
- TypeDescription schema = reader.getSchema();
- VectorizedRowBatch batch = schema.createRowBatch();
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- JSONWriter writer = new JSONWriter(out);
- printRow(writer, batch, schema, r);
- out.write("\n");
- out.flush();
- if (printStream.checkError()) {
- throw new IOException("Error encountered when writing to stdout.");
- }
- }
- }
- } finally {
- rows.close();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/tools/JsonFileDump.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/tools/JsonFileDump.java b/orc/src/java/org/apache/orc/tools/JsonFileDump.java
deleted file mode 100644
index e2048ea..0000000
--- a/orc/src/java/org/apache/orc/tools/JsonFileDump.java
+++ /dev/null
@@ -1,412 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.tools;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.orc.CompressionKind;
-import org.apache.orc.Reader;
-import org.apache.orc.impl.AcidStats;
-import org.apache.orc.impl.OrcAcidUtils;
-import org.apache.orc.impl.RecordReaderImpl;
-import org.codehaus.jettison.json.JSONArray;
-import org.apache.orc.BloomFilterIO;
-import org.apache.orc.BinaryColumnStatistics;
-import org.apache.orc.BooleanColumnStatistics;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.impl.ColumnStatisticsImpl;
-import org.apache.orc.DateColumnStatistics;
-import org.apache.orc.DecimalColumnStatistics;
-import org.apache.orc.DoubleColumnStatistics;
-import org.apache.orc.IntegerColumnStatistics;
-import org.apache.orc.impl.OrcIndex;
-import org.apache.orc.OrcProto;
-import org.apache.orc.StringColumnStatistics;
-import org.apache.orc.StripeInformation;
-import org.apache.orc.StripeStatistics;
-import org.apache.orc.TimestampColumnStatistics;
-import org.codehaus.jettison.json.JSONException;
-import org.codehaus.jettison.json.JSONObject;
-import org.codehaus.jettison.json.JSONStringer;
-import org.codehaus.jettison.json.JSONWriter;
-
-/**
- * File dump tool with json formatted output.
- */
-public class JsonFileDump {
-
- public static void printJsonMetaData(List<String> files,
- Configuration conf,
- List<Integer> rowIndexCols, boolean prettyPrint, boolean printTimeZone)
- throws JSONException, IOException {
- if (files.isEmpty()) {
- return;
- }
- JSONStringer writer = new JSONStringer();
- boolean multiFile = files.size() > 1;
- if (multiFile) {
- writer.array();
- } else {
- writer.object();
- }
- for (String filename : files) {
- try {
- if (multiFile) {
- writer.object();
- }
- writer.key("fileName").value(filename);
- Path path = new Path(filename);
- Reader reader = FileDump.getReader(path, conf, null);
- if (reader == null) {
- writer.key("status").value("FAILED");
- continue;
- }
- writer.key("fileVersion").value(reader.getFileVersion().getName());
- writer.key("writerVersion").value(reader.getWriterVersion());
- RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
- writer.key("numberOfRows").value(reader.getNumberOfRows());
- writer.key("compression").value(reader.getCompressionKind());
- if (reader.getCompressionKind() != CompressionKind.NONE) {
- writer.key("compressionBufferSize").value(reader.getCompressionSize());
- }
- writer.key("schemaString").value(reader.getSchema().toString());
- writer.key("schema").array();
- writeSchema(writer, reader.getTypes());
- writer.endArray();
-
- writer.key("stripeStatistics").array();
- List<StripeStatistics> stripeStatistics = reader.getStripeStatistics();
- for (int n = 0; n < stripeStatistics.size(); n++) {
- writer.object();
- writer.key("stripeNumber").value(n + 1);
- StripeStatistics ss = stripeStatistics.get(n);
- writer.key("columnStatistics").array();
- for (int i = 0; i < ss.getColumnStatistics().length; i++) {
- writer.object();
- writer.key("columnId").value(i);
- writeColumnStatistics(writer, ss.getColumnStatistics()[i]);
- writer.endObject();
- }
- writer.endArray();
- writer.endObject();
- }
- writer.endArray();
-
- ColumnStatistics[] stats = reader.getStatistics();
- int colCount = stats.length;
- if (rowIndexCols == null) {
- rowIndexCols = new ArrayList<>(colCount);
- for (int i = 0; i < colCount; ++i) {
- rowIndexCols.add(i);
- }
- }
- writer.key("fileStatistics").array();
- for (int i = 0; i < stats.length; ++i) {
- writer.object();
- writer.key("columnId").value(i);
- writeColumnStatistics(writer, stats[i]);
- writer.endObject();
- }
- writer.endArray();
-
- writer.key("stripes").array();
- int stripeIx = -1;
- for (StripeInformation stripe : reader.getStripes()) {
- ++stripeIx;
- long stripeStart = stripe.getOffset();
- OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
- writer.object(); // start of stripe information
- writer.key("stripeNumber").value(stripeIx + 1);
- writer.key("stripeInformation");
- writeStripeInformation(writer, stripe);
- if (printTimeZone) {
- writer.key("writerTimezone").value(
- footer.hasWriterTimezone() ? footer.getWriterTimezone() : FileDump.UNKNOWN);
- }
- long sectionStart = stripeStart;
-
- writer.key("streams").array();
- for (OrcProto.Stream section : footer.getStreamsList()) {
- writer.object();
- String kind = section.hasKind() ? section.getKind().name() : FileDump.UNKNOWN;
- writer.key("columnId").value(section.getColumn());
- writer.key("section").value(kind);
- writer.key("startOffset").value(sectionStart);
- writer.key("length").value(section.getLength());
- sectionStart += section.getLength();
- writer.endObject();
- }
- writer.endArray();
-
- writer.key("encodings").array();
- for (int i = 0; i < footer.getColumnsCount(); ++i) {
- writer.object();
- OrcProto.ColumnEncoding encoding = footer.getColumns(i);
- writer.key("columnId").value(i);
- writer.key("kind").value(encoding.getKind());
- if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY ||
- encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) {
- writer.key("dictionarySize").value(encoding.getDictionarySize());
- }
- writer.endObject();
- }
- writer.endArray();
- if (!rowIndexCols.isEmpty()) {
- // include the columns that are specified, only if the columns are included, bloom filter
- // will be read
- boolean[] sargColumns = new boolean[colCount];
- for (int colIdx : rowIndexCols) {
- sargColumns[colIdx] = true;
- }
- OrcIndex indices = rows.readRowIndex(stripeIx, null, sargColumns);
- writer.key("indexes").array();
- for (int col : rowIndexCols) {
- writer.object();
- writer.key("columnId").value(col);
- writeRowGroupIndexes(writer, col, indices.getRowGroupIndex());
- writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex());
- writer.endObject();
- }
- writer.endArray();
- }
- writer.endObject(); // end of stripe information
- }
- writer.endArray();
-
- FileSystem fs = path.getFileSystem(conf);
- long fileLen = fs.getContentSummary(path).getLength();
- long paddedBytes = FileDump.getTotalPaddingSize(reader);
- // empty ORC file is ~45 bytes. Assumption here is file length always >0
- double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
- writer.key("fileLength").value(fileLen);
- writer.key("paddingLength").value(paddedBytes);
- writer.key("paddingRatio").value(percentPadding);
- AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader);
- if (acidStats != null) {
- writer.key("numInserts").value(acidStats.inserts);
- writer.key("numDeletes").value(acidStats.deletes);
- writer.key("numUpdates").value(acidStats.updates);
- }
- writer.key("status").value("OK");
- rows.close();
-
- writer.endObject();
- } catch (Exception e) {
- writer.key("status").value("FAILED");
- throw e;
- }
- }
- if (multiFile) {
- writer.endArray();
- }
-
- if (prettyPrint) {
- final String prettyJson;
- if (multiFile) {
- JSONArray jsonArray = new JSONArray(writer.toString());
- prettyJson = jsonArray.toString(2);
- } else {
- JSONObject jsonObject = new JSONObject(writer.toString());
- prettyJson = jsonObject.toString(2);
- }
- System.out.println(prettyJson);
- } else {
- System.out.println(writer.toString());
- }
- }
-
- private static void writeSchema(JSONStringer writer, List<OrcProto.Type> types)
- throws JSONException {
- int i = 0;
- for(OrcProto.Type type : types) {
- writer.object();
- writer.key("columnId").value(i++);
- writer.key("columnType").value(type.getKind());
- if (type.getFieldNamesCount() > 0) {
- writer.key("childColumnNames").array();
- for (String field : type.getFieldNamesList()) {
- writer.value(field);
- }
- writer.endArray();
- writer.key("childColumnIds").array();
- for (Integer colId : type.getSubtypesList()) {
- writer.value(colId);
- }
- writer.endArray();
- }
- if (type.hasPrecision()) {
- writer.key("precision").value(type.getPrecision());
- }
-
- if (type.hasScale()) {
- writer.key("scale").value(type.getScale());
- }
-
- if (type.hasMaximumLength()) {
- writer.key("maxLength").value(type.getMaximumLength());
- }
- writer.endObject();
- }
- }
-
- private static void writeStripeInformation(JSONWriter writer, StripeInformation stripe)
- throws JSONException {
- writer.object();
- writer.key("offset").value(stripe.getOffset());
- writer.key("indexLength").value(stripe.getIndexLength());
- writer.key("dataLength").value(stripe.getDataLength());
- writer.key("footerLength").value(stripe.getFooterLength());
- writer.key("rowCount").value(stripe.getNumberOfRows());
- writer.endObject();
- }
-
- private static void writeColumnStatistics(JSONWriter writer, ColumnStatistics cs)
- throws JSONException {
- if (cs != null) {
- writer.key("count").value(cs.getNumberOfValues());
- writer.key("hasNull").value(cs.hasNull());
- if (cs instanceof BinaryColumnStatistics) {
- writer.key("totalLength").value(((BinaryColumnStatistics) cs).getSum());
- writer.key("type").value(OrcProto.Type.Kind.BINARY);
- } else if (cs instanceof BooleanColumnStatistics) {
- writer.key("trueCount").value(((BooleanColumnStatistics) cs).getTrueCount());
- writer.key("falseCount").value(((BooleanColumnStatistics) cs).getFalseCount());
- writer.key("type").value(OrcProto.Type.Kind.BOOLEAN);
- } else if (cs instanceof IntegerColumnStatistics) {
- writer.key("min").value(((IntegerColumnStatistics) cs).getMinimum());
- writer.key("max").value(((IntegerColumnStatistics) cs).getMaximum());
- if (((IntegerColumnStatistics) cs).isSumDefined()) {
- writer.key("sum").value(((IntegerColumnStatistics) cs).getSum());
- }
- writer.key("type").value(OrcProto.Type.Kind.LONG);
- } else if (cs instanceof DoubleColumnStatistics) {
- writer.key("min").value(((DoubleColumnStatistics) cs).getMinimum());
- writer.key("max").value(((DoubleColumnStatistics) cs).getMaximum());
- writer.key("sum").value(((DoubleColumnStatistics) cs).getSum());
- writer.key("type").value(OrcProto.Type.Kind.DOUBLE);
- } else if (cs instanceof StringColumnStatistics) {
- writer.key("min").value(((StringColumnStatistics) cs).getMinimum());
- writer.key("max").value(((StringColumnStatistics) cs).getMaximum());
- writer.key("totalLength").value(((StringColumnStatistics) cs).getSum());
- writer.key("type").value(OrcProto.Type.Kind.STRING);
- } else if (cs instanceof DateColumnStatistics) {
- if (((DateColumnStatistics) cs).getMaximum() != null) {
- writer.key("min").value(((DateColumnStatistics) cs).getMinimum());
- writer.key("max").value(((DateColumnStatistics) cs).getMaximum());
- }
- writer.key("type").value(OrcProto.Type.Kind.DATE);
- } else if (cs instanceof TimestampColumnStatistics) {
- if (((TimestampColumnStatistics) cs).getMaximum() != null) {
- writer.key("min").value(((TimestampColumnStatistics) cs).getMinimum());
- writer.key("max").value(((TimestampColumnStatistics) cs).getMaximum());
- }
- writer.key("type").value(OrcProto.Type.Kind.TIMESTAMP);
- } else if (cs instanceof DecimalColumnStatistics) {
- if (((DecimalColumnStatistics) cs).getMaximum() != null) {
- writer.key("min").value(((DecimalColumnStatistics) cs).getMinimum());
- writer.key("max").value(((DecimalColumnStatistics) cs).getMaximum());
- writer.key("sum").value(((DecimalColumnStatistics) cs).getSum());
- }
- writer.key("type").value(OrcProto.Type.Kind.DECIMAL);
- }
- }
- }
-
- private static void writeBloomFilterIndexes(JSONWriter writer, int col,
- OrcProto.BloomFilterIndex[] bloomFilterIndex) throws JSONException {
-
- BloomFilterIO stripeLevelBF = null;
- if (bloomFilterIndex != null && bloomFilterIndex[col] != null) {
- int entryIx = 0;
- writer.key("bloomFilterIndexes").array();
- for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) {
- writer.object();
- writer.key("entryId").value(entryIx++);
- BloomFilterIO toMerge = new BloomFilterIO(bf);
- writeBloomFilterStats(writer, toMerge);
- if (stripeLevelBF == null) {
- stripeLevelBF = toMerge;
- } else {
- stripeLevelBF.merge(toMerge);
- }
- writer.endObject();
- }
- writer.endArray();
- }
- if (stripeLevelBF != null) {
- writer.key("stripeLevelBloomFilter");
- writer.object();
- writeBloomFilterStats(writer, stripeLevelBF);
- writer.endObject();
- }
- }
-
- private static void writeBloomFilterStats(JSONWriter writer, BloomFilterIO bf)
- throws JSONException {
- int bitCount = bf.getBitSize();
- int popCount = 0;
- for (long l : bf.getBitSet()) {
- popCount += Long.bitCount(l);
- }
- int k = bf.getNumHashFunctions();
- float loadFactor = (float) popCount / (float) bitCount;
- float expectedFpp = (float) Math.pow(loadFactor, k);
- writer.key("numHashFunctions").value(k);
- writer.key("bitCount").value(bitCount);
- writer.key("popCount").value(popCount);
- writer.key("loadFactor").value(loadFactor);
- writer.key("expectedFpp").value(expectedFpp);
- }
-
- private static void writeRowGroupIndexes(JSONWriter writer, int col,
- OrcProto.RowIndex[] rowGroupIndex)
- throws JSONException {
-
- OrcProto.RowIndex index;
- if (rowGroupIndex == null || (col >= rowGroupIndex.length) ||
- ((index = rowGroupIndex[col]) == null)) {
- return;
- }
-
- writer.key("rowGroupIndexes").array();
- for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) {
- writer.object();
- writer.key("entryId").value(entryIx);
- OrcProto.RowIndexEntry entry = index.getEntry(entryIx);
- if (entry == null) {
- continue;
- }
- OrcProto.ColumnStatistics colStats = entry.getStatistics();
- writeColumnStatistics(writer, ColumnStatisticsImpl.deserialize(colStats));
- writer.key("positions").array();
- for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) {
- writer.value(entry.getPositions(posIx));
- }
- writer.endArray();
- writer.endObject();
- }
- writer.endArray();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/protobuf/orc_proto.proto
----------------------------------------------------------------------
diff --git a/orc/src/protobuf/orc_proto.proto b/orc/src/protobuf/orc_proto.proto
index ae5945b..73f68b1 100644
--- a/orc/src/protobuf/orc_proto.proto
+++ b/orc/src/protobuf/orc_proto.proto
@@ -18,7 +18,7 @@
package orc.proto;
-option java_package = "org.apache.orc";
+option java_package = "org.apache.hive.orc";
message IntegerStatistics {
optional sint64 minimum = 1;
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/hive/orc/TestColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/hive/orc/TestColumnStatistics.java b/orc/src/test/org/apache/hive/orc/TestColumnStatistics.java
new file mode 100644
index 0000000..87c12c9
--- /dev/null
+++ b/orc/src/test/org/apache/hive/orc/TestColumnStatistics.java
@@ -0,0 +1,364 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc;
+
+import static junit.framework.Assert.assertEquals;
+import static org.junit.Assume.assumeTrue;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.PrintStream;
+import java.sql.Timestamp;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.serde2.io.DateWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hive.orc.impl.ColumnStatisticsImpl;
+import org.apache.hive.orc.tools.FileDump;
+import org.apache.hive.orc.tools.TestFileDump;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+/**
+ * Test ColumnStatisticsImpl for ORC.
+ */
+public class TestColumnStatistics {
+
+ @Test
+ public void testLongMerge() throws Exception {
+ TypeDescription schema = TypeDescription.createInt();
+
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+ ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+ stats1.updateInteger(10, 2);
+ stats2.updateInteger(1, 1);
+ stats2.updateInteger(1000, 1);
+ stats1.merge(stats2);
+ IntegerColumnStatistics typed = (IntegerColumnStatistics) stats1;
+ assertEquals(1, typed.getMinimum());
+ assertEquals(1000, typed.getMaximum());
+ stats1.reset();
+ stats1.updateInteger(-10, 1);
+ stats1.updateInteger(10000, 1);
+ stats1.merge(stats2);
+ assertEquals(-10, typed.getMinimum());
+ assertEquals(10000, typed.getMaximum());
+ }
+
+ @Test
+ public void testDoubleMerge() throws Exception {
+ TypeDescription schema = TypeDescription.createDouble();
+
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+ ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+ stats1.updateDouble(10.0);
+ stats1.updateDouble(100.0);
+ stats2.updateDouble(1.0);
+ stats2.updateDouble(1000.0);
+ stats1.merge(stats2);
+ DoubleColumnStatistics typed = (DoubleColumnStatistics) stats1;
+ assertEquals(1.0, typed.getMinimum(), 0.001);
+ assertEquals(1000.0, typed.getMaximum(), 0.001);
+ stats1.reset();
+ stats1.updateDouble(-10);
+ stats1.updateDouble(10000);
+ stats1.merge(stats2);
+ assertEquals(-10, typed.getMinimum(), 0.001);
+ assertEquals(10000, typed.getMaximum(), 0.001);
+ }
+
+
+ @Test
+ public void testStringMerge() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+ ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+ stats1.updateString(new Text("bob"));
+ stats1.updateString(new Text("david"));
+ stats1.updateString(new Text("charles"));
+ stats2.updateString(new Text("anne"));
+ byte[] erin = new byte[]{0, 1, 2, 3, 4, 5, 101, 114, 105, 110};
+ stats2.updateString(erin, 6, 4, 5);
+ assertEquals(24, ((StringColumnStatistics)stats2).getSum());
+ stats1.merge(stats2);
+ StringColumnStatistics typed = (StringColumnStatistics) stats1;
+ assertEquals("anne", typed.getMinimum());
+ assertEquals("erin", typed.getMaximum());
+ assertEquals(39, typed.getSum());
+ stats1.reset();
+ stats1.updateString(new Text("aaa"));
+ stats1.updateString(new Text("zzz"));
+ stats1.merge(stats2);
+ assertEquals("aaa", typed.getMinimum());
+ assertEquals("zzz", typed.getMaximum());
+ }
+
+ @Test
+ public void testDateMerge() throws Exception {
+ TypeDescription schema = TypeDescription.createDate();
+
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+ ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+ stats1.updateDate(new DateWritable(1000));
+ stats1.updateDate(new DateWritable(100));
+ stats2.updateDate(new DateWritable(10));
+ stats2.updateDate(new DateWritable(2000));
+ stats1.merge(stats2);
+ DateColumnStatistics typed = (DateColumnStatistics) stats1;
+ assertEquals(new DateWritable(10).get(), typed.getMinimum());
+ assertEquals(new DateWritable(2000).get(), typed.getMaximum());
+ stats1.reset();
+ stats1.updateDate(new DateWritable(-10));
+ stats1.updateDate(new DateWritable(10000));
+ stats1.merge(stats2);
+ assertEquals(new DateWritable(-10).get(), typed.getMinimum());
+ assertEquals(new DateWritable(10000).get(), typed.getMaximum());
+ }
+
+ @Test
+ public void testTimestampMerge() throws Exception {
+ TypeDescription schema = TypeDescription.createTimestamp();
+
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+ ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+ stats1.updateTimestamp(new Timestamp(10));
+ stats1.updateTimestamp(new Timestamp(100));
+ stats2.updateTimestamp(new Timestamp(1));
+ stats2.updateTimestamp(new Timestamp(1000));
+ stats1.merge(stats2);
+ TimestampColumnStatistics typed = (TimestampColumnStatistics) stats1;
+ assertEquals(1, typed.getMinimum().getTime());
+ assertEquals(1000, typed.getMaximum().getTime());
+ stats1.reset();
+ stats1.updateTimestamp(new Timestamp(-10));
+ stats1.updateTimestamp(new Timestamp(10000));
+ stats1.merge(stats2);
+ assertEquals(-10, typed.getMinimum().getTime());
+ assertEquals(10000, typed.getMaximum().getTime());
+ }
+
+ @Test
+ public void testDecimalMerge() throws Exception {
+ TypeDescription schema = TypeDescription.createDecimal()
+ .withPrecision(38).withScale(16);
+
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
+ ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
+ stats1.updateDecimal(new HiveDecimalWritable(10));
+ stats1.updateDecimal(new HiveDecimalWritable(100));
+ stats2.updateDecimal(new HiveDecimalWritable(1));
+ stats2.updateDecimal(new HiveDecimalWritable(1000));
+ stats1.merge(stats2);
+ DecimalColumnStatistics typed = (DecimalColumnStatistics) stats1;
+ assertEquals(1, typed.getMinimum().longValue());
+ assertEquals(1000, typed.getMaximum().longValue());
+ stats1.reset();
+ stats1.updateDecimal(new HiveDecimalWritable(-10));
+ stats1.updateDecimal(new HiveDecimalWritable(10000));
+ stats1.merge(stats2);
+ assertEquals(-10, typed.getMinimum().longValue());
+ assertEquals(10000, typed.getMaximum().longValue());
+ }
+
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ fs.setWorkingDirectory(workDir);
+ testFilePath = new Path("TestOrcFile." + testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ private static BytesWritable bytes(int... items) {
+ BytesWritable result = new BytesWritable();
+ result.setSize(items.length);
+ for (int i = 0; i < items.length; ++i) {
+ result.getBytes()[i] = (byte) items[i];
+ }
+ return result;
+ }
+
+ void appendRow(VectorizedRowBatch batch, BytesWritable bytes,
+ String str) {
+ int row = batch.size++;
+ if (bytes == null) {
+ batch.cols[0].noNulls = false;
+ batch.cols[0].isNull[row] = true;
+ } else {
+ ((BytesColumnVector) batch.cols[0]).setVal(row, bytes.getBytes(),
+ 0, bytes.getLength());
+ }
+ if (str == null) {
+ batch.cols[1].noNulls = false;
+ batch.cols[1].isNull[row] = true;
+ } else {
+ ((BytesColumnVector) batch.cols[1]).setVal(row, str.getBytes());
+ }
+ }
+
+ @Test
+ public void testHasNull() throws Exception {
+ TypeDescription schema =
+ TypeDescription.createStruct()
+ .addField("bytes1", TypeDescription.createBinary())
+ .addField("string1", TypeDescription.createString());
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .rowIndexStride(1000)
+ .stripeSize(10000)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch(5000);
+ // STRIPE 1
+ // RG1
+ for(int i=0; i<1000; i++) {
+ appendRow(batch, bytes(1, 2, 3), "RG1");
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // RG2
+ for(int i=0; i<1000; i++) {
+ appendRow(batch, bytes(1, 2, 3), null);
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // RG3
+ for(int i=0; i<1000; i++) {
+ appendRow(batch, bytes(1, 2, 3), "RG3");
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // RG4
+ for (int i = 0; i < 1000; i++) {
+ appendRow(batch, bytes(1,2,3), null);
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // RG5
+ for(int i=0; i<1000; i++) {
+ appendRow(batch, bytes(1, 2, 3), null);
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // STRIPE 2
+ for (int i = 0; i < 5000; i++) {
+ appendRow(batch, bytes(1,2,3), null);
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // STRIPE 3
+ for (int i = 0; i < 5000; i++) {
+ appendRow(batch, bytes(1,2,3), "STRIPE-3");
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ // STRIPE 4
+ for (int i = 0; i < 5000; i++) {
+ appendRow(batch, bytes(1,2,3), null);
+ }
+ writer.addRowBatch(batch);
+ batch.reset();
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+
+ // check the file level stats
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(20000, stats[0].getNumberOfValues());
+ assertEquals(20000, stats[1].getNumberOfValues());
+ assertEquals(7000, stats[2].getNumberOfValues());
+ assertEquals(false, stats[0].hasNull());
+ assertEquals(false, stats[1].hasNull());
+ assertEquals(true, stats[2].hasNull());
+
+ // check the stripe level stats
+ List<StripeStatistics> stripeStats = reader.getStripeStatistics();
+ // stripe 1 stats
+ StripeStatistics ss1 = stripeStats.get(0);
+ ColumnStatistics ss1_cs1 = ss1.getColumnStatistics()[0];
+ ColumnStatistics ss1_cs2 = ss1.getColumnStatistics()[1];
+ ColumnStatistics ss1_cs3 = ss1.getColumnStatistics()[2];
+ assertEquals(false, ss1_cs1.hasNull());
+ assertEquals(false, ss1_cs2.hasNull());
+ assertEquals(true, ss1_cs3.hasNull());
+
+ // stripe 2 stats
+ StripeStatistics ss2 = stripeStats.get(1);
+ ColumnStatistics ss2_cs1 = ss2.getColumnStatistics()[0];
+ ColumnStatistics ss2_cs2 = ss2.getColumnStatistics()[1];
+ ColumnStatistics ss2_cs3 = ss2.getColumnStatistics()[2];
+ assertEquals(false, ss2_cs1.hasNull());
+ assertEquals(false, ss2_cs2.hasNull());
+ assertEquals(true, ss2_cs3.hasNull());
+
+ // stripe 3 stats
+ StripeStatistics ss3 = stripeStats.get(2);
+ ColumnStatistics ss3_cs1 = ss3.getColumnStatistics()[0];
+ ColumnStatistics ss3_cs2 = ss3.getColumnStatistics()[1];
+ ColumnStatistics ss3_cs3 = ss3.getColumnStatistics()[2];
+ assertEquals(false, ss3_cs1.hasNull());
+ assertEquals(false, ss3_cs2.hasNull());
+ assertEquals(false, ss3_cs3.hasNull());
+
+ // stripe 4 stats
+ StripeStatistics ss4 = stripeStats.get(3);
+ ColumnStatistics ss4_cs1 = ss4.getColumnStatistics()[0];
+ ColumnStatistics ss4_cs2 = ss4.getColumnStatistics()[1];
+ ColumnStatistics ss4_cs3 = ss4.getColumnStatistics()[2];
+ assertEquals(false, ss4_cs1.hasNull());
+ assertEquals(false, ss4_cs2.hasNull());
+ assertEquals(true, ss4_cs3.hasNull());
+
+ // Test file dump
+ PrintStream origOut = System.out;
+ String outputFilename = "orc-file-has-null.out";
+ FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
+
+ // replace stdout and run command
+ System.setOut(new PrintStream(myOut));
+ FileDump.main(new String[]{testFilePath.toString(), "--rowindex=2"});
+ System.out.flush();
+ System.setOut(origOut);
+ // If called with an expression evaluating to false, the test will halt
+ // and be ignored.
+ assumeTrue(!System.getProperty("os.name").startsWith("Windows"));
+ TestFileDump.checkOutput(outputFilename, workDir + File.separator + outputFilename);
+ }
+}
[31/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/DataReaderProperties.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/DataReaderProperties.java b/orc/src/java/org/apache/hive/orc/impl/DataReaderProperties.java
new file mode 100644
index 0000000..adef538
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/DataReaderProperties.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hive.orc.CompressionKind;
+
+public final class DataReaderProperties {
+
+ private final FileSystem fileSystem;
+ private final Path path;
+ private final CompressionKind compression;
+ private final boolean zeroCopy;
+ private final int typeCount;
+ private final int bufferSize;
+
+ private DataReaderProperties(Builder builder) {
+ this.fileSystem = builder.fileSystem;
+ this.path = builder.path;
+ this.compression = builder.compression;
+ this.zeroCopy = builder.zeroCopy;
+ this.typeCount = builder.typeCount;
+ this.bufferSize = builder.bufferSize;
+ }
+
+ public FileSystem getFileSystem() {
+ return fileSystem;
+ }
+
+ public Path getPath() {
+ return path;
+ }
+
+ public CompressionKind getCompression() {
+ return compression;
+ }
+
+ public boolean getZeroCopy() {
+ return zeroCopy;
+ }
+
+ public int getTypeCount() {
+ return typeCount;
+ }
+
+ public int getBufferSize() {
+ return bufferSize;
+ }
+
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ public static class Builder {
+
+ private FileSystem fileSystem;
+ private Path path;
+ private CompressionKind compression;
+ private boolean zeroCopy;
+ private int typeCount;
+ private int bufferSize;
+
+ private Builder() {
+
+ }
+
+ public Builder withFileSystem(FileSystem fileSystem) {
+ this.fileSystem = fileSystem;
+ return this;
+ }
+
+ public Builder withPath(Path path) {
+ this.path = path;
+ return this;
+ }
+
+ public Builder withCompression(CompressionKind value) {
+ this.compression = value;
+ return this;
+ }
+
+ public Builder withZeroCopy(boolean zeroCopy) {
+ this.zeroCopy = zeroCopy;
+ return this;
+ }
+
+ public Builder withTypeCount(int value) {
+ this.typeCount = value;
+ return this;
+ }
+
+ public Builder withBufferSize(int value) {
+ this.bufferSize = value;
+ return this;
+ }
+
+ public DataReaderProperties build() {
+ Preconditions.checkNotNull(fileSystem);
+ Preconditions.checkNotNull(path);
+
+ return new DataReaderProperties(this);
+ }
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/DirectDecompressionCodec.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/DirectDecompressionCodec.java b/orc/src/java/org/apache/hive/orc/impl/DirectDecompressionCodec.java
new file mode 100644
index 0000000..e6cb84e
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/DirectDecompressionCodec.java
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import org.apache.hive.orc.CompressionCodec;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+public interface DirectDecompressionCodec extends CompressionCodec {
+ public boolean isAvailable();
+ public void directDecompress(ByteBuffer in, ByteBuffer out) throws IOException;
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/DynamicByteArray.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/DynamicByteArray.java b/orc/src/java/org/apache/hive/orc/impl/DynamicByteArray.java
new file mode 100644
index 0000000..004f69a
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/DynamicByteArray.java
@@ -0,0 +1,303 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+
+import org.apache.hadoop.io.Text;
+
+/**
+ * A class that is a growable array of bytes. Growth is managed in terms of
+ * chunks that are allocated when needed.
+ */
+public final class DynamicByteArray {
+ static final int DEFAULT_CHUNKSIZE = 32 * 1024;
+ static final int DEFAULT_NUM_CHUNKS = 128;
+
+ private final int chunkSize; // our allocation sizes
+ private byte[][] data; // the real data
+ private int length; // max set element index +1
+ private int initializedChunks = 0; // the number of chunks created
+
+ public DynamicByteArray() {
+ this(DEFAULT_NUM_CHUNKS, DEFAULT_CHUNKSIZE);
+ }
+
+ public DynamicByteArray(int numChunks, int chunkSize) {
+ if (chunkSize == 0) {
+ throw new IllegalArgumentException("bad chunksize");
+ }
+ this.chunkSize = chunkSize;
+ data = new byte[numChunks][];
+ }
+
+ /**
+ * Ensure that the given index is valid.
+ */
+ private void grow(int chunkIndex) {
+ if (chunkIndex >= initializedChunks) {
+ if (chunkIndex >= data.length) {
+ int newSize = Math.max(chunkIndex + 1, 2 * data.length);
+ byte[][] newChunk = new byte[newSize][];
+ System.arraycopy(data, 0, newChunk, 0, data.length);
+ data = newChunk;
+ }
+ for(int i=initializedChunks; i <= chunkIndex; ++i) {
+ data[i] = new byte[chunkSize];
+ }
+ initializedChunks = chunkIndex + 1;
+ }
+ }
+
+ public byte get(int index) {
+ if (index >= length) {
+ throw new IndexOutOfBoundsException("Index " + index +
+ " is outside of 0.." +
+ (length - 1));
+ }
+ int i = index / chunkSize;
+ int j = index % chunkSize;
+ return data[i][j];
+ }
+
+ public void set(int index, byte value) {
+ int i = index / chunkSize;
+ int j = index % chunkSize;
+ grow(i);
+ if (index >= length) {
+ length = index + 1;
+ }
+ data[i][j] = value;
+ }
+
+ public int add(byte value) {
+ int i = length / chunkSize;
+ int j = length % chunkSize;
+ grow(i);
+ data[i][j] = value;
+ int result = length;
+ length += 1;
+ return result;
+ }
+
+ /**
+ * Copy a slice of a byte array into our buffer.
+ * @param value the array to copy from
+ * @param valueOffset the first location to copy from value
+ * @param valueLength the number of bytes to copy from value
+ * @return the offset of the start of the value
+ */
+ public int add(byte[] value, int valueOffset, int valueLength) {
+ int i = length / chunkSize;
+ int j = length % chunkSize;
+ grow((length + valueLength) / chunkSize);
+ int remaining = valueLength;
+ while (remaining > 0) {
+ int size = Math.min(remaining, chunkSize - j);
+ System.arraycopy(value, valueOffset, data[i], j, size);
+ remaining -= size;
+ valueOffset += size;
+ i += 1;
+ j = 0;
+ }
+ int result = length;
+ length += valueLength;
+ return result;
+ }
+
+ /**
+ * Read the entire stream into this array.
+ * @param in the stream to read from
+ * @throws IOException
+ */
+ public void readAll(InputStream in) throws IOException {
+ int currentChunk = length / chunkSize;
+ int currentOffset = length % chunkSize;
+ grow(currentChunk);
+ int currentLength = in.read(data[currentChunk], currentOffset,
+ chunkSize - currentOffset);
+ while (currentLength > 0) {
+ length += currentLength;
+ currentOffset = length % chunkSize;
+ if (currentOffset == 0) {
+ currentChunk = length / chunkSize;
+ grow(currentChunk);
+ }
+ currentLength = in.read(data[currentChunk], currentOffset,
+ chunkSize - currentOffset);
+ }
+ }
+
+ /**
+ * Byte compare a set of bytes against the bytes in this dynamic array.
+ * @param other source of the other bytes
+ * @param otherOffset start offset in the other array
+ * @param otherLength number of bytes in the other array
+ * @param ourOffset the offset in our array
+ * @param ourLength the number of bytes in our array
+ * @return negative for less, 0 for equal, positive for greater
+ */
+ public int compare(byte[] other, int otherOffset, int otherLength,
+ int ourOffset, int ourLength) {
+ int currentChunk = ourOffset / chunkSize;
+ int currentOffset = ourOffset % chunkSize;
+ int maxLength = Math.min(otherLength, ourLength);
+ while (maxLength > 0 &&
+ other[otherOffset] == data[currentChunk][currentOffset]) {
+ otherOffset += 1;
+ currentOffset += 1;
+ if (currentOffset == chunkSize) {
+ currentChunk += 1;
+ currentOffset = 0;
+ }
+ maxLength -= 1;
+ }
+ if (maxLength == 0) {
+ return otherLength - ourLength;
+ }
+ int otherByte = 0xff & other[otherOffset];
+ int ourByte = 0xff & data[currentChunk][currentOffset];
+ return otherByte > ourByte ? 1 : -1;
+ }
+
+ /**
+ * Get the size of the array.
+ * @return the number of bytes in the array
+ */
+ public int size() {
+ return length;
+ }
+
+ /**
+ * Clear the array to its original pristine state.
+ */
+ public void clear() {
+ length = 0;
+ for(int i=0; i < data.length; ++i) {
+ data[i] = null;
+ }
+ initializedChunks = 0;
+ }
+
+ /**
+ * Set a text value from the bytes in this dynamic array.
+ * @param result the value to set
+ * @param offset the start of the bytes to copy
+ * @param length the number of bytes to copy
+ */
+ public void setText(Text result, int offset, int length) {
+ result.clear();
+ int currentChunk = offset / chunkSize;
+ int currentOffset = offset % chunkSize;
+ int currentLength = Math.min(length, chunkSize - currentOffset);
+ while (length > 0) {
+ result.append(data[currentChunk], currentOffset, currentLength);
+ length -= currentLength;
+ currentChunk += 1;
+ currentOffset = 0;
+ currentLength = Math.min(length, chunkSize - currentOffset);
+ }
+ }
+
+ /**
+ * Write out a range of this dynamic array to an output stream.
+ * @param out the stream to write to
+ * @param offset the first offset to write
+ * @param length the number of bytes to write
+ * @throws IOException
+ */
+ public void write(OutputStream out, int offset,
+ int length) throws IOException {
+ int currentChunk = offset / chunkSize;
+ int currentOffset = offset % chunkSize;
+ while (length > 0) {
+ int currentLength = Math.min(length, chunkSize - currentOffset);
+ out.write(data[currentChunk], currentOffset, currentLength);
+ length -= currentLength;
+ currentChunk += 1;
+ currentOffset = 0;
+ }
+ }
+
+ @Override
+ public String toString() {
+ int i;
+ StringBuilder sb = new StringBuilder(length * 3);
+
+ sb.append('{');
+ int l = length - 1;
+ for (i=0; i<l; i++) {
+ sb.append(Integer.toHexString(get(i)));
+ sb.append(',');
+ }
+ sb.append(get(i));
+ sb.append('}');
+
+ return sb.toString();
+ }
+
+ public void setByteBuffer(ByteBuffer result, int offset, int length) {
+ result.clear();
+ int currentChunk = offset / chunkSize;
+ int currentOffset = offset % chunkSize;
+ int currentLength = Math.min(length, chunkSize - currentOffset);
+ while (length > 0) {
+ result.put(data[currentChunk], currentOffset, currentLength);
+ length -= currentLength;
+ currentChunk += 1;
+ currentOffset = 0;
+ currentLength = Math.min(length, chunkSize - currentOffset);
+ }
+ }
+
+ /**
+ * Gets all the bytes of the array.
+ *
+ * @return Bytes of the array
+ */
+ public byte[] get() {
+ byte[] result = null;
+ if (length > 0) {
+ int currentChunk = 0;
+ int currentOffset = 0;
+ int currentLength = Math.min(length, chunkSize);
+ int destOffset = 0;
+ result = new byte[length];
+ int totalLength = length;
+ while (totalLength > 0) {
+ System.arraycopy(data[currentChunk], currentOffset, result, destOffset, currentLength);
+ destOffset += currentLength;
+ totalLength -= currentLength;
+ currentChunk += 1;
+ currentOffset = 0;
+ currentLength = Math.min(totalLength, chunkSize - currentOffset);
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Get the size of the buffers.
+ */
+ public long getSizeInBytes() {
+ return initializedChunks * chunkSize;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/DynamicIntArray.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/DynamicIntArray.java b/orc/src/java/org/apache/hive/orc/impl/DynamicIntArray.java
new file mode 100644
index 0000000..f44b5b0
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/DynamicIntArray.java
@@ -0,0 +1,142 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+/**
+ * Dynamic int array that uses primitive types and chunks to avoid copying
+ * large number of integers when it resizes.
+ *
+ * The motivation for this class is memory optimization, i.e. space efficient
+ * storage of potentially huge arrays without good a-priori size guesses.
+ *
+ * The API of this class is between a primitive array and a AbstractList. It's
+ * not a Collection implementation because it handles primitive types, but the
+ * API could be extended to support iterators and the like.
+ *
+ * NOTE: Like standard Collection implementations/arrays, this class is not
+ * synchronized.
+ */
+public final class DynamicIntArray {
+ static final int DEFAULT_CHUNKSIZE = 8 * 1024;
+ static final int INIT_CHUNKS = 128;
+
+ private final int chunkSize; // our allocation size
+ private int[][] data; // the real data
+ private int length; // max set element index +1
+ private int initializedChunks = 0; // the number of created chunks
+
+ public DynamicIntArray() {
+ this(DEFAULT_CHUNKSIZE);
+ }
+
+ public DynamicIntArray(int chunkSize) {
+ this.chunkSize = chunkSize;
+
+ data = new int[INIT_CHUNKS][];
+ }
+
+ /**
+ * Ensure that the given index is valid.
+ */
+ private void grow(int chunkIndex) {
+ if (chunkIndex >= initializedChunks) {
+ if (chunkIndex >= data.length) {
+ int newSize = Math.max(chunkIndex + 1, 2 * data.length);
+ int[][] newChunk = new int[newSize][];
+ System.arraycopy(data, 0, newChunk, 0, data.length);
+ data = newChunk;
+ }
+ for (int i=initializedChunks; i <= chunkIndex; ++i) {
+ data[i] = new int[chunkSize];
+ }
+ initializedChunks = chunkIndex + 1;
+ }
+ }
+
+ public int get(int index) {
+ if (index >= length) {
+ throw new IndexOutOfBoundsException("Index " + index +
+ " is outside of 0.." +
+ (length - 1));
+ }
+ int i = index / chunkSize;
+ int j = index % chunkSize;
+ return data[i][j];
+ }
+
+ public void set(int index, int value) {
+ int i = index / chunkSize;
+ int j = index % chunkSize;
+ grow(i);
+ if (index >= length) {
+ length = index + 1;
+ }
+ data[i][j] = value;
+ }
+
+ public void increment(int index, int value) {
+ int i = index / chunkSize;
+ int j = index % chunkSize;
+ grow(i);
+ if (index >= length) {
+ length = index + 1;
+ }
+ data[i][j] += value;
+ }
+
+ public void add(int value) {
+ int i = length / chunkSize;
+ int j = length % chunkSize;
+ grow(i);
+ data[i][j] = value;
+ length += 1;
+ }
+
+ public int size() {
+ return length;
+ }
+
+ public void clear() {
+ length = 0;
+ for(int i=0; i < data.length; ++i) {
+ data[i] = null;
+ }
+ initializedChunks = 0;
+ }
+
+ public String toString() {
+ int i;
+ StringBuilder sb = new StringBuilder(length * 4);
+
+ sb.append('{');
+ int l = length - 1;
+ for (i=0; i<l; i++) {
+ sb.append(get(i));
+ sb.append(',');
+ }
+ sb.append(get(i));
+ sb.append('}');
+
+ return sb.toString();
+ }
+
+ public int getSizeInBytes() {
+ return 4 * initializedChunks * chunkSize;
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/HadoopShims.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/HadoopShims.java b/orc/src/java/org/apache/hive/orc/impl/HadoopShims.java
new file mode 100644
index 0000000..8c9a624
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/HadoopShims.java
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.VersionInfo;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+
+public interface HadoopShims {
+
+ enum DirectCompressionType {
+ NONE,
+ ZLIB_NOHEADER,
+ ZLIB,
+ SNAPPY,
+ }
+
+ interface DirectDecompressor {
+ void decompress(ByteBuffer var1, ByteBuffer var2) throws IOException;
+ }
+
+ /**
+ * Get a direct decompressor codec, if it is available
+ * @param codec
+ * @return
+ */
+ DirectDecompressor getDirectDecompressor(DirectCompressionType codec);
+
+ /**
+ * a hadoop.io ByteBufferPool shim.
+ */
+ public interface ByteBufferPoolShim {
+ /**
+ * Get a new ByteBuffer from the pool. The pool can provide this from
+ * removing a buffer from its internal cache, or by allocating a
+ * new buffer.
+ *
+ * @param direct Whether the buffer should be direct.
+ * @param length The minimum length the buffer will have.
+ * @return A new ByteBuffer. Its capacity can be less
+ * than what was requested, but must be at
+ * least 1 byte.
+ */
+ ByteBuffer getBuffer(boolean direct, int length);
+
+ /**
+ * Release a buffer back to the pool.
+ * The pool may choose to put this buffer into its cache/free it.
+ *
+ * @param buffer a direct bytebuffer
+ */
+ void putBuffer(ByteBuffer buffer);
+ }
+
+ /**
+ * Provides an HDFS ZeroCopyReader shim.
+ * @param in FSDataInputStream to read from (where the cached/mmap buffers are tied to)
+ * @param in ByteBufferPoolShim to allocate fallback buffers with
+ *
+ * @return returns null if not supported
+ */
+ public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in, ByteBufferPoolShim pool) throws IOException;
+
+ public interface ZeroCopyReaderShim extends Closeable {
+ /**
+ * Get a ByteBuffer from the FSDataInputStream - this can be either a HeapByteBuffer or an MappedByteBuffer.
+ * Also move the in stream by that amount. The data read can be small than maxLength.
+ *
+ * @return ByteBuffer read from the stream,
+ */
+ public ByteBuffer readBuffer(int maxLength, boolean verifyChecksums) throws IOException;
+ /**
+ * Release a ByteBuffer obtained from a read on the
+ * Also move the in stream by that amount. The data read can be small than maxLength.
+ *
+ */
+ public void releaseBuffer(ByteBuffer buffer);
+
+ /**
+ * Close the underlying stream.
+ * @throws IOException
+ */
+ public void close() throws IOException;
+ }
+ /**
+ * Read data into a Text object in the fastest way possible
+ */
+ public interface TextReaderShim {
+ /**
+ * @param txt
+ * @param size
+ * @return bytes read
+ * @throws IOException
+ */
+ void read(Text txt, int size) throws IOException;
+ }
+
+ /**
+ * Wrap a TextReaderShim around an input stream. The reader shim will not
+ * buffer any reads from the underlying stream and will only consume bytes
+ * which are required for TextReaderShim.read() input.
+ */
+ public TextReaderShim getTextReaderShim(InputStream input) throws IOException;
+
+ class Factory {
+ private static HadoopShims SHIMS = null;
+
+ public static synchronized HadoopShims get() {
+ if (SHIMS == null) {
+ String[] versionParts = VersionInfo.getVersion().split("[.]");
+ int major = Integer.parseInt(versionParts[0]);
+ int minor = Integer.parseInt(versionParts[1]);
+ if (major < 2 || (major == 2 && minor < 3)) {
+ SHIMS = new HadoopShims_2_2();
+ } else {
+ SHIMS = new HadoopShimsCurrent();
+ }
+ }
+ return SHIMS;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/HadoopShimsCurrent.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/HadoopShimsCurrent.java b/orc/src/java/org/apache/hive/orc/impl/HadoopShimsCurrent.java
new file mode 100644
index 0000000..87aa9d7
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/HadoopShimsCurrent.java
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.snappy.SnappyDecompressor;
+import org.apache.hadoop.io.compress.zlib.ZlibDecompressor;
+
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+
+/**
+ * Shims for recent versions of Hadoop
+ */
+public class HadoopShimsCurrent implements HadoopShims {
+
+ private static class DirectDecompressWrapper implements DirectDecompressor {
+ private final org.apache.hadoop.io.compress.DirectDecompressor root;
+
+ DirectDecompressWrapper(org.apache.hadoop.io.compress.DirectDecompressor root) {
+ this.root = root;
+ }
+
+ public void decompress(ByteBuffer input,
+ ByteBuffer output) throws IOException {
+ root.decompress(input, output);
+ }
+ }
+
+ public DirectDecompressor getDirectDecompressor(
+ DirectCompressionType codec) {
+ switch (codec) {
+ case ZLIB:
+ return new DirectDecompressWrapper
+ (new ZlibDecompressor.ZlibDirectDecompressor());
+ case ZLIB_NOHEADER:
+ return new DirectDecompressWrapper
+ (new ZlibDecompressor.ZlibDirectDecompressor
+ (ZlibDecompressor.CompressionHeader.NO_HEADER, 0));
+ case SNAPPY:
+ return new DirectDecompressWrapper
+ (new SnappyDecompressor.SnappyDirectDecompressor());
+ default:
+ return null;
+ }
+ }
+
+ @Override
+ public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in,
+ ByteBufferPoolShim pool
+ ) throws IOException {
+ return ZeroCopyShims.getZeroCopyReader(in, pool);
+ }
+
+ private final class FastTextReaderShim implements TextReaderShim {
+ private final DataInputStream din;
+
+ public FastTextReaderShim(InputStream in) {
+ this.din = new DataInputStream(in);
+ }
+
+ @Override
+ public void read(Text txt, int len) throws IOException {
+ txt.readWithKnownLength(din, len);
+ }
+ }
+
+ @Override
+ public TextReaderShim getTextReaderShim(InputStream in) throws IOException {
+ return new FastTextReaderShim(in);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/HadoopShims_2_2.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/HadoopShims_2_2.java b/orc/src/java/org/apache/hive/orc/impl/HadoopShims_2_2.java
new file mode 100644
index 0000000..ce0be86
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/HadoopShims_2_2.java
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.io.Text;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.lang.reflect.Method;
+
+/**
+ * Shims for versions of Hadoop up to and including 2.2.x
+ */
+public class HadoopShims_2_2 implements HadoopShims {
+
+ final boolean zeroCopy;
+ final boolean fastRead;
+
+ HadoopShims_2_2() {
+ boolean zcr = false;
+ try {
+ Class.forName("org.apache.hadoop.fs.CacheFlag", false,
+ HadoopShims_2_2.class.getClassLoader());
+ zcr = true;
+ } catch (ClassNotFoundException ce) {
+ }
+ zeroCopy = zcr;
+ boolean fastRead = false;
+ if (zcr) {
+ for (Method m : Text.class.getMethods()) {
+ if ("readWithKnownLength".equals(m.getName())) {
+ fastRead = true;
+ }
+ }
+ }
+ this.fastRead = fastRead;
+ }
+
+ public DirectDecompressor getDirectDecompressor(
+ DirectCompressionType codec) {
+ return null;
+ }
+
+ @Override
+ public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in,
+ ByteBufferPoolShim pool
+ ) throws IOException {
+ if(zeroCopy) {
+ return ZeroCopyShims.getZeroCopyReader(in, pool);
+ }
+ /* not supported */
+ return null;
+ }
+
+ private final class BasicTextReaderShim implements TextReaderShim {
+ private final InputStream in;
+
+ public BasicTextReaderShim(InputStream in) {
+ this.in = in;
+ }
+
+ @Override
+ public void read(Text txt, int len) throws IOException {
+ int offset = 0;
+ byte[] bytes = new byte[len];
+ while (len > 0) {
+ int written = in.read(bytes, offset, len);
+ if (written < 0) {
+ throw new EOFException("Can't finish read from " + in + " read "
+ + (offset) + " bytes out of " + bytes.length);
+ }
+ len -= written;
+ offset += written;
+ }
+ txt.set(bytes);
+ }
+ }
+
+ @Override
+ public TextReaderShim getTextReaderShim(InputStream in) throws IOException {
+ return new BasicTextReaderShim(in);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/InStream.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/InStream.java b/orc/src/java/org/apache/hive/orc/impl/InStream.java
new file mode 100644
index 0000000..96c9ed3
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/InStream.java
@@ -0,0 +1,498 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ListIterator;
+
+import org.apache.hive.orc.CompressionCodec;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.hive.common.io.DiskRange;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.protobuf.CodedInputStream;
+
+public abstract class InStream extends InputStream {
+
+ private static final Logger LOG = LoggerFactory.getLogger(InStream.class);
+ public static final int PROTOBUF_MESSAGE_MAX_LIMIT = 1024 << 20; // 1GB
+
+ protected final String name;
+ protected long length;
+
+ public InStream(String name, long length) {
+ this.name = name;
+ this.length = length;
+ }
+
+ public String getStreamName() {
+ return name;
+ }
+
+ public long getStreamLength() {
+ return length;
+ }
+
+ @Override
+ public abstract void close();
+
+ public static class UncompressedStream extends InStream {
+ private List<DiskRange> bytes;
+ private long length;
+ protected long currentOffset;
+ private ByteBuffer range;
+ private int currentRange;
+
+ public UncompressedStream(String name, List<DiskRange> input, long length) {
+ super(name, length);
+ reset(input, length);
+ }
+
+ protected void reset(List<DiskRange> input, long length) {
+ this.bytes = input;
+ this.length = length;
+ currentRange = 0;
+ currentOffset = 0;
+ range = null;
+ }
+
+ @Override
+ public int read() {
+ if (range == null || range.remaining() == 0) {
+ if (currentOffset == length) {
+ return -1;
+ }
+ seek(currentOffset);
+ }
+ currentOffset += 1;
+ return 0xff & range.get();
+ }
+
+ @Override
+ public int read(byte[] data, int offset, int length) {
+ if (range == null || range.remaining() == 0) {
+ if (currentOffset == this.length) {
+ return -1;
+ }
+ seek(currentOffset);
+ }
+ int actualLength = Math.min(length, range.remaining());
+ range.get(data, offset, actualLength);
+ currentOffset += actualLength;
+ return actualLength;
+ }
+
+ @Override
+ public int available() {
+ if (range != null && range.remaining() > 0) {
+ return range.remaining();
+ }
+ return (int) (length - currentOffset);
+ }
+
+ @Override
+ public void close() {
+ currentRange = bytes.size();
+ currentOffset = length;
+ // explicit de-ref of bytes[]
+ bytes.clear();
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ seek(index.getNext());
+ }
+
+ public void seek(long desired) {
+ if (desired == 0 && bytes.isEmpty()) {
+ logEmptySeek(name);
+ return;
+ }
+ int i = 0;
+ for (DiskRange curRange : bytes) {
+ if (desired == 0 && curRange.getData().remaining() == 0) {
+ logEmptySeek(name);
+ return;
+ }
+ if (curRange.getOffset() <= desired &&
+ (desired - curRange.getOffset()) < curRange.getLength()) {
+ currentOffset = desired;
+ currentRange = i;
+ this.range = curRange.getData().duplicate();
+ int pos = range.position();
+ pos += (int)(desired - curRange.getOffset()); // this is why we duplicate
+ this.range.position(pos);
+ return;
+ }
+ ++i;
+ }
+ // if they are seeking to the precise end, go ahead and let them go there
+ int segments = bytes.size();
+ if (segments != 0 && desired == bytes.get(segments - 1).getEnd()) {
+ currentOffset = desired;
+ currentRange = segments - 1;
+ DiskRange curRange = bytes.get(currentRange);
+ this.range = curRange.getData().duplicate();
+ int pos = range.position();
+ pos += (int)(desired - curRange.getOffset()); // this is why we duplicate
+ this.range.position(pos);
+ return;
+ }
+ throw new IllegalArgumentException("Seek in " + name + " to " +
+ desired + " is outside of the data");
+ }
+
+ @Override
+ public String toString() {
+ return "uncompressed stream " + name + " position: " + currentOffset +
+ " length: " + length + " range: " + currentRange +
+ " offset: " + (range == null ? 0 : range.position()) + " limit: " + (range == null ? 0 : range.limit());
+ }
+ }
+
+ private static ByteBuffer allocateBuffer(int size, boolean isDirect) {
+ // TODO: use the same pool as the ORC readers
+ if (isDirect) {
+ return ByteBuffer.allocateDirect(size);
+ } else {
+ return ByteBuffer.allocate(size);
+ }
+ }
+
+ private static class CompressedStream extends InStream {
+ private final List<DiskRange> bytes;
+ private final int bufferSize;
+ private ByteBuffer uncompressed;
+ private final CompressionCodec codec;
+ private ByteBuffer compressed;
+ private long currentOffset;
+ private int currentRange;
+ private boolean isUncompressedOriginal;
+
+ public CompressedStream(String name, List<DiskRange> input, long length,
+ CompressionCodec codec, int bufferSize) {
+ super(name, length);
+ this.bytes = input;
+ this.codec = codec;
+ this.bufferSize = bufferSize;
+ currentOffset = 0;
+ currentRange = 0;
+ }
+
+ private void allocateForUncompressed(int size, boolean isDirect) {
+ uncompressed = allocateBuffer(size, isDirect);
+ }
+
+ private void readHeader() throws IOException {
+ if (compressed == null || compressed.remaining() <= 0) {
+ seek(currentOffset);
+ }
+ if (compressed.remaining() > OutStream.HEADER_SIZE) {
+ int b0 = compressed.get() & 0xff;
+ int b1 = compressed.get() & 0xff;
+ int b2 = compressed.get() & 0xff;
+ boolean isOriginal = (b0 & 0x01) == 1;
+ int chunkLength = (b2 << 15) | (b1 << 7) | (b0 >> 1);
+
+ if (chunkLength > bufferSize) {
+ throw new IllegalArgumentException("Buffer size too small. size = " +
+ bufferSize + " needed = " + chunkLength);
+ }
+ // read 3 bytes, which should be equal to OutStream.HEADER_SIZE always
+ assert OutStream.HEADER_SIZE == 3 : "The Orc HEADER_SIZE must be the same in OutStream and InStream";
+ currentOffset += OutStream.HEADER_SIZE;
+
+ ByteBuffer slice = this.slice(chunkLength);
+
+ if (isOriginal) {
+ uncompressed = slice;
+ isUncompressedOriginal = true;
+ } else {
+ if (isUncompressedOriginal) {
+ allocateForUncompressed(bufferSize, slice.isDirect());
+ isUncompressedOriginal = false;
+ } else if (uncompressed == null) {
+ allocateForUncompressed(bufferSize, slice.isDirect());
+ } else {
+ uncompressed.clear();
+ }
+ codec.decompress(slice, uncompressed);
+ }
+ } else {
+ throw new IllegalStateException("Can't read header at " + this);
+ }
+ }
+
+ @Override
+ public int read() throws IOException {
+ if (uncompressed == null || uncompressed.remaining() == 0) {
+ if (currentOffset == length) {
+ return -1;
+ }
+ readHeader();
+ }
+ return 0xff & uncompressed.get();
+ }
+
+ @Override
+ public int read(byte[] data, int offset, int length) throws IOException {
+ if (uncompressed == null || uncompressed.remaining() == 0) {
+ if (currentOffset == this.length) {
+ return -1;
+ }
+ readHeader();
+ }
+ int actualLength = Math.min(length, uncompressed.remaining());
+ uncompressed.get(data, offset, actualLength);
+ return actualLength;
+ }
+
+ @Override
+ public int available() throws IOException {
+ if (uncompressed == null || uncompressed.remaining() == 0) {
+ if (currentOffset == length) {
+ return 0;
+ }
+ readHeader();
+ }
+ return uncompressed.remaining();
+ }
+
+ @Override
+ public void close() {
+ uncompressed = null;
+ compressed = null;
+ currentRange = bytes.size();
+ currentOffset = length;
+ bytes.clear();
+ }
+
+ @Override
+ public void seek(PositionProvider index) throws IOException {
+ seek(index.getNext());
+ long uncompressedBytes = index.getNext();
+ if (uncompressedBytes != 0) {
+ readHeader();
+ uncompressed.position(uncompressed.position() +
+ (int) uncompressedBytes);
+ } else if (uncompressed != null) {
+ // mark the uncompressed buffer as done
+ uncompressed.position(uncompressed.limit());
+ }
+ }
+
+ /* slices a read only contiguous buffer of chunkLength */
+ private ByteBuffer slice(int chunkLength) throws IOException {
+ int len = chunkLength;
+ final long oldOffset = currentOffset;
+ ByteBuffer slice;
+ if (compressed.remaining() >= len) {
+ slice = compressed.slice();
+ // simple case
+ slice.limit(len);
+ currentOffset += len;
+ compressed.position(compressed.position() + len);
+ return slice;
+ } else if (currentRange >= (bytes.size() - 1)) {
+ // nothing has been modified yet
+ throw new IOException("EOF in " + this + " while trying to read " +
+ chunkLength + " bytes");
+ }
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(String.format(
+ "Crossing into next BufferChunk because compressed only has %d bytes (needs %d)",
+ compressed.remaining(), len));
+ }
+
+ // we need to consolidate 2 or more buffers into 1
+ // first copy out compressed buffers
+ ByteBuffer copy = allocateBuffer(chunkLength, compressed.isDirect());
+ currentOffset += compressed.remaining();
+ len -= compressed.remaining();
+ copy.put(compressed);
+ ListIterator<DiskRange> iter = bytes.listIterator(currentRange);
+
+ while (len > 0 && iter.hasNext()) {
+ ++currentRange;
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(String.format("Read slow-path, >1 cross block reads with %s", this.toString()));
+ }
+ DiskRange range = iter.next();
+ compressed = range.getData().duplicate();
+ if (compressed.remaining() >= len) {
+ slice = compressed.slice();
+ slice.limit(len);
+ copy.put(slice);
+ currentOffset += len;
+ compressed.position(compressed.position() + len);
+ return copy;
+ }
+ currentOffset += compressed.remaining();
+ len -= compressed.remaining();
+ copy.put(compressed);
+ }
+
+ // restore offsets for exception clarity
+ seek(oldOffset);
+ throw new IOException("EOF in " + this + " while trying to read " +
+ chunkLength + " bytes");
+ }
+
+ private void seek(long desired) throws IOException {
+ if (desired == 0 && bytes.isEmpty()) {
+ logEmptySeek(name);
+ return;
+ }
+ int i = 0;
+ for (DiskRange range : bytes) {
+ if (range.getOffset() <= desired && desired < range.getEnd()) {
+ currentRange = i;
+ compressed = range.getData().duplicate();
+ int pos = compressed.position();
+ pos += (int)(desired - range.getOffset());
+ compressed.position(pos);
+ currentOffset = desired;
+ return;
+ }
+ ++i;
+ }
+ // if they are seeking to the precise end, go ahead and let them go there
+ int segments = bytes.size();
+ if (segments != 0 && desired == bytes.get(segments - 1).getEnd()) {
+ DiskRange range = bytes.get(segments - 1);
+ currentRange = segments - 1;
+ compressed = range.getData().duplicate();
+ compressed.position(compressed.limit());
+ currentOffset = desired;
+ return;
+ }
+ throw new IOException("Seek outside of data in " + this + " to " + desired);
+ }
+
+ private String rangeString() {
+ StringBuilder builder = new StringBuilder();
+ int i = 0;
+ for (DiskRange range : bytes) {
+ if (i != 0) {
+ builder.append("; ");
+ }
+ builder.append(" range " + i + " = " + range.getOffset()
+ + " to " + (range.getEnd() - range.getOffset()));
+ ++i;
+ }
+ return builder.toString();
+ }
+
+ @Override
+ public String toString() {
+ return "compressed stream " + name + " position: " + currentOffset +
+ " length: " + length + " range: " + currentRange +
+ " offset: " + (compressed == null ? 0 : compressed.position()) + " limit: " + (compressed == null ? 0 : compressed.limit()) +
+ rangeString() +
+ (uncompressed == null ? "" :
+ " uncompressed: " + uncompressed.position() + " to " +
+ uncompressed.limit());
+ }
+ }
+
+ public abstract void seek(PositionProvider index) throws IOException;
+
+ private static void logEmptySeek(String name) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Attempting seek into empty stream (" + name + ") Skipping stream.");
+ }
+ }
+
+ /**
+ * Create an input stream from a list of buffers.
+ * @param streamName the name of the stream
+ * @param buffers the list of ranges of bytes for the stream
+ * @param offsets a list of offsets (the same length as input) that must
+ * contain the first offset of the each set of bytes in input
+ * @param length the length in bytes of the stream
+ * @param codec the compression codec
+ * @param bufferSize the compression buffer size
+ * @return an input stream
+ * @throws IOException
+ */
+ @VisibleForTesting
+ @Deprecated
+ public static InStream create(String streamName,
+ ByteBuffer[] buffers,
+ long[] offsets,
+ long length,
+ CompressionCodec codec,
+ int bufferSize) throws IOException {
+ List<DiskRange> input = new ArrayList<DiskRange>(buffers.length);
+ for (int i = 0; i < buffers.length; ++i) {
+ input.add(new BufferChunk(buffers[i], offsets[i]));
+ }
+ return create(streamName, input, length, codec, bufferSize);
+ }
+
+ /**
+ * Create an input stream from a list of disk ranges with data.
+ * @param name the name of the stream
+ * @param input the list of ranges of bytes for the stream; from disk or cache
+ * @param length the length in bytes of the stream
+ * @param codec the compression codec
+ * @param bufferSize the compression buffer size
+ * @return an input stream
+ * @throws IOException
+ */
+ public static InStream create(String name,
+ List<DiskRange> input,
+ long length,
+ CompressionCodec codec,
+ int bufferSize) throws IOException {
+ if (codec == null) {
+ return new UncompressedStream(name, input, length);
+ } else {
+ return new CompressedStream(name, input, length, codec, bufferSize);
+ }
+ }
+
+ /**
+ * Creates coded input stream (used for protobuf message parsing) with higher message size limit.
+ *
+ * @param name the name of the stream
+ * @param input the list of ranges of bytes for the stream; from disk or cache
+ * @param length the length in bytes of the stream
+ * @param codec the compression codec
+ * @param bufferSize the compression buffer size
+ * @return coded input stream
+ * @throws IOException
+ */
+ public static CodedInputStream createCodedInputStream(
+ String name,
+ List<DiskRange> input,
+ long length,
+ CompressionCodec codec,
+ int bufferSize) throws IOException {
+ InStream inStream = create(name, input, length, codec, bufferSize);
+ CodedInputStream codedInputStream = CodedInputStream.newInstance(inStream);
+ codedInputStream.setSizeLimit(PROTOBUF_MESSAGE_MAX_LIMIT);
+ return codedInputStream;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/IntegerReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/IntegerReader.java b/orc/src/java/org/apache/hive/orc/impl/IntegerReader.java
new file mode 100644
index 0000000..45cab28
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/IntegerReader.java
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+
+/**
+ * Interface for reading integers.
+ */
+public interface IntegerReader {
+
+ /**
+ * Seek to the position provided by index.
+ * @param index
+ * @throws IOException
+ */
+ void seek(PositionProvider index) throws IOException;
+
+ /**
+ * Skip number of specified rows.
+ * @param numValues
+ * @throws IOException
+ */
+ void skip(long numValues) throws IOException;
+
+ /**
+ * Check if there are any more values left.
+ * @return
+ * @throws IOException
+ */
+ boolean hasNext() throws IOException;
+
+ /**
+ * Return the next available value.
+ * @return
+ * @throws IOException
+ */
+ long next() throws IOException;
+
+ /**
+ * Return the next available vector for values.
+ * @param column the column being read
+ * @param data the vector to read into
+ * @param length the number of numbers to read
+ * @throws IOException
+ */
+ void nextVector(ColumnVector column,
+ long[] data,
+ int length
+ ) throws IOException;
+
+ /**
+ * Return the next available vector for values. Does not change the
+ * value of column.isRepeating.
+ * @param column the column being read
+ * @param data the vector to read into
+ * @param length the number of numbers to read
+ * @throws IOException
+ */
+ void nextVector(ColumnVector column,
+ int[] data,
+ int length
+ ) throws IOException;
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/IntegerWriter.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/IntegerWriter.java b/orc/src/java/org/apache/hive/orc/impl/IntegerWriter.java
new file mode 100644
index 0000000..119eed0
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/IntegerWriter.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+
+/**
+ * Interface for writing integers.
+ */
+public interface IntegerWriter {
+
+ /**
+ * Get position from the stream.
+ * @param recorder
+ * @throws IOException
+ */
+ void getPosition(PositionRecorder recorder) throws IOException;
+
+ /**
+ * Write the integer value
+ * @param value
+ * @throws IOException
+ */
+ void write(long value) throws IOException;
+
+ /**
+ * Flush the buffer
+ * @throws IOException
+ */
+ void flush() throws IOException;
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/MemoryManager.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/MemoryManager.java b/orc/src/java/org/apache/hive/orc/impl/MemoryManager.java
new file mode 100644
index 0000000..1dd74ef
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/MemoryManager.java
@@ -0,0 +1,212 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import org.apache.hive.orc.OrcConf;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+
+import java.io.IOException;
+import java.lang.management.ManagementFactory;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.locks.ReentrantLock;
+
+/**
+ * Implements a memory manager that keeps a global context of how many ORC
+ * writers there are and manages the memory between them. For use cases with
+ * dynamic partitions, it is easy to end up with many writers in the same task.
+ * By managing the size of each allocation, we try to cut down the size of each
+ * allocation and keep the task from running out of memory.
+ *
+ * This class is not thread safe, but is re-entrant - ensure creation and all
+ * invocations are triggered from the same thread.
+ */
+public class MemoryManager {
+
+ private static final Logger LOG = LoggerFactory.getLogger(MemoryManager.class);
+
+ /**
+ * How often should we check the memory sizes? Measured in rows added
+ * to all of the writers.
+ */
+ private static final int ROWS_BETWEEN_CHECKS = 5000;
+ private final long totalMemoryPool;
+ private final Map<Path, WriterInfo> writerList =
+ new HashMap<Path, WriterInfo>();
+ private long totalAllocation = 0;
+ private double currentScale = 1;
+ private int rowsAddedSinceCheck = 0;
+ private final OwnedLock ownerLock = new OwnedLock();
+
+ @SuppressWarnings("serial")
+ private static class OwnedLock extends ReentrantLock {
+ public Thread getOwner() {
+ return super.getOwner();
+ }
+ }
+
+ private static class WriterInfo {
+ long allocation;
+ Callback callback;
+ WriterInfo(long allocation, Callback callback) {
+ this.allocation = allocation;
+ this.callback = callback;
+ }
+ }
+
+ public interface Callback {
+ /**
+ * The writer needs to check its memory usage
+ * @param newScale the current scale factor for memory allocations
+ * @return true if the writer was over the limit
+ * @throws IOException
+ */
+ boolean checkMemory(double newScale) throws IOException;
+ }
+
+ /**
+ * Create the memory manager.
+ * @param conf use the configuration to find the maximum size of the memory
+ * pool.
+ */
+ public MemoryManager(Configuration conf) {
+ double maxLoad = OrcConf.MEMORY_POOL.getDouble(conf);
+ totalMemoryPool = Math.round(ManagementFactory.getMemoryMXBean().
+ getHeapMemoryUsage().getMax() * maxLoad);
+ ownerLock.lock();
+ }
+
+ /**
+ * Light weight thread-safety check for multi-threaded access patterns
+ */
+ private void checkOwner() {
+ if (!ownerLock.isHeldByCurrentThread()) {
+ LOG.warn("Owner thread expected {}, got {}",
+ ownerLock.getOwner(), Thread.currentThread());
+ }
+ }
+
+ /**
+ * Add a new writer's memory allocation to the pool. We use the path
+ * as a unique key to ensure that we don't get duplicates.
+ * @param path the file that is being written
+ * @param requestedAllocation the requested buffer size
+ */
+ public void addWriter(Path path, long requestedAllocation,
+ Callback callback) throws IOException {
+ checkOwner();
+ WriterInfo oldVal = writerList.get(path);
+ // this should always be null, but we handle the case where the memory
+ // manager wasn't told that a writer wasn't still in use and the task
+ // starts writing to the same path.
+ if (oldVal == null) {
+ oldVal = new WriterInfo(requestedAllocation, callback);
+ writerList.put(path, oldVal);
+ totalAllocation += requestedAllocation;
+ } else {
+ // handle a new writer that is writing to the same path
+ totalAllocation += requestedAllocation - oldVal.allocation;
+ oldVal.allocation = requestedAllocation;
+ oldVal.callback = callback;
+ }
+ updateScale(true);
+ }
+
+ /**
+ * Remove the given writer from the pool.
+ * @param path the file that has been closed
+ */
+ public void removeWriter(Path path) throws IOException {
+ checkOwner();
+ WriterInfo val = writerList.get(path);
+ if (val != null) {
+ writerList.remove(path);
+ totalAllocation -= val.allocation;
+ if (writerList.isEmpty()) {
+ rowsAddedSinceCheck = 0;
+ }
+ updateScale(false);
+ }
+ if(writerList.isEmpty()) {
+ rowsAddedSinceCheck = 0;
+ }
+ }
+
+ /**
+ * Get the total pool size that is available for ORC writers.
+ * @return the number of bytes in the pool
+ */
+ public long getTotalMemoryPool() {
+ return totalMemoryPool;
+ }
+
+ /**
+ * The scaling factor for each allocation to ensure that the pool isn't
+ * oversubscribed.
+ * @return a fraction between 0.0 and 1.0 of the requested size that is
+ * available for each writer.
+ */
+ public double getAllocationScale() {
+ return currentScale;
+ }
+
+ /**
+ * Give the memory manager an opportunity for doing a memory check.
+ * @param rows number of rows added
+ * @throws IOException
+ */
+ public void addedRow(int rows) throws IOException {
+ rowsAddedSinceCheck += rows;
+ if (rowsAddedSinceCheck >= ROWS_BETWEEN_CHECKS) {
+ notifyWriters();
+ }
+ }
+
+ /**
+ * Notify all of the writers that they should check their memory usage.
+ * @throws IOException
+ */
+ public void notifyWriters() throws IOException {
+ checkOwner();
+ LOG.debug("Notifying writers after " + rowsAddedSinceCheck);
+ for(WriterInfo writer: writerList.values()) {
+ boolean flushed = writer.callback.checkMemory(currentScale);
+ if (LOG.isDebugEnabled() && flushed) {
+ LOG.debug("flushed " + writer.toString());
+ }
+ }
+ rowsAddedSinceCheck = 0;
+ }
+
+ /**
+ * Update the currentScale based on the current allocation and pool size.
+ * This also updates the notificationTrigger.
+ * @param isAllocate is this an allocation?
+ */
+ private void updateScale(boolean isAllocate) throws IOException {
+ if (totalAllocation <= totalMemoryPool) {
+ currentScale = 1;
+ } else {
+ currentScale = (double) totalMemoryPool / totalAllocation;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/OrcAcidUtils.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/OrcAcidUtils.java b/orc/src/java/org/apache/hive/orc/impl/OrcAcidUtils.java
new file mode 100644
index 0000000..4244f24
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/OrcAcidUtils.java
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hive.orc.Reader;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+
+public class OrcAcidUtils {
+ public static final String ACID_STATS = "hive.acid.stats";
+ public static final String DELTA_SIDE_FILE_SUFFIX = "_flush_length";
+
+ /**
+ * Get the filename of the ORC ACID side file that contains the lengths
+ * of the intermediate footers.
+ * @param main the main ORC filename
+ * @return the name of the side file
+ */
+ public static Path getSideFile(Path main) {
+ return new Path(main + DELTA_SIDE_FILE_SUFFIX);
+ }
+
+ /**
+ * Read the side file to get the last flush length.
+ * @param fs the file system to use
+ * @param deltaFile the path of the delta file
+ * @return the maximum size of the file to use
+ * @throws IOException
+ */
+ public static long getLastFlushLength(FileSystem fs,
+ Path deltaFile) throws IOException {
+ Path lengths = getSideFile(deltaFile);
+ long result = Long.MAX_VALUE;
+ if(!fs.exists(lengths)) {
+ return result;
+ }
+ try (FSDataInputStream stream = fs.open(lengths)) {
+ result = -1;
+ while (stream.available() > 0) {
+ result = stream.readLong();
+ }
+ return result;
+ } catch (IOException ioe) {
+ return result;
+ }
+ }
+
+ private static final Charset utf8 = Charset.forName("UTF-8");
+ private static final CharsetDecoder utf8Decoder = utf8.newDecoder();
+
+ public static AcidStats parseAcidStats(Reader reader) {
+ if (reader.hasMetadataValue(ACID_STATS)) {
+ try {
+ ByteBuffer val = reader.getMetadataValue(ACID_STATS).duplicate();
+ return new AcidStats(utf8Decoder.decode(val).toString());
+ } catch (CharacterCodingException e) {
+ throw new IllegalArgumentException("Bad string encoding for " +
+ ACID_STATS, e);
+ }
+ } else {
+ return null;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/OrcIndex.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/OrcIndex.java b/orc/src/java/org/apache/hive/orc/impl/OrcIndex.java
new file mode 100644
index 0000000..cf2b5fe
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/OrcIndex.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import org.apache.hive.orc.OrcProto;
+
+public final class OrcIndex {
+ OrcProto.RowIndex[] rowGroupIndex;
+ OrcProto.BloomFilterIndex[] bloomFilterIndex;
+
+ public OrcIndex(OrcProto.RowIndex[] rgIndex, OrcProto.BloomFilterIndex[] bfIndex) {
+ this.rowGroupIndex = rgIndex;
+ this.bloomFilterIndex = bfIndex;
+ }
+
+ public OrcProto.RowIndex[] getRowGroupIndex() {
+ return rowGroupIndex;
+ }
+
+ public OrcProto.BloomFilterIndex[] getBloomFilterIndex() {
+ return bloomFilterIndex;
+ }
+
+ public void setRowGroupIndex(OrcProto.RowIndex[] rowGroupIndex) {
+ this.rowGroupIndex = rowGroupIndex;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/OrcTail.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/OrcTail.java b/orc/src/java/org/apache/hive/orc/impl/OrcTail.java
new file mode 100644
index 0000000..71e3c77
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/OrcTail.java
@@ -0,0 +1,138 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.orc.impl;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hive.orc.CompressionCodec;
+import org.apache.hive.orc.OrcFile;
+import org.apache.hive.orc.CompressionKind;
+import org.apache.hive.orc.OrcProto;
+import org.apache.hive.orc.StripeInformation;
+import org.apache.hive.orc.StripeStatistics;
+
+// TODO: Make OrcTail implement FileMetadata or Reader interface
+public final class OrcTail {
+ // postscript + footer - Serialized in OrcSplit
+ private final OrcProto.FileTail fileTail;
+ // serialized representation of metadata, footer and postscript
+ private final ByteBuffer serializedTail;
+ // used to invalidate cache entries
+ private final long fileModificationTime;
+ // lazily deserialized
+ private OrcProto.Metadata metadata;
+
+ public OrcTail(OrcProto.FileTail fileTail, ByteBuffer serializedTail) {
+ this(fileTail, serializedTail, -1);
+ }
+
+ public OrcTail(OrcProto.FileTail fileTail, ByteBuffer serializedTail, long fileModificationTime) {
+ this.fileTail = fileTail;
+ this.serializedTail = serializedTail;
+ this.fileModificationTime = fileModificationTime;
+ this.metadata = null;
+ }
+
+ public ByteBuffer getSerializedTail() {
+ return serializedTail;
+ }
+
+ public long getFileModificationTime() {
+ return fileModificationTime;
+ }
+
+ public OrcProto.Footer getFooter() {
+ return fileTail.getFooter();
+ }
+
+ public OrcProto.PostScript getPostScript() {
+ return fileTail.getPostscript();
+ }
+
+ public OrcFile.WriterVersion getWriterVersion() {
+ OrcProto.PostScript ps = fileTail.getPostscript();
+ return (ps.hasWriterVersion()
+ ? OrcFile.WriterVersion.from(ps.getWriterVersion()) : OrcFile.WriterVersion.ORIGINAL);
+ }
+
+ public List<StripeInformation> getStripes() {
+ List<StripeInformation> result = new ArrayList<>(fileTail.getFooter().getStripesCount());
+ for (OrcProto.StripeInformation stripeProto : fileTail.getFooter().getStripesList()) {
+ result.add(new ReaderImpl.StripeInformationImpl(stripeProto));
+ }
+ return result;
+ }
+
+ public CompressionKind getCompressionKind() {
+ return CompressionKind.valueOf(fileTail.getPostscript().getCompression().name());
+ }
+
+ public CompressionCodec getCompressionCodec() {
+ return PhysicalFsWriter.createCodec(getCompressionKind());
+ }
+
+ public int getCompressionBufferSize() {
+ return (int) fileTail.getPostscript().getCompressionBlockSize();
+ }
+
+ public List<StripeStatistics> getStripeStatistics() throws IOException {
+ List<StripeStatistics> result = new ArrayList<>();
+ List<OrcProto.StripeStatistics> ssProto = getStripeStatisticsProto();
+ if (ssProto != null) {
+ for (OrcProto.StripeStatistics ss : ssProto) {
+ result.add(new StripeStatistics(ss.getColStatsList()));
+ }
+ }
+ return result;
+ }
+
+ public List<OrcProto.StripeStatistics> getStripeStatisticsProto() throws IOException {
+ if (serializedTail == null) return null;
+ if (metadata == null) {
+ metadata = ReaderImpl.extractMetadata(serializedTail, 0,
+ (int) fileTail.getPostscript().getMetadataLength(),
+ getCompressionCodec(), getCompressionBufferSize());
+ // clear does not clear the contents but sets position to 0 and limit = capacity
+ serializedTail.clear();
+ }
+ return metadata.getStripeStatsList();
+ }
+
+ public int getMetadataSize() {
+ return (int) getPostScript().getMetadataLength();
+ }
+
+ public List<OrcProto.Type> getTypes() {
+ return getFooter().getTypesList();
+ }
+
+ public OrcProto.FileTail getFileTail() {
+ return fileTail;
+ }
+
+ public OrcProto.FileTail getMinimalFileTail() {
+ OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder(fileTail);
+ OrcProto.Footer.Builder footerBuilder = OrcProto.Footer.newBuilder(fileTail.getFooter());
+ footerBuilder.clearStatistics();
+ fileTailBuilder.setFooter(footerBuilder.build());
+ OrcProto.FileTail result = fileTailBuilder.build();
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/hive/orc/impl/OutStream.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/hive/orc/impl/OutStream.java b/orc/src/java/org/apache/hive/orc/impl/OutStream.java
new file mode 100644
index 0000000..7157ac5
--- /dev/null
+++ b/orc/src/java/org/apache/hive/orc/impl/OutStream.java
@@ -0,0 +1,289 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hive.orc.impl;
+
+import org.apache.hive.orc.CompressionCodec;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+public class OutStream extends PositionedOutputStream {
+
+ public interface OutputReceiver {
+ /**
+ * Output the given buffer to the final destination
+ * @param buffer the buffer to output
+ * @throws IOException
+ */
+ void output(ByteBuffer buffer) throws IOException;
+ }
+
+ public static final int HEADER_SIZE = 3;
+ private final String name;
+ private final OutputReceiver receiver;
+ // if enabled the stream will be suppressed when writing stripe
+ private boolean suppress;
+
+ /**
+ * Stores the uncompressed bytes that have been serialized, but not
+ * compressed yet. When this fills, we compress the entire buffer.
+ */
+ private ByteBuffer current = null;
+
+ /**
+ * Stores the compressed bytes until we have a full buffer and then outputs
+ * them to the receiver. If no compression is being done, this (and overflow)
+ * will always be null and the current buffer will be sent directly to the
+ * receiver.
+ */
+ private ByteBuffer compressed = null;
+
+ /**
+ * Since the compressed buffer may start with contents from previous
+ * compression blocks, we allocate an overflow buffer so that the
+ * output of the codec can be split between the two buffers. After the
+ * compressed buffer is sent to the receiver, the overflow buffer becomes
+ * the new compressed buffer.
+ */
+ private ByteBuffer overflow = null;
+ private final int bufferSize;
+ private final CompressionCodec codec;
+ private long compressedBytes = 0;
+ private long uncompressedBytes = 0;
+
+ public OutStream(String name,
+ int bufferSize,
+ CompressionCodec codec,
+ OutputReceiver receiver) throws IOException {
+ this.name = name;
+ this.bufferSize = bufferSize;
+ this.codec = codec;
+ this.receiver = receiver;
+ this.suppress = false;
+ }
+
+ public void clear() throws IOException {
+ flush();
+ suppress = false;
+ }
+
+ /**
+ * Write the length of the compressed bytes. Life is much easier if the
+ * header is constant length, so just use 3 bytes. Considering most of the
+ * codecs want between 32k (snappy) and 256k (lzo, zlib), 3 bytes should
+ * be plenty. We also use the low bit for whether it is the original or
+ * compressed bytes.
+ * @param buffer the buffer to write the header to
+ * @param position the position in the buffer to write at
+ * @param val the size in the file
+ * @param original is it uncompressed
+ */
+ private static void writeHeader(ByteBuffer buffer,
+ int position,
+ int val,
+ boolean original) {
+ buffer.put(position, (byte) ((val << 1) + (original ? 1 : 0)));
+ buffer.put(position + 1, (byte) (val >> 7));
+ buffer.put(position + 2, (byte) (val >> 15));
+ }
+
+ private void getNewInputBuffer() throws IOException {
+ if (codec == null) {
+ current = ByteBuffer.allocate(bufferSize);
+ } else {
+ current = ByteBuffer.allocate(bufferSize + HEADER_SIZE);
+ writeHeader(current, 0, bufferSize, true);
+ current.position(HEADER_SIZE);
+ }
+ }
+
+ /**
+ * Allocate a new output buffer if we are compressing.
+ */
+ private ByteBuffer getNewOutputBuffer() throws IOException {
+ return ByteBuffer.allocate(bufferSize + HEADER_SIZE);
+ }
+
+ private void flip() throws IOException {
+ current.limit(current.position());
+ current.position(codec == null ? 0 : HEADER_SIZE);
+ }
+
+ @Override
+ public void write(int i) throws IOException {
+ if (current == null) {
+ getNewInputBuffer();
+ }
+ if (current.remaining() < 1) {
+ spill();
+ }
+ uncompressedBytes += 1;
+ current.put((byte) i);
+ }
+
+ @Override
+ public void write(byte[] bytes, int offset, int length) throws IOException {
+ if (current == null) {
+ getNewInputBuffer();
+ }
+ int remaining = Math.min(current.remaining(), length);
+ current.put(bytes, offset, remaining);
+ uncompressedBytes += remaining;
+ length -= remaining;
+ while (length != 0) {
+ spill();
+ offset += remaining;
+ remaining = Math.min(current.remaining(), length);
+ current.put(bytes, offset, remaining);
+ uncompressedBytes += remaining;
+ length -= remaining;
+ }
+ }
+
+ private void spill() throws java.io.IOException {
+ // if there isn't anything in the current buffer, don't spill
+ if (current == null ||
+ current.position() == (codec == null ? 0 : HEADER_SIZE)) {
+ return;
+ }
+ flip();
+ if (codec == null) {
+ receiver.output(current);
+ getNewInputBuffer();
+ } else {
+ if (compressed == null) {
+ compressed = getNewOutputBuffer();
+ } else if (overflow == null) {
+ overflow = getNewOutputBuffer();
+ }
+ int sizePosn = compressed.position();
+ compressed.position(compressed.position() + HEADER_SIZE);
+ if (codec.compress(current, compressed, overflow)) {
+ uncompressedBytes = 0;
+ // move position back to after the header
+ current.position(HEADER_SIZE);
+ current.limit(current.capacity());
+ // find the total bytes in the chunk
+ int totalBytes = compressed.position() - sizePosn - HEADER_SIZE;
+ if (overflow != null) {
+ totalBytes += overflow.position();
+ }
+ compressedBytes += totalBytes + HEADER_SIZE;
+ writeHeader(compressed, sizePosn, totalBytes, false);
+ // if we have less than the next header left, spill it.
+ if (compressed.remaining() < HEADER_SIZE) {
+ compressed.flip();
+ receiver.output(compressed);
+ compressed = overflow;
+ overflow = null;
+ }
+ } else {
+ compressedBytes += uncompressedBytes + HEADER_SIZE;
+ uncompressedBytes = 0;
+ // we are using the original, but need to spill the current
+ // compressed buffer first. So back up to where we started,
+ // flip it and add it to done.
+ if (sizePosn != 0) {
+ compressed.position(sizePosn);
+ compressed.flip();
+ receiver.output(compressed);
+ compressed = null;
+ // if we have an overflow, clear it and make it the new compress
+ // buffer
+ if (overflow != null) {
+ overflow.clear();
+ compressed = overflow;
+ overflow = null;
+ }
+ } else {
+ compressed.clear();
+ if (overflow != null) {
+ overflow.clear();
+ }
+ }
+
+ // now add the current buffer into the done list and get a new one.
+ current.position(0);
+ // update the header with the current length
+ writeHeader(current, 0, current.limit() - HEADER_SIZE, true);
+ receiver.output(current);
+ getNewInputBuffer();
+ }
+ }
+ }
+
+ @Override
+ public void getPosition(PositionRecorder recorder) throws IOException {
+ if (codec == null) {
+ recorder.addPosition(uncompressedBytes);
+ } else {
+ recorder.addPosition(compressedBytes);
+ recorder.addPosition(uncompressedBytes);
+ }
+ }
+
+ @Override
+ public void flush() throws IOException {
+ spill();
+ if (compressed != null && compressed.position() != 0) {
+ compressed.flip();
+ receiver.output(compressed);
+ }
+ compressed = null;
+ uncompressedBytes = 0;
+ compressedBytes = 0;
+ overflow = null;
+ current = null;
+ }
+
+ @Override
+ public String toString() {
+ return name;
+ }
+
+ @Override
+ public long getBufferSize() {
+ long result = 0;
+ if (current != null) {
+ result += current.capacity();
+ }
+ if (compressed != null) {
+ result += compressed.capacity();
+ }
+ if (overflow != null) {
+ result += overflow.capacity();
+ }
+ return result;
+ }
+
+ /**
+ * Set suppress flag
+ */
+ public void suppress() {
+ suppress = true;
+ }
+
+ /**
+ * Returns the state of suppress flag
+ * @return value of suppress flag
+ */
+ public boolean isSuppressed() {
+ return suppress;
+ }
+}
+
[21/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java b/orc/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
deleted file mode 100644
index 2d293b5..0000000
--- a/orc/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
+++ /dev/null
@@ -1,2893 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.IOException;
-import java.nio.charset.StandardCharsets;
-import java.sql.Date;
-import java.sql.Timestamp;
-import java.util.EnumMap;
-import java.util.Map;
-
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr;
-import org.apache.hadoop.hive.ql.util.TimestampUtils;
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.orc.OrcProto;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.TypeDescription.Category;
-
-/**
- * Convert ORC tree readers.
- */
-public class ConvertTreeReaderFactory extends TreeReaderFactory {
-
- /**
- * Override methods like checkEncoding to pass-thru to the convert TreeReader.
- */
- public static class ConvertTreeReader extends TreeReader {
-
- private TreeReader convertTreeReader;
-
- ConvertTreeReader(int columnId) throws IOException {
- super(columnId);
- }
-
- // The ordering of types here is used to determine which numeric types
- // are common/convertible to one another. Probably better to rely on the
- // ordering explicitly defined here than to assume that the enum values
- // that were arbitrarily assigned in PrimitiveCategory work for our purposes.
- private static EnumMap<TypeDescription.Category, Integer> numericTypes =
- new EnumMap<>(TypeDescription.Category.class);
-
- static {
- registerNumericType(TypeDescription.Category.BOOLEAN, 1);
- registerNumericType(TypeDescription.Category.BYTE, 2);
- registerNumericType(TypeDescription.Category.SHORT, 3);
- registerNumericType(TypeDescription.Category.INT, 4);
- registerNumericType(TypeDescription.Category.LONG, 5);
- registerNumericType(TypeDescription.Category.FLOAT, 6);
- registerNumericType(TypeDescription.Category.DOUBLE, 7);
- registerNumericType(TypeDescription.Category.DECIMAL, 8);
- }
-
- private static void registerNumericType(TypeDescription.Category kind, int level) {
- numericTypes.put(kind, level);
- }
-
- protected void setConvertTreeReader(TreeReader convertTreeReader) {
- this.convertTreeReader = convertTreeReader;
- }
-
- protected TreeReader getStringGroupTreeReader(int columnId,
- TypeDescription fileType) throws IOException {
- switch (fileType.getCategory()) {
- case STRING:
- return new StringTreeReader(columnId);
- case CHAR:
- return new CharTreeReader(columnId, fileType.getMaxLength());
- case VARCHAR:
- return new VarcharTreeReader(columnId, fileType.getMaxLength());
- default:
- throw new RuntimeException("Unexpected type kind " + fileType.getCategory().name());
- }
- }
-
- protected void assignStringGroupVectorEntry(BytesColumnVector bytesColVector,
- int elementNum, TypeDescription readerType, byte[] bytes) {
- assignStringGroupVectorEntry(bytesColVector,
- elementNum, readerType, bytes, 0, bytes.length);
- }
-
- /*
- * Assign a BytesColumnVector entry when we have a byte array, start, and
- * length for the string group which can be (STRING, CHAR, VARCHAR).
- */
- protected void assignStringGroupVectorEntry(BytesColumnVector bytesColVector,
- int elementNum, TypeDescription readerType, byte[] bytes, int start, int length) {
- switch (readerType.getCategory()) {
- case STRING:
- bytesColVector.setVal(elementNum, bytes, start, length);
- break;
- case CHAR:
- {
- int adjustedDownLen =
- StringExpr.rightTrimAndTruncate(bytes, start, length, readerType.getMaxLength());
- bytesColVector.setVal(elementNum, bytes, start, adjustedDownLen);
- }
- break;
- case VARCHAR:
- {
- int adjustedDownLen =
- StringExpr.truncate(bytes, start, length, readerType.getMaxLength());
- bytesColVector.setVal(elementNum, bytes, start, adjustedDownLen);
- }
- break;
- default:
- throw new RuntimeException("Unexpected type kind " + readerType.getCategory().name());
- }
- }
-
- protected void convertStringGroupVectorElement(BytesColumnVector bytesColVector,
- int elementNum, TypeDescription readerType) {
- switch (readerType.getCategory()) {
- case STRING:
- // No conversion needed.
- break;
- case CHAR:
- {
- int length = bytesColVector.length[elementNum];
- int adjustedDownLen = StringExpr
- .rightTrimAndTruncate(bytesColVector.vector[elementNum],
- bytesColVector.start[elementNum], length,
- readerType.getMaxLength());
- if (adjustedDownLen < length) {
- bytesColVector.length[elementNum] = adjustedDownLen;
- }
- }
- break;
- case VARCHAR:
- {
- int length = bytesColVector.length[elementNum];
- int adjustedDownLen = StringExpr
- .truncate(bytesColVector.vector[elementNum],
- bytesColVector.start[elementNum], length,
- readerType.getMaxLength());
- if (adjustedDownLen < length) {
- bytesColVector.length[elementNum] = adjustedDownLen;
- }
- }
- break;
- default:
- throw new RuntimeException("Unexpected type kind " + readerType.getCategory().name());
- }
- }
-
- private boolean isParseError;
-
- /*
- * We do this because we want the various parse methods return a primitive.
- *
- * @return true if there was a parse error in the last call to
- * parseLongFromString, etc.
- */
- protected boolean getIsParseError() {
- return isParseError;
- }
-
- protected long parseLongFromString(String string) {
- try {
- long longValue = Long.parseLong(string);
- isParseError = false;
- return longValue;
- } catch (NumberFormatException e) {
- isParseError = true;
- return 0;
- }
- }
-
- protected float parseFloatFromString(String string) {
- try {
- float floatValue = Float.parseFloat(string);
- isParseError = false;
- return floatValue;
- } catch (NumberFormatException e) {
- isParseError = true;
- return Float.NaN;
- }
- }
-
- protected double parseDoubleFromString(String string) {
- try {
- double value = Double.parseDouble(string);
- isParseError = false;
- return value;
- } catch (NumberFormatException e) {
- isParseError = true;
- return Double.NaN;
- }
- }
-
- /**
- * @param string
- * @return the HiveDecimal parsed, or null if there was a parse error.
- */
- protected HiveDecimal parseDecimalFromString(String string) {
- try {
- HiveDecimal value = HiveDecimal.create(string);
- return value;
- } catch (NumberFormatException e) {
- return null;
- }
- }
-
- /**
- * @param string
- * @return the Timestamp parsed, or null if there was a parse error.
- */
- protected Timestamp parseTimestampFromString(String string) {
- try {
- Timestamp value = Timestamp.valueOf(string);
- return value;
- } catch (IllegalArgumentException e) {
- return null;
- }
- }
-
- /**
- * @param string
- * @return the Date parsed, or null if there was a parse error.
- */
- protected Date parseDateFromString(String string) {
- try {
- Date value = Date.valueOf(string);
- return value;
- } catch (IllegalArgumentException e) {
- return null;
- }
- }
-
- protected String stringFromBytesColumnVectorEntry(
- BytesColumnVector bytesColVector, int elementNum) {
- String string;
-
- string = new String(
- bytesColVector.vector[elementNum],
- bytesColVector.start[elementNum], bytesColVector.length[elementNum],
- StandardCharsets.UTF_8);
-
- return string;
- }
-
- private static final double MIN_LONG_AS_DOUBLE = -0x1p63;
- /*
- * We cannot store Long.MAX_VALUE as a double without losing precision. Instead, we store
- * Long.MAX_VALUE + 1 == -Long.MIN_VALUE, and then offset all comparisons by 1.
- */
- private static final double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
-
- public boolean doubleCanFitInLong(double doubleValue) {
-
- // Borrowed from Guava DoubleMath.roundToLong except do not want dependency on Guava and we
- // don't want to catch an exception.
-
- return ((MIN_LONG_AS_DOUBLE - doubleValue < 1.0) &&
- (doubleValue < MAX_LONG_AS_DOUBLE_PLUS_ONE));
- }
-
- @Override
- void checkEncoding(OrcProto.ColumnEncoding encoding) throws IOException {
- // Pass-thru.
- convertTreeReader.checkEncoding(encoding);
- }
-
- @Override
- void startStripe(Map<StreamName, InStream> streams,
- OrcProto.StripeFooter stripeFooter
- ) throws IOException {
- // Pass-thru.
- convertTreeReader.startStripe(streams, stripeFooter);
- }
-
- @Override
- public void seek(PositionProvider[] index) throws IOException {
- // Pass-thru.
- convertTreeReader.seek(index);
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- // Pass-thru.
- convertTreeReader.seek(index);
- }
-
- @Override
- void skipRows(long items) throws IOException {
- // Pass-thru.
- convertTreeReader.skipRows(items);
- }
-
- /**
- * Override this to use convertVector.
- * Source and result are member variables in the subclass with the right
- * type.
- * @param elementNum
- * @throws IOException
- */
- // Override this to use convertVector.
- public void setConvertVectorElement(int elementNum) throws IOException {
- throw new RuntimeException("Expected this method to be overriden");
- }
-
- // Common code used by the conversion.
- public void convertVector(ColumnVector fromColVector,
- ColumnVector resultColVector, final int batchSize) throws IOException {
-
- resultColVector.reset();
- if (fromColVector.isRepeating) {
- resultColVector.isRepeating = true;
- if (fromColVector.noNulls || !fromColVector.isNull[0]) {
- setConvertVectorElement(0);
- } else {
- resultColVector.noNulls = false;
- resultColVector.isNull[0] = true;
- }
- } else if (fromColVector.noNulls){
- for (int i = 0; i < batchSize; i++) {
- setConvertVectorElement(i);
- }
- } else {
- for (int i = 0; i < batchSize; i++) {
- if (!fromColVector.isNull[i]) {
- setConvertVectorElement(i);
- } else {
- resultColVector.noNulls = false;
- resultColVector.isNull[i] = true;
- }
- }
- }
- }
-
- public void downCastAnyInteger(LongColumnVector longColVector, int elementNum,
- TypeDescription readerType) {
- downCastAnyInteger(longColVector, elementNum, longColVector.vector[elementNum], readerType);
- }
-
- public void downCastAnyInteger(LongColumnVector longColVector, int elementNum, long inputLong,
- TypeDescription readerType) {
- long[] vector = longColVector.vector;
- long outputLong;
- Category readerCategory = readerType.getCategory();
- switch (readerCategory) {
- case BOOLEAN:
- // No data loss for boolean.
- vector[elementNum] = inputLong == 0 ? 0 : 1;
- return;
- case BYTE:
- outputLong = (byte) inputLong;
- break;
- case SHORT:
- outputLong = (short) inputLong;
- break;
- case INT:
- outputLong = (int) inputLong;
- break;
- case LONG:
- // No data loss for long.
- vector[elementNum] = inputLong;
- return;
- default:
- throw new RuntimeException("Unexpected type kind " + readerCategory.name());
- }
-
- if (outputLong != inputLong) {
- // Data loss.
- longColVector.isNull[elementNum] = true;
- longColVector.noNulls = false;
- } else {
- vector[elementNum] = outputLong;
- }
- }
-
- protected boolean integerDownCastNeeded(TypeDescription fileType, TypeDescription readerType) {
- Integer fileLevel = numericTypes.get(fileType.getCategory());
- Integer schemaLevel = numericTypes.get(readerType.getCategory());
- return (schemaLevel.intValue() < fileLevel.intValue());
- }
- }
-
- public static class AnyIntegerTreeReader extends ConvertTreeReader {
-
- private TypeDescription.Category fileTypeCategory;
- private TreeReader anyIntegerTreeReader;
-
- private long longValue;
-
- AnyIntegerTreeReader(int columnId, TypeDescription fileType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.fileTypeCategory = fileType.getCategory();
- switch (fileTypeCategory) {
- case BOOLEAN:
- anyIntegerTreeReader = new BooleanTreeReader(columnId);
- break;
- case BYTE:
- anyIntegerTreeReader = new ByteTreeReader(columnId);
- break;
- case SHORT:
- anyIntegerTreeReader = new ShortTreeReader(columnId);
- break;
- case INT:
- anyIntegerTreeReader = new IntTreeReader(columnId);
- break;
- case LONG:
- anyIntegerTreeReader = new LongTreeReader(columnId, skipCorrupt);
- break;
- default:
- throw new RuntimeException("Unexpected type kind " + fileType.getCategory().name());
- }
- setConvertTreeReader(anyIntegerTreeReader);
- }
-
- protected long getLong() throws IOException {
- return longValue;
- }
-
- protected String getString(long longValue) {
- if (fileTypeCategory == TypeDescription.Category.BOOLEAN) {
- return longValue == 0 ? "FALSE" : "TRUE";
- } else {
- return Long.toString(longValue);
- }
- }
-
- protected String getString() {
- return getString(longValue);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- anyIntegerTreeReader.nextVector(previousVector, isNull, batchSize);
- }
- }
-
- public static class AnyIntegerFromAnyIntegerTreeReader extends ConvertTreeReader {
-
- private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
-
- private final TypeDescription readerType;
- private final boolean downCastNeeded;
-
- AnyIntegerFromAnyIntegerTreeReader(int columnId, TypeDescription fileType, TypeDescription readerType, boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- anyIntegerAsLongTreeReader = new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
- setConvertTreeReader(anyIntegerAsLongTreeReader);
- downCastNeeded = integerDownCastNeeded(fileType, readerType);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- anyIntegerAsLongTreeReader.nextVector(previousVector, isNull, batchSize);
- LongColumnVector resultColVector = (LongColumnVector) previousVector;
- if (downCastNeeded) {
- if (resultColVector.isRepeating) {
- if (resultColVector.noNulls || !resultColVector.isNull[0]) {
- downCastAnyInteger(resultColVector, 0, readerType);
- } else {
- // Result remains null.
- }
- } else if (resultColVector.noNulls){
- for (int i = 0; i < batchSize; i++) {
- downCastAnyInteger(resultColVector, i, readerType);
- }
- } else {
- for (int i = 0; i < batchSize; i++) {
- if (!resultColVector.isNull[i]) {
- downCastAnyInteger(resultColVector, i, readerType);
- } else {
- // Result remains null.
- }
- }
- }
- }
- }
- }
-
- public static class AnyIntegerFromFloatTreeReader extends ConvertTreeReader {
-
- private FloatTreeReader floatTreeReader;
-
- private final TypeDescription readerType;
- private DoubleColumnVector doubleColVector;
- private LongColumnVector longColVector;
-
- AnyIntegerFromFloatTreeReader(int columnId, TypeDescription readerType)
- throws IOException {
- super(columnId);
- this.readerType = readerType;
- floatTreeReader = new FloatTreeReader(columnId);
- setConvertTreeReader(floatTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- double doubleValue = doubleColVector.vector[elementNum];
- if (!doubleCanFitInLong(doubleValue)) {
- longColVector.isNull[elementNum] = true;
- longColVector.noNulls = false;
- } else {
- // UNDONE: Does the overflow check above using double really work here for float?
- float floatValue = (float) doubleValue;
- downCastAnyInteger(longColVector, elementNum, (long) floatValue, readerType);
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (doubleColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- doubleColVector = new DoubleColumnVector();
- longColVector = (LongColumnVector) previousVector;
- }
- // Read present/isNull stream
- floatTreeReader.nextVector(doubleColVector, isNull, batchSize);
-
- convertVector(doubleColVector, longColVector, batchSize);
- }
- }
-
- public static class AnyIntegerFromDoubleTreeReader extends ConvertTreeReader {
-
- private DoubleTreeReader doubleTreeReader;
-
- private final TypeDescription readerType;
- private DoubleColumnVector doubleColVector;
- private LongColumnVector longColVector;
-
- AnyIntegerFromDoubleTreeReader(int columnId, TypeDescription readerType)
- throws IOException {
- super(columnId);
- this.readerType = readerType;
- doubleTreeReader = new DoubleTreeReader(columnId);
- setConvertTreeReader(doubleTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- double doubleValue = doubleColVector.vector[elementNum];
- if (!doubleCanFitInLong(doubleValue)) {
- longColVector.isNull[elementNum] = true;
- longColVector.noNulls = false;
- } else {
- downCastAnyInteger(longColVector, elementNum, (long) doubleValue, readerType);
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (doubleColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- doubleColVector = new DoubleColumnVector();
- longColVector = (LongColumnVector) previousVector;
- }
- // Read present/isNull stream
- doubleTreeReader.nextVector(doubleColVector, isNull, batchSize);
-
- convertVector(doubleColVector, longColVector, batchSize);
- }
- }
-
- public static class AnyIntegerFromDecimalTreeReader extends ConvertTreeReader {
-
- private DecimalTreeReader decimalTreeReader;
-
- private final int precision;
- private final int scale;
- private final TypeDescription readerType;
- private DecimalColumnVector decimalColVector;
- private LongColumnVector longColVector;
-
- AnyIntegerFromDecimalTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType) throws IOException {
- super(columnId);
- this.precision = fileType.getPrecision();
- this.scale = fileType.getScale();
- this.readerType = readerType;
- decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
- setConvertTreeReader(decimalTreeReader);
- }
-
- private static HiveDecimal DECIMAL_MAX_LONG = HiveDecimal.create(Long.MAX_VALUE);
- private static HiveDecimal DECIMAL_MIN_LONG = HiveDecimal.create(Long.MIN_VALUE);
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- HiveDecimal decimalValue = decimalColVector.vector[elementNum].getHiveDecimal();
- if (decimalValue.compareTo(DECIMAL_MAX_LONG) > 0 ||
- decimalValue.compareTo(DECIMAL_MIN_LONG) < 0) {
- longColVector.isNull[elementNum] = true;
- longColVector.noNulls = false;
- } else {
- downCastAnyInteger(longColVector, elementNum, decimalValue.longValue(), readerType);
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (decimalColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- decimalColVector = new DecimalColumnVector(precision, scale);
- longColVector = (LongColumnVector) previousVector;
- }
- // Read present/isNull stream
- decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
-
- convertVector(decimalColVector, longColVector, batchSize);
- }
- }
-
- public static class AnyIntegerFromStringGroupTreeReader extends ConvertTreeReader {
-
- private TreeReader stringGroupTreeReader;
-
- private final TypeDescription readerType;
- private BytesColumnVector bytesColVector;
- private LongColumnVector longColVector;
-
- AnyIntegerFromStringGroupTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType) throws IOException {
- super(columnId);
- this.readerType = readerType;
- stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
- setConvertTreeReader(stringGroupTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
- long longValue = parseLongFromString(string);
- if (!getIsParseError()) {
- downCastAnyInteger(longColVector, elementNum, longValue, readerType);
- } else {
- longColVector.noNulls = false;
- longColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (bytesColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- bytesColVector = new BytesColumnVector();
- longColVector = (LongColumnVector) previousVector;
- }
- // Read present/isNull stream
- stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
-
- convertVector(bytesColVector, longColVector, batchSize);
- }
- }
-
- public static class AnyIntegerFromTimestampTreeReader extends ConvertTreeReader {
-
- private TimestampTreeReader timestampTreeReader;
-
- private final TypeDescription readerType;
- private TimestampColumnVector timestampColVector;
- private LongColumnVector longColVector;
-
- AnyIntegerFromTimestampTreeReader(int columnId, TypeDescription readerType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
- setConvertTreeReader(timestampTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- // Use TimestampWritable's getSeconds.
- long longValue = TimestampUtils.millisToSeconds(
- timestampColVector.asScratchTimestamp(elementNum).getTime());
- downCastAnyInteger(longColVector, elementNum, longValue, readerType);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (timestampColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- timestampColVector = new TimestampColumnVector();
- longColVector = (LongColumnVector) previousVector;
- }
- // Read present/isNull stream
- timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
-
- convertVector(timestampColVector, longColVector, batchSize);
- }
- }
-
- public static class FloatFromAnyIntegerTreeReader extends ConvertTreeReader {
-
- private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
-
- private LongColumnVector longColVector;
- private DoubleColumnVector doubleColVector;
-
- FloatFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- anyIntegerAsLongTreeReader =
- new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
- setConvertTreeReader(anyIntegerAsLongTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- float floatValue = (float) longColVector.vector[elementNum];
- if (!Float.isNaN(floatValue)) {
- doubleColVector.vector[elementNum] = floatValue;
- } else {
- doubleColVector.vector[elementNum] = Double.NaN;
- doubleColVector.noNulls = false;
- doubleColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (longColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- longColVector = new LongColumnVector();
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
-
- convertVector(longColVector, doubleColVector, batchSize);
- }
- }
-
- public static class FloatFromDoubleTreeReader extends ConvertTreeReader {
-
- private DoubleTreeReader doubleTreeReader;
-
- FloatFromDoubleTreeReader(int columnId) throws IOException {
- super(columnId);
- doubleTreeReader = new DoubleTreeReader(columnId);
- setConvertTreeReader(doubleTreeReader);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- doubleTreeReader.nextVector(previousVector, isNull, batchSize);
-
- DoubleColumnVector resultColVector = (DoubleColumnVector) previousVector;
- double[] resultVector = resultColVector.vector;
- if (resultColVector.isRepeating) {
- if (resultColVector.noNulls || !resultColVector.isNull[0]) {
- resultVector[0] = (float) resultVector[0];
- } else {
- // Remains null.
- }
- } else if (resultColVector.noNulls){
- for (int i = 0; i < batchSize; i++) {
- resultVector[i] = (float) resultVector[i];
- }
- } else {
- for (int i = 0; i < batchSize; i++) {
- if (!resultColVector.isNull[i]) {
- resultVector[i] = (float) resultVector[i];
- } else {
- // Remains null.
- }
- }
- }
- }
- }
-
- public static class FloatFromDecimalTreeReader extends ConvertTreeReader {
-
- private DecimalTreeReader decimalTreeReader;
-
- private final int precision;
- private final int scale;
- private DecimalColumnVector decimalColVector;
- private DoubleColumnVector doubleColVector;
-
- FloatFromDecimalTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType) throws IOException {
- super(columnId);
- this.precision = fileType.getPrecision();
- this.scale = fileType.getScale();
- decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
- setConvertTreeReader(decimalTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- doubleColVector.vector[elementNum] =
- (float) decimalColVector.vector[elementNum].doubleValue();
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (decimalColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- decimalColVector = new DecimalColumnVector(precision, scale);
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
-
- convertVector(decimalColVector, doubleColVector, batchSize);
- }
- }
-
- public static class FloatFromStringGroupTreeReader extends ConvertTreeReader {
-
- private TreeReader stringGroupTreeReader;
-
- private BytesColumnVector bytesColVector;
- private DoubleColumnVector doubleColVector;
-
- FloatFromStringGroupTreeReader(int columnId, TypeDescription fileType)
- throws IOException {
- super(columnId);
- stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
- setConvertTreeReader(stringGroupTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
- float floatValue = parseFloatFromString(string);
- if (!getIsParseError()) {
- doubleColVector.vector[elementNum] = floatValue;
- } else {
- doubleColVector.vector[elementNum] = Double.NaN;
- doubleColVector.noNulls = false;
- doubleColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (bytesColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- bytesColVector = new BytesColumnVector();
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
-
- convertVector(bytesColVector, doubleColVector, batchSize);
- }
- }
-
- public static class FloatFromTimestampTreeReader extends ConvertTreeReader {
-
- private TimestampTreeReader timestampTreeReader;
-
- private TimestampColumnVector timestampColVector;
- private DoubleColumnVector doubleColVector;
-
- FloatFromTimestampTreeReader(int columnId, boolean skipCorrupt) throws IOException {
- super(columnId);
- timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
- setConvertTreeReader(timestampTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- doubleColVector.vector[elementNum] = (float) TimestampUtils.getDouble(
- timestampColVector.asScratchTimestamp(elementNum));
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (timestampColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- timestampColVector = new TimestampColumnVector();
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
-
- convertVector(timestampColVector, doubleColVector, batchSize);
- }
- }
-
- public static class DoubleFromAnyIntegerTreeReader extends ConvertTreeReader {
-
- private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
-
- private LongColumnVector longColVector;
- private DoubleColumnVector doubleColVector;
-
- DoubleFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- anyIntegerAsLongTreeReader =
- new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
- setConvertTreeReader(anyIntegerAsLongTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
-
- double doubleValue = (double) longColVector.vector[elementNum];
- if (!Double.isNaN(doubleValue)) {
- doubleColVector.vector[elementNum] = doubleValue;
- } else {
- doubleColVector.vector[elementNum] = Double.NaN;
- doubleColVector.noNulls = false;
- doubleColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (longColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- longColVector = new LongColumnVector();
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
-
- convertVector(longColVector, doubleColVector, batchSize);
- }
- }
-
- public static class DoubleFromFloatTreeReader extends ConvertTreeReader {
-
- private FloatTreeReader floatTreeReader;
-
- DoubleFromFloatTreeReader(int columnId) throws IOException {
- super(columnId);
- floatTreeReader = new FloatTreeReader(columnId);
- setConvertTreeReader(floatTreeReader);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- // we get the DoubleColumnVector produced by float tree reader first, then iterate through
- // the elements and make double -> float -> string -> double conversion to preserve the
- // precision. When float tree reader reads float and assign it to double, java's widening
- // conversion adds more precision which will break all comparisons.
- // Example: float f = 74.72
- // double d = f ---> 74.72000122070312
- // Double.parseDouble(String.valueOf(f)) ---> 74.72
- floatTreeReader.nextVector(previousVector, isNull, batchSize);
-
- DoubleColumnVector doubleColumnVector = (DoubleColumnVector) previousVector;
- if (doubleColumnVector.isRepeating) {
- if (doubleColumnVector.noNulls || !doubleColumnVector.isNull[0]) {
- final float f = (float) doubleColumnVector.vector[0];
- doubleColumnVector.vector[0] = Double.parseDouble(String.valueOf(f));
- }
- } else if (doubleColumnVector.noNulls){
- for (int i = 0; i < batchSize; i++) {
- final float f = (float) doubleColumnVector.vector[i];
- doubleColumnVector.vector[i] = Double.parseDouble(String.valueOf(f));
- }
- } else {
- for (int i = 0; i < batchSize; i++) {
- if (!doubleColumnVector.isNull[i]) {
- final float f = (float) doubleColumnVector.vector[i];
- doubleColumnVector.vector[i] = Double.parseDouble(String.valueOf(f));
- }
- }
- }
- }
- }
-
- public static class DoubleFromDecimalTreeReader extends ConvertTreeReader {
-
- private DecimalTreeReader decimalTreeReader;
-
- private final int precision;
- private final int scale;
- private DecimalColumnVector decimalColVector;
- private DoubleColumnVector doubleColVector;
-
- DoubleFromDecimalTreeReader(int columnId, TypeDescription fileType) throws IOException {
- super(columnId);
- this.precision = fileType.getPrecision();
- this.scale = fileType.getScale();
- decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
- setConvertTreeReader(decimalTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- doubleColVector.vector[elementNum] =
- decimalColVector.vector[elementNum].doubleValue();
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (decimalColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- decimalColVector = new DecimalColumnVector(precision, scale);
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
-
- convertVector(decimalColVector, doubleColVector, batchSize);
- }
- }
-
- public static class DoubleFromStringGroupTreeReader extends ConvertTreeReader {
-
- private TreeReader stringGroupTreeReader;
-
- private BytesColumnVector bytesColVector;
- private DoubleColumnVector doubleColVector;
-
- DoubleFromStringGroupTreeReader(int columnId, TypeDescription fileType)
- throws IOException {
- super(columnId);
- stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
- setConvertTreeReader(stringGroupTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
- double doubleValue = parseDoubleFromString(string);
- if (!getIsParseError()) {
- doubleColVector.vector[elementNum] = doubleValue;
- } else {
- doubleColVector.noNulls = false;
- doubleColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (bytesColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- bytesColVector = new BytesColumnVector();
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
-
- convertVector(bytesColVector, doubleColVector, batchSize);
- }
- }
-
- public static class DoubleFromTimestampTreeReader extends ConvertTreeReader {
-
- private TimestampTreeReader timestampTreeReader;
-
- private TimestampColumnVector timestampColVector;
- private DoubleColumnVector doubleColVector;
-
- DoubleFromTimestampTreeReader(int columnId, boolean skipCorrupt) throws IOException {
- super(columnId);
- timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
- setConvertTreeReader(timestampTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- doubleColVector.vector[elementNum] = TimestampUtils.getDouble(
- timestampColVector.asScratchTimestamp(elementNum));
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (timestampColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- timestampColVector = new TimestampColumnVector();
- doubleColVector = (DoubleColumnVector) previousVector;
- }
- // Read present/isNull stream
- timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
-
- convertVector(timestampColVector, doubleColVector, batchSize);
- }
- }
-
- public static class DecimalFromAnyIntegerTreeReader extends ConvertTreeReader {
-
- private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
-
- private LongColumnVector longColVector;
- private DecimalColumnVector decimalColVector;
-
- DecimalFromAnyIntegerTreeReader(int columnId, TypeDescription fileType, boolean skipCorrupt)
- throws IOException {
- super(columnId);
- anyIntegerAsLongTreeReader =
- new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
- setConvertTreeReader(anyIntegerAsLongTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- long longValue = longColVector.vector[elementNum];
- HiveDecimalWritable hiveDecimalWritable = new HiveDecimalWritable(longValue);
- // The DecimalColumnVector will enforce precision and scale and set the entry to null when out of bounds.
- decimalColVector.set(elementNum, hiveDecimalWritable);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (longColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- longColVector = new LongColumnVector();
- decimalColVector = (DecimalColumnVector) previousVector;
- }
- // Read present/isNull stream
- anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
-
- convertVector(longColVector, decimalColVector, batchSize);
- }
- }
-
- public static class DecimalFromFloatTreeReader extends ConvertTreeReader {
-
- private FloatTreeReader floatTreeReader;
-
- private DoubleColumnVector doubleColVector;
- private DecimalColumnVector decimalColVector;
-
- DecimalFromFloatTreeReader(int columnId, TypeDescription readerType)
- throws IOException {
- super(columnId);
- floatTreeReader = new FloatTreeReader(columnId);
- setConvertTreeReader(floatTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- float floatValue = (float) doubleColVector.vector[elementNum];
- if (!Float.isNaN(floatValue)) {
- HiveDecimal decimalValue =
- HiveDecimal.create(Float.toString(floatValue));
- if (decimalValue != null) {
- // The DecimalColumnVector will enforce precision and scale and set the entry to null when out of bounds.
- decimalColVector.set(elementNum, decimalValue);
- } else {
- decimalColVector.noNulls = false;
- decimalColVector.isNull[elementNum] = true;
- }
- } else {
- decimalColVector.noNulls = false;
- decimalColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (doubleColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- doubleColVector = new DoubleColumnVector();
- decimalColVector = (DecimalColumnVector) previousVector;
- }
- // Read present/isNull stream
- floatTreeReader.nextVector(doubleColVector, isNull, batchSize);
-
- convertVector(doubleColVector, decimalColVector, batchSize);
- }
- }
-
- public static class DecimalFromDoubleTreeReader extends ConvertTreeReader {
-
- private DoubleTreeReader doubleTreeReader;
-
- private DoubleColumnVector doubleColVector;
- private DecimalColumnVector decimalColVector;
-
- DecimalFromDoubleTreeReader(int columnId, TypeDescription readerType)
- throws IOException {
- super(columnId);
- doubleTreeReader = new DoubleTreeReader(columnId);
- setConvertTreeReader(doubleTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- HiveDecimal value =
- HiveDecimal.create(Double.toString(doubleColVector.vector[elementNum]));
- if (value != null) {
- decimalColVector.set(elementNum, value);
- } else {
- decimalColVector.noNulls = false;
- decimalColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (doubleColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- doubleColVector = new DoubleColumnVector();
- decimalColVector = (DecimalColumnVector) previousVector;
- }
- // Read present/isNull stream
- doubleTreeReader.nextVector(doubleColVector, isNull, batchSize);
-
- convertVector(doubleColVector, decimalColVector, batchSize);
- }
- }
-
- public static class DecimalFromStringGroupTreeReader extends ConvertTreeReader {
-
- private TreeReader stringGroupTreeReader;
-
- private BytesColumnVector bytesColVector;
- private DecimalColumnVector decimalColVector;
-
- DecimalFromStringGroupTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType) throws IOException {
- super(columnId);
- stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
- setConvertTreeReader(stringGroupTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- String string = stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
- HiveDecimal value = parseDecimalFromString(string);
- if (value != null) {
- // The DecimalColumnVector will enforce precision and scale and set the entry to null when out of bounds.
- decimalColVector.set(elementNum, value);
- } else {
- decimalColVector.noNulls = false;
- decimalColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (bytesColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- bytesColVector = new BytesColumnVector();
- decimalColVector = (DecimalColumnVector) previousVector;
- }
- // Read present/isNull stream
- stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
-
- convertVector(bytesColVector, decimalColVector, batchSize);
- }
- }
-
- public static class DecimalFromTimestampTreeReader extends ConvertTreeReader {
-
- private TimestampTreeReader timestampTreeReader;
-
- private TimestampColumnVector timestampColVector;
- private DecimalColumnVector decimalColVector;
-
- DecimalFromTimestampTreeReader(int columnId, boolean skipCorrupt) throws IOException {
- super(columnId);
- timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
- setConvertTreeReader(timestampTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- double doubleValue = TimestampUtils.getDouble(
- timestampColVector.asScratchTimestamp(elementNum));
- HiveDecimal value = HiveDecimal.create(Double.toString(doubleValue));
- if (value != null) {
- // The DecimalColumnVector will enforce precision and scale and set the entry to null when out of bounds.
- decimalColVector.set(elementNum, value);
- } else {
- decimalColVector.noNulls = false;
- decimalColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (timestampColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- timestampColVector = new TimestampColumnVector();
- decimalColVector = (DecimalColumnVector) previousVector;
- }
- // Read present/isNull stream
- timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
-
- convertVector(timestampColVector, decimalColVector, batchSize);
- }
- }
-
- public static class DecimalFromDecimalTreeReader extends ConvertTreeReader {
-
- private DecimalTreeReader decimalTreeReader;
-
- private DecimalColumnVector fileDecimalColVector;
- private int filePrecision;
- private int fileScale;
- private int readerPrecision;
- private int readerScale;
- private DecimalColumnVector decimalColVector;
-
- DecimalFromDecimalTreeReader(int columnId, TypeDescription fileType, TypeDescription readerType)
- throws IOException {
- super(columnId);
- filePrecision = fileType.getPrecision();
- fileScale = fileType.getScale();
- readerPrecision = readerType.getPrecision();
- readerScale = readerType.getScale();
- decimalTreeReader = new DecimalTreeReader(columnId, filePrecision, fileScale);
- setConvertTreeReader(decimalTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
-
- decimalColVector.set(elementNum, fileDecimalColVector.vector[elementNum]);
-
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (fileDecimalColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- fileDecimalColVector = new DecimalColumnVector(filePrecision, fileScale);
- decimalColVector = (DecimalColumnVector) previousVector;
- }
- // Read present/isNull stream
- decimalTreeReader.nextVector(fileDecimalColVector, isNull, batchSize);
-
- convertVector(fileDecimalColVector, decimalColVector, batchSize);
- }
- }
-
- public static class StringGroupFromAnyIntegerTreeReader extends ConvertTreeReader {
-
- private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
-
- private final TypeDescription readerType;
- private LongColumnVector longColVector;
- private BytesColumnVector bytesColVector;
-
- StringGroupFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType, boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- anyIntegerAsLongTreeReader =
- new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
- setConvertTreeReader(anyIntegerAsLongTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- long longValue = longColVector.vector[elementNum];
- String string = anyIntegerAsLongTreeReader.getString(longValue);
- byte[] bytes = string.getBytes();
- assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (longColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- longColVector = new LongColumnVector();
- bytesColVector = (BytesColumnVector) previousVector;
- }
- // Read present/isNull stream
- anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
-
- convertVector(longColVector, bytesColVector, batchSize);
- }
- }
-
- public static class StringGroupFromFloatTreeReader extends ConvertTreeReader {
-
- private FloatTreeReader floatTreeReader;
-
- private final TypeDescription readerType;
- private DoubleColumnVector doubleColVector;
- private BytesColumnVector bytesColVector;
-
-
- StringGroupFromFloatTreeReader(int columnId, TypeDescription readerType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- floatTreeReader = new FloatTreeReader(columnId);
- setConvertTreeReader(floatTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- float floatValue = (float) doubleColVector.vector[elementNum];
- if (!Float.isNaN(floatValue)) {
- String string = String.valueOf(floatValue);
- byte[] bytes = string.getBytes();
- assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
- } else {
- bytesColVector.noNulls = false;
- bytesColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (doubleColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- doubleColVector = new DoubleColumnVector();
- bytesColVector = (BytesColumnVector) previousVector;
- }
- // Read present/isNull stream
- floatTreeReader.nextVector(doubleColVector, isNull, batchSize);
-
- convertVector(doubleColVector, bytesColVector, batchSize);
- }
- }
-
- public static class StringGroupFromDoubleTreeReader extends ConvertTreeReader {
-
- private DoubleTreeReader doubleTreeReader;
-
- private final TypeDescription readerType;
- private DoubleColumnVector doubleColVector;
- private BytesColumnVector bytesColVector;
-
- StringGroupFromDoubleTreeReader(int columnId, TypeDescription readerType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- doubleTreeReader = new DoubleTreeReader(columnId);
- setConvertTreeReader(doubleTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- double doubleValue = doubleColVector.vector[elementNum];
- if (!Double.isNaN(doubleValue)) {
- String string = String.valueOf(doubleValue);
- byte[] bytes = string.getBytes();
- assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
- } else {
- bytesColVector.noNulls = false;
- bytesColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (doubleColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- doubleColVector = new DoubleColumnVector();
- bytesColVector = (BytesColumnVector) previousVector;
- }
- // Read present/isNull stream
- doubleTreeReader.nextVector(doubleColVector, isNull, batchSize);
-
- convertVector(doubleColVector, bytesColVector, batchSize);
- }
- }
-
-
-
- public static class StringGroupFromDecimalTreeReader extends ConvertTreeReader {
-
- private DecimalTreeReader decimalTreeReader;
-
- private int precision;
- private int scale;
- private final TypeDescription readerType;
- private DecimalColumnVector decimalColVector;
- private BytesColumnVector bytesColVector;
- private byte[] scratchBuffer;
-
- StringGroupFromDecimalTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType, boolean skipCorrupt) throws IOException {
- super(columnId);
- this.precision = fileType.getPrecision();
- this.scale = fileType.getScale();
- this.readerType = readerType;
- decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
- setConvertTreeReader(decimalTreeReader);
- scratchBuffer = new byte[HiveDecimal.SCRATCH_BUFFER_LEN_TO_BYTES];
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- HiveDecimalWritable decWritable = decimalColVector.vector[elementNum];
-
- // Convert decimal into bytes instead of a String for better performance.
- final int byteIndex = decWritable.toBytes(scratchBuffer);
-
- assignStringGroupVectorEntry(
- bytesColVector, elementNum, readerType,
- scratchBuffer, byteIndex, HiveDecimal.SCRATCH_BUFFER_LEN_TO_BYTES - byteIndex);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (decimalColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- decimalColVector = new DecimalColumnVector(precision, scale);
- bytesColVector = (BytesColumnVector) previousVector;
- }
- // Read present/isNull stream
- decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
-
- convertVector(decimalColVector, bytesColVector, batchSize);
- }
- }
-
- public static class StringGroupFromTimestampTreeReader extends ConvertTreeReader {
-
- private TimestampTreeReader timestampTreeReader;
-
- private final TypeDescription readerType;
- private TimestampColumnVector timestampColVector;
- private BytesColumnVector bytesColVector;
-
- StringGroupFromTimestampTreeReader(int columnId, TypeDescription readerType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
- setConvertTreeReader(timestampTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- String string =
- timestampColVector.asScratchTimestamp(elementNum).toString();
- byte[] bytes = string.getBytes();
- assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (timestampColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- timestampColVector = new TimestampColumnVector();
- bytesColVector = (BytesColumnVector) previousVector;
- }
- // Read present/isNull stream
- timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
-
- convertVector(timestampColVector, bytesColVector, batchSize);
- }
- }
-
- public static class StringGroupFromDateTreeReader extends ConvertTreeReader {
-
- private DateTreeReader dateTreeReader;
-
- private final TypeDescription readerType;
- private LongColumnVector longColVector;
- private BytesColumnVector bytesColVector;
- private Date date;
-
- StringGroupFromDateTreeReader(int columnId, TypeDescription readerType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- dateTreeReader = new DateTreeReader(columnId);
- setConvertTreeReader(dateTreeReader);
- date = new Date(0);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- date.setTime(DateWritable.daysToMillis((int) longColVector.vector[elementNum]));
- String string = date.toString();
- byte[] bytes = string.getBytes();
- assignStringGroupVectorEntry(bytesColVector, elementNum, readerType, bytes);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (longColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- longColVector = new LongColumnVector();
- bytesColVector = (BytesColumnVector) previousVector;
- }
- // Read present/isNull stream
- dateTreeReader.nextVector(longColVector, isNull, batchSize);
-
- convertVector(longColVector, bytesColVector, batchSize);
- }
- }
-
- public static class StringGroupFromStringGroupTreeReader extends ConvertTreeReader {
-
- private TreeReader stringGroupTreeReader;
-
- private final TypeDescription readerType;
-
- StringGroupFromStringGroupTreeReader(int columnId, TypeDescription fileType,
- TypeDescription readerType) throws IOException {
- super(columnId);
- this.readerType = readerType;
- stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
- setConvertTreeReader(stringGroupTreeReader);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- stringGroupTreeReader.nextVector(previousVector, isNull, batchSize);
-
- BytesColumnVector resultColVector = (BytesColumnVector) previousVector;
-
- if (resultColVector.isRepeating) {
- if (resultColVector.noNulls || !resultColVector.isNull[0]) {
- convertStringGroupVectorElement(resultColVector, 0, readerType);
- } else {
- // Remains null.
- }
- } else if (resultColVector.noNulls){
- for (int i = 0; i < batchSize; i++) {
- convertStringGroupVectorElement(resultColVector, i, readerType);
- }
- } else {
- for (int i = 0; i < batchSize; i++) {
- if (!resultColVector.isNull[i]) {
- convertStringGroupVectorElement(resultColVector, i, readerType);
- } else {
- // Remains null.
- }
- }
- }
- }
- }
-
- public static class StringGroupFromBinaryTreeReader extends ConvertTreeReader {
-
- private BinaryTreeReader binaryTreeReader;
-
- private final TypeDescription readerType;
- private BytesColumnVector inBytesColVector;
- private BytesColumnVector outBytesColVector;
-
- StringGroupFromBinaryTreeReader(int columnId, TypeDescription readerType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.readerType = readerType;
- binaryTreeReader = new BinaryTreeReader(columnId);
- setConvertTreeReader(binaryTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- byte[] bytes = inBytesColVector.vector[elementNum];
- int start = inBytesColVector.start[elementNum];
- int length = inBytesColVector.length[elementNum];
- byte[] string = new byte[length == 0 ? 0 : 3 * length - 1];
- for(int p = 0; p < string.length; p += 2) {
- if (p != 0) {
- string[p++] = ' ';
- }
- int num = 0xff & bytes[start++];
- int digit = num / 16;
- string[p] = (byte)((digit) + (digit < 10 ? '0' : 'a' - 10));
- digit = num % 16;
- string[p + 1] = (byte)((digit) + (digit < 10 ? '0' : 'a' - 10));
- }
- assignStringGroupVectorEntry(outBytesColVector, elementNum, readerType,
- string, 0, string.length);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (inBytesColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- inBytesColVector = new BytesColumnVector();
- outBytesColVector = (BytesColumnVector) previousVector;
- }
- // Read present/isNull stream
- binaryTreeReader.nextVector(inBytesColVector, isNull, batchSize);
-
- convertVector(inBytesColVector, outBytesColVector, batchSize);
- }
- }
-
- public static class TimestampFromAnyIntegerTreeReader extends ConvertTreeReader {
-
- private AnyIntegerTreeReader anyIntegerAsLongTreeReader;
-
- private LongColumnVector longColVector;
- private TimestampColumnVector timestampColVector;
-
- TimestampFromAnyIntegerTreeReader(int columnId, TypeDescription fileType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- anyIntegerAsLongTreeReader =
- new AnyIntegerTreeReader(columnId, fileType, skipCorrupt);
- setConvertTreeReader(anyIntegerAsLongTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- long longValue = longColVector.vector[elementNum];
- // UNDONE: What does the boolean setting need to be?
- timestampColVector.set(elementNum, new Timestamp(longValue));
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (longColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- longColVector = new LongColumnVector();
- timestampColVector = (TimestampColumnVector) previousVector;
- }
- // Read present/isNull stream
- anyIntegerAsLongTreeReader.nextVector(longColVector, isNull, batchSize);
-
- convertVector(longColVector, timestampColVector, batchSize);
- }
- }
-
- public static class TimestampFromFloatTreeReader extends ConvertTreeReader {
-
- private FloatTreeReader floatTreeReader;
-
- private DoubleColumnVector doubleColVector;
- private TimestampColumnVector timestampColVector;
-
- TimestampFromFloatTreeReader(int columnId, TypeDescription fileType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- floatTreeReader = new FloatTreeReader(columnId);
- setConvertTreeReader(floatTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- float floatValue = (float) doubleColVector.vector[elementNum];
- Timestamp timestampValue = TimestampUtils.doubleToTimestamp(floatValue);
- // The TimestampColumnVector will set the entry to null when a null timestamp is passed in.
- timestampColVector.set(elementNum, timestampValue);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (doubleColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- doubleColVector = new DoubleColumnVector();
- timestampColVector = (TimestampColumnVector) previousVector;
- }
- // Read present/isNull stream
- floatTreeReader.nextVector(doubleColVector, isNull, batchSize);
-
- convertVector(doubleColVector, timestampColVector, batchSize);
- }
- }
-
- public static class TimestampFromDoubleTreeReader extends ConvertTreeReader {
-
- private DoubleTreeReader doubleTreeReader;
-
- private DoubleColumnVector doubleColVector;
- private TimestampColumnVector timestampColVector;
-
- TimestampFromDoubleTreeReader(int columnId, TypeDescription fileType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- doubleTreeReader = new DoubleTreeReader(columnId);
- setConvertTreeReader(doubleTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- double doubleValue = doubleColVector.vector[elementNum];
- Timestamp timestampValue = TimestampUtils.doubleToTimestamp(doubleValue);
- // The TimestampColumnVector will set the entry to null when a null timestamp is passed in.
- timestampColVector.set(elementNum, timestampValue);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (doubleColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- doubleColVector = new DoubleColumnVector();
- timestampColVector = (TimestampColumnVector) previousVector;
- }
- // Read present/isNull stream
- doubleTreeReader.nextVector(doubleColVector, isNull, batchSize);
-
- convertVector(doubleColVector, timestampColVector, batchSize);
- }
- }
-
- public static class TimestampFromDecimalTreeReader extends ConvertTreeReader {
-
- private DecimalTreeReader decimalTreeReader;
-
- private final int precision;
- private final int scale;
- private DecimalColumnVector decimalColVector;
- private TimestampColumnVector timestampColVector;
-
- TimestampFromDecimalTreeReader(int columnId, TypeDescription fileType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- this.precision = fileType.getPrecision();
- this.scale = fileType.getScale();
- decimalTreeReader = new DecimalTreeReader(columnId, precision, scale);
- setConvertTreeReader(decimalTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- Timestamp timestampValue =
- TimestampUtils.decimalToTimestamp(
- decimalColVector.vector[elementNum].getHiveDecimal());
- // The TimestampColumnVector will set the entry to null when a null timestamp is passed in.
- timestampColVector.set(elementNum, timestampValue);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (decimalColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- decimalColVector = new DecimalColumnVector(precision, scale);
- timestampColVector = (TimestampColumnVector) previousVector;
- }
- // Read present/isNull stream
- decimalTreeReader.nextVector(decimalColVector, isNull, batchSize);
-
- convertVector(decimalColVector, timestampColVector, batchSize);
- }
- }
-
- public static class TimestampFromStringGroupTreeReader extends ConvertTreeReader {
-
- private TreeReader stringGroupTreeReader;
-
- private BytesColumnVector bytesColVector;
- private TimestampColumnVector timestampColVector;
-
- TimestampFromStringGroupTreeReader(int columnId, TypeDescription fileType)
- throws IOException {
- super(columnId);
- stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
- setConvertTreeReader(stringGroupTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- String stringValue =
- stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
- Timestamp timestampValue = parseTimestampFromString(stringValue);
- if (timestampValue != null) {
- timestampColVector.set(elementNum, timestampValue);
- } else {
- timestampColVector.noNulls = false;
- timestampColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (bytesColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- bytesColVector = new BytesColumnVector();
- timestampColVector = (TimestampColumnVector) previousVector;
- }
- // Read present/isNull stream
- stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
-
- convertVector(bytesColVector, timestampColVector, batchSize);
- }
- }
-
- public static class TimestampFromDateTreeReader extends ConvertTreeReader {
-
- private DateTreeReader dateTreeReader;
-
- private LongColumnVector longColVector;
- private TimestampColumnVector timestampColVector;
-
- TimestampFromDateTreeReader(int columnId, TypeDescription fileType,
- boolean skipCorrupt) throws IOException {
- super(columnId);
- dateTreeReader = new DateTreeReader(columnId);
- setConvertTreeReader(dateTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) {
- long millis =
- DateWritable.daysToMillis((int) longColVector.vector[elementNum]);
- timestampColVector.set(elementNum, new Timestamp(millis));
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (longColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- longColVector = new LongColumnVector();
- timestampColVector = (TimestampColumnVector) previousVector;
- }
- // Read present/isNull stream
- dateTreeReader.nextVector(longColVector, isNull, batchSize);
-
- convertVector(longColVector, timestampColVector, batchSize);
- }
- }
-
- public static class DateFromStringGroupTreeReader extends ConvertTreeReader {
-
- private TreeReader stringGroupTreeReader;
-
- private BytesColumnVector bytesColVector;
- private LongColumnVector longColVector;
-
- DateFromStringGroupTreeReader(int columnId, TypeDescription fileType)
- throws IOException {
- super(columnId);
- stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
- setConvertTreeReader(stringGroupTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- String stringValue =
- stringFromBytesColumnVectorEntry(bytesColVector, elementNum);
- Date dateValue = parseDateFromString(stringValue);
- if (dateValue != null) {
- longColVector.vector[elementNum] = DateWritable.dateToDays(dateValue);
- } else {
- longColVector.noNulls = false;
- longColVector.isNull[elementNum] = true;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (bytesColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- bytesColVector = new BytesColumnVector();
- longColVector = (LongColumnVector) previousVector;
- }
- // Read present/isNull stream
- stringGroupTreeReader.nextVector(bytesColVector, isNull, batchSize);
-
- convertVector(bytesColVector, longColVector, batchSize);
- }
- }
-
- public static class DateFromTimestampTreeReader extends ConvertTreeReader {
-
- private TimestampTreeReader timestampTreeReader;
-
- private TimestampColumnVector timestampColVector;
- private LongColumnVector longColVector;
-
- DateFromTimestampTreeReader(int columnId, boolean skipCorrupt) throws IOException {
- super(columnId);
- timestampTreeReader = new TimestampTreeReader(columnId, skipCorrupt);
- setConvertTreeReader(timestampTreeReader);
- }
-
- @Override
- public void setConvertVectorElement(int elementNum) throws IOException {
- Date dateValue =
- DateWritable.timeToDate(TimestampUtils.millisToSeconds(
- timestampColVector.asScratchTimestamp(elementNum).getTime()));
- longColVector.vector[elementNum] = DateWritable.dateToDays(dateValue);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- if (timestampColVector == null) {
- // Allocate column vector for file; cast column vector for reader.
- timestampColVector = new TimestampColumnVector();
- longColVector = (LongColumnVector) previousVector;
- }
- // Read present/isNull stream
- timestampTreeReader.nextVector(timestampColVector, isNull, batchSize);
-
- convertVector(timestampColVector, longColVector, batchSize);
- }
- }
-
- public static class BinaryFromStringGroupTreeReader extends ConvertTreeReader {
-
- private TreeReader stringGroupTreeReader;
-
- BinaryFromStringGroupTreeReader(int columnId, TypeDescription fileType)
- throws IOException {
- super(columnId);
- stringGroupTreeReader = getStringGroupTreeReader(columnId, fileType);
- setConvertTreeReader(stringGroupTreeReader);
- }
-
- @Override
- public void nextVector(ColumnVector previousVector,
- boolean[] isNull,
- final int batchSize) throws IOException {
- super.nextVector(previousVector, isNull, batchSize);
- }
- }
-
- private static TreeReader createAnyIntegerConvertTreeReader(int columnId,
- TypeDescription fileType,
- TypeDescription readerType,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt) throws IOException {
-
- // CONVERT from (BOOLEAN, BYTE, SHORT, INT, LONG) to schema type.
- //
- switch (readerType.getCategory()) {
-
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- if (fileType.getCategory() == readerType.getCategory()) {
- throw new IllegalArgumentException("No conversion of type " +
- readerType.getCategory() + " to self needed");
- }
- return new AnyIntegerFromAnyIntegerTreeReader(columnId, fileType, readerType,
- skipCorrupt);
-
- case FLOAT:
- return new FloatFromAnyIntegerTreeReader(columnId, fileType,
- skipCorrupt);
-
- case DOUBLE:
- return new DoubleFromAnyIntegerTreeReader(columnId, fileType,
- skipCorrupt);
-
- case DECIMAL:
- return new DecimalFromAnyIntegerTreeReader(columnId, fileType, skipCorrupt);
-
- case STRING:
- case CHAR:
- case VARCHAR:
- return new StringGroupFromAnyIntegerTreeReader(columnId, fileType, readerType,
- skipCorrupt);
-
- case TIMESTAMP:
- return new TimestampFromAnyIntegerTreeReader(columnId, fileType, skipCorrupt);
-
- // Not currently supported conversion(s):
- case BINARY:
- case DATE:
-
- case STRUCT:
- case LIST:
- case MAP:
- case UNION:
- default:
- throw new IllegalArgumentException("Unsupported type " +
- readerType.getCategory());
- }
- }
-
- private static TreeReader createFloatConvertTreeReader(int columnId,
- TypeDescription fileType,
- TypeDescription readerType,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt) throws IOException {
-
- // CONVERT from FLOAT to schema type.
- switch (readerType.getCategory()) {
-
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- return new AnyIntegerFromFloatTreeReader(columnId, readerType);
-
- case FLOAT:
- throw new IllegalArgumentException("No conversion of type " +
- readerType.getCategory() + " to self needed");
-
- case DOUBLE:
- return new DoubleFromFloatTreeReader(columnId);
-
- case DECIMAL:
- return new DecimalFromFloatTreeReader(columnId, readerType);
-
- case STRING:
- case CHAR:
- case VARCHAR:
- return new StringGroupFromFloatTreeReader(columnId, readerType, skipCorrupt);
-
- case TIMESTAMP:
- return new TimestampFromFloatTreeReader(columnId, readerType, skipCorrupt);
-
- // Not currently supported conversion(s):
- case BINARY:
- case DATE:
-
- case STRUCT:
- case LIST:
- case MAP:
- case UNION:
- default:
- throw new IllegalArgumentException("Unsupported type " +
- readerType.getCategory());
- }
- }
-
- private static TreeReader createDoubleConvertTreeReader(int columnId,
- TypeDescription fileType,
- TypeDescription readerType,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt) throws IOException {
-
- // CONVERT from DOUBLE to schema type.
- switch (readerType.getCategory()) {
-
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- return new AnyIntegerFromDoubleTreeReader(columnId, readerType);
-
- case FLOAT:
- return new FloatFromDoubleTreeReader(columnId);
-
- case DOUBLE:
- throw new IllegalArgumentException("No conversion of type " +
- readerType.getCategory() + " to self needed");
-
- case DECIMAL:
- return new DecimalFromDoubleTreeReader(columnId, readerType);
-
- case STRING:
- case CHAR:
- case VARCHAR:
- return new StringGroupFromDoubleTreeReader(columnId, readerType, skipCorrupt);
-
- case TIMESTAMP:
- return new TimestampFromDoubleTreeReader(columnId, readerType, skipCorrupt);
-
- // Not currently supported conversion(s):
- case BINARY:
- case DATE:
-
- case STRUCT:
- case LIST:
- case MAP:
- case UNION:
- default:
- throw new IllegalArgumentException("Unsupported type " +
- readerType.getCategory());
- }
- }
-
- private static TreeReader createDecimalConvertTreeReader(int columnId,
- TypeDescription fileType,
- TypeDescription readerType,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt) throws IOException {
-
- // CONVERT from DECIMAL to schema type.
- switch (readerType.getCategory()) {
-
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- return new AnyIntegerFromDecimalTreeReader(columnId, fileType, readerType);
-
- case FLOAT:
- return new FloatFromDecimalTreeReader(columnId, fileType, readerType);
-
- case DOUBLE:
- return new DoubleFromDecimalTreeReader(columnId, fileType);
-
- case STRING:
- case CHAR:
- case VARCHAR:
- return new StringGroupFromDecimalTreeReader(columnId, fileType, readerType, skipCorrupt);
-
- case TIMESTAMP:
- return new TimestampFromDecimalTreeReader(columnId, fileType, skipCorrupt);
-
- case DECIMAL:
- return new DecimalFromDecimalTreeReader(columnId, fileType, readerType);
-
- // Not currently supported conversion(s):
- case BINARY:
- case DATE:
-
- case STRUCT:
- case LIST:
- case MAP:
- case UNION:
- default:
- throw new IllegalArgumentException("Unsupported type " +
- readerType.getCategory());
- }
- }
-
- private static TreeReader createStringConvertTreeReader(int columnId,
- TypeDescription fileType,
- TypeDescription readerType,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt) throws IOException {
-
- // CONVERT from STRING to schema type.
- switch (readerType.getCategory()) {
-
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- return new AnyIntegerFromStringGroupTreeReader(columnId, fileType, readerType);
-
- case FLOAT:
- return new FloatFromStringGroupTreeReader(columnId, fileType);
-
- case DOUBLE:
- return new DoubleFromStringGroupTreeReader(columnId, fileType);
-
- case DECIMAL:
- return new DecimalFromStringGroupTreeReader(columnId, fileType, readerType);
-
- case CHAR:
- return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
-
- case VARCHAR:
- return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
-
- case STRING:
- throw new IllegalArgumentException("No conversion of type " +
- readerType.getCategory() + " to self needed");
-
- case BINARY:
- return new BinaryFromStringGroupTreeReader(columnId, fileType);
-
- case TIMESTAMP:
- return new TimestampFromStringGroupTreeReader(columnId, fileType);
-
- case DATE:
- return new DateFromStringGroupTreeReader(columnId, fileType);
-
- // Not currently supported conversion(s):
-
- case STRUCT:
- case LIST:
- case MAP:
- case UNION:
- default:
- throw new IllegalArgumentException("Unsupported type " +
- readerType.getCategory());
- }
- }
-
- private static TreeReader createCharConvertTreeReader(int columnId,
- TypeDescription fileType,
- TypeDescription readerType,
- SchemaEvolution evolution,
- boolean[] included,
- boolean skipCorrupt) throws IOException {
-
- // CONVERT from CHAR to schema type.
- switch (readerType.getCategory()) {
-
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- return new AnyIntegerFromStringGroupTreeReader(columnId, fileType, readerType);
-
- case FLOAT:
- return new FloatFromStringGroupTreeReader(columnId, fileType);
-
- case DOUBLE:
- return new DoubleFromStringGroupTreeReader(columnId, fileType);
-
- case DECIMAL:
- return new DecimalFromStringGroupTreeReader(columnId, fileType, readerType);
-
- case STRING:
- return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
-
- case VARCHAR:
- return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
-
- case CHAR:
- return new StringGroupFromStringGroupTreeReader(columnId, fileType, readerType);
-
- case BINARY:
- return new BinaryFromStringGroupTreeReader(columnId, fileType);
-
- case TIMESTAMP:
- return new TimestampFromStringGroupTreeReader(columnId, fileType);
-
- case DATE:
- return new DateFromStringGroupTreeReader(columnId, fileT
<TRUNCATED>
[19/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/PhysicalFsWriter.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/PhysicalFsWriter.java b/orc/src/java/org/apache/orc/impl/PhysicalFsWriter.java
deleted file mode 100644
index ba8c13f..0000000
--- a/orc/src/java/org/apache/orc/impl/PhysicalFsWriter.java
+++ /dev/null
@@ -1,529 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.EnumSet;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.orc.CompressionCodec;
-import org.apache.orc.CompressionCodec.Modifier;
-import org.apache.orc.CompressionKind;
-import org.apache.orc.OrcFile;
-import org.apache.orc.OrcFile.CompressionStrategy;
-import org.apache.orc.OrcProto;
-import org.apache.orc.OrcProto.BloomFilterIndex;
-import org.apache.orc.OrcProto.Footer;
-import org.apache.orc.OrcProto.Metadata;
-import org.apache.orc.OrcProto.PostScript;
-import org.apache.orc.OrcProto.Stream.Kind;
-import org.apache.orc.OrcProto.StripeFooter;
-import org.apache.orc.OrcProto.StripeInformation;
-import org.apache.orc.OrcProto.RowIndex.Builder;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.common.annotations.VisibleForTesting;
-import com.google.protobuf.CodedOutputStream;
-
-public class PhysicalFsWriter implements PhysicalWriter {
- private static final Logger LOG = LoggerFactory.getLogger(PhysicalFsWriter.class);
-
- private static final int HDFS_BUFFER_SIZE = 256 * 1024;
-
- private FSDataOutputStream rawWriter = null;
- // the compressed metadata information outStream
- private OutStream writer = null;
- // a protobuf outStream around streamFactory
- private CodedOutputStream protobufWriter = null;
-
- private final FileSystem fs;
- private final Path path;
- private final long blockSize;
- private final int bufferSize;
- private final CompressionCodec codec;
- private final double paddingTolerance;
- private final long defaultStripeSize;
- private final CompressionKind compress;
- private final boolean addBlockPadding;
- private final CompressionStrategy compressionStrategy;
-
- // the streams that make up the current stripe
- private final Map<StreamName, BufferedStream> streams =
- new TreeMap<StreamName, BufferedStream>();
-
- private long adjustedStripeSize;
- private long headerLength;
- private long stripeStart;
- private int metadataLength;
- private int footerLength;
-
- public PhysicalFsWriter(FileSystem fs, Path path, int numColumns, OrcFile.WriterOptions opts) {
- this.fs = fs;
- this.path = path;
- this.defaultStripeSize = this.adjustedStripeSize = opts.getStripeSize();
- this.addBlockPadding = opts.getBlockPadding();
- if (opts.isEnforceBufferSize()) {
- this.bufferSize = opts.getBufferSize();
- } else {
- this.bufferSize = getEstimatedBufferSize(defaultStripeSize, numColumns, opts.getBufferSize());
- }
- this.compress = opts.getCompress();
- this.compressionStrategy = opts.getCompressionStrategy();
- codec = createCodec(compress);
- this.paddingTolerance = opts.getPaddingTolerance();
- this.blockSize = opts.getBlockSize();
- LOG.info("ORC writer created for path: {} with stripeSize: {} blockSize: {}" +
- " compression: {} bufferSize: {}", path, defaultStripeSize, blockSize,
- compress, bufferSize);
- }
-
- @Override
- public void initialize() throws IOException {
- if (rawWriter != null) return;
- rawWriter = fs.create(path, false, HDFS_BUFFER_SIZE,
- fs.getDefaultReplication(path), blockSize);
- rawWriter.writeBytes(OrcFile.MAGIC);
- headerLength = rawWriter.getPos();
- writer = new OutStream("metadata", bufferSize, codec,
- new DirectStream(rawWriter));
- protobufWriter = CodedOutputStream.newInstance(writer);
- }
-
- private void padStripe(long indexSize, long dataSize, int footerSize) throws IOException {
- this.stripeStart = rawWriter.getPos();
- final long currentStripeSize = indexSize + dataSize + footerSize;
- final long available = blockSize - (stripeStart % blockSize);
- final long overflow = currentStripeSize - adjustedStripeSize;
- final float availRatio = (float) available / (float) defaultStripeSize;
-
- if (availRatio > 0.0f && availRatio < 1.0f
- && availRatio > paddingTolerance) {
- // adjust default stripe size to fit into remaining space, also adjust
- // the next stripe for correction based on the current stripe size
- // and user specified padding tolerance. Since stripe size can overflow
- // the default stripe size we should apply this correction to avoid
- // writing portion of last stripe to next hdfs block.
- double correction = overflow > 0 ? (double) overflow
- / (double) adjustedStripeSize : 0.0;
-
- // correction should not be greater than user specified padding
- // tolerance
- correction = correction > paddingTolerance ? paddingTolerance
- : correction;
-
- // adjust next stripe size based on current stripe estimate correction
- adjustedStripeSize = (long) ((1.0f - correction) * (availRatio * defaultStripeSize));
- } else if (availRatio >= 1.0) {
- adjustedStripeSize = defaultStripeSize;
- }
-
- if (availRatio < paddingTolerance && addBlockPadding) {
- long padding = blockSize - (stripeStart % blockSize);
- byte[] pad = new byte[(int) Math.min(HDFS_BUFFER_SIZE, padding)];
- LOG.info(String.format("Padding ORC by %d bytes (<= %.2f * %d)",
- padding, availRatio, defaultStripeSize));
- stripeStart += padding;
- while (padding > 0) {
- int writeLen = (int) Math.min(padding, pad.length);
- rawWriter.write(pad, 0, writeLen);
- padding -= writeLen;
- }
- adjustedStripeSize = defaultStripeSize;
- } else if (currentStripeSize < blockSize
- && (stripeStart % blockSize) + currentStripeSize > blockSize) {
- // even if you don't pad, reset the default stripe size when crossing a
- // block boundary
- adjustedStripeSize = defaultStripeSize;
- }
- }
-
- /**
- * An output receiver that writes the ByteBuffers to the output stream
- * as they are received.
- */
- private class DirectStream implements OutStream.OutputReceiver {
- private final FSDataOutputStream output;
-
- DirectStream(FSDataOutputStream output) {
- this.output = output;
- }
-
- @Override
- public void output(ByteBuffer buffer) throws IOException {
- output.write(buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining());
- }
- }
-
- @Override
- public long getPhysicalStripeSize() {
- return adjustedStripeSize;
- }
-
- @Override
- public boolean isCompressed() {
- return codec != null;
- }
-
-
- public static CompressionCodec createCodec(CompressionKind kind) {
- switch (kind) {
- case NONE:
- return null;
- case ZLIB:
- return new ZlibCodec();
- case SNAPPY:
- return new SnappyCodec();
- case LZO:
- try {
- ClassLoader loader = Thread.currentThread().getContextClassLoader();
- if (loader == null) {
- loader = WriterImpl.class.getClassLoader();
- }
- @SuppressWarnings("unchecked")
- Class<? extends CompressionCodec> lzo =
- (Class<? extends CompressionCodec>)
- loader.loadClass("org.apache.hadoop.hive.ql.io.orc.LzoCodec");
- return lzo.newInstance();
- } catch (ClassNotFoundException e) {
- throw new IllegalArgumentException("LZO is not available.", e);
- } catch (InstantiationException e) {
- throw new IllegalArgumentException("Problem initializing LZO", e);
- } catch (IllegalAccessException e) {
- throw new IllegalArgumentException("Insufficient access to LZO", e);
- }
- default:
- throw new IllegalArgumentException("Unknown compression codec: " +
- kind);
- }
- }
-
- private void writeStripeFooter(StripeFooter footer, long dataSize, long indexSize,
- StripeInformation.Builder dirEntry) throws IOException {
- footer.writeTo(protobufWriter);
- protobufWriter.flush();
- writer.flush();
- dirEntry.setOffset(stripeStart);
- dirEntry.setFooterLength(rawWriter.getPos() - stripeStart - dataSize - indexSize);
- }
-
- @VisibleForTesting
- public static int getEstimatedBufferSize(long stripeSize, int numColumns,
- int bs) {
- // The worst case is that there are 2 big streams per a column and
- // we want to guarantee that each stream gets ~10 buffers.
- // This keeps buffers small enough that we don't get really small stripe
- // sizes.
- int estBufferSize = (int) (stripeSize / (20 * numColumns));
- estBufferSize = getClosestBufferSize(estBufferSize);
- return estBufferSize > bs ? bs : estBufferSize;
- }
-
- private static int getClosestBufferSize(int estBufferSize) {
- final int kb4 = 4 * 1024;
- final int kb8 = 8 * 1024;
- final int kb16 = 16 * 1024;
- final int kb32 = 32 * 1024;
- final int kb64 = 64 * 1024;
- final int kb128 = 128 * 1024;
- final int kb256 = 256 * 1024;
- if (estBufferSize <= kb4) {
- return kb4;
- } else if (estBufferSize > kb4 && estBufferSize <= kb8) {
- return kb8;
- } else if (estBufferSize > kb8 && estBufferSize <= kb16) {
- return kb16;
- } else if (estBufferSize > kb16 && estBufferSize <= kb32) {
- return kb32;
- } else if (estBufferSize > kb32 && estBufferSize <= kb64) {
- return kb64;
- } else if (estBufferSize > kb64 && estBufferSize <= kb128) {
- return kb128;
- } else {
- return kb256;
- }
- }
-
- @Override
- public void writeFileMetadata(Metadata.Builder builder) throws IOException {
- long startPosn = rawWriter.getPos();
- Metadata metadata = builder.build();
- metadata.writeTo(protobufWriter);
- protobufWriter.flush();
- writer.flush();
- this.metadataLength = (int) (rawWriter.getPos() - startPosn);
- }
-
- @Override
- public void writeFileFooter(Footer.Builder builder) throws IOException {
- long bodyLength = rawWriter.getPos() - metadataLength;
- builder.setContentLength(bodyLength);
- builder.setHeaderLength(headerLength);
- long startPosn = rawWriter.getPos();
- Footer footer = builder.build();
- footer.writeTo(protobufWriter);
- protobufWriter.flush();
- writer.flush();
- this.footerLength = (int) (rawWriter.getPos() - startPosn);
- }
-
- @Override
- public void writePostScript(PostScript.Builder builder) throws IOException {
- builder.setCompression(writeCompressionKind(compress));
- builder.setFooterLength(footerLength);
- builder.setMetadataLength(metadataLength);
- if (compress != CompressionKind.NONE) {
- builder.setCompressionBlockSize(bufferSize);
- }
- PostScript ps = builder.build();
- // need to write this uncompressed
- long startPosn = rawWriter.getPos();
- ps.writeTo(rawWriter);
- long length = rawWriter.getPos() - startPosn;
- if (length > 255) {
- throw new IllegalArgumentException("PostScript too large at " + length);
- }
- rawWriter.writeByte((int)length);
- }
-
- @Override
- public void close() throws IOException {
- rawWriter.close();
- }
-
- private OrcProto.CompressionKind writeCompressionKind(CompressionKind kind) {
- switch (kind) {
- case NONE: return OrcProto.CompressionKind.NONE;
- case ZLIB: return OrcProto.CompressionKind.ZLIB;
- case SNAPPY: return OrcProto.CompressionKind.SNAPPY;
- case LZO: return OrcProto.CompressionKind.LZO;
- default:
- throw new IllegalArgumentException("Unknown compression " + kind);
- }
- }
-
- @Override
- public void flush() throws IOException {
- rawWriter.hflush();
- // TODO: reset?
- }
-
- @Override
- public long getRawWriterPosition() throws IOException {
- return rawWriter.getPos();
- }
-
- @Override
- public void appendRawStripe(byte[] stripe, int offset, int length,
- StripeInformation.Builder dirEntry) throws IOException {
- long start = rawWriter.getPos();
- long availBlockSpace = blockSize - (start % blockSize);
-
- // see if stripe can fit in the current hdfs block, else pad the remaining
- // space in the block
- if (length < blockSize && length > availBlockSpace &&
- addBlockPadding) {
- byte[] pad = new byte[(int) Math.min(HDFS_BUFFER_SIZE, availBlockSpace)];
- LOG.info(String.format("Padding ORC by %d bytes while merging..",
- availBlockSpace));
- start += availBlockSpace;
- while (availBlockSpace > 0) {
- int writeLen = (int) Math.min(availBlockSpace, pad.length);
- rawWriter.write(pad, 0, writeLen);
- availBlockSpace -= writeLen;
- }
- }
-
- rawWriter.write(stripe);
- dirEntry.setOffset(start);
- }
-
-
- /**
- * This class is used to hold the contents of streams as they are buffered.
- * The TreeWriters write to the outStream and the codec compresses the
- * data as buffers fill up and stores them in the output list. When the
- * stripe is being written, the whole stream is written to the file.
- */
- private class BufferedStream implements OutStream.OutputReceiver {
- private final OutStream outStream;
- private final List<ByteBuffer> output = new ArrayList<ByteBuffer>();
-
- BufferedStream(String name, int bufferSize,
- CompressionCodec codec) throws IOException {
- outStream = new OutStream(name, bufferSize, codec, this);
- }
-
- /**
- * Receive a buffer from the compression codec.
- * @param buffer the buffer to save
- */
- @Override
- public void output(ByteBuffer buffer) {
- output.add(buffer);
- }
-
- /**
- * @return the number of bytes in buffers that are allocated to this stream.
- */
- public long getBufferSize() {
- long result = 0;
- for (ByteBuffer buf: output) {
- result += buf.capacity();
- }
- return outStream.getBufferSize() + result;
- }
-
- /**
- * Write any saved buffers to the OutputStream if needed, and clears all the buffers.
- */
- public void spillToDiskAndClear() throws IOException {
- if (!outStream.isSuppressed()) {
- for (ByteBuffer buffer: output) {
- rawWriter.write(buffer.array(), buffer.arrayOffset() + buffer.position(),
- buffer.remaining());
- }
- }
- outStream.clear();
- output.clear();
- }
-
- /**
- * @return The number of bytes that will be written to the output. Assumes the stream writing
- * into this receiver has already been flushed.
- */
- public long getOutputSize() {
- long result = 0;
- for (ByteBuffer buffer: output) {
- result += buffer.remaining();
- }
- return result;
- }
-
- @Override
- public String toString() {
- return outStream.toString();
- }
- }
-
- @Override
- public OutStream getOrCreatePhysicalStream(StreamName name) throws IOException {
- BufferedStream result = streams.get(name);
- if (result == null) {
- EnumSet<Modifier> modifiers = createCompressionModifiers(name.getKind());
- result = new BufferedStream(name.toString(), bufferSize,
- codec == null ? null : codec.modify(modifiers));
- streams.put(name, result);
- }
- return result.outStream;
- }
-
- private EnumSet<Modifier> createCompressionModifiers(Kind kind) {
- switch (kind) {
- case BLOOM_FILTER:
- case DATA:
- case DICTIONARY_DATA:
- return EnumSet.of(Modifier.TEXT,
- compressionStrategy == CompressionStrategy.SPEED ? Modifier.FAST : Modifier.DEFAULT);
- case LENGTH:
- case DICTIONARY_COUNT:
- case PRESENT:
- case ROW_INDEX:
- case SECONDARY:
- // easily compressed using the fastest modes
- return EnumSet.of(CompressionCodec.Modifier.FASTEST, CompressionCodec.Modifier.BINARY);
- default:
- LOG.warn("Missing ORC compression modifiers for " + kind);
- return null;
- }
- }
-
- @Override
- public void finalizeStripe(StripeFooter.Builder footerBuilder,
- StripeInformation.Builder dirEntry) throws IOException {
- long indexSize = 0;
- long dataSize = 0;
- for (Map.Entry<StreamName, BufferedStream> pair: streams.entrySet()) {
- BufferedStream receiver = pair.getValue();
- OutStream outStream = receiver.outStream;
- if (!outStream.isSuppressed()) {
- outStream.flush();
- long streamSize = receiver.getOutputSize();
- StreamName name = pair.getKey();
- footerBuilder.addStreams(OrcProto.Stream.newBuilder().setColumn(name.getColumn())
- .setKind(name.getKind()).setLength(streamSize));
- if (StreamName.Area.INDEX == name.getArea()) {
- indexSize += streamSize;
- } else {
- dataSize += streamSize;
- }
- }
- }
- dirEntry.setIndexLength(indexSize).setDataLength(dataSize);
-
- OrcProto.StripeFooter footer = footerBuilder.build();
- // Do we need to pad the file so the stripe doesn't straddle a block boundary?
- padStripe(indexSize, dataSize, footer.getSerializedSize());
-
- // write out the data streams
- for (Map.Entry<StreamName, BufferedStream> pair : streams.entrySet()) {
- pair.getValue().spillToDiskAndClear();
- }
- // Write out the footer.
- writeStripeFooter(footer, dataSize, indexSize, dirEntry);
- }
-
- @Override
- public long estimateMemory() {
- long result = 0;
- for (BufferedStream stream: streams.values()) {
- result += stream.getBufferSize();
- }
- return result;
- }
-
- @Override
- public void writeIndexStream(StreamName name, Builder rowIndex) throws IOException {
- OutStream stream = getOrCreatePhysicalStream(name);
- rowIndex.build().writeTo(stream);
- stream.flush();
- }
-
- @Override
- public void writeBloomFilterStream(
- StreamName name, BloomFilterIndex.Builder bloomFilterIndex) throws IOException {
- OutStream stream = getOrCreatePhysicalStream(name);
- bloomFilterIndex.build().writeTo(stream);
- stream.flush();
- }
-
- @VisibleForTesting
- public OutputStream getStream() throws IOException {
- initialize();
- return rawWriter;
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/PhysicalWriter.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/PhysicalWriter.java b/orc/src/java/org/apache/orc/impl/PhysicalWriter.java
deleted file mode 100644
index 5ba1b9b..0000000
--- a/orc/src/java/org/apache/orc/impl/PhysicalWriter.java
+++ /dev/null
@@ -1,122 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import java.io.IOException;
-
-import org.apache.orc.OrcProto.BloomFilterIndex;
-import org.apache.orc.OrcProto.Footer;
-import org.apache.orc.OrcProto.Metadata;
-import org.apache.orc.OrcProto.PostScript;
-import org.apache.orc.OrcProto.RowIndex;
-import org.apache.orc.OrcProto.StripeFooter;
-import org.apache.orc.OrcProto.StripeInformation;
-
-public interface PhysicalWriter {
-
- /**
- * Creates all the streams/connections/etc. necessary to write.
- */
- void initialize() throws IOException;
-
- /**
- * Writes out the file metadata.
- * @param builder Metadata builder to finalize and write.
- */
- void writeFileMetadata(Metadata.Builder builder) throws IOException;
-
- /**
- * Writes out the file footer.
- * @param builder Footer builder to finalize and write.
- */
- void writeFileFooter(Footer.Builder builder) throws IOException;
-
- /**
- * Writes out the postscript (including the size byte if needed).
- * @param builder Postscript builder to finalize and write.
- */
- void writePostScript(PostScript.Builder builder) throws IOException;
-
- /**
- * Creates physical stream to write data to.
- * @param name Stream name.
- * @return The output stream.
- */
- OutStream getOrCreatePhysicalStream(StreamName name) throws IOException;
-
- /**
- * Flushes the data in all the streams, spills them to disk, write out stripe footer.
- * @param footer Stripe footer to be updated with relevant data and written out.
- * @param dirEntry File metadata entry for the stripe, to be updated with relevant data.
- */
- void finalizeStripe(StripeFooter.Builder footer,
- StripeInformation.Builder dirEntry) throws IOException;
-
- /**
- * Writes out the index for the stripe column.
- * @param streamName Stream name.
- * @param rowIndex Row index entries to write.
- */
- void writeIndexStream(StreamName name, RowIndex.Builder rowIndex) throws IOException;
-
- /**
- * Writes out the index for the stripe column.
- * @param streamName Stream name.
- * @param bloomFilterIndex Bloom filter index to write.
- */
- void writeBloomFilterStream(StreamName streamName,
- BloomFilterIndex.Builder bloomFilterIndex) throws IOException;
-
- /**
- * Closes the writer.
- */
- void close() throws IOException;
-
- /**
- * Force-flushes the writer.
- */
- void flush() throws IOException;
-
- /**
- * @return the physical writer position (e.g. for updater).
- */
- long getRawWriterPosition() throws IOException;
-
- /** @return physical stripe size, taking padding into account. */
- long getPhysicalStripeSize();
-
- /** @return whether the writer is compressed. */
- boolean isCompressed();
-
- /**
- * Appends raw stripe data (e.g. for file merger).
- * @param stripe Stripe data buffer.
- * @param offset Stripe data buffer offset.
- * @param length Stripe data buffer length.
- * @param dirEntry File metadata entry for the stripe, to be updated with relevant data.
- * @throws IOException
- */
- void appendRawStripe(byte[] stripe, int offset, int length,
- StripeInformation.Builder dirEntry) throws IOException;
-
- /**
- * @return the estimated memory usage for the stripe.
- */
- long estimateMemory();
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/PositionProvider.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/PositionProvider.java b/orc/src/java/org/apache/orc/impl/PositionProvider.java
deleted file mode 100644
index 47cf481..0000000
--- a/orc/src/java/org/apache/orc/impl/PositionProvider.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-/**
- * An interface used for seeking to a row index.
- */
-public interface PositionProvider {
- long getNext();
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/PositionRecorder.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/PositionRecorder.java b/orc/src/java/org/apache/orc/impl/PositionRecorder.java
deleted file mode 100644
index 1fff760..0000000
--- a/orc/src/java/org/apache/orc/impl/PositionRecorder.java
+++ /dev/null
@@ -1,25 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-/**
- * An interface for recording positions in a stream.
- */
-public interface PositionRecorder {
- void addPosition(long offset);
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/PositionedOutputStream.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/PositionedOutputStream.java b/orc/src/java/org/apache/orc/impl/PositionedOutputStream.java
deleted file mode 100644
index d412939..0000000
--- a/orc/src/java/org/apache/orc/impl/PositionedOutputStream.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.IOException;
-import java.io.OutputStream;
-
-public abstract class PositionedOutputStream extends OutputStream {
-
- /**
- * Record the current position to the recorder.
- * @param recorder the object that receives the position
- * @throws IOException
- */
- public abstract void getPosition(PositionRecorder recorder
- ) throws IOException;
-
- /**
- * Get the memory size currently allocated as buffer associated with this
- * stream.
- * @return the number of bytes used by buffers.
- */
- public abstract long getBufferSize();
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/ReaderImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/ReaderImpl.java b/orc/src/java/org/apache/orc/impl/ReaderImpl.java
deleted file mode 100644
index 70fa628..0000000
--- a/orc/src/java/org/apache/orc/impl/ReaderImpl.java
+++ /dev/null
@@ -1,764 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.orc.CompressionKind;
-import org.apache.orc.FileMetadata;
-import org.apache.orc.OrcFile;
-import org.apache.orc.OrcUtils;
-import org.apache.orc.Reader;
-import org.apache.orc.RecordReader;
-import org.apache.orc.TypeDescription;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.CompressionCodec;
-import org.apache.orc.FileFormatException;
-import org.apache.orc.StripeInformation;
-import org.apache.orc.StripeStatistics;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.common.io.DiskRange;
-import org.apache.hadoop.hive.ql.util.JavaDataModel;
-import org.apache.hadoop.io.Text;
-import org.apache.orc.OrcProto;
-
-import com.google.common.collect.Lists;
-import com.google.protobuf.CodedInputStream;
-
-public class ReaderImpl implements Reader {
-
- private static final Logger LOG = LoggerFactory.getLogger(ReaderImpl.class);
-
- private static final int DIRECTORY_SIZE_GUESS = 16 * 1024;
-
- protected final FileSystem fileSystem;
- private final long maxLength;
- protected final Path path;
- protected final org.apache.orc.CompressionKind compressionKind;
- protected CompressionCodec codec;
- protected int bufferSize;
- protected OrcProto.Metadata metadata;
- private List<OrcProto.StripeStatistics> stripeStats;
- private final int metadataSize;
- protected final List<OrcProto.Type> types;
- private TypeDescription schema;
- private final List<OrcProto.UserMetadataItem> userMetadata;
- private final List<OrcProto.ColumnStatistics> fileStats;
- private final List<StripeInformation> stripes;
- protected final int rowIndexStride;
- private final long contentLength, numberOfRows;
-
- private long deserializedSize = -1;
- protected final Configuration conf;
- private final List<Integer> versionList;
- private final OrcFile.WriterVersion writerVersion;
-
- protected OrcTail tail;
-
- public static class StripeInformationImpl
- implements StripeInformation {
- private final OrcProto.StripeInformation stripe;
-
- public StripeInformationImpl(OrcProto.StripeInformation stripe) {
- this.stripe = stripe;
- }
-
- @Override
- public long getOffset() {
- return stripe.getOffset();
- }
-
- @Override
- public long getLength() {
- return stripe.getDataLength() + getIndexLength() + getFooterLength();
- }
-
- @Override
- public long getDataLength() {
- return stripe.getDataLength();
- }
-
- @Override
- public long getFooterLength() {
- return stripe.getFooterLength();
- }
-
- @Override
- public long getIndexLength() {
- return stripe.getIndexLength();
- }
-
- @Override
- public long getNumberOfRows() {
- return stripe.getNumberOfRows();
- }
-
- @Override
- public String toString() {
- return "offset: " + getOffset() + " data: " + getDataLength() +
- " rows: " + getNumberOfRows() + " tail: " + getFooterLength() +
- " index: " + getIndexLength();
- }
- }
-
- @Override
- public long getNumberOfRows() {
- return numberOfRows;
- }
-
- @Override
- public List<String> getMetadataKeys() {
- List<String> result = new ArrayList<String>();
- for(OrcProto.UserMetadataItem item: userMetadata) {
- result.add(item.getName());
- }
- return result;
- }
-
- @Override
- public ByteBuffer getMetadataValue(String key) {
- for(OrcProto.UserMetadataItem item: userMetadata) {
- if (item.hasName() && item.getName().equals(key)) {
- return item.getValue().asReadOnlyByteBuffer();
- }
- }
- throw new IllegalArgumentException("Can't find user metadata " + key);
- }
-
- public boolean hasMetadataValue(String key) {
- for(OrcProto.UserMetadataItem item: userMetadata) {
- if (item.hasName() && item.getName().equals(key)) {
- return true;
- }
- }
- return false;
- }
-
- @Override
- public org.apache.orc.CompressionKind getCompressionKind() {
- return compressionKind;
- }
-
- @Override
- public int getCompressionSize() {
- return bufferSize;
- }
-
- @Override
- public List<StripeInformation> getStripes() {
- return stripes;
- }
-
- @Override
- public long getContentLength() {
- return contentLength;
- }
-
- @Override
- public List<OrcProto.Type> getTypes() {
- return types;
- }
-
- @Override
- public OrcFile.Version getFileVersion() {
- for (OrcFile.Version version: OrcFile.Version.values()) {
- if ((versionList != null && !versionList.isEmpty()) &&
- version.getMajor() == versionList.get(0) &&
- version.getMinor() == versionList.get(1)) {
- return version;
- }
- }
- return OrcFile.Version.V_0_11;
- }
-
- @Override
- public OrcFile.WriterVersion getWriterVersion() {
- return writerVersion;
- }
-
- @Override
- public OrcProto.FileTail getFileTail() {
- return tail.getFileTail();
- }
-
- @Override
- public int getRowIndexStride() {
- return rowIndexStride;
- }
-
- @Override
- public ColumnStatistics[] getStatistics() {
- ColumnStatistics[] result = new ColumnStatistics[types.size()];
- for(int i=0; i < result.length; ++i) {
- result[i] = ColumnStatisticsImpl.deserialize(fileStats.get(i));
- }
- return result;
- }
-
- @Override
- public TypeDescription getSchema() {
- return schema;
- }
-
- /**
- * Ensure this is an ORC file to prevent users from trying to read text
- * files or RC files as ORC files.
- * @param in the file being read
- * @param path the filename for error messages
- * @param psLen the postscript length
- * @param buffer the tail of the file
- * @throws IOException
- */
- protected static void ensureOrcFooter(FSDataInputStream in,
- Path path,
- int psLen,
- ByteBuffer buffer) throws IOException {
- int magicLength = OrcFile.MAGIC.length();
- int fullLength = magicLength + 1;
- if (psLen < fullLength || buffer.remaining() < fullLength) {
- throw new FileFormatException("Malformed ORC file " + path +
- ". Invalid postscript length " + psLen);
- }
- int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength;
- byte[] array = buffer.array();
- // now look for the magic string at the end of the postscript.
- if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) {
- // If it isn't there, this may be the 0.11.0 version of ORC.
- // Read the first 3 bytes of the file to check for the header
- byte[] header = new byte[magicLength];
- in.readFully(0, header, 0, magicLength);
- // if it isn't there, this isn't an ORC file
- if (!Text.decode(header, 0 , magicLength).equals(OrcFile.MAGIC)) {
- throw new FileFormatException("Malformed ORC file " + path +
- ". Invalid postscript.");
- }
- }
- }
-
- /**
- * Ensure this is an ORC file to prevent users from trying to read text
- * files or RC files as ORC files.
- * @param psLen the postscript length
- * @param buffer the tail of the file
- * @throws IOException
- */
- protected static void ensureOrcFooter(ByteBuffer buffer, int psLen) throws IOException {
- int magicLength = OrcFile.MAGIC.length();
- int fullLength = magicLength + 1;
- if (psLen < fullLength || buffer.remaining() < fullLength) {
- throw new FileFormatException("Malformed ORC file. Invalid postscript length " + psLen);
- }
-
- int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength;
- byte[] array = buffer.array();
- // now look for the magic string at the end of the postscript.
- if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) {
- // if it isn't there, this may be 0.11.0 version of the ORC file.
- // Read the first 3 bytes from the buffer to check for the header
- if (!Text.decode(buffer.array(), 0, magicLength).equals(OrcFile.MAGIC)) {
- throw new FileFormatException("Malformed ORC file. Invalid postscript length " + psLen);
- }
- }
- }
-
- /**
- * Build a version string out of an array.
- * @param version the version number as a list
- * @return the human readable form of the version string
- */
- private static String versionString(List<Integer> version) {
- StringBuilder buffer = new StringBuilder();
- for(int i=0; i < version.size(); ++i) {
- if (i != 0) {
- buffer.append('.');
- }
- buffer.append(version.get(i));
- }
- return buffer.toString();
- }
-
- /**
- * Check to see if this ORC file is from a future version and if so,
- * warn the user that we may not be able to read all of the column encodings.
- * @param log the logger to write any error message to
- * @param path the data source path for error messages
- * @param version the version of hive that wrote the file.
- */
- protected static void checkOrcVersion(Logger log, Path path,
- List<Integer> version) {
- if (version.size() >= 1) {
- int major = version.get(0);
- int minor = 0;
- if (version.size() >= 2) {
- minor = version.get(1);
- }
- if (major > OrcFile.Version.CURRENT.getMajor() ||
- (major == OrcFile.Version.CURRENT.getMajor() &&
- minor > OrcFile.Version.CURRENT.getMinor())) {
- log.warn(path + " was written by a future Hive version " +
- versionString(version) +
- ". This file may not be readable by this version of Hive.");
- }
- }
- }
-
- /**
- * Constructor that let's the user specify additional options.
- * @param path pathname for file
- * @param options options for reading
- * @throws IOException
- */
- public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException {
- FileSystem fs = options.getFilesystem();
- if (fs == null) {
- fs = path.getFileSystem(options.getConfiguration());
- }
- this.fileSystem = fs;
- this.path = path;
- this.conf = options.getConfiguration();
- this.maxLength = options.getMaxLength();
- FileMetadata fileMetadata = options.getFileMetadata();
- if (fileMetadata != null) {
- this.compressionKind = fileMetadata.getCompressionKind();
- this.bufferSize = fileMetadata.getCompressionBufferSize();
- this.codec = PhysicalFsWriter.createCodec(compressionKind);
- this.metadataSize = fileMetadata.getMetadataSize();
- this.stripeStats = fileMetadata.getStripeStats();
- this.versionList = fileMetadata.getVersionList();
- this.writerVersion =
- OrcFile.WriterVersion.from(fileMetadata.getWriterVersionNum());
- this.types = fileMetadata.getTypes();
- this.rowIndexStride = fileMetadata.getRowIndexStride();
- this.contentLength = fileMetadata.getContentLength();
- this.numberOfRows = fileMetadata.getNumberOfRows();
- this.fileStats = fileMetadata.getFileStats();
- this.stripes = fileMetadata.getStripes();
- this.userMetadata = null; // not cached and not needed here
- } else {
- OrcTail orcTail = options.getOrcTail();
- if (orcTail == null) {
- tail = extractFileTail(fs, path, options.getMaxLength());
- options.orcTail(tail);
- } else {
- tail = orcTail;
- }
- this.compressionKind = tail.getCompressionKind();
- this.codec = tail.getCompressionCodec();
- this.bufferSize = tail.getCompressionBufferSize();
- this.metadataSize = tail.getMetadataSize();
- this.versionList = tail.getPostScript().getVersionList();
- this.types = tail.getFooter().getTypesList();
- this.rowIndexStride = tail.getFooter().getRowIndexStride();
- this.contentLength = tail.getFooter().getContentLength();
- this.numberOfRows = tail.getFooter().getNumberOfRows();
- this.userMetadata = tail.getFooter().getMetadataList();
- this.fileStats = tail.getFooter().getStatisticsList();
- this.writerVersion = tail.getWriterVersion();
- this.stripes = tail.getStripes();
- this.stripeStats = tail.getStripeStatisticsProto();
- }
- this.schema = OrcUtils.convertTypeFromProtobuf(this.types, 0);
- }
-
- /**
- * Get the WriterVersion based on the ORC file postscript.
- * @param writerVersion the integer writer version
- * @return the version of the software that produced the file
- */
- public static OrcFile.WriterVersion getWriterVersion(int writerVersion) {
- for(OrcFile.WriterVersion version: OrcFile.WriterVersion.values()) {
- if (version.getId() == writerVersion) {
- return version;
- }
- }
- return OrcFile.WriterVersion.FUTURE;
- }
-
- private static OrcProto.Footer extractFooter(ByteBuffer bb, int footerAbsPos,
- int footerSize, CompressionCodec codec, int bufferSize) throws IOException {
- bb.position(footerAbsPos);
- bb.limit(footerAbsPos + footerSize);
- return OrcProto.Footer.parseFrom(InStream.createCodedInputStream("footer",
- Lists.<DiskRange>newArrayList(new BufferChunk(bb, 0)), footerSize, codec, bufferSize));
- }
-
- public static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos,
- int metadataSize, CompressionCodec codec, int bufferSize) throws IOException {
- bb.position(metadataAbsPos);
- bb.limit(metadataAbsPos + metadataSize);
- return OrcProto.Metadata.parseFrom(InStream.createCodedInputStream("metadata",
- Lists.<DiskRange>newArrayList(new BufferChunk(bb, 0)), metadataSize, codec, bufferSize));
- }
-
- private static OrcProto.PostScript extractPostScript(ByteBuffer bb, Path path,
- int psLen, int psAbsOffset) throws IOException {
- // TODO: when PB is upgraded to 2.6, newInstance(ByteBuffer) method should be used here.
- assert bb.hasArray();
- CodedInputStream in = CodedInputStream.newInstance(
- bb.array(), bb.arrayOffset() + psAbsOffset, psLen);
- OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(in);
- checkOrcVersion(LOG, path, ps.getVersionList());
-
- // Check compression codec.
- switch (ps.getCompression()) {
- case NONE:
- break;
- case ZLIB:
- break;
- case SNAPPY:
- break;
- case LZO:
- break;
- default:
- throw new IllegalArgumentException("Unknown compression");
- }
- return ps;
- }
-
- public static OrcTail extractFileTail(ByteBuffer buffer)
- throws IOException {
- return extractFileTail(buffer, -1, -1);
- }
-
- public static OrcTail extractFileTail(ByteBuffer buffer, long fileLength, long modificationTime)
- throws IOException {
- int readSize = buffer.limit();
- int psLen = buffer.get(readSize - 1) & 0xff;
- int psOffset = readSize - 1 - psLen;
- ensureOrcFooter(buffer, psLen);
- byte[] psBuffer = new byte[psLen];
- System.arraycopy(buffer.array(), psOffset, psBuffer, 0, psLen);
- OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(psBuffer);
- int footerSize = (int) ps.getFooterLength();
- CompressionCodec codec = PhysicalFsWriter
- .createCodec(CompressionKind.valueOf(ps.getCompression().name()));
- OrcProto.Footer footer = extractFooter(buffer,
- (int) (buffer.position() + ps.getMetadataLength()),
- footerSize, codec, (int) ps.getCompressionBlockSize());
- OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder()
- .setPostscriptLength(psLen)
- .setPostscript(ps)
- .setFooter(footer)
- .setFileLength(fileLength);
- // clear does not clear the contents but sets position to 0 and limit = capacity
- buffer.clear();
- return new OrcTail(fileTailBuilder.build(), buffer.slice(), modificationTime);
- }
-
- protected OrcTail extractFileTail(FileSystem fs, Path path,
- long maxFileLength) throws IOException {
- FSDataInputStream file = fs.open(path);
- ByteBuffer buffer;
- OrcProto.PostScript ps;
- OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder();
- long modificationTime;
- try {
- // figure out the size of the file using the option or filesystem
- long size;
- if (maxFileLength == Long.MAX_VALUE) {
- FileStatus fileStatus = fs.getFileStatus(path);
- size = fileStatus.getLen();
- modificationTime = fileStatus.getModificationTime();
- } else {
- size = maxFileLength;
- modificationTime = -1;
- }
- fileTailBuilder.setFileLength(size);
-
- //read last bytes into buffer to get PostScript
- int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS);
- buffer = ByteBuffer.allocate(readSize);
- assert buffer.position() == 0;
- file.readFully((size - readSize),
- buffer.array(), buffer.arrayOffset(), readSize);
- buffer.position(0);
-
- //read the PostScript
- //get length of PostScript
- int psLen = buffer.get(readSize - 1) & 0xff;
- ensureOrcFooter(file, path, psLen, buffer);
- int psOffset = readSize - 1 - psLen;
- ps = extractPostScript(buffer, path, psLen, psOffset);
- bufferSize = (int) ps.getCompressionBlockSize();
- codec = PhysicalFsWriter.createCodec(CompressionKind.valueOf(ps.getCompression().name()));
- fileTailBuilder.setPostscriptLength(psLen).setPostscript(ps);
-
- int footerSize = (int) ps.getFooterLength();
- int metadataSize = (int) ps.getMetadataLength();
-
- //check if extra bytes need to be read
- int extra = Math.max(0, psLen + 1 + footerSize + metadataSize - readSize);
- int tailSize = 1 + psLen + footerSize + metadataSize;
- if (extra > 0) {
- //more bytes need to be read, seek back to the right place and read extra bytes
- ByteBuffer extraBuf = ByteBuffer.allocate(extra + readSize);
- file.readFully((size - readSize - extra), extraBuf.array(),
- extraBuf.arrayOffset() + extraBuf.position(), extra);
- extraBuf.position(extra);
- //append with already read bytes
- extraBuf.put(buffer);
- buffer = extraBuf;
- buffer.position(0);
- buffer.limit(tailSize);
- readSize += extra;
- psOffset = readSize - 1 - psLen;
- } else {
- //footer is already in the bytes in buffer, just adjust position, length
- buffer.position(psOffset - footerSize - metadataSize);
- buffer.limit(buffer.position() + tailSize);
- }
-
- buffer.mark();
- int footerOffset = psOffset - footerSize;
- buffer.position(footerOffset);
- ByteBuffer footerBuffer = buffer.slice();
- buffer.reset();
- OrcProto.Footer footer = extractFooter(footerBuffer, 0, footerSize,
- codec, bufferSize);
- fileTailBuilder.setFooter(footer);
- } finally {
- try {
- file.close();
- } catch (IOException ex) {
- LOG.error("Failed to close the file after another error", ex);
- }
- }
-
- ByteBuffer serializedTail = ByteBuffer.allocate(buffer.remaining());
- serializedTail.put(buffer.slice());
- serializedTail.rewind();
- return new OrcTail(fileTailBuilder.build(), serializedTail, modificationTime);
- }
-
- @Override
- public ByteBuffer getSerializedFileFooter() {
- return tail.getSerializedTail();
- }
-
- @Override
- public RecordReader rows() throws IOException {
- return rows(new Options());
- }
-
- @Override
- public RecordReader rows(Options options) throws IOException {
- LOG.info("Reading ORC rows from " + path + " with " + options);
- return new RecordReaderImpl(this, options);
- }
-
-
- @Override
- public long getRawDataSize() {
- // if the deserializedSize is not computed, then compute it, else
- // return the already computed size. since we are reading from the footer
- // we don't have to compute deserialized size repeatedly
- if (deserializedSize == -1) {
- List<Integer> indices = Lists.newArrayList();
- for (int i = 0; i < fileStats.size(); ++i) {
- indices.add(i);
- }
- deserializedSize = getRawDataSizeFromColIndices(indices);
- }
- return deserializedSize;
- }
-
- @Override
- public long getRawDataSizeFromColIndices(List<Integer> colIndices) {
- return getRawDataSizeFromColIndices(colIndices, types, fileStats);
- }
-
- public static long getRawDataSizeFromColIndices(
- List<Integer> colIndices, List<OrcProto.Type> types,
- List<OrcProto.ColumnStatistics> stats) {
- long result = 0;
- for (int colIdx : colIndices) {
- result += getRawDataSizeOfColumn(colIdx, types, stats);
- }
- return result;
- }
-
- private static long getRawDataSizeOfColumn(int colIdx, List<OrcProto.Type> types,
- List<OrcProto.ColumnStatistics> stats) {
- OrcProto.ColumnStatistics colStat = stats.get(colIdx);
- long numVals = colStat.getNumberOfValues();
- OrcProto.Type type = types.get(colIdx);
-
- switch (type.getKind()) {
- case BINARY:
- // old orc format doesn't support binary statistics. checking for binary
- // statistics is not required as protocol buffers takes care of it.
- return colStat.getBinaryStatistics().getSum();
- case STRING:
- case CHAR:
- case VARCHAR:
- // old orc format doesn't support sum for string statistics. checking for
- // existence is not required as protocol buffers takes care of it.
-
- // ORC strings are deserialized to java strings. so use java data model's
- // string size
- numVals = numVals == 0 ? 1 : numVals;
- int avgStrLen = (int) (colStat.getStringStatistics().getSum() / numVals);
- return numVals * JavaDataModel.get().lengthForStringOfLength(avgStrLen);
- case TIMESTAMP:
- return numVals * JavaDataModel.get().lengthOfTimestamp();
- case DATE:
- return numVals * JavaDataModel.get().lengthOfDate();
- case DECIMAL:
- return numVals * JavaDataModel.get().lengthOfDecimal();
- case DOUBLE:
- case LONG:
- return numVals * JavaDataModel.get().primitive2();
- case FLOAT:
- case INT:
- case SHORT:
- case BOOLEAN:
- case BYTE:
- return numVals * JavaDataModel.get().primitive1();
- default:
- LOG.debug("Unknown primitive category: " + type.getKind());
- break;
- }
-
- return 0;
- }
-
- @Override
- public long getRawDataSizeOfColumns(List<String> colNames) {
- List<Integer> colIndices = getColumnIndicesFromNames(colNames);
- return getRawDataSizeFromColIndices(colIndices);
- }
-
- private List<Integer> getColumnIndicesFromNames(List<String> colNames) {
- // top level struct
- OrcProto.Type type = types.get(0);
- List<Integer> colIndices = Lists.newArrayList();
- List<String> fieldNames = type.getFieldNamesList();
- int fieldIdx;
- for (String colName : colNames) {
- if (fieldNames.contains(colName)) {
- fieldIdx = fieldNames.indexOf(colName);
- } else {
- String s = "Cannot find field for: " + colName + " in ";
- for (String fn : fieldNames) {
- s += fn + ", ";
- }
- LOG.warn(s);
- continue;
- }
-
- // a single field may span multiple columns. find start and end column
- // index for the requested field
- int idxStart = type.getSubtypes(fieldIdx);
-
- int idxEnd;
-
- // if the specified is the last field and then end index will be last
- // column index
- if (fieldIdx + 1 > fieldNames.size() - 1) {
- idxEnd = getLastIdx() + 1;
- } else {
- idxEnd = type.getSubtypes(fieldIdx + 1);
- }
-
- // if start index and end index are same then the field is a primitive
- // field else complex field (like map, list, struct, union)
- if (idxStart == idxEnd) {
- // simple field
- colIndices.add(idxStart);
- } else {
- // complex fields spans multiple columns
- for (int i = idxStart; i < idxEnd; i++) {
- colIndices.add(i);
- }
- }
- }
- return colIndices;
- }
-
- private int getLastIdx() {
- Set<Integer> indices = new HashSet<>();
- for (OrcProto.Type type : types) {
- indices.addAll(type.getSubtypesList());
- }
- return Collections.max(indices);
- }
-
- @Override
- public List<OrcProto.StripeStatistics> getOrcProtoStripeStatistics() {
- return stripeStats;
- }
-
- @Override
- public List<OrcProto.ColumnStatistics> getOrcProtoFileStatistics() {
- return fileStats;
- }
-
- @Override
- public List<StripeStatistics> getStripeStatistics() throws IOException {
- if (stripeStats == null && metadata == null) {
- metadata = extractMetadata(tail.getSerializedTail(), 0, metadataSize, codec, bufferSize);
- stripeStats = metadata.getStripeStatsList();
- }
- List<StripeStatistics> result = new ArrayList<>();
- for (OrcProto.StripeStatistics ss : stripeStats) {
- result.add(new StripeStatistics(ss.getColStatsList()));
- }
- return result;
- }
-
- public List<OrcProto.UserMetadataItem> getOrcProtoUserMetadata() {
- return userMetadata;
- }
-
- @Override
- public List<Integer> getVersionList() {
- return versionList;
- }
-
- @Override
- public int getMetadataSize() {
- return metadataSize;
- }
-
- @Override
- public String toString() {
- StringBuilder buffer = new StringBuilder();
- buffer.append("ORC Reader(");
- buffer.append(path);
- if (maxLength != -1) {
- buffer.append(", ");
- buffer.append(maxLength);
- }
- buffer.append(")");
- return buffer.toString();
- }
-}
[17/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/RunLengthByteWriter.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/RunLengthByteWriter.java b/orc/src/java/org/apache/orc/impl/RunLengthByteWriter.java
deleted file mode 100644
index 09108b2..0000000
--- a/orc/src/java/org/apache/orc/impl/RunLengthByteWriter.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.IOException;
-
-/**
- * A streamFactory that writes a sequence of bytes. A control byte is written before
- * each run with positive values 0 to 127 meaning 2 to 129 repetitions. If the
- * bytes is -1 to -128, 1 to 128 literal byte values follow.
- */
-public class RunLengthByteWriter {
- static final int MIN_REPEAT_SIZE = 3;
- static final int MAX_LITERAL_SIZE = 128;
- static final int MAX_REPEAT_SIZE= 127 + MIN_REPEAT_SIZE;
- private final PositionedOutputStream output;
- private final byte[] literals = new byte[MAX_LITERAL_SIZE];
- private int numLiterals = 0;
- private boolean repeat = false;
- private int tailRunLength = 0;
-
- public RunLengthByteWriter(PositionedOutputStream output) {
- this.output = output;
- }
-
- private void writeValues() throws IOException {
- if (numLiterals != 0) {
- if (repeat) {
- output.write(numLiterals - MIN_REPEAT_SIZE);
- output.write(literals, 0, 1);
- } else {
- output.write(-numLiterals);
- output.write(literals, 0, numLiterals);
- }
- repeat = false;
- tailRunLength = 0;
- numLiterals = 0;
- }
- }
-
- public void flush() throws IOException {
- writeValues();
- output.flush();
- }
-
- public void write(byte value) throws IOException {
- if (numLiterals == 0) {
- literals[numLiterals++] = value;
- tailRunLength = 1;
- } else if (repeat) {
- if (value == literals[0]) {
- numLiterals += 1;
- if (numLiterals == MAX_REPEAT_SIZE) {
- writeValues();
- }
- } else {
- writeValues();
- literals[numLiterals++] = value;
- tailRunLength = 1;
- }
- } else {
- if (value == literals[numLiterals - 1]) {
- tailRunLength += 1;
- } else {
- tailRunLength = 1;
- }
- if (tailRunLength == MIN_REPEAT_SIZE) {
- if (numLiterals + 1 == MIN_REPEAT_SIZE) {
- repeat = true;
- numLiterals += 1;
- } else {
- numLiterals -= MIN_REPEAT_SIZE - 1;
- writeValues();
- literals[0] = value;
- repeat = true;
- numLiterals = MIN_REPEAT_SIZE;
- }
- } else {
- literals[numLiterals++] = value;
- if (numLiterals == MAX_LITERAL_SIZE) {
- writeValues();
- }
- }
- }
- }
-
- public void getPosition(PositionRecorder recorder) throws IOException {
- output.getPosition(recorder);
- recorder.addPosition(numLiterals);
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/RunLengthIntegerReader.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/RunLengthIntegerReader.java b/orc/src/java/org/apache/orc/impl/RunLengthIntegerReader.java
deleted file mode 100644
index b91a263..0000000
--- a/orc/src/java/org/apache/orc/impl/RunLengthIntegerReader.java
+++ /dev/null
@@ -1,173 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.EOFException;
-import java.io.IOException;
-
-import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
-
-/**
- * A reader that reads a sequence of integers.
- * */
-public class RunLengthIntegerReader implements IntegerReader {
- private InStream input;
- private final boolean signed;
- private final long[] literals =
- new long[RunLengthIntegerWriter.MAX_LITERAL_SIZE];
- private int numLiterals = 0;
- private int delta = 0;
- private int used = 0;
- private boolean repeat = false;
- private SerializationUtils utils;
-
- public RunLengthIntegerReader(InStream input, boolean signed) throws IOException {
- this.input = input;
- this.signed = signed;
- this.utils = new SerializationUtils();
- }
-
- private void readValues(boolean ignoreEof) throws IOException {
- int control = input.read();
- if (control == -1) {
- if (!ignoreEof) {
- throw new EOFException("Read past end of RLE integer from " + input);
- }
- used = numLiterals = 0;
- return;
- } else if (control < 0x80) {
- numLiterals = control + RunLengthIntegerWriter.MIN_REPEAT_SIZE;
- used = 0;
- repeat = true;
- delta = input.read();
- if (delta == -1) {
- throw new EOFException("End of stream in RLE Integer from " + input);
- }
- // convert from 0 to 255 to -128 to 127 by converting to a signed byte
- delta = (byte) (0 + delta);
- if (signed) {
- literals[0] = utils.readVslong(input);
- } else {
- literals[0] = utils.readVulong(input);
- }
- } else {
- repeat = false;
- numLiterals = 0x100 - control;
- used = 0;
- for(int i=0; i < numLiterals; ++i) {
- if (signed) {
- literals[i] = utils.readVslong(input);
- } else {
- literals[i] = utils.readVulong(input);
- }
- }
- }
- }
-
- @Override
- public boolean hasNext() throws IOException {
- return used != numLiterals || input.available() > 0;
- }
-
- @Override
- public long next() throws IOException {
- long result;
- if (used == numLiterals) {
- readValues(false);
- }
- if (repeat) {
- result = literals[0] + (used++) * delta;
- } else {
- result = literals[used++];
- }
- return result;
- }
-
- @Override
- public void nextVector(ColumnVector previous,
- long[] data,
- int previousLen) throws IOException {
- previous.isRepeating = true;
- for (int i = 0; i < previousLen; i++) {
- if (!previous.isNull[i]) {
- data[i] = next();
- } else {
- // The default value of null for int type in vectorized
- // processing is 1, so set that if the value is null
- data[i] = 1;
- }
-
- // The default value for nulls in Vectorization for int types is 1
- // and given that non null value can also be 1, we need to check for isNull also
- // when determining the isRepeating flag.
- if (previous.isRepeating
- && i > 0
- && (data[0] != data[i] || previous.isNull[0] != previous.isNull[i])) {
- previous.isRepeating = false;
- }
- }
- }
-
- @Override
- public void nextVector(ColumnVector vector,
- int[] data,
- int size) throws IOException {
- if (vector.noNulls) {
- for(int r=0; r < data.length && r < size; ++r) {
- data[r] = (int) next();
- }
- } else if (!(vector.isRepeating && vector.isNull[0])) {
- for(int r=0; r < data.length && r < size; ++r) {
- if (!vector.isNull[r]) {
- data[r] = (int) next();
- } else {
- data[r] = 1;
- }
- }
- }
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- input.seek(index);
- int consumed = (int) index.getNext();
- if (consumed != 0) {
- // a loop is required for cases where we break the run into two parts
- while (consumed > 0) {
- readValues(false);
- used = consumed;
- consumed -= numLiterals;
- }
- } else {
- used = 0;
- numLiterals = 0;
- }
- }
-
- @Override
- public void skip(long numValues) throws IOException {
- while (numValues > 0) {
- if (used == numLiterals) {
- readValues(false);
- }
- long consume = Math.min(numValues, numLiterals - used);
- used += consume;
- numValues -= consume;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/RunLengthIntegerReaderV2.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/RunLengthIntegerReaderV2.java b/orc/src/java/org/apache/orc/impl/RunLengthIntegerReaderV2.java
deleted file mode 100644
index 610d9b5..0000000
--- a/orc/src/java/org/apache/orc/impl/RunLengthIntegerReaderV2.java
+++ /dev/null
@@ -1,406 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.util.Arrays;
-
-import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * A reader that reads a sequence of light weight compressed integers. Refer
- * {@link RunLengthIntegerWriterV2} for description of various lightweight
- * compression techniques.
- */
-public class RunLengthIntegerReaderV2 implements IntegerReader {
- public static final Logger LOG = LoggerFactory.getLogger(RunLengthIntegerReaderV2.class);
-
- private InStream input;
- private final boolean signed;
- private final long[] literals = new long[RunLengthIntegerWriterV2.MAX_SCOPE];
- private boolean isRepeating = false;
- private int numLiterals = 0;
- private int used = 0;
- private final boolean skipCorrupt;
- private final SerializationUtils utils;
- private RunLengthIntegerWriterV2.EncodingType currentEncoding;
-
- public RunLengthIntegerReaderV2(InStream input, boolean signed,
- boolean skipCorrupt) throws IOException {
- this.input = input;
- this.signed = signed;
- this.skipCorrupt = skipCorrupt;
- this.utils = new SerializationUtils();
- }
-
- private final static RunLengthIntegerWriterV2.EncodingType[] encodings = RunLengthIntegerWriterV2.EncodingType.values();
- private void readValues(boolean ignoreEof) throws IOException {
- // read the first 2 bits and determine the encoding type
- isRepeating = false;
- int firstByte = input.read();
- if (firstByte < 0) {
- if (!ignoreEof) {
- throw new EOFException("Read past end of RLE integer from " + input);
- }
- used = numLiterals = 0;
- return;
- }
- currentEncoding = encodings[(firstByte >>> 6) & 0x03];
- switch (currentEncoding) {
- case SHORT_REPEAT: readShortRepeatValues(firstByte); break;
- case DIRECT: readDirectValues(firstByte); break;
- case PATCHED_BASE: readPatchedBaseValues(firstByte); break;
- case DELTA: readDeltaValues(firstByte); break;
- default: throw new IOException("Unknown encoding " + currentEncoding);
- }
- }
-
- private void readDeltaValues(int firstByte) throws IOException {
-
- // extract the number of fixed bits
- int fb = (firstByte >>> 1) & 0x1f;
- if (fb != 0) {
- fb = utils.decodeBitWidth(fb);
- }
-
- // extract the blob run length
- int len = (firstByte & 0x01) << 8;
- len |= input.read();
-
- // read the first value stored as vint
- long firstVal = 0;
- if (signed) {
- firstVal = utils.readVslong(input);
- } else {
- firstVal = utils.readVulong(input);
- }
-
- // store first value to result buffer
- long prevVal = firstVal;
- literals[numLiterals++] = firstVal;
-
- // if fixed bits is 0 then all values have fixed delta
- if (fb == 0) {
- // read the fixed delta value stored as vint (deltas can be negative even
- // if all number are positive)
- long fd = utils.readVslong(input);
- if (fd == 0) {
- isRepeating = true;
- assert numLiterals == 1;
- Arrays.fill(literals, numLiterals, numLiterals + len, literals[0]);
- numLiterals += len;
- } else {
- // add fixed deltas to adjacent values
- for(int i = 0; i < len; i++) {
- literals[numLiterals++] = literals[numLiterals - 2] + fd;
- }
- }
- } else {
- long deltaBase = utils.readVslong(input);
- // add delta base and first value
- literals[numLiterals++] = firstVal + deltaBase;
- prevVal = literals[numLiterals - 1];
- len -= 1;
-
- // write the unpacked values, add it to previous value and store final
- // value to result buffer. if the delta base value is negative then it
- // is a decreasing sequence else an increasing sequence
- utils.readInts(literals, numLiterals, len, fb, input);
- while (len > 0) {
- if (deltaBase < 0) {
- literals[numLiterals] = prevVal - literals[numLiterals];
- } else {
- literals[numLiterals] = prevVal + literals[numLiterals];
- }
- prevVal = literals[numLiterals];
- len--;
- numLiterals++;
- }
- }
- }
-
- private void readPatchedBaseValues(int firstByte) throws IOException {
-
- // extract the number of fixed bits
- int fbo = (firstByte >>> 1) & 0x1f;
- int fb = utils.decodeBitWidth(fbo);
-
- // extract the run length of data blob
- int len = (firstByte & 0x01) << 8;
- len |= input.read();
- // runs are always one off
- len += 1;
-
- // extract the number of bytes occupied by base
- int thirdByte = input.read();
- int bw = (thirdByte >>> 5) & 0x07;
- // base width is one off
- bw += 1;
-
- // extract patch width
- int pwo = thirdByte & 0x1f;
- int pw = utils.decodeBitWidth(pwo);
-
- // read fourth byte and extract patch gap width
- int fourthByte = input.read();
- int pgw = (fourthByte >>> 5) & 0x07;
- // patch gap width is one off
- pgw += 1;
-
- // extract the length of the patch list
- int pl = fourthByte & 0x1f;
-
- // read the next base width number of bytes to extract base value
- long base = utils.bytesToLongBE(input, bw);
- long mask = (1L << ((bw * 8) - 1));
- // if MSB of base value is 1 then base is negative value else positive
- if ((base & mask) != 0) {
- base = base & ~mask;
- base = -base;
- }
-
- // unpack the data blob
- long[] unpacked = new long[len];
- utils.readInts(unpacked, 0, len, fb, input);
-
- // unpack the patch blob
- long[] unpackedPatch = new long[pl];
-
- if ((pw + pgw) > 64 && !skipCorrupt) {
- throw new IOException("Corruption in ORC data encountered. To skip" +
- " reading corrupted data, set hive.exec.orc.skip.corrupt.data to" +
- " true");
- }
- int bitSize = utils.getClosestFixedBits(pw + pgw);
- utils.readInts(unpackedPatch, 0, pl, bitSize, input);
-
- // apply the patch directly when decoding the packed data
- int patchIdx = 0;
- long currGap = 0;
- long currPatch = 0;
- long patchMask = ((1L << pw) - 1);
- currGap = unpackedPatch[patchIdx] >>> pw;
- currPatch = unpackedPatch[patchIdx] & patchMask;
- long actualGap = 0;
-
- // special case: gap is >255 then patch value will be 0.
- // if gap is <=255 then patch value cannot be 0
- while (currGap == 255 && currPatch == 0) {
- actualGap += 255;
- patchIdx++;
- currGap = unpackedPatch[patchIdx] >>> pw;
- currPatch = unpackedPatch[patchIdx] & patchMask;
- }
- // add the left over gap
- actualGap += currGap;
-
- // unpack data blob, patch it (if required), add base to get final result
- for(int i = 0; i < unpacked.length; i++) {
- if (i == actualGap) {
- // extract the patch value
- long patchedVal = unpacked[i] | (currPatch << fb);
-
- // add base to patched value
- literals[numLiterals++] = base + patchedVal;
-
- // increment the patch to point to next entry in patch list
- patchIdx++;
-
- if (patchIdx < pl) {
- // read the next gap and patch
- currGap = unpackedPatch[patchIdx] >>> pw;
- currPatch = unpackedPatch[patchIdx] & patchMask;
- actualGap = 0;
-
- // special case: gap is >255 then patch will be 0. if gap is
- // <=255 then patch cannot be 0
- while (currGap == 255 && currPatch == 0) {
- actualGap += 255;
- patchIdx++;
- currGap = unpackedPatch[patchIdx] >>> pw;
- currPatch = unpackedPatch[patchIdx] & patchMask;
- }
- // add the left over gap
- actualGap += currGap;
-
- // next gap is relative to the current gap
- actualGap += i;
- }
- } else {
- // no patching required. add base to unpacked value to get final value
- literals[numLiterals++] = base + unpacked[i];
- }
- }
-
- }
-
- private void readDirectValues(int firstByte) throws IOException {
-
- // extract the number of fixed bits
- int fbo = (firstByte >>> 1) & 0x1f;
- int fb = utils.decodeBitWidth(fbo);
-
- // extract the run length
- int len = (firstByte & 0x01) << 8;
- len |= input.read();
- // runs are one off
- len += 1;
-
- // write the unpacked values and zigzag decode to result buffer
- utils.readInts(literals, numLiterals, len, fb, input);
- if (signed) {
- for(int i = 0; i < len; i++) {
- literals[numLiterals] = utils.zigzagDecode(literals[numLiterals]);
- numLiterals++;
- }
- } else {
- numLiterals += len;
- }
- }
-
- private void readShortRepeatValues(int firstByte) throws IOException {
-
- // read the number of bytes occupied by the value
- int size = (firstByte >>> 3) & 0x07;
- // #bytes are one off
- size += 1;
-
- // read the run length
- int len = firstByte & 0x07;
- // run lengths values are stored only after MIN_REPEAT value is met
- len += RunLengthIntegerWriterV2.MIN_REPEAT;
-
- // read the repeated value which is store using fixed bytes
- long val = utils.bytesToLongBE(input, size);
-
- if (signed) {
- val = utils.zigzagDecode(val);
- }
-
- if (numLiterals != 0) {
- // Currently this always holds, which makes peekNextAvailLength simpler.
- // If this changes, peekNextAvailLength should be adjusted accordingly.
- throw new AssertionError("readValues called with existing values present");
- }
- // repeat the value for length times
- isRepeating = true;
- // TODO: this is not so useful and V1 reader doesn't do that. Fix? Same if delta == 0
- for(int i = 0; i < len; i++) {
- literals[i] = val;
- }
- numLiterals = len;
- }
-
- @Override
- public boolean hasNext() throws IOException {
- return used != numLiterals || input.available() > 0;
- }
-
- @Override
- public long next() throws IOException {
- long result;
- if (used == numLiterals) {
- numLiterals = 0;
- used = 0;
- readValues(false);
- }
- result = literals[used++];
- return result;
- }
-
- @Override
- public void seek(PositionProvider index) throws IOException {
- input.seek(index);
- int consumed = (int) index.getNext();
- if (consumed != 0) {
- // a loop is required for cases where we break the run into two
- // parts
- while (consumed > 0) {
- numLiterals = 0;
- readValues(false);
- used = consumed;
- consumed -= numLiterals;
- }
- } else {
- used = 0;
- numLiterals = 0;
- }
- }
-
- @Override
- public void skip(long numValues) throws IOException {
- while (numValues > 0) {
- if (used == numLiterals) {
- numLiterals = 0;
- used = 0;
- readValues(false);
- }
- long consume = Math.min(numValues, numLiterals - used);
- used += consume;
- numValues -= consume;
- }
- }
-
- @Override
- public void nextVector(ColumnVector previous,
- long[] data,
- int previousLen) throws IOException {
- previous.isRepeating = true;
- for (int i = 0; i < previousLen; i++) {
- if (!previous.isNull[i]) {
- data[i] = next();
- } else {
- // The default value of null for int type in vectorized
- // processing is 1, so set that if the value is null
- data[i] = 1;
- }
-
- // The default value for nulls in Vectorization for int types is 1
- // and given that non null value can also be 1, we need to check for isNull also
- // when determining the isRepeating flag.
- if (previous.isRepeating
- && i > 0
- && (data[0] != data[i] ||
- previous.isNull[0] != previous.isNull[i])) {
- previous.isRepeating = false;
- }
- }
- }
-
- @Override
- public void nextVector(ColumnVector vector,
- int[] data,
- int size) throws IOException {
- if (vector.noNulls) {
- for(int r=0; r < data.length && r < size; ++r) {
- data[r] = (int) next();
- }
- } else if (!(vector.isRepeating && vector.isNull[0])) {
- for(int r=0; r < data.length && r < size; ++r) {
- if (!vector.isNull[r]) {
- data[r] = (int) next();
- } else {
- data[r] = 1;
- }
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/RunLengthIntegerWriter.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/RunLengthIntegerWriter.java b/orc/src/java/org/apache/orc/impl/RunLengthIntegerWriter.java
deleted file mode 100644
index 3e5f2e2..0000000
--- a/orc/src/java/org/apache/orc/impl/RunLengthIntegerWriter.java
+++ /dev/null
@@ -1,143 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.IOException;
-
-/**
- * A streamFactory that writes a sequence of integers. A control byte is written before
- * each run with positive values 0 to 127 meaning 3 to 130 repetitions, each
- * repetition is offset by a delta. If the control byte is -1 to -128, 1 to 128
- * literal vint values follow.
- */
-public class RunLengthIntegerWriter implements IntegerWriter {
- static final int MIN_REPEAT_SIZE = 3;
- static final int MAX_DELTA = 127;
- static final int MIN_DELTA = -128;
- static final int MAX_LITERAL_SIZE = 128;
- private static final int MAX_REPEAT_SIZE = 127 + MIN_REPEAT_SIZE;
- private final PositionedOutputStream output;
- private final boolean signed;
- private final long[] literals = new long[MAX_LITERAL_SIZE];
- private int numLiterals = 0;
- private long delta = 0;
- private boolean repeat = false;
- private int tailRunLength = 0;
- private SerializationUtils utils;
-
- public RunLengthIntegerWriter(PositionedOutputStream output,
- boolean signed) {
- this.output = output;
- this.signed = signed;
- this.utils = new SerializationUtils();
- }
-
- private void writeValues() throws IOException {
- if (numLiterals != 0) {
- if (repeat) {
- output.write(numLiterals - MIN_REPEAT_SIZE);
- output.write((byte) delta);
- if (signed) {
- utils.writeVslong(output, literals[0]);
- } else {
- utils.writeVulong(output, literals[0]);
- }
- } else {
- output.write(-numLiterals);
- for(int i=0; i < numLiterals; ++i) {
- if (signed) {
- utils.writeVslong(output, literals[i]);
- } else {
- utils.writeVulong(output, literals[i]);
- }
- }
- }
- repeat = false;
- numLiterals = 0;
- tailRunLength = 0;
- }
- }
-
- @Override
- public void flush() throws IOException {
- writeValues();
- output.flush();
- }
-
- @Override
- public void write(long value) throws IOException {
- if (numLiterals == 0) {
- literals[numLiterals++] = value;
- tailRunLength = 1;
- } else if (repeat) {
- if (value == literals[0] + delta * numLiterals) {
- numLiterals += 1;
- if (numLiterals == MAX_REPEAT_SIZE) {
- writeValues();
- }
- } else {
- writeValues();
- literals[numLiterals++] = value;
- tailRunLength = 1;
- }
- } else {
- if (tailRunLength == 1) {
- delta = value - literals[numLiterals - 1];
- if (delta < MIN_DELTA || delta > MAX_DELTA) {
- tailRunLength = 1;
- } else {
- tailRunLength = 2;
- }
- } else if (value == literals[numLiterals - 1] + delta) {
- tailRunLength += 1;
- } else {
- delta = value - literals[numLiterals - 1];
- if (delta < MIN_DELTA || delta > MAX_DELTA) {
- tailRunLength = 1;
- } else {
- tailRunLength = 2;
- }
- }
- if (tailRunLength == MIN_REPEAT_SIZE) {
- if (numLiterals + 1 == MIN_REPEAT_SIZE) {
- repeat = true;
- numLiterals += 1;
- } else {
- numLiterals -= MIN_REPEAT_SIZE - 1;
- long base = literals[numLiterals];
- writeValues();
- literals[0] = base;
- repeat = true;
- numLiterals = MIN_REPEAT_SIZE;
- }
- } else {
- literals[numLiterals++] = value;
- if (numLiterals == MAX_LITERAL_SIZE) {
- writeValues();
- }
- }
- }
- }
-
- @Override
- public void getPosition(PositionRecorder recorder) throws IOException {
- output.getPosition(recorder);
- recorder.addPosition(numLiterals);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java b/orc/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java
deleted file mode 100644
index fab2801..0000000
--- a/orc/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java
+++ /dev/null
@@ -1,831 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.IOException;
-
-/**
- * A writer that performs light weight compression over sequence of integers.
- * <p>
- * There are four types of lightweight integer compression
- * <ul>
- * <li>SHORT_REPEAT</li>
- * <li>DIRECT</li>
- * <li>PATCHED_BASE</li>
- * <li>DELTA</li>
- * </ul>
- * </p>
- * The description and format for these types are as below:
- * <p>
- * <b>SHORT_REPEAT:</b> Used for short repeated integer sequences.
- * <ul>
- * <li>1 byte header
- * <ul>
- * <li>2 bits for encoding type</li>
- * <li>3 bits for bytes required for repeating value</li>
- * <li>3 bits for repeat count (MIN_REPEAT + run length)</li>
- * </ul>
- * </li>
- * <li>Blob - repeat value (fixed bytes)</li>
- * </ul>
- * </p>
- * <p>
- * <b>DIRECT:</b> Used for random integer sequences whose number of bit
- * requirement doesn't vary a lot.
- * <ul>
- * <li>2 bytes header
- * <ul>
- * 1st byte
- * <li>2 bits for encoding type</li>
- * <li>5 bits for fixed bit width of values in blob</li>
- * <li>1 bit for storing MSB of run length</li>
- * </ul>
- * <ul>
- * 2nd byte
- * <li>8 bits for lower run length bits</li>
- * </ul>
- * </li>
- * <li>Blob - stores the direct values using fixed bit width. The length of the
- * data blob is (fixed width * run length) bits long</li>
- * </ul>
- * </p>
- * <p>
- * <b>PATCHED_BASE:</b> Used for random integer sequences whose number of bit
- * requirement varies beyond a threshold.
- * <ul>
- * <li>4 bytes header
- * <ul>
- * 1st byte
- * <li>2 bits for encoding type</li>
- * <li>5 bits for fixed bit width of values in blob</li>
- * <li>1 bit for storing MSB of run length</li>
- * </ul>
- * <ul>
- * 2nd byte
- * <li>8 bits for lower run length bits</li>
- * </ul>
- * <ul>
- * 3rd byte
- * <li>3 bits for bytes required to encode base value</li>
- * <li>5 bits for patch width</li>
- * </ul>
- * <ul>
- * 4th byte
- * <li>3 bits for patch gap width</li>
- * <li>5 bits for patch length</li>
- * </ul>
- * </li>
- * <li>Base value - Stored using fixed number of bytes. If MSB is set, base
- * value is negative else positive. Length of base value is (base width * 8)
- * bits.</li>
- * <li>Data blob - Base reduced values as stored using fixed bit width. Length
- * of data blob is (fixed width * run length) bits.</li>
- * <li>Patch blob - Patch blob is a list of gap and patch value. Each entry in
- * the patch list is (patch width + patch gap width) bits long. Gap between the
- * subsequent elements to be patched are stored in upper part of entry whereas
- * patch values are stored in lower part of entry. Length of patch blob is
- * ((patch width + patch gap width) * patch length) bits.</li>
- * </ul>
- * </p>
- * <p>
- * <b>DELTA</b> Used for monotonically increasing or decreasing sequences,
- * sequences with fixed delta values or long repeated sequences.
- * <ul>
- * <li>2 bytes header
- * <ul>
- * 1st byte
- * <li>2 bits for encoding type</li>
- * <li>5 bits for fixed bit width of values in blob</li>
- * <li>1 bit for storing MSB of run length</li>
- * </ul>
- * <ul>
- * 2nd byte
- * <li>8 bits for lower run length bits</li>
- * </ul>
- * </li>
- * <li>Base value - zigzag encoded value written as varint</li>
- * <li>Delta base - zigzag encoded value written as varint</li>
- * <li>Delta blob - only positive values. monotonicity and orderness are decided
- * based on the sign of the base value and delta base</li>
- * </ul>
- * </p>
- */
-public class RunLengthIntegerWriterV2 implements IntegerWriter {
-
- public enum EncodingType {
- SHORT_REPEAT, DIRECT, PATCHED_BASE, DELTA
- }
-
- static final int MAX_SCOPE = 512;
- static final int MIN_REPEAT = 3;
- private static final int MAX_SHORT_REPEAT_LENGTH = 10;
- private long prevDelta = 0;
- private int fixedRunLength = 0;
- private int variableRunLength = 0;
- private final long[] literals = new long[MAX_SCOPE];
- private final PositionedOutputStream output;
- private final boolean signed;
- private EncodingType encoding;
- private int numLiterals;
- private final long[] zigzagLiterals = new long[MAX_SCOPE];
- private final long[] baseRedLiterals = new long[MAX_SCOPE];
- private final long[] adjDeltas = new long[MAX_SCOPE];
- private long fixedDelta;
- private int zzBits90p;
- private int zzBits100p;
- private int brBits95p;
- private int brBits100p;
- private int bitsDeltaMax;
- private int patchWidth;
- private int patchGapWidth;
- private int patchLength;
- private long[] gapVsPatchList;
- private long min;
- private boolean isFixedDelta;
- private SerializationUtils utils;
- private boolean alignedBitpacking;
-
- RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed) {
- this(output, signed, true);
- }
-
- public RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed,
- boolean alignedBitpacking) {
- this.output = output;
- this.signed = signed;
- this.alignedBitpacking = alignedBitpacking;
- this.utils = new SerializationUtils();
- clear();
- }
-
- private void writeValues() throws IOException {
- if (numLiterals != 0) {
-
- if (encoding.equals(EncodingType.SHORT_REPEAT)) {
- writeShortRepeatValues();
- } else if (encoding.equals(EncodingType.DIRECT)) {
- writeDirectValues();
- } else if (encoding.equals(EncodingType.PATCHED_BASE)) {
- writePatchedBaseValues();
- } else {
- writeDeltaValues();
- }
-
- // clear all the variables
- clear();
- }
- }
-
- private void writeDeltaValues() throws IOException {
- int len = 0;
- int fb = bitsDeltaMax;
- int efb = 0;
-
- if (alignedBitpacking) {
- fb = utils.getClosestAlignedFixedBits(fb);
- }
-
- if (isFixedDelta) {
- // if fixed run length is greater than threshold then it will be fixed
- // delta sequence with delta value 0 else fixed delta sequence with
- // non-zero delta value
- if (fixedRunLength > MIN_REPEAT) {
- // ex. sequence: 2 2 2 2 2 2 2 2
- len = fixedRunLength - 1;
- fixedRunLength = 0;
- } else {
- // ex. sequence: 4 6 8 10 12 14 16
- len = variableRunLength - 1;
- variableRunLength = 0;
- }
- } else {
- // fixed width 0 is used for long repeating values.
- // sequences that require only 1 bit to encode will have an additional bit
- if (fb == 1) {
- fb = 2;
- }
- efb = utils.encodeBitWidth(fb);
- efb = efb << 1;
- len = variableRunLength - 1;
- variableRunLength = 0;
- }
-
- // extract the 9th bit of run length
- final int tailBits = (len & 0x100) >>> 8;
-
- // create first byte of the header
- final int headerFirstByte = getOpcode() | efb | tailBits;
-
- // second byte of the header stores the remaining 8 bits of runlength
- final int headerSecondByte = len & 0xff;
-
- // write header
- output.write(headerFirstByte);
- output.write(headerSecondByte);
-
- // store the first value from zigzag literal array
- if (signed) {
- utils.writeVslong(output, literals[0]);
- } else {
- utils.writeVulong(output, literals[0]);
- }
-
- if (isFixedDelta) {
- // if delta is fixed then we don't need to store delta blob
- utils.writeVslong(output, fixedDelta);
- } else {
- // store the first value as delta value using zigzag encoding
- utils.writeVslong(output, adjDeltas[0]);
-
- // adjacent delta values are bit packed. The length of adjDeltas array is
- // always one less than the number of literals (delta difference for n
- // elements is n-1). We have already written one element, write the
- // remaining numLiterals - 2 elements here
- utils.writeInts(adjDeltas, 1, numLiterals - 2, fb, output);
- }
- }
-
- private void writePatchedBaseValues() throws IOException {
-
- // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding
- // because patch is applied to MSB bits. For example: If fixed bit width of
- // base value is 7 bits and if patch is 3 bits, the actual value is
- // constructed by shifting the patch to left by 7 positions.
- // actual_value = patch << 7 | base_value
- // So, if we align base_value then actual_value can not be reconstructed.
-
- // write the number of fixed bits required in next 5 bits
- final int fb = brBits95p;
- final int efb = utils.encodeBitWidth(fb) << 1;
-
- // adjust variable run length, they are one off
- variableRunLength -= 1;
-
- // extract the 9th bit of run length
- final int tailBits = (variableRunLength & 0x100) >>> 8;
-
- // create first byte of the header
- final int headerFirstByte = getOpcode() | efb | tailBits;
-
- // second byte of the header stores the remaining 8 bits of runlength
- final int headerSecondByte = variableRunLength & 0xff;
-
- // if the min value is negative toggle the sign
- final boolean isNegative = min < 0 ? true : false;
- if (isNegative) {
- min = -min;
- }
-
- // find the number of bytes required for base and shift it by 5 bits
- // to accommodate patch width. The additional bit is used to store the sign
- // of the base value.
- final int baseWidth = utils.findClosestNumBits(min) + 1;
- final int baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1;
- final int bb = (baseBytes - 1) << 5;
-
- // if the base value is negative then set MSB to 1
- if (isNegative) {
- min |= (1L << ((baseBytes * 8) - 1));
- }
-
- // third byte contains 3 bits for number of bytes occupied by base
- // and 5 bits for patchWidth
- final int headerThirdByte = bb | utils.encodeBitWidth(patchWidth);
-
- // fourth byte contains 3 bits for page gap width and 5 bits for
- // patch length
- final int headerFourthByte = (patchGapWidth - 1) << 5 | patchLength;
-
- // write header
- output.write(headerFirstByte);
- output.write(headerSecondByte);
- output.write(headerThirdByte);
- output.write(headerFourthByte);
-
- // write the base value using fixed bytes in big endian order
- for(int i = baseBytes - 1; i >= 0; i--) {
- byte b = (byte) ((min >>> (i * 8)) & 0xff);
- output.write(b);
- }
-
- // base reduced literals are bit packed
- int closestFixedBits = utils.getClosestFixedBits(fb);
-
- utils.writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits,
- output);
-
- // write patch list
- closestFixedBits = utils.getClosestFixedBits(patchGapWidth + patchWidth);
-
- utils.writeInts(gapVsPatchList, 0, gapVsPatchList.length, closestFixedBits,
- output);
-
- // reset run length
- variableRunLength = 0;
- }
-
- /**
- * Store the opcode in 2 MSB bits
- * @return opcode
- */
- private int getOpcode() {
- return encoding.ordinal() << 6;
- }
-
- private void writeDirectValues() throws IOException {
-
- // write the number of fixed bits required in next 5 bits
- int fb = zzBits100p;
-
- if (alignedBitpacking) {
- fb = utils.getClosestAlignedFixedBits(fb);
- }
-
- final int efb = utils.encodeBitWidth(fb) << 1;
-
- // adjust variable run length
- variableRunLength -= 1;
-
- // extract the 9th bit of run length
- final int tailBits = (variableRunLength & 0x100) >>> 8;
-
- // create first byte of the header
- final int headerFirstByte = getOpcode() | efb | tailBits;
-
- // second byte of the header stores the remaining 8 bits of runlength
- final int headerSecondByte = variableRunLength & 0xff;
-
- // write header
- output.write(headerFirstByte);
- output.write(headerSecondByte);
-
- // bit packing the zigzag encoded literals
- utils.writeInts(zigzagLiterals, 0, numLiterals, fb, output);
-
- // reset run length
- variableRunLength = 0;
- }
-
- private void writeShortRepeatValues() throws IOException {
- // get the value that is repeating, compute the bits and bytes required
- long repeatVal = 0;
- if (signed) {
- repeatVal = utils.zigzagEncode(literals[0]);
- } else {
- repeatVal = literals[0];
- }
-
- final int numBitsRepeatVal = utils.findClosestNumBits(repeatVal);
- final int numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? numBitsRepeatVal >>> 3
- : (numBitsRepeatVal >>> 3) + 1;
-
- // write encoding type in top 2 bits
- int header = getOpcode();
-
- // write the number of bytes required for the value
- header |= ((numBytesRepeatVal - 1) << 3);
-
- // write the run length
- fixedRunLength -= MIN_REPEAT;
- header |= fixedRunLength;
-
- // write the header
- output.write(header);
-
- // write the repeating value in big endian byte order
- for(int i = numBytesRepeatVal - 1; i >= 0; i--) {
- int b = (int) ((repeatVal >>> (i * 8)) & 0xff);
- output.write(b);
- }
-
- fixedRunLength = 0;
- }
-
- private void determineEncoding() {
-
- // we need to compute zigzag values for DIRECT encoding if we decide to
- // break early for delta overflows or for shorter runs
- computeZigZagLiterals();
-
- zzBits100p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
-
- // not a big win for shorter runs to determine encoding
- if (numLiterals <= MIN_REPEAT) {
- encoding = EncodingType.DIRECT;
- return;
- }
-
- // DELTA encoding check
-
- // for identifying monotonic sequences
- boolean isIncreasing = true;
- boolean isDecreasing = true;
- this.isFixedDelta = true;
-
- this.min = literals[0];
- long max = literals[0];
- final long initialDelta = literals[1] - literals[0];
- long currDelta = initialDelta;
- long deltaMax = initialDelta;
- this.adjDeltas[0] = initialDelta;
-
- for (int i = 1; i < numLiterals; i++) {
- final long l1 = literals[i];
- final long l0 = literals[i - 1];
- currDelta = l1 - l0;
- min = Math.min(min, l1);
- max = Math.max(max, l1);
-
- isIncreasing &= (l0 <= l1);
- isDecreasing &= (l0 >= l1);
-
- isFixedDelta &= (currDelta == initialDelta);
- if (i > 1) {
- adjDeltas[i - 1] = Math.abs(currDelta);
- deltaMax = Math.max(deltaMax, adjDeltas[i - 1]);
- }
- }
-
- // its faster to exit under delta overflow condition without checking for
- // PATCHED_BASE condition as encoding using DIRECT is faster and has less
- // overhead than PATCHED_BASE
- if (!utils.isSafeSubtract(max, min)) {
- encoding = EncodingType.DIRECT;
- return;
- }
-
- // invariant - subtracting any number from any other in the literals after
- // this point won't overflow
-
- // if min is equal to max then the delta is 0, this condition happens for
- // fixed values run >10 which cannot be encoded with SHORT_REPEAT
- if (min == max) {
- assert isFixedDelta : min + "==" + max +
- ", isFixedDelta cannot be false";
- assert currDelta == 0 : min + "==" + max + ", currDelta should be zero";
- fixedDelta = 0;
- encoding = EncodingType.DELTA;
- return;
- }
-
- if (isFixedDelta) {
- assert currDelta == initialDelta
- : "currDelta should be equal to initialDelta for fixed delta encoding";
- encoding = EncodingType.DELTA;
- fixedDelta = currDelta;
- return;
- }
-
- // if initialDelta is 0 then we cannot delta encode as we cannot identify
- // the sign of deltas (increasing or decreasing)
- if (initialDelta != 0) {
- // stores the number of bits required for packing delta blob in
- // delta encoding
- bitsDeltaMax = utils.findClosestNumBits(deltaMax);
-
- // monotonic condition
- if (isIncreasing || isDecreasing) {
- encoding = EncodingType.DELTA;
- return;
- }
- }
-
- // PATCHED_BASE encoding check
-
- // percentile values are computed for the zigzag encoded values. if the
- // number of bit requirement between 90th and 100th percentile varies
- // beyond a threshold then we need to patch the values. if the variation
- // is not significant then we can use direct encoding
-
- zzBits90p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 0.9);
- int diffBitsLH = zzBits100p - zzBits90p;
-
- // if the difference between 90th percentile and 100th percentile fixed
- // bits is > 1 then we need patch the values
- if (diffBitsLH > 1) {
-
- // patching is done only on base reduced values.
- // remove base from literals
- for (int i = 0; i < numLiterals; i++) {
- baseRedLiterals[i] = literals[i] - min;
- }
-
- // 95th percentile width is used to determine max allowed value
- // after which patching will be done
- brBits95p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 0.95);
-
- // 100th percentile is used to compute the max patch width
- brBits100p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 1.0);
-
- // after base reducing the values, if the difference in bits between
- // 95th percentile and 100th percentile value is zero then there
- // is no point in patching the values, in which case we will
- // fallback to DIRECT encoding.
- // The decision to use patched base was based on zigzag values, but the
- // actual patching is done on base reduced literals.
- if ((brBits100p - brBits95p) != 0) {
- encoding = EncodingType.PATCHED_BASE;
- preparePatchedBlob();
- return;
- } else {
- encoding = EncodingType.DIRECT;
- return;
- }
- } else {
- // if difference in bits between 95th percentile and 100th percentile is
- // 0, then patch length will become 0. Hence we will fallback to direct
- encoding = EncodingType.DIRECT;
- return;
- }
- }
-
- private void computeZigZagLiterals() {
- // populate zigzag encoded literals
- long zzEncVal = 0;
- for (int i = 0; i < numLiterals; i++) {
- if (signed) {
- zzEncVal = utils.zigzagEncode(literals[i]);
- } else {
- zzEncVal = literals[i];
- }
- zigzagLiterals[i] = zzEncVal;
- }
- }
-
- private void preparePatchedBlob() {
- // mask will be max value beyond which patch will be generated
- long mask = (1L << brBits95p) - 1;
-
- // since we are considering only 95 percentile, the size of gap and
- // patch array can contain only be 5% values
- patchLength = (int) Math.ceil((numLiterals * 0.05));
-
- int[] gapList = new int[patchLength];
- long[] patchList = new long[patchLength];
-
- // #bit for patch
- patchWidth = brBits100p - brBits95p;
- patchWidth = utils.getClosestFixedBits(patchWidth);
-
- // if patch bit requirement is 64 then it will not possible to pack
- // gap and patch together in a long. To make sure gap and patch can be
- // packed together adjust the patch width
- if (patchWidth == 64) {
- patchWidth = 56;
- brBits95p = 8;
- mask = (1L << brBits95p) - 1;
- }
-
- int gapIdx = 0;
- int patchIdx = 0;
- int prev = 0;
- int gap = 0;
- int maxGap = 0;
-
- for(int i = 0; i < numLiterals; i++) {
- // if value is above mask then create the patch and record the gap
- if (baseRedLiterals[i] > mask) {
- gap = i - prev;
- if (gap > maxGap) {
- maxGap = gap;
- }
-
- // gaps are relative, so store the previous patched value index
- prev = i;
- gapList[gapIdx++] = gap;
-
- // extract the most significant bits that are over mask bits
- long patch = baseRedLiterals[i] >>> brBits95p;
- patchList[patchIdx++] = patch;
-
- // strip off the MSB to enable safe bit packing
- baseRedLiterals[i] &= mask;
- }
- }
-
- // adjust the patch length to number of entries in gap list
- patchLength = gapIdx;
-
- // if the element to be patched is the first and only element then
- // max gap will be 0, but to store the gap as 0 we need atleast 1 bit
- if (maxGap == 0 && patchLength != 0) {
- patchGapWidth = 1;
- } else {
- patchGapWidth = utils.findClosestNumBits(maxGap);
- }
-
- // special case: if the patch gap width is greater than 256, then
- // we need 9 bits to encode the gap width. But we only have 3 bits in
- // header to record the gap width. To deal with this case, we will save
- // two entries in patch list in the following way
- // 256 gap width => 0 for patch value
- // actual gap - 256 => actual patch value
- // We will do the same for gap width = 511. If the element to be patched is
- // the last element in the scope then gap width will be 511. In this case we
- // will have 3 entries in the patch list in the following way
- // 255 gap width => 0 for patch value
- // 255 gap width => 0 for patch value
- // 1 gap width => actual patch value
- if (patchGapWidth > 8) {
- patchGapWidth = 8;
- // for gap = 511, we need two additional entries in patch list
- if (maxGap == 511) {
- patchLength += 2;
- } else {
- patchLength += 1;
- }
- }
-
- // create gap vs patch list
- gapIdx = 0;
- patchIdx = 0;
- gapVsPatchList = new long[patchLength];
- for(int i = 0; i < patchLength; i++) {
- long g = gapList[gapIdx++];
- long p = patchList[patchIdx++];
- while (g > 255) {
- gapVsPatchList[i++] = (255L << patchWidth);
- g -= 255;
- }
-
- // store patch value in LSBs and gap in MSBs
- gapVsPatchList[i] = (g << patchWidth) | p;
- }
- }
-
- /**
- * clears all the variables
- */
- private void clear() {
- numLiterals = 0;
- encoding = null;
- prevDelta = 0;
- fixedDelta = 0;
- zzBits90p = 0;
- zzBits100p = 0;
- brBits95p = 0;
- brBits100p = 0;
- bitsDeltaMax = 0;
- patchGapWidth = 0;
- patchLength = 0;
- patchWidth = 0;
- gapVsPatchList = null;
- min = 0;
- isFixedDelta = true;
- }
-
- @Override
- public void flush() throws IOException {
- if (numLiterals != 0) {
- if (variableRunLength != 0) {
- determineEncoding();
- writeValues();
- } else if (fixedRunLength != 0) {
- if (fixedRunLength < MIN_REPEAT) {
- variableRunLength = fixedRunLength;
- fixedRunLength = 0;
- determineEncoding();
- writeValues();
- } else if (fixedRunLength >= MIN_REPEAT
- && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
- encoding = EncodingType.SHORT_REPEAT;
- writeValues();
- } else {
- encoding = EncodingType.DELTA;
- isFixedDelta = true;
- writeValues();
- }
- }
- }
- output.flush();
- }
-
- @Override
- public void write(long val) throws IOException {
- if (numLiterals == 0) {
- initializeLiterals(val);
- } else {
- if (numLiterals == 1) {
- prevDelta = val - literals[0];
- literals[numLiterals++] = val;
- // if both values are same count as fixed run else variable run
- if (val == literals[0]) {
- fixedRunLength = 2;
- variableRunLength = 0;
- } else {
- fixedRunLength = 0;
- variableRunLength = 2;
- }
- } else {
- long currentDelta = val - literals[numLiterals - 1];
- if (prevDelta == 0 && currentDelta == 0) {
- // fixed delta run
-
- literals[numLiterals++] = val;
-
- // if variable run is non-zero then we are seeing repeating
- // values at the end of variable run in which case keep
- // updating variable and fixed runs
- if (variableRunLength > 0) {
- fixedRunLength = 2;
- }
- fixedRunLength += 1;
-
- // if fixed run met the minimum condition and if variable
- // run is non-zero then flush the variable run and shift the
- // tail fixed runs to start of the buffer
- if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
- numLiterals -= MIN_REPEAT;
- variableRunLength -= MIN_REPEAT - 1;
- // copy the tail fixed runs
- long[] tailVals = new long[MIN_REPEAT];
- System.arraycopy(literals, numLiterals, tailVals, 0, MIN_REPEAT);
-
- // determine variable encoding and flush values
- determineEncoding();
- writeValues();
-
- // shift tail fixed runs to beginning of the buffer
- for(long l : tailVals) {
- literals[numLiterals++] = l;
- }
- }
-
- // if fixed runs reached max repeat length then write values
- if (fixedRunLength == MAX_SCOPE) {
- determineEncoding();
- writeValues();
- }
- } else {
- // variable delta run
-
- // if fixed run length is non-zero and if it satisfies the
- // short repeat conditions then write the values as short repeats
- // else use delta encoding
- if (fixedRunLength >= MIN_REPEAT) {
- if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
- encoding = EncodingType.SHORT_REPEAT;
- writeValues();
- } else {
- encoding = EncodingType.DELTA;
- isFixedDelta = true;
- writeValues();
- }
- }
-
- // if fixed run length is <MIN_REPEAT and current value is
- // different from previous then treat it as variable run
- if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT) {
- if (val != literals[numLiterals - 1]) {
- variableRunLength = fixedRunLength;
- fixedRunLength = 0;
- }
- }
-
- // after writing values re-initialize the variables
- if (numLiterals == 0) {
- initializeLiterals(val);
- } else {
- // keep updating variable run lengths
- prevDelta = val - literals[numLiterals - 1];
- literals[numLiterals++] = val;
- variableRunLength += 1;
-
- // if variable run length reach the max scope, write it
- if (variableRunLength == MAX_SCOPE) {
- determineEncoding();
- writeValues();
- }
- }
- }
- }
- }
- }
-
- private void initializeLiterals(long val) {
- literals[numLiterals++] = val;
- fixedRunLength = 1;
- variableRunLength = 1;
- }
-
- @Override
- public void getPosition(PositionRecorder recorder) throws IOException {
- output.getPosition(recorder);
- recorder.addPosition(numLiterals);
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/SchemaEvolution.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/SchemaEvolution.java b/orc/src/java/org/apache/orc/impl/SchemaEvolution.java
deleted file mode 100644
index c1bd2b7..0000000
--- a/orc/src/java/org/apache/orc/impl/SchemaEvolution.java
+++ /dev/null
@@ -1,399 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.orc.TypeDescription;
-
-/**
- * Take the file types and the (optional) configuration column names/types and see if there
- * has been schema evolution.
- */
-public class SchemaEvolution {
- // indexed by reader column id
- private final TypeDescription[] readerFileTypes;
- // indexed by reader column id
- private final boolean[] readerIncluded;
- // the offset to the first column id ignoring any ACID columns
- private final int readerColumnOffset;
- // indexed by file column id
- private final boolean[] fileIncluded;
- private final TypeDescription fileSchema;
- private final TypeDescription readerSchema;
- private boolean hasConversion;
- // indexed by reader column id
- private final boolean[] ppdSafeConversion;
-
- public SchemaEvolution(TypeDescription fileSchema, boolean[] includedCols) {
- this(fileSchema, null, includedCols);
- }
-
- public SchemaEvolution(TypeDescription fileSchema,
- TypeDescription readerSchema,
- boolean[] includeCols) {
- this.readerIncluded = includeCols == null ? null : Arrays.copyOf(includeCols, includeCols.length);
- this.fileIncluded = new boolean[fileSchema.getMaximumId() + 1];
- this.hasConversion = false;
- this.fileSchema = fileSchema;
- boolean isAcid = checkAcidSchema(fileSchema);
- this.readerColumnOffset = isAcid ? acidEventFieldNames.size() : 0;
- if (readerSchema != null) {
- if (isAcid) {
- this.readerSchema = createEventSchema(readerSchema);
- } else {
- this.readerSchema = readerSchema;
- }
- if (readerIncluded != null &&
- readerIncluded.length + readerColumnOffset != this.readerSchema.getMaximumId() + 1) {
- throw new IllegalArgumentException("Include vector the wrong length: " +
- this.readerSchema.toJson() + " with include length " +
- readerIncluded.length);
- }
- this.readerFileTypes = new TypeDescription[this.readerSchema.getMaximumId() + 1];
- buildConversionFileTypesArray(fileSchema, this.readerSchema);
- } else {
- this.readerSchema = fileSchema;
- this.readerFileTypes = new TypeDescription[this.readerSchema.getMaximumId() + 1];
- if (readerIncluded != null &&
- readerIncluded.length + readerColumnOffset != this.readerSchema.getMaximumId() + 1) {
- throw new IllegalArgumentException("Include vector the wrong length: " +
- this.readerSchema.toJson() + " with include length " +
- readerIncluded.length);
- }
- buildSameSchemaFileTypesArray();
- }
- this.ppdSafeConversion = populatePpdSafeConversion();
- }
-
- public TypeDescription getReaderSchema() {
- return readerSchema;
- }
-
- /**
- * Returns the non-ACID (aka base) reader type description.
- *
- * @return the reader type ignoring the ACID rowid columns, if any
- */
- public TypeDescription getReaderBaseSchema() {
- return readerSchema.findSubtype(readerColumnOffset);
- }
-
- /**
- * Is there Schema Evolution data type conversion?
- * @return
- */
- public boolean hasConversion() {
- return hasConversion;
- }
-
- public TypeDescription getFileType(TypeDescription readerType) {
- return getFileType(readerType.getId());
- }
-
- /**
- * Get whether each column is included from the reader's point of view.
- * @return a boolean array indexed by reader column id
- */
- public boolean[] getReaderIncluded() {
- return readerIncluded;
- }
-
- /**
- * Get whether each column is included from the file's point of view.
- * @return a boolean array indexed by file column id
- */
- public boolean[] getFileIncluded() {
- return fileIncluded;
- }
-
- /**
- * Get the file type by reader type id.
- * @param id reader column id
- * @return
- */
- public TypeDescription getFileType(int id) {
- return readerFileTypes[id];
- }
-
- /**
- * Check if column is safe for ppd evaluation
- * @param colId reader column id
- * @return true if the specified column is safe for ppd evaluation else false
- */
- public boolean isPPDSafeConversion(final int colId) {
- if (hasConversion()) {
- if (colId < 0 || colId >= ppdSafeConversion.length) {
- return false;
- }
- return ppdSafeConversion[colId];
- }
-
- // when there is no schema evolution PPD is safe
- return true;
- }
-
- private boolean[] populatePpdSafeConversion() {
- if (fileSchema == null || readerSchema == null || readerFileTypes == null) {
- return null;
- }
-
- boolean[] result = new boolean[readerSchema.getMaximumId() + 1];
- boolean safePpd = validatePPDConversion(fileSchema, readerSchema);
- result[readerSchema.getId()] = safePpd;
- List<TypeDescription> children = readerSchema.getChildren();
- if (children != null) {
- for (TypeDescription child : children) {
- TypeDescription fileType = getFileType(child.getId());
- safePpd = validatePPDConversion(fileType, child);
- result[child.getId()] = safePpd;
- }
- }
- return result;
- }
-
- private boolean validatePPDConversion(final TypeDescription fileType,
- final TypeDescription readerType) {
- if (fileType == null) {
- return false;
- }
- if (fileType.getCategory().isPrimitive()) {
- if (fileType.getCategory().equals(readerType.getCategory())) {
- // for decimals alone do equality check to not mess up with precision change
- if (fileType.getCategory().equals(TypeDescription.Category.DECIMAL) &&
- !fileType.equals(readerType)) {
- return false;
- }
- return true;
- }
-
- // only integer and string evolutions are safe
- // byte -> short -> int -> long
- // string <-> char <-> varchar
- // NOTE: Float to double evolution is not safe as floats are stored as doubles in ORC's
- // internal index, but when doing predicate evaluation for queries like "select * from
- // orc_float where f = 74.72" the constant on the filter is converted from string -> double
- // so the precisions will be different and the comparison will fail.
- // Soon, we should convert all sargs that compare equality between floats or
- // doubles to range predicates.
-
- // Similarly string -> char and varchar -> char and vice versa is not possible, as ORC stores
- // char with padded spaces in its internal index.
- switch (fileType.getCategory()) {
- case BYTE:
- if (readerType.getCategory().equals(TypeDescription.Category.SHORT) ||
- readerType.getCategory().equals(TypeDescription.Category.INT) ||
- readerType.getCategory().equals(TypeDescription.Category.LONG)) {
- return true;
- }
- break;
- case SHORT:
- if (readerType.getCategory().equals(TypeDescription.Category.INT) ||
- readerType.getCategory().equals(TypeDescription.Category.LONG)) {
- return true;
- }
- break;
- case INT:
- if (readerType.getCategory().equals(TypeDescription.Category.LONG)) {
- return true;
- }
- break;
- case STRING:
- if (readerType.getCategory().equals(TypeDescription.Category.VARCHAR)) {
- return true;
- }
- break;
- case VARCHAR:
- if (readerType.getCategory().equals(TypeDescription.Category.STRING)) {
- return true;
- }
- break;
- default:
- break;
- }
- }
- return false;
- }
-
- /**
- * Should we read the given reader column?
- * @param readerId the id of column in the extended reader schema
- * @return true if the column should be read
- */
- public boolean includeReaderColumn(int readerId) {
- return readerIncluded == null ||
- readerId <= readerColumnOffset ||
- readerIncluded[readerId - readerColumnOffset];
- }
-
- void buildConversionFileTypesArray(TypeDescription fileType,
- TypeDescription readerType) {
- // if the column isn't included, don't map it
- int readerId = readerType.getId();
- if (!includeReaderColumn(readerId)) {
- return;
- }
- boolean isOk = true;
- // check the easy case first
- if (fileType.getCategory() == readerType.getCategory()) {
- switch (readerType.getCategory()) {
- case BOOLEAN:
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- case DOUBLE:
- case FLOAT:
- case STRING:
- case TIMESTAMP:
- case BINARY:
- case DATE:
- // these are always a match
- break;
- case CHAR:
- case VARCHAR:
- // We do conversion when same CHAR/VARCHAR type but different maxLength.
- if (fileType.getMaxLength() != readerType.getMaxLength()) {
- hasConversion = true;
- }
- break;
- case DECIMAL:
- // We do conversion when same DECIMAL type but different precision/scale.
- if (fileType.getPrecision() != readerType.getPrecision() ||
- fileType.getScale() != readerType.getScale()) {
- hasConversion = true;
- }
- break;
- case UNION:
- case MAP:
- case LIST: {
- // these must be an exact match
- List<TypeDescription> fileChildren = fileType.getChildren();
- List<TypeDescription> readerChildren = readerType.getChildren();
- if (fileChildren.size() == readerChildren.size()) {
- for(int i=0; i < fileChildren.size(); ++i) {
- buildConversionFileTypesArray(fileChildren.get(i), readerChildren.get(i));
- }
- } else {
- isOk = false;
- }
- break;
- }
- case STRUCT: {
- // allow either side to have fewer fields than the other
- List<TypeDescription> fileChildren = fileType.getChildren();
- List<TypeDescription> readerChildren = readerType.getChildren();
- if (fileChildren.size() != readerChildren.size()) {
- hasConversion = true;
- }
- int jointSize = Math.min(fileChildren.size(), readerChildren.size());
- for(int i=0; i < jointSize; ++i) {
- buildConversionFileTypesArray(fileChildren.get(i), readerChildren.get(i));
- }
- break;
- }
- default:
- throw new IllegalArgumentException("Unknown type " + readerType);
- }
- } else {
- /*
- * Check for the few cases where will not convert....
- */
-
- isOk = ConvertTreeReaderFactory.canConvert(fileType, readerType);
- hasConversion = true;
- }
- if (isOk) {
- if (readerFileTypes[readerId] != null) {
- throw new RuntimeException("reader to file type entry already assigned");
- }
- readerFileTypes[readerId] = fileType;
- fileIncluded[fileType.getId()] = true;
- } else {
- throw new IllegalArgumentException(
- String.format(
- "ORC does not support type conversion from file type %s (%d) to reader type %s (%d)",
- fileType.toString(), fileType.getId(),
- readerType.toString(), readerId));
- }
- }
-
- /**
- * Use to make a reader to file type array when the schema is the same.
- * @return
- */
- private void buildSameSchemaFileTypesArray() {
- buildSameSchemaFileTypesArrayRecurse(readerSchema);
- }
-
- void buildSameSchemaFileTypesArrayRecurse(TypeDescription readerType) {
- int id = readerType.getId();
- if (!includeReaderColumn(id)) {
- return;
- }
- if (readerFileTypes[id] != null) {
- throw new RuntimeException("reader to file type entry already assigned");
- }
- readerFileTypes[id] = readerType;
- fileIncluded[id] = true;
- List<TypeDescription> children = readerType.getChildren();
- if (children != null) {
- for (TypeDescription child : children) {
- buildSameSchemaFileTypesArrayRecurse(child);
- }
- }
- }
-
- private static boolean checkAcidSchema(TypeDescription type) {
- if (type.getCategory().equals(TypeDescription.Category.STRUCT)) {
- List<String> rootFields = type.getFieldNames();
- if (acidEventFieldNames.equals(rootFields)) {
- return true;
- }
- }
- return false;
- }
-
- /**
- * @param typeDescr
- * @return ORC types for the ACID event based on the row's type description
- */
- public static TypeDescription createEventSchema(TypeDescription typeDescr) {
- TypeDescription result = TypeDescription.createStruct()
- .addField("operation", TypeDescription.createInt())
- .addField("originalTransaction", TypeDescription.createLong())
- .addField("bucket", TypeDescription.createInt())
- .addField("rowId", TypeDescription.createLong())
- .addField("currentTransaction", TypeDescription.createLong())
- .addField("row", typeDescr.clone());
- return result;
- }
-
- public static final List<String> acidEventFieldNames= new ArrayList<String>();
- static {
- acidEventFieldNames.add("operation");
- acidEventFieldNames.add("originalTransaction");
- acidEventFieldNames.add("bucket");
- acidEventFieldNames.add("rowId");
- acidEventFieldNames.add("currentTransaction");
- acidEventFieldNames.add("row");
- }
-}
[16/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/SerializationUtils.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/SerializationUtils.java b/orc/src/java/org/apache/orc/impl/SerializationUtils.java
deleted file mode 100644
index 2e5a59b..0000000
--- a/orc/src/java/org/apache/orc/impl/SerializationUtils.java
+++ /dev/null
@@ -1,1311 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.math.BigInteger;
-
-public final class SerializationUtils {
-
- private final static int BUFFER_SIZE = 64;
- private final byte[] readBuffer;
- private final byte[] writeBuffer;
-
- public SerializationUtils() {
- this.readBuffer = new byte[BUFFER_SIZE];
- this.writeBuffer = new byte[BUFFER_SIZE];
- }
-
- public void writeVulong(OutputStream output,
- long value) throws IOException {
- while (true) {
- if ((value & ~0x7f) == 0) {
- output.write((byte) value);
- return;
- } else {
- output.write((byte) (0x80 | (value & 0x7f)));
- value >>>= 7;
- }
- }
- }
-
- public void writeVslong(OutputStream output,
- long value) throws IOException {
- writeVulong(output, (value << 1) ^ (value >> 63));
- }
-
-
- public long readVulong(InputStream in) throws IOException {
- long result = 0;
- long b;
- int offset = 0;
- do {
- b = in.read();
- if (b == -1) {
- throw new EOFException("Reading Vulong past EOF");
- }
- result |= (0x7f & b) << offset;
- offset += 7;
- } while (b >= 0x80);
- return result;
- }
-
- public long readVslong(InputStream in) throws IOException {
- long result = readVulong(in);
- return (result >>> 1) ^ -(result & 1);
- }
-
- public float readFloat(InputStream in) throws IOException {
- readFully(in, readBuffer, 0, 4);
- int val = (((readBuffer[0] & 0xff) << 0)
- + ((readBuffer[1] & 0xff) << 8)
- + ((readBuffer[2] & 0xff) << 16)
- + ((readBuffer[3] & 0xff) << 24));
- return Float.intBitsToFloat(val);
- }
-
- public void writeFloat(OutputStream output,
- float value) throws IOException {
- int ser = Float.floatToIntBits(value);
- writeBuffer[0] = (byte) ((ser >> 0) & 0xff);
- writeBuffer[1] = (byte) ((ser >> 8) & 0xff);
- writeBuffer[2] = (byte) ((ser >> 16) & 0xff);
- writeBuffer[3] = (byte) ((ser >> 24) & 0xff);
- output.write(writeBuffer, 0, 4);
- }
-
- public double readDouble(InputStream in) throws IOException {
- return Double.longBitsToDouble(readLongLE(in));
- }
-
- public long readLongLE(InputStream in) throws IOException {
- readFully(in, readBuffer, 0, 8);
- return (((readBuffer[0] & 0xff) << 0)
- + ((readBuffer[1] & 0xff) << 8)
- + ((readBuffer[2] & 0xff) << 16)
- + ((long) (readBuffer[3] & 0xff) << 24)
- + ((long) (readBuffer[4] & 0xff) << 32)
- + ((long) (readBuffer[5] & 0xff) << 40)
- + ((long) (readBuffer[6] & 0xff) << 48)
- + ((long) (readBuffer[7] & 0xff) << 56));
- }
-
- private void readFully(final InputStream in, final byte[] buffer, final int off, final int len)
- throws IOException {
- int n = 0;
- while (n < len) {
- int count = in.read(buffer, off + n, len - n);
- if (count < 0) {
- throw new EOFException("Read past EOF for " + in);
- }
- n += count;
- }
- }
-
- public void writeDouble(OutputStream output,
- double value) throws IOException {
- writeLongLE(output, Double.doubleToLongBits(value));
- }
-
- private void writeLongLE(OutputStream output, long value) throws IOException {
- writeBuffer[0] = (byte) ((value >> 0) & 0xff);
- writeBuffer[1] = (byte) ((value >> 8) & 0xff);
- writeBuffer[2] = (byte) ((value >> 16) & 0xff);
- writeBuffer[3] = (byte) ((value >> 24) & 0xff);
- writeBuffer[4] = (byte) ((value >> 32) & 0xff);
- writeBuffer[5] = (byte) ((value >> 40) & 0xff);
- writeBuffer[6] = (byte) ((value >> 48) & 0xff);
- writeBuffer[7] = (byte) ((value >> 56) & 0xff);
- output.write(writeBuffer, 0, 8);
- }
-
- /**
- * Write the arbitrarily sized signed BigInteger in vint format.
- *
- * Signed integers are encoded using the low bit as the sign bit using zigzag
- * encoding.
- *
- * Each byte uses the low 7 bits for data and the high bit for stop/continue.
- *
- * Bytes are stored LSB first.
- * @param output the stream to write to
- * @param value the value to output
- * @throws IOException
- */
- public static void writeBigInteger(OutputStream output,
- BigInteger value) throws IOException {
- // encode the signed number as a positive integer
- value = value.shiftLeft(1);
- int sign = value.signum();
- if (sign < 0) {
- value = value.negate();
- value = value.subtract(BigInteger.ONE);
- }
- int length = value.bitLength();
- while (true) {
- long lowBits = value.longValue() & 0x7fffffffffffffffL;
- length -= 63;
- // write out the next 63 bits worth of data
- for(int i=0; i < 9; ++i) {
- // if this is the last byte, leave the high bit off
- if (length <= 0 && (lowBits & ~0x7f) == 0) {
- output.write((byte) lowBits);
- return;
- } else {
- output.write((byte) (0x80 | (lowBits & 0x7f)));
- lowBits >>>= 7;
- }
- }
- value = value.shiftRight(63);
- }
- }
-
- /**
- * Read the signed arbitrary sized BigInteger BigInteger in vint format
- * @param input the stream to read from
- * @return the read BigInteger
- * @throws IOException
- */
- public static BigInteger readBigInteger(InputStream input) throws IOException {
- BigInteger result = BigInteger.ZERO;
- long work = 0;
- int offset = 0;
- long b;
- do {
- b = input.read();
- if (b == -1) {
- throw new EOFException("Reading BigInteger past EOF from " + input);
- }
- work |= (0x7f & b) << (offset % 63);
- offset += 7;
- // if we've read 63 bits, roll them into the result
- if (offset == 63) {
- result = BigInteger.valueOf(work);
- work = 0;
- } else if (offset % 63 == 0) {
- result = result.or(BigInteger.valueOf(work).shiftLeft(offset-63));
- work = 0;
- }
- } while (b >= 0x80);
- if (work != 0) {
- result = result.or(BigInteger.valueOf(work).shiftLeft((offset/63)*63));
- }
- // convert back to a signed number
- boolean isNegative = result.testBit(0);
- if (isNegative) {
- result = result.add(BigInteger.ONE);
- result = result.negate();
- }
- result = result.shiftRight(1);
- return result;
- }
-
- public enum FixedBitSizes {
- ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE,
- THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN,
- TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX,
- TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR;
- }
-
- /**
- * Count the number of bits required to encode the given value
- * @param value
- * @return bits required to store value
- */
- public int findClosestNumBits(long value) {
- int count = 0;
- while (value != 0) {
- count++;
- value = value >>> 1;
- }
- return getClosestFixedBits(count);
- }
-
- /**
- * zigzag encode the given value
- * @param val
- * @return zigzag encoded value
- */
- public long zigzagEncode(long val) {
- return (val << 1) ^ (val >> 63);
- }
-
- /**
- * zigzag decode the given value
- * @param val
- * @return zizag decoded value
- */
- public long zigzagDecode(long val) {
- return (val >>> 1) ^ -(val & 1);
- }
-
- /**
- * Compute the bits required to represent pth percentile value
- * @param data - array
- * @param p - percentile value (>=0.0 to <=1.0)
- * @return pth percentile bits
- */
- public int percentileBits(long[] data, int offset, int length,
- double p) {
- if ((p > 1.0) || (p <= 0.0)) {
- return -1;
- }
-
- // histogram that store the encoded bit requirement for each values.
- // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
- int[] hist = new int[32];
-
- // compute the histogram
- for(int i = offset; i < (offset + length); i++) {
- int idx = encodeBitWidth(findClosestNumBits(data[i]));
- hist[idx] += 1;
- }
-
- int perLen = (int) (length * (1.0 - p));
-
- // return the bits required by pth percentile length
- for(int i = hist.length - 1; i >= 0; i--) {
- perLen -= hist[i];
- if (perLen < 0) {
- return decodeBitWidth(i);
- }
- }
-
- return 0;
- }
-
- /**
- * Read n bytes in big endian order and convert to long
- * @return long value
- */
- public long bytesToLongBE(InStream input, int n) throws IOException {
- long out = 0;
- long val = 0;
- while (n > 0) {
- n--;
- // store it in a long and then shift else integer overflow will occur
- val = input.read();
- out |= (val << (n * 8));
- }
- return out;
- }
-
- /**
- * Calculate the number of bytes required
- * @param n - number of values
- * @param numBits - bit width
- * @return number of bytes required
- */
- int getTotalBytesRequired(int n, int numBits) {
- return (n * numBits + 7) / 8;
- }
-
- /**
- * For a given fixed bit this function will return the closest available fixed
- * bit
- * @param n
- * @return closest valid fixed bit
- */
- public int getClosestFixedBits(int n) {
- if (n == 0) {
- return 1;
- }
-
- if (n >= 1 && n <= 24) {
- return n;
- } else if (n > 24 && n <= 26) {
- return 26;
- } else if (n > 26 && n <= 28) {
- return 28;
- } else if (n > 28 && n <= 30) {
- return 30;
- } else if (n > 30 && n <= 32) {
- return 32;
- } else if (n > 32 && n <= 40) {
- return 40;
- } else if (n > 40 && n <= 48) {
- return 48;
- } else if (n > 48 && n <= 56) {
- return 56;
- } else {
- return 64;
- }
- }
-
- public int getClosestAlignedFixedBits(int n) {
- if (n == 0 || n == 1) {
- return 1;
- } else if (n > 1 && n <= 2) {
- return 2;
- } else if (n > 2 && n <= 4) {
- return 4;
- } else if (n > 4 && n <= 8) {
- return 8;
- } else if (n > 8 && n <= 16) {
- return 16;
- } else if (n > 16 && n <= 24) {
- return 24;
- } else if (n > 24 && n <= 32) {
- return 32;
- } else if (n > 32 && n <= 40) {
- return 40;
- } else if (n > 40 && n <= 48) {
- return 48;
- } else if (n > 48 && n <= 56) {
- return 56;
- } else {
- return 64;
- }
- }
-
- /**
- * Finds the closest available fixed bit width match and returns its encoded
- * value (ordinal)
- * @param n - fixed bit width to encode
- * @return encoded fixed bit width
- */
- public int encodeBitWidth(int n) {
- n = getClosestFixedBits(n);
-
- if (n >= 1 && n <= 24) {
- return n - 1;
- } else if (n > 24 && n <= 26) {
- return FixedBitSizes.TWENTYSIX.ordinal();
- } else if (n > 26 && n <= 28) {
- return FixedBitSizes.TWENTYEIGHT.ordinal();
- } else if (n > 28 && n <= 30) {
- return FixedBitSizes.THIRTY.ordinal();
- } else if (n > 30 && n <= 32) {
- return FixedBitSizes.THIRTYTWO.ordinal();
- } else if (n > 32 && n <= 40) {
- return FixedBitSizes.FORTY.ordinal();
- } else if (n > 40 && n <= 48) {
- return FixedBitSizes.FORTYEIGHT.ordinal();
- } else if (n > 48 && n <= 56) {
- return FixedBitSizes.FIFTYSIX.ordinal();
- } else {
- return FixedBitSizes.SIXTYFOUR.ordinal();
- }
- }
-
- /**
- * Decodes the ordinal fixed bit value to actual fixed bit width value
- * @param n - encoded fixed bit width
- * @return decoded fixed bit width
- */
- public int decodeBitWidth(int n) {
- if (n >= FixedBitSizes.ONE.ordinal()
- && n <= FixedBitSizes.TWENTYFOUR.ordinal()) {
- return n + 1;
- } else if (n == FixedBitSizes.TWENTYSIX.ordinal()) {
- return 26;
- } else if (n == FixedBitSizes.TWENTYEIGHT.ordinal()) {
- return 28;
- } else if (n == FixedBitSizes.THIRTY.ordinal()) {
- return 30;
- } else if (n == FixedBitSizes.THIRTYTWO.ordinal()) {
- return 32;
- } else if (n == FixedBitSizes.FORTY.ordinal()) {
- return 40;
- } else if (n == FixedBitSizes.FORTYEIGHT.ordinal()) {
- return 48;
- } else if (n == FixedBitSizes.FIFTYSIX.ordinal()) {
- return 56;
- } else {
- return 64;
- }
- }
-
- /**
- * Bitpack and write the input values to underlying output stream
- * @param input - values to write
- * @param offset - offset
- * @param len - length
- * @param bitSize - bit width
- * @param output - output stream
- * @throws IOException
- */
- public void writeInts(long[] input, int offset, int len, int bitSize,
- OutputStream output) throws IOException {
- if (input == null || input.length < 1 || offset < 0 || len < 1
- || bitSize < 1) {
- return;
- }
-
- switch (bitSize) {
- case 1:
- unrolledBitPack1(input, offset, len, output);
- return;
- case 2:
- unrolledBitPack2(input, offset, len, output);
- return;
- case 4:
- unrolledBitPack4(input, offset, len, output);
- return;
- case 8:
- unrolledBitPack8(input, offset, len, output);
- return;
- case 16:
- unrolledBitPack16(input, offset, len, output);
- return;
- case 24:
- unrolledBitPack24(input, offset, len, output);
- return;
- case 32:
- unrolledBitPack32(input, offset, len, output);
- return;
- case 40:
- unrolledBitPack40(input, offset, len, output);
- return;
- case 48:
- unrolledBitPack48(input, offset, len, output);
- return;
- case 56:
- unrolledBitPack56(input, offset, len, output);
- return;
- case 64:
- unrolledBitPack64(input, offset, len, output);
- return;
- default:
- break;
- }
-
- int bitsLeft = 8;
- byte current = 0;
- for(int i = offset; i < (offset + len); i++) {
- long value = input[i];
- int bitsToWrite = bitSize;
- while (bitsToWrite > bitsLeft) {
- // add the bits to the bottom of the current word
- current |= value >>> (bitsToWrite - bitsLeft);
- // subtract out the bits we just added
- bitsToWrite -= bitsLeft;
- // zero out the bits above bitsToWrite
- value &= (1L << bitsToWrite) - 1;
- output.write(current);
- current = 0;
- bitsLeft = 8;
- }
- bitsLeft -= bitsToWrite;
- current |= value << bitsLeft;
- if (bitsLeft == 0) {
- output.write(current);
- current = 0;
- bitsLeft = 8;
- }
- }
-
- // flush
- if (bitsLeft != 8) {
- output.write(current);
- current = 0;
- bitsLeft = 8;
- }
- }
-
- private void unrolledBitPack1(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- final int numHops = 8;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int val = 0;
- for (int i = offset; i < endUnroll; i = i + numHops) {
- val = (int) (val | ((input[i] & 1) << 7)
- | ((input[i + 1] & 1) << 6)
- | ((input[i + 2] & 1) << 5)
- | ((input[i + 3] & 1) << 4)
- | ((input[i + 4] & 1) << 3)
- | ((input[i + 5] & 1) << 2)
- | ((input[i + 6] & 1) << 1)
- | (input[i + 7]) & 1);
- output.write(val);
- val = 0;
- }
-
- if (remainder > 0) {
- int startShift = 7;
- for (int i = endUnroll; i < endOffset; i++) {
- val = (int) (val | (input[i] & 1) << startShift);
- startShift -= 1;
- }
- output.write(val);
- }
- }
-
- private void unrolledBitPack2(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- final int numHops = 4;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int val = 0;
- for (int i = offset; i < endUnroll; i = i + numHops) {
- val = (int) (val | ((input[i] & 3) << 6)
- | ((input[i + 1] & 3) << 4)
- | ((input[i + 2] & 3) << 2)
- | (input[i + 3]) & 3);
- output.write(val);
- val = 0;
- }
-
- if (remainder > 0) {
- int startShift = 6;
- for (int i = endUnroll; i < endOffset; i++) {
- val = (int) (val | (input[i] & 3) << startShift);
- startShift -= 2;
- }
- output.write(val);
- }
- }
-
- private void unrolledBitPack4(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- final int numHops = 2;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int val = 0;
- for (int i = offset; i < endUnroll; i = i + numHops) {
- val = (int) (val | ((input[i] & 15) << 4) | (input[i + 1]) & 15);
- output.write(val);
- val = 0;
- }
-
- if (remainder > 0) {
- int startShift = 4;
- for (int i = endUnroll; i < endOffset; i++) {
- val = (int) (val | (input[i] & 15) << startShift);
- startShift -= 4;
- }
- output.write(val);
- }
- }
-
- private void unrolledBitPack8(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 1);
- }
-
- private void unrolledBitPack16(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 2);
- }
-
- private void unrolledBitPack24(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 3);
- }
-
- private void unrolledBitPack32(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 4);
- }
-
- private void unrolledBitPack40(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 5);
- }
-
- private void unrolledBitPack48(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 6);
- }
-
- private void unrolledBitPack56(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 7);
- }
-
- private void unrolledBitPack64(long[] input, int offset, int len,
- OutputStream output) throws IOException {
- unrolledBitPackBytes(input, offset, len, output, 8);
- }
-
- private void unrolledBitPackBytes(long[] input, int offset, int len, OutputStream output, int numBytes) throws IOException {
- final int numHops = 8;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int i = offset;
- for (; i < endUnroll; i = i + numHops) {
- writeLongBE(output, input, i, numHops, numBytes);
- }
-
- if (remainder > 0) {
- writeRemainingLongs(output, i, input, remainder, numBytes);
- }
- }
-
- private void writeRemainingLongs(OutputStream output, int offset, long[] input, int remainder,
- int numBytes) throws IOException {
- final int numHops = remainder;
-
- int idx = 0;
- switch (numBytes) {
- case 1:
- while (remainder > 0) {
- writeBuffer[idx] = (byte) (input[offset + idx] & 255);
- remainder--;
- idx++;
- }
- break;
- case 2:
- while (remainder > 0) {
- writeLongBE2(output, input[offset + idx], idx * 2);
- remainder--;
- idx++;
- }
- break;
- case 3:
- while (remainder > 0) {
- writeLongBE3(output, input[offset + idx], idx * 3);
- remainder--;
- idx++;
- }
- break;
- case 4:
- while (remainder > 0) {
- writeLongBE4(output, input[offset + idx], idx * 4);
- remainder--;
- idx++;
- }
- break;
- case 5:
- while (remainder > 0) {
- writeLongBE5(output, input[offset + idx], idx * 5);
- remainder--;
- idx++;
- }
- break;
- case 6:
- while (remainder > 0) {
- writeLongBE6(output, input[offset + idx], idx * 6);
- remainder--;
- idx++;
- }
- break;
- case 7:
- while (remainder > 0) {
- writeLongBE7(output, input[offset + idx], idx * 7);
- remainder--;
- idx++;
- }
- break;
- case 8:
- while (remainder > 0) {
- writeLongBE8(output, input[offset + idx], idx * 8);
- remainder--;
- idx++;
- }
- break;
- default:
- break;
- }
-
- final int toWrite = numHops * numBytes;
- output.write(writeBuffer, 0, toWrite);
- }
-
- private void writeLongBE(OutputStream output, long[] input, int offset, int numHops, int numBytes) throws IOException {
-
- switch (numBytes) {
- case 1:
- writeBuffer[0] = (byte) (input[offset + 0] & 255);
- writeBuffer[1] = (byte) (input[offset + 1] & 255);
- writeBuffer[2] = (byte) (input[offset + 2] & 255);
- writeBuffer[3] = (byte) (input[offset + 3] & 255);
- writeBuffer[4] = (byte) (input[offset + 4] & 255);
- writeBuffer[5] = (byte) (input[offset + 5] & 255);
- writeBuffer[6] = (byte) (input[offset + 6] & 255);
- writeBuffer[7] = (byte) (input[offset + 7] & 255);
- break;
- case 2:
- writeLongBE2(output, input[offset + 0], 0);
- writeLongBE2(output, input[offset + 1], 2);
- writeLongBE2(output, input[offset + 2], 4);
- writeLongBE2(output, input[offset + 3], 6);
- writeLongBE2(output, input[offset + 4], 8);
- writeLongBE2(output, input[offset + 5], 10);
- writeLongBE2(output, input[offset + 6], 12);
- writeLongBE2(output, input[offset + 7], 14);
- break;
- case 3:
- writeLongBE3(output, input[offset + 0], 0);
- writeLongBE3(output, input[offset + 1], 3);
- writeLongBE3(output, input[offset + 2], 6);
- writeLongBE3(output, input[offset + 3], 9);
- writeLongBE3(output, input[offset + 4], 12);
- writeLongBE3(output, input[offset + 5], 15);
- writeLongBE3(output, input[offset + 6], 18);
- writeLongBE3(output, input[offset + 7], 21);
- break;
- case 4:
- writeLongBE4(output, input[offset + 0], 0);
- writeLongBE4(output, input[offset + 1], 4);
- writeLongBE4(output, input[offset + 2], 8);
- writeLongBE4(output, input[offset + 3], 12);
- writeLongBE4(output, input[offset + 4], 16);
- writeLongBE4(output, input[offset + 5], 20);
- writeLongBE4(output, input[offset + 6], 24);
- writeLongBE4(output, input[offset + 7], 28);
- break;
- case 5:
- writeLongBE5(output, input[offset + 0], 0);
- writeLongBE5(output, input[offset + 1], 5);
- writeLongBE5(output, input[offset + 2], 10);
- writeLongBE5(output, input[offset + 3], 15);
- writeLongBE5(output, input[offset + 4], 20);
- writeLongBE5(output, input[offset + 5], 25);
- writeLongBE5(output, input[offset + 6], 30);
- writeLongBE5(output, input[offset + 7], 35);
- break;
- case 6:
- writeLongBE6(output, input[offset + 0], 0);
- writeLongBE6(output, input[offset + 1], 6);
- writeLongBE6(output, input[offset + 2], 12);
- writeLongBE6(output, input[offset + 3], 18);
- writeLongBE6(output, input[offset + 4], 24);
- writeLongBE6(output, input[offset + 5], 30);
- writeLongBE6(output, input[offset + 6], 36);
- writeLongBE6(output, input[offset + 7], 42);
- break;
- case 7:
- writeLongBE7(output, input[offset + 0], 0);
- writeLongBE7(output, input[offset + 1], 7);
- writeLongBE7(output, input[offset + 2], 14);
- writeLongBE7(output, input[offset + 3], 21);
- writeLongBE7(output, input[offset + 4], 28);
- writeLongBE7(output, input[offset + 5], 35);
- writeLongBE7(output, input[offset + 6], 42);
- writeLongBE7(output, input[offset + 7], 49);
- break;
- case 8:
- writeLongBE8(output, input[offset + 0], 0);
- writeLongBE8(output, input[offset + 1], 8);
- writeLongBE8(output, input[offset + 2], 16);
- writeLongBE8(output, input[offset + 3], 24);
- writeLongBE8(output, input[offset + 4], 32);
- writeLongBE8(output, input[offset + 5], 40);
- writeLongBE8(output, input[offset + 6], 48);
- writeLongBE8(output, input[offset + 7], 56);
- break;
- default:
- break;
- }
-
- final int toWrite = numHops * numBytes;
- output.write(writeBuffer, 0, toWrite);
- }
-
- private void writeLongBE2(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 0);
- }
-
- private void writeLongBE3(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 0);
- }
-
- private void writeLongBE4(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 24);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 3] = (byte) (val >>> 0);
- }
-
- private void writeLongBE5(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 32);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 24);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 3] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 4] = (byte) (val >>> 0);
- }
-
- private void writeLongBE6(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 40);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 32);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 24);
- writeBuffer[wbOffset + 3] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 4] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 5] = (byte) (val >>> 0);
- }
-
- private void writeLongBE7(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 48);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 40);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 32);
- writeBuffer[wbOffset + 3] = (byte) (val >>> 24);
- writeBuffer[wbOffset + 4] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 5] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 6] = (byte) (val >>> 0);
- }
-
- private void writeLongBE8(OutputStream output, long val, int wbOffset) {
- writeBuffer[wbOffset + 0] = (byte) (val >>> 56);
- writeBuffer[wbOffset + 1] = (byte) (val >>> 48);
- writeBuffer[wbOffset + 2] = (byte) (val >>> 40);
- writeBuffer[wbOffset + 3] = (byte) (val >>> 32);
- writeBuffer[wbOffset + 4] = (byte) (val >>> 24);
- writeBuffer[wbOffset + 5] = (byte) (val >>> 16);
- writeBuffer[wbOffset + 6] = (byte) (val >>> 8);
- writeBuffer[wbOffset + 7] = (byte) (val >>> 0);
- }
-
- /**
- * Read bitpacked integers from input stream
- * @param buffer - input buffer
- * @param offset - offset
- * @param len - length
- * @param bitSize - bit width
- * @param input - input stream
- * @throws IOException
- */
- public void readInts(long[] buffer, int offset, int len, int bitSize,
- InStream input) throws IOException {
- int bitsLeft = 0;
- int current = 0;
-
- switch (bitSize) {
- case 1:
- unrolledUnPack1(buffer, offset, len, input);
- return;
- case 2:
- unrolledUnPack2(buffer, offset, len, input);
- return;
- case 4:
- unrolledUnPack4(buffer, offset, len, input);
- return;
- case 8:
- unrolledUnPack8(buffer, offset, len, input);
- return;
- case 16:
- unrolledUnPack16(buffer, offset, len, input);
- return;
- case 24:
- unrolledUnPack24(buffer, offset, len, input);
- return;
- case 32:
- unrolledUnPack32(buffer, offset, len, input);
- return;
- case 40:
- unrolledUnPack40(buffer, offset, len, input);
- return;
- case 48:
- unrolledUnPack48(buffer, offset, len, input);
- return;
- case 56:
- unrolledUnPack56(buffer, offset, len, input);
- return;
- case 64:
- unrolledUnPack64(buffer, offset, len, input);
- return;
- default:
- break;
- }
-
- for(int i = offset; i < (offset + len); i++) {
- long result = 0;
- int bitsLeftToRead = bitSize;
- while (bitsLeftToRead > bitsLeft) {
- result <<= bitsLeft;
- result |= current & ((1 << bitsLeft) - 1);
- bitsLeftToRead -= bitsLeft;
- current = input.read();
- bitsLeft = 8;
- }
-
- // handle the left over bits
- if (bitsLeftToRead > 0) {
- result <<= bitsLeftToRead;
- bitsLeft -= bitsLeftToRead;
- result |= (current >> bitsLeft) & ((1 << bitsLeftToRead) - 1);
- }
- buffer[i] = result;
- }
- }
-
-
- private void unrolledUnPack1(long[] buffer, int offset, int len,
- InStream input) throws IOException {
- final int numHops = 8;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int val = 0;
- for (int i = offset; i < endUnroll; i = i + numHops) {
- val = input.read();
- buffer[i] = (val >>> 7) & 1;
- buffer[i + 1] = (val >>> 6) & 1;
- buffer[i + 2] = (val >>> 5) & 1;
- buffer[i + 3] = (val >>> 4) & 1;
- buffer[i + 4] = (val >>> 3) & 1;
- buffer[i + 5] = (val >>> 2) & 1;
- buffer[i + 6] = (val >>> 1) & 1;
- buffer[i + 7] = val & 1;
- }
-
- if (remainder > 0) {
- int startShift = 7;
- val = input.read();
- for (int i = endUnroll; i < endOffset; i++) {
- buffer[i] = (val >>> startShift) & 1;
- startShift -= 1;
- }
- }
- }
-
- private void unrolledUnPack2(long[] buffer, int offset, int len,
- InStream input) throws IOException {
- final int numHops = 4;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int val = 0;
- for (int i = offset; i < endUnroll; i = i + numHops) {
- val = input.read();
- buffer[i] = (val >>> 6) & 3;
- buffer[i + 1] = (val >>> 4) & 3;
- buffer[i + 2] = (val >>> 2) & 3;
- buffer[i + 3] = val & 3;
- }
-
- if (remainder > 0) {
- int startShift = 6;
- val = input.read();
- for (int i = endUnroll; i < endOffset; i++) {
- buffer[i] = (val >>> startShift) & 3;
- startShift -= 2;
- }
- }
- }
-
- private void unrolledUnPack4(long[] buffer, int offset, int len,
- InStream input) throws IOException {
- final int numHops = 2;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int val = 0;
- for (int i = offset; i < endUnroll; i = i + numHops) {
- val = input.read();
- buffer[i] = (val >>> 4) & 15;
- buffer[i + 1] = val & 15;
- }
-
- if (remainder > 0) {
- int startShift = 4;
- val = input.read();
- for (int i = endUnroll; i < endOffset; i++) {
- buffer[i] = (val >>> startShift) & 15;
- startShift -= 4;
- }
- }
- }
-
- private void unrolledUnPack8(long[] buffer, int offset, int len,
- InStream input) throws IOException {
- unrolledUnPackBytes(buffer, offset, len, input, 1);
- }
-
- private void unrolledUnPack16(long[] buffer, int offset, int len,
- InStream input) throws IOException {
- unrolledUnPackBytes(buffer, offset, len, input, 2);
- }
-
- private void unrolledUnPack24(long[] buffer, int offset, int len,
- InStream input) throws IOException {
- unrolledUnPackBytes(buffer, offset, len, input, 3);
- }
-
- private void unrolledUnPack32(long[] buffer, int offset, int len,
- InStream input) throws IOException {
- unrolledUnPackBytes(buffer, offset, len, input, 4);
- }
-
- private void unrolledUnPack40(long[] buffer, int offset, int len,
- InStream input) throws IOException {
- unrolledUnPackBytes(buffer, offset, len, input, 5);
- }
-
- private void unrolledUnPack48(long[] buffer, int offset, int len,
- InStream input) throws IOException {
- unrolledUnPackBytes(buffer, offset, len, input, 6);
- }
-
- private void unrolledUnPack56(long[] buffer, int offset, int len,
- InStream input) throws IOException {
- unrolledUnPackBytes(buffer, offset, len, input, 7);
- }
-
- private void unrolledUnPack64(long[] buffer, int offset, int len,
- InStream input) throws IOException {
- unrolledUnPackBytes(buffer, offset, len, input, 8);
- }
-
- private void unrolledUnPackBytes(long[] buffer, int offset, int len, InStream input, int numBytes)
- throws IOException {
- final int numHops = 8;
- final int remainder = len % numHops;
- final int endOffset = offset + len;
- final int endUnroll = endOffset - remainder;
- int i = offset;
- for (; i < endUnroll; i = i + numHops) {
- readLongBE(input, buffer, i, numHops, numBytes);
- }
-
- if (remainder > 0) {
- readRemainingLongs(buffer, i, input, remainder, numBytes);
- }
- }
-
- private void readRemainingLongs(long[] buffer, int offset, InStream input, int remainder,
- int numBytes) throws IOException {
- final int toRead = remainder * numBytes;
- // bulk read to buffer
- int bytesRead = input.read(readBuffer, 0, toRead);
- while (bytesRead != toRead) {
- bytesRead += input.read(readBuffer, bytesRead, toRead - bytesRead);
- }
-
- int idx = 0;
- switch (numBytes) {
- case 1:
- while (remainder > 0) {
- buffer[offset++] = readBuffer[idx] & 255;
- remainder--;
- idx++;
- }
- break;
- case 2:
- while (remainder > 0) {
- buffer[offset++] = readLongBE2(input, idx * 2);
- remainder--;
- idx++;
- }
- break;
- case 3:
- while (remainder > 0) {
- buffer[offset++] = readLongBE3(input, idx * 3);
- remainder--;
- idx++;
- }
- break;
- case 4:
- while (remainder > 0) {
- buffer[offset++] = readLongBE4(input, idx * 4);
- remainder--;
- idx++;
- }
- break;
- case 5:
- while (remainder > 0) {
- buffer[offset++] = readLongBE5(input, idx * 5);
- remainder--;
- idx++;
- }
- break;
- case 6:
- while (remainder > 0) {
- buffer[offset++] = readLongBE6(input, idx * 6);
- remainder--;
- idx++;
- }
- break;
- case 7:
- while (remainder > 0) {
- buffer[offset++] = readLongBE7(input, idx * 7);
- remainder--;
- idx++;
- }
- break;
- case 8:
- while (remainder > 0) {
- buffer[offset++] = readLongBE8(input, idx * 8);
- remainder--;
- idx++;
- }
- break;
- default:
- break;
- }
- }
-
- private void readLongBE(InStream in, long[] buffer, int start, int numHops, int numBytes)
- throws IOException {
- final int toRead = numHops * numBytes;
- // bulk read to buffer
- int bytesRead = in.read(readBuffer, 0, toRead);
- while (bytesRead != toRead) {
- bytesRead += in.read(readBuffer, bytesRead, toRead - bytesRead);
- }
-
- switch (numBytes) {
- case 1:
- buffer[start + 0] = readBuffer[0] & 255;
- buffer[start + 1] = readBuffer[1] & 255;
- buffer[start + 2] = readBuffer[2] & 255;
- buffer[start + 3] = readBuffer[3] & 255;
- buffer[start + 4] = readBuffer[4] & 255;
- buffer[start + 5] = readBuffer[5] & 255;
- buffer[start + 6] = readBuffer[6] & 255;
- buffer[start + 7] = readBuffer[7] & 255;
- break;
- case 2:
- buffer[start + 0] = readLongBE2(in, 0);
- buffer[start + 1] = readLongBE2(in, 2);
- buffer[start + 2] = readLongBE2(in, 4);
- buffer[start + 3] = readLongBE2(in, 6);
- buffer[start + 4] = readLongBE2(in, 8);
- buffer[start + 5] = readLongBE2(in, 10);
- buffer[start + 6] = readLongBE2(in, 12);
- buffer[start + 7] = readLongBE2(in, 14);
- break;
- case 3:
- buffer[start + 0] = readLongBE3(in, 0);
- buffer[start + 1] = readLongBE3(in, 3);
- buffer[start + 2] = readLongBE3(in, 6);
- buffer[start + 3] = readLongBE3(in, 9);
- buffer[start + 4] = readLongBE3(in, 12);
- buffer[start + 5] = readLongBE3(in, 15);
- buffer[start + 6] = readLongBE3(in, 18);
- buffer[start + 7] = readLongBE3(in, 21);
- break;
- case 4:
- buffer[start + 0] = readLongBE4(in, 0);
- buffer[start + 1] = readLongBE4(in, 4);
- buffer[start + 2] = readLongBE4(in, 8);
- buffer[start + 3] = readLongBE4(in, 12);
- buffer[start + 4] = readLongBE4(in, 16);
- buffer[start + 5] = readLongBE4(in, 20);
- buffer[start + 6] = readLongBE4(in, 24);
- buffer[start + 7] = readLongBE4(in, 28);
- break;
- case 5:
- buffer[start + 0] = readLongBE5(in, 0);
- buffer[start + 1] = readLongBE5(in, 5);
- buffer[start + 2] = readLongBE5(in, 10);
- buffer[start + 3] = readLongBE5(in, 15);
- buffer[start + 4] = readLongBE5(in, 20);
- buffer[start + 5] = readLongBE5(in, 25);
- buffer[start + 6] = readLongBE5(in, 30);
- buffer[start + 7] = readLongBE5(in, 35);
- break;
- case 6:
- buffer[start + 0] = readLongBE6(in, 0);
- buffer[start + 1] = readLongBE6(in, 6);
- buffer[start + 2] = readLongBE6(in, 12);
- buffer[start + 3] = readLongBE6(in, 18);
- buffer[start + 4] = readLongBE6(in, 24);
- buffer[start + 5] = readLongBE6(in, 30);
- buffer[start + 6] = readLongBE6(in, 36);
- buffer[start + 7] = readLongBE6(in, 42);
- break;
- case 7:
- buffer[start + 0] = readLongBE7(in, 0);
- buffer[start + 1] = readLongBE7(in, 7);
- buffer[start + 2] = readLongBE7(in, 14);
- buffer[start + 3] = readLongBE7(in, 21);
- buffer[start + 4] = readLongBE7(in, 28);
- buffer[start + 5] = readLongBE7(in, 35);
- buffer[start + 6] = readLongBE7(in, 42);
- buffer[start + 7] = readLongBE7(in, 49);
- break;
- case 8:
- buffer[start + 0] = readLongBE8(in, 0);
- buffer[start + 1] = readLongBE8(in, 8);
- buffer[start + 2] = readLongBE8(in, 16);
- buffer[start + 3] = readLongBE8(in, 24);
- buffer[start + 4] = readLongBE8(in, 32);
- buffer[start + 5] = readLongBE8(in, 40);
- buffer[start + 6] = readLongBE8(in, 48);
- buffer[start + 7] = readLongBE8(in, 56);
- break;
- default:
- break;
- }
- }
-
- private long readLongBE2(InStream in, int rbOffset) {
- return (((readBuffer[rbOffset] & 255) << 8)
- + ((readBuffer[rbOffset + 1] & 255) << 0));
- }
-
- private long readLongBE3(InStream in, int rbOffset) {
- return (((readBuffer[rbOffset] & 255) << 16)
- + ((readBuffer[rbOffset + 1] & 255) << 8)
- + ((readBuffer[rbOffset + 2] & 255) << 0));
- }
-
- private long readLongBE4(InStream in, int rbOffset) {
- return (((long) (readBuffer[rbOffset] & 255) << 24)
- + ((readBuffer[rbOffset + 1] & 255) << 16)
- + ((readBuffer[rbOffset + 2] & 255) << 8)
- + ((readBuffer[rbOffset + 3] & 255) << 0));
- }
-
- private long readLongBE5(InStream in, int rbOffset) {
- return (((long) (readBuffer[rbOffset] & 255) << 32)
- + ((long) (readBuffer[rbOffset + 1] & 255) << 24)
- + ((readBuffer[rbOffset + 2] & 255) << 16)
- + ((readBuffer[rbOffset + 3] & 255) << 8)
- + ((readBuffer[rbOffset + 4] & 255) << 0));
- }
-
- private long readLongBE6(InStream in, int rbOffset) {
- return (((long) (readBuffer[rbOffset] & 255) << 40)
- + ((long) (readBuffer[rbOffset + 1] & 255) << 32)
- + ((long) (readBuffer[rbOffset + 2] & 255) << 24)
- + ((readBuffer[rbOffset + 3] & 255) << 16)
- + ((readBuffer[rbOffset + 4] & 255) << 8)
- + ((readBuffer[rbOffset + 5] & 255) << 0));
- }
-
- private long readLongBE7(InStream in, int rbOffset) {
- return (((long) (readBuffer[rbOffset] & 255) << 48)
- + ((long) (readBuffer[rbOffset + 1] & 255) << 40)
- + ((long) (readBuffer[rbOffset + 2] & 255) << 32)
- + ((long) (readBuffer[rbOffset + 3] & 255) << 24)
- + ((readBuffer[rbOffset + 4] & 255) << 16)
- + ((readBuffer[rbOffset + 5] & 255) << 8)
- + ((readBuffer[rbOffset + 6] & 255) << 0));
- }
-
- private long readLongBE8(InStream in, int rbOffset) {
- return (((long) (readBuffer[rbOffset] & 255) << 56)
- + ((long) (readBuffer[rbOffset + 1] & 255) << 48)
- + ((long) (readBuffer[rbOffset + 2] & 255) << 40)
- + ((long) (readBuffer[rbOffset + 3] & 255) << 32)
- + ((long) (readBuffer[rbOffset + 4] & 255) << 24)
- + ((readBuffer[rbOffset + 5] & 255) << 16)
- + ((readBuffer[rbOffset + 6] & 255) << 8)
- + ((readBuffer[rbOffset + 7] & 255) << 0));
- }
-
- // Do not want to use Guava LongMath.checkedSubtract() here as it will throw
- // ArithmeticException in case of overflow
- public boolean isSafeSubtract(long left, long right) {
- return (left ^ right) >= 0 | (left ^ (left - right)) >= 0;
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/SettableUncompressedStream.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/SettableUncompressedStream.java b/orc/src/java/org/apache/orc/impl/SettableUncompressedStream.java
deleted file mode 100644
index f9e29eb..0000000
--- a/orc/src/java/org/apache/orc/impl/SettableUncompressedStream.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.util.List;
-
-import org.apache.hadoop.hive.common.DiskRangeInfo;
-import org.apache.hadoop.hive.common.io.DiskRange;
-import org.apache.orc.impl.InStream;
-
-/**
- * An uncompressed stream whose underlying byte buffer can be set.
- */
-public class SettableUncompressedStream extends InStream.UncompressedStream {
-
- public SettableUncompressedStream(String name, List<DiskRange> input, long length) {
- super(name, input, length);
- setOffset(input);
- }
-
- public void setBuffers(DiskRangeInfo diskRangeInfo) {
- reset(diskRangeInfo.getDiskRanges(), diskRangeInfo.getTotalLength());
- setOffset(diskRangeInfo.getDiskRanges());
- }
-
- private void setOffset(List<DiskRange> list) {
- currentOffset = list.isEmpty() ? 0 : list.get(0).getOffset();
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/SnappyCodec.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/SnappyCodec.java b/orc/src/java/org/apache/orc/impl/SnappyCodec.java
deleted file mode 100644
index dd4f30c..0000000
--- a/orc/src/java/org/apache/orc/impl/SnappyCodec.java
+++ /dev/null
@@ -1,108 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import org.apache.orc.CompressionCodec;
-import org.iq80.snappy.Snappy;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.EnumSet;
-
-public class SnappyCodec implements CompressionCodec, DirectDecompressionCodec {
- private static final HadoopShims SHIMS = HadoopShims.Factory.get();
-
- Boolean direct = null;
-
- @Override
- public boolean compress(ByteBuffer in, ByteBuffer out,
- ByteBuffer overflow) throws IOException {
- int inBytes = in.remaining();
- // I should work on a patch for Snappy to support an overflow buffer
- // to prevent the extra buffer copy.
- byte[] compressed = new byte[Snappy.maxCompressedLength(inBytes)];
- int outBytes =
- Snappy.compress(in.array(), in.arrayOffset() + in.position(), inBytes,
- compressed, 0);
- if (outBytes < inBytes) {
- int remaining = out.remaining();
- if (remaining >= outBytes) {
- System.arraycopy(compressed, 0, out.array(), out.arrayOffset() +
- out.position(), outBytes);
- out.position(out.position() + outBytes);
- } else {
- System.arraycopy(compressed, 0, out.array(), out.arrayOffset() +
- out.position(), remaining);
- out.position(out.limit());
- System.arraycopy(compressed, remaining, overflow.array(),
- overflow.arrayOffset(), outBytes - remaining);
- overflow.position(outBytes - remaining);
- }
- return true;
- } else {
- return false;
- }
- }
-
- @Override
- public void decompress(ByteBuffer in, ByteBuffer out) throws IOException {
- if(in.isDirect() && out.isDirect()) {
- directDecompress(in, out);
- return;
- }
- int inOffset = in.position();
- int uncompressLen =
- Snappy.uncompress(in.array(), in.arrayOffset() + inOffset,
- in.limit() - inOffset, out.array(), out.arrayOffset() + out.position());
- out.position(uncompressLen + out.position());
- out.flip();
- }
-
- @Override
- public boolean isAvailable() {
- if (direct == null) {
- try {
- if (SHIMS.getDirectDecompressor(
- HadoopShims.DirectCompressionType.SNAPPY) != null) {
- direct = Boolean.valueOf(true);
- } else {
- direct = Boolean.valueOf(false);
- }
- } catch (UnsatisfiedLinkError ule) {
- direct = Boolean.valueOf(false);
- }
- }
- return direct.booleanValue();
- }
-
- @Override
- public void directDecompress(ByteBuffer in, ByteBuffer out)
- throws IOException {
- HadoopShims.DirectDecompressor decompressShim =
- SHIMS.getDirectDecompressor(HadoopShims.DirectCompressionType.SNAPPY);
- decompressShim.decompress(in, out);
- out.flip(); // flip for read
- }
-
- @Override
- public CompressionCodec modify(EnumSet<Modifier> modifiers) {
- // snappy allows no modifications
- return this;
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/StreamName.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/StreamName.java b/orc/src/java/org/apache/orc/impl/StreamName.java
deleted file mode 100644
index b3fd145..0000000
--- a/orc/src/java/org/apache/orc/impl/StreamName.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc.impl;
-
-import org.apache.orc.OrcProto;
-
-/**
- * The name of a stream within a stripe.
- */
-public class StreamName implements Comparable<StreamName> {
- private final int column;
- private final OrcProto.Stream.Kind kind;
-
- public static enum Area {
- DATA, INDEX
- }
-
- public StreamName(int column, OrcProto.Stream.Kind kind) {
- this.column = column;
- this.kind = kind;
- }
-
- public boolean equals(Object obj) {
- if (obj != null && obj instanceof StreamName) {
- StreamName other = (StreamName) obj;
- return other.column == column && other.kind == kind;
- } else {
- return false;
- }
- }
-
- @Override
- public int compareTo(StreamName streamName) {
- if (streamName == null) {
- return -1;
- }
- Area area = getArea(kind);
- Area otherArea = streamName.getArea(streamName.kind);
- if (area != otherArea) {
- return -area.compareTo(otherArea);
- }
- if (column != streamName.column) {
- return column < streamName.column ? -1 : 1;
- }
- return kind.compareTo(streamName.kind);
- }
-
- public int getColumn() {
- return column;
- }
-
- public OrcProto.Stream.Kind getKind() {
- return kind;
- }
-
- public Area getArea() {
- return getArea(kind);
- }
-
- public static Area getArea(OrcProto.Stream.Kind kind) {
- switch (kind) {
- case ROW_INDEX:
- case DICTIONARY_COUNT:
- case BLOOM_FILTER:
- return Area.INDEX;
- default:
- return Area.DATA;
- }
- }
-
- @Override
- public String toString() {
- return "Stream for column " + column + " kind " + kind;
- }
-
- @Override
- public int hashCode() {
- return column * 101 + kind.getNumber();
- }
-}
-
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/java/org/apache/orc/impl/StringRedBlackTree.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/StringRedBlackTree.java b/orc/src/java/org/apache/orc/impl/StringRedBlackTree.java
deleted file mode 100644
index c353ab0..0000000
--- a/orc/src/java/org/apache/orc/impl/StringRedBlackTree.java
+++ /dev/null
@@ -1,210 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc.impl;
-
-import java.io.IOException;
-import java.io.OutputStream;
-
-import org.apache.hadoop.io.Text;
-import org.apache.orc.impl.DynamicByteArray;
-import org.apache.orc.impl.DynamicIntArray;
-import org.apache.orc.impl.RedBlackTree;
-
-/**
- * A red-black tree that stores strings. The strings are stored as UTF-8 bytes
- * and an offset for each entry.
- */
-public class StringRedBlackTree extends RedBlackTree {
- private final DynamicByteArray byteArray = new DynamicByteArray();
- private final DynamicIntArray keyOffsets;
- private final Text newKey = new Text();
-
- public StringRedBlackTree(int initialCapacity) {
- super(initialCapacity);
- keyOffsets = new DynamicIntArray(initialCapacity);
- }
-
- public int add(String value) {
- newKey.set(value);
- return addNewKey();
- }
-
- private int addNewKey() {
- // if the newKey is actually new, add it to our byteArray and store the offset & length
- if (add()) {
- int len = newKey.getLength();
- keyOffsets.add(byteArray.add(newKey.getBytes(), 0, len));
- }
- return lastAdd;
- }
-
- public int add(Text value) {
- newKey.set(value);
- return addNewKey();
- }
-
- public int add(byte[] bytes, int offset, int length) {
- newKey.set(bytes, offset, length);
- return addNewKey();
- }
-
- @Override
- protected int compareValue(int position) {
- int start = keyOffsets.get(position);
- int end;
- if (position + 1 == keyOffsets.size()) {
- end = byteArray.size();
- } else {
- end = keyOffsets.get(position+1);
- }
- return byteArray.compare(newKey.getBytes(), 0, newKey.getLength(),
- start, end - start);
- }
-
- /**
- * The information about each node.
- */
- public interface VisitorContext {
- /**
- * Get the position where the key was originally added.
- * @return the number returned by add.
- */
- int getOriginalPosition();
-
- /**
- * Write the bytes for the string to the given output stream.
- * @param out the stream to write to.
- * @throws IOException
- */
- void writeBytes(OutputStream out) throws IOException;
-
- /**
- * Get the original string.
- * @return the string
- */
- Text getText();
-
- /**
- * Get the number of bytes.
- * @return the string's length in bytes
- */
- int getLength();
- }
-
- /**
- * The interface for visitors.
- */
- public interface Visitor {
- /**
- * Called once for each node of the tree in sort order.
- * @param context the information about each node
- * @throws IOException
- */
- void visit(VisitorContext context) throws IOException;
- }
-
- private class VisitorContextImpl implements VisitorContext {
- private int originalPosition;
- private int start;
- private int end;
- private final Text text = new Text();
-
- public int getOriginalPosition() {
- return originalPosition;
- }
-
- public Text getText() {
- byteArray.setText(text, start, end - start);
- return text;
- }
-
- public void writeBytes(OutputStream out) throws IOException {
- byteArray.write(out, start, end - start);
- }
-
- public int getLength() {
- return end - start;
- }
-
- void setPosition(int position) {
- originalPosition = position;
- start = keyOffsets.get(originalPosition);
- if (position + 1 == keyOffsets.size()) {
- end = byteArray.size();
- } else {
- end = keyOffsets.get(originalPosition + 1);
- }
- }
- }
-
- private void recurse(int node, Visitor visitor, VisitorContextImpl context
- ) throws IOException {
- if (node != NULL) {
- recurse(getLeft(node), visitor, context);
- context.setPosition(node);
- visitor.visit(context);
- recurse(getRight(node), visitor, context);
- }
- }
-
- /**
- * Visit all of the nodes in the tree in sorted order.
- * @param visitor the action to be applied to each node
- * @throws IOException
- */
- public void visit(Visitor visitor) throws IOException {
- recurse(root, visitor, new VisitorContextImpl());
- }
-
- /**
- * Reset the table to empty.
- */
- public void clear() {
- super.clear();
- byteArray.clear();
- keyOffsets.clear();
- }
-
- public void getText(Text result, int originalPosition) {
- int offset = keyOffsets.get(originalPosition);
- int length;
- if (originalPosition + 1 == keyOffsets.size()) {
- length = byteArray.size() - offset;
- } else {
- length = keyOffsets.get(originalPosition + 1) - offset;
- }
- byteArray.setText(result, offset, length);
- }
-
- /**
- * Get the size of the character data in the table.
- * @return the bytes used by the table
- */
- public int getCharacterSize() {
- return byteArray.size();
- }
-
- /**
- * Calculate the approximate size in memory.
- * @return the number of bytes used in storing the tree.
- */
- public long getSizeInBytes() {
- return byteArray.getSizeInBytes() + keyOffsets.getSizeInBytes() +
- super.getSizeInBytes();
- }
-}
[06/37] hive git commit: HIVE-17118. Move the hive-orc source files
to make the package names unique.
Posted by om...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/TestColumnStatistics.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestColumnStatistics.java b/orc/src/test/org/apache/orc/TestColumnStatistics.java
deleted file mode 100644
index 93d4bdb..0000000
--- a/orc/src/test/org/apache/orc/TestColumnStatistics.java
+++ /dev/null
@@ -1,365 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.orc;
-
-import static junit.framework.Assert.assertEquals;
-import static org.junit.Assume.assumeTrue;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.PrintStream;
-import java.sql.Timestamp;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.orc.impl.ColumnStatisticsImpl;
-import org.apache.orc.tools.FileDump;
-import org.apache.orc.tools.TestFileDump;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-
-/**
- * Test ColumnStatisticsImpl for ORC.
- */
-public class TestColumnStatistics {
-
- @Test
- public void testLongMerge() throws Exception {
- TypeDescription schema = TypeDescription.createInt();
-
- ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
- ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
- stats1.updateInteger(10, 2);
- stats2.updateInteger(1, 1);
- stats2.updateInteger(1000, 1);
- stats1.merge(stats2);
- IntegerColumnStatistics typed = (IntegerColumnStatistics) stats1;
- assertEquals(1, typed.getMinimum());
- assertEquals(1000, typed.getMaximum());
- stats1.reset();
- stats1.updateInteger(-10, 1);
- stats1.updateInteger(10000, 1);
- stats1.merge(stats2);
- assertEquals(-10, typed.getMinimum());
- assertEquals(10000, typed.getMaximum());
- }
-
- @Test
- public void testDoubleMerge() throws Exception {
- TypeDescription schema = TypeDescription.createDouble();
-
- ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
- ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
- stats1.updateDouble(10.0);
- stats1.updateDouble(100.0);
- stats2.updateDouble(1.0);
- stats2.updateDouble(1000.0);
- stats1.merge(stats2);
- DoubleColumnStatistics typed = (DoubleColumnStatistics) stats1;
- assertEquals(1.0, typed.getMinimum(), 0.001);
- assertEquals(1000.0, typed.getMaximum(), 0.001);
- stats1.reset();
- stats1.updateDouble(-10);
- stats1.updateDouble(10000);
- stats1.merge(stats2);
- assertEquals(-10, typed.getMinimum(), 0.001);
- assertEquals(10000, typed.getMaximum(), 0.001);
- }
-
-
- @Test
- public void testStringMerge() throws Exception {
- TypeDescription schema = TypeDescription.createString();
-
- ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
- ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
- stats1.updateString(new Text("bob"));
- stats1.updateString(new Text("david"));
- stats1.updateString(new Text("charles"));
- stats2.updateString(new Text("anne"));
- byte[] erin = new byte[]{0, 1, 2, 3, 4, 5, 101, 114, 105, 110};
- stats2.updateString(erin, 6, 4, 5);
- assertEquals(24, ((StringColumnStatistics)stats2).getSum());
- stats1.merge(stats2);
- StringColumnStatistics typed = (StringColumnStatistics) stats1;
- assertEquals("anne", typed.getMinimum());
- assertEquals("erin", typed.getMaximum());
- assertEquals(39, typed.getSum());
- stats1.reset();
- stats1.updateString(new Text("aaa"));
- stats1.updateString(new Text("zzz"));
- stats1.merge(stats2);
- assertEquals("aaa", typed.getMinimum());
- assertEquals("zzz", typed.getMaximum());
- }
-
- @Test
- public void testDateMerge() throws Exception {
- TypeDescription schema = TypeDescription.createDate();
-
- ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
- ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
- stats1.updateDate(new DateWritable(1000));
- stats1.updateDate(new DateWritable(100));
- stats2.updateDate(new DateWritable(10));
- stats2.updateDate(new DateWritable(2000));
- stats1.merge(stats2);
- DateColumnStatistics typed = (DateColumnStatistics) stats1;
- assertEquals(new DateWritable(10).get(), typed.getMinimum());
- assertEquals(new DateWritable(2000).get(), typed.getMaximum());
- stats1.reset();
- stats1.updateDate(new DateWritable(-10));
- stats1.updateDate(new DateWritable(10000));
- stats1.merge(stats2);
- assertEquals(new DateWritable(-10).get(), typed.getMinimum());
- assertEquals(new DateWritable(10000).get(), typed.getMaximum());
- }
-
- @Test
- public void testTimestampMerge() throws Exception {
- TypeDescription schema = TypeDescription.createTimestamp();
-
- ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
- ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
- stats1.updateTimestamp(new Timestamp(10));
- stats1.updateTimestamp(new Timestamp(100));
- stats2.updateTimestamp(new Timestamp(1));
- stats2.updateTimestamp(new Timestamp(1000));
- stats1.merge(stats2);
- TimestampColumnStatistics typed = (TimestampColumnStatistics) stats1;
- assertEquals(1, typed.getMinimum().getTime());
- assertEquals(1000, typed.getMaximum().getTime());
- stats1.reset();
- stats1.updateTimestamp(new Timestamp(-10));
- stats1.updateTimestamp(new Timestamp(10000));
- stats1.merge(stats2);
- assertEquals(-10, typed.getMinimum().getTime());
- assertEquals(10000, typed.getMaximum().getTime());
- }
-
- @Test
- public void testDecimalMerge() throws Exception {
- TypeDescription schema = TypeDescription.createDecimal()
- .withPrecision(38).withScale(16);
-
- ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
- ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
- stats1.updateDecimal(new HiveDecimalWritable(10));
- stats1.updateDecimal(new HiveDecimalWritable(100));
- stats2.updateDecimal(new HiveDecimalWritable(1));
- stats2.updateDecimal(new HiveDecimalWritable(1000));
- stats1.merge(stats2);
- DecimalColumnStatistics typed = (DecimalColumnStatistics) stats1;
- assertEquals(1, typed.getMinimum().longValue());
- assertEquals(1000, typed.getMaximum().longValue());
- stats1.reset();
- stats1.updateDecimal(new HiveDecimalWritable(-10));
- stats1.updateDecimal(new HiveDecimalWritable(10000));
- stats1.merge(stats2);
- assertEquals(-10, typed.getMinimum().longValue());
- assertEquals(10000, typed.getMaximum().longValue());
- }
-
-
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
-
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- fs.setWorkingDirectory(workDir);
- testFilePath = new Path("TestOrcFile." + testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- private static BytesWritable bytes(int... items) {
- BytesWritable result = new BytesWritable();
- result.setSize(items.length);
- for (int i = 0; i < items.length; ++i) {
- result.getBytes()[i] = (byte) items[i];
- }
- return result;
- }
-
- void appendRow(VectorizedRowBatch batch, BytesWritable bytes,
- String str) {
- int row = batch.size++;
- if (bytes == null) {
- batch.cols[0].noNulls = false;
- batch.cols[0].isNull[row] = true;
- } else {
- ((BytesColumnVector) batch.cols[0]).setVal(row, bytes.getBytes(),
- 0, bytes.getLength());
- }
- if (str == null) {
- batch.cols[1].noNulls = false;
- batch.cols[1].isNull[row] = true;
- } else {
- ((BytesColumnVector) batch.cols[1]).setVal(row, str.getBytes());
- }
- }
-
- @Test
- public void testHasNull() throws Exception {
- TypeDescription schema =
- TypeDescription.createStruct()
- .addField("bytes1", TypeDescription.createBinary())
- .addField("string1", TypeDescription.createString());
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .rowIndexStride(1000)
- .stripeSize(10000)
- .bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch(5000);
- // STRIPE 1
- // RG1
- for(int i=0; i<1000; i++) {
- appendRow(batch, bytes(1, 2, 3), "RG1");
- }
- writer.addRowBatch(batch);
- batch.reset();
- // RG2
- for(int i=0; i<1000; i++) {
- appendRow(batch, bytes(1, 2, 3), null);
- }
- writer.addRowBatch(batch);
- batch.reset();
- // RG3
- for(int i=0; i<1000; i++) {
- appendRow(batch, bytes(1, 2, 3), "RG3");
- }
- writer.addRowBatch(batch);
- batch.reset();
- // RG4
- for (int i = 0; i < 1000; i++) {
- appendRow(batch, bytes(1,2,3), null);
- }
- writer.addRowBatch(batch);
- batch.reset();
- // RG5
- for(int i=0; i<1000; i++) {
- appendRow(batch, bytes(1, 2, 3), null);
- }
- writer.addRowBatch(batch);
- batch.reset();
- // STRIPE 2
- for (int i = 0; i < 5000; i++) {
- appendRow(batch, bytes(1,2,3), null);
- }
- writer.addRowBatch(batch);
- batch.reset();
- // STRIPE 3
- for (int i = 0; i < 5000; i++) {
- appendRow(batch, bytes(1,2,3), "STRIPE-3");
- }
- writer.addRowBatch(batch);
- batch.reset();
- // STRIPE 4
- for (int i = 0; i < 5000; i++) {
- appendRow(batch, bytes(1,2,3), null);
- }
- writer.addRowBatch(batch);
- batch.reset();
- writer.close();
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
-
- // check the file level stats
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(20000, stats[0].getNumberOfValues());
- assertEquals(20000, stats[1].getNumberOfValues());
- assertEquals(7000, stats[2].getNumberOfValues());
- assertEquals(false, stats[0].hasNull());
- assertEquals(false, stats[1].hasNull());
- assertEquals(true, stats[2].hasNull());
-
- // check the stripe level stats
- List<StripeStatistics> stripeStats = reader.getStripeStatistics();
- // stripe 1 stats
- StripeStatistics ss1 = stripeStats.get(0);
- ColumnStatistics ss1_cs1 = ss1.getColumnStatistics()[0];
- ColumnStatistics ss1_cs2 = ss1.getColumnStatistics()[1];
- ColumnStatistics ss1_cs3 = ss1.getColumnStatistics()[2];
- assertEquals(false, ss1_cs1.hasNull());
- assertEquals(false, ss1_cs2.hasNull());
- assertEquals(true, ss1_cs3.hasNull());
-
- // stripe 2 stats
- StripeStatistics ss2 = stripeStats.get(1);
- ColumnStatistics ss2_cs1 = ss2.getColumnStatistics()[0];
- ColumnStatistics ss2_cs2 = ss2.getColumnStatistics()[1];
- ColumnStatistics ss2_cs3 = ss2.getColumnStatistics()[2];
- assertEquals(false, ss2_cs1.hasNull());
- assertEquals(false, ss2_cs2.hasNull());
- assertEquals(true, ss2_cs3.hasNull());
-
- // stripe 3 stats
- StripeStatistics ss3 = stripeStats.get(2);
- ColumnStatistics ss3_cs1 = ss3.getColumnStatistics()[0];
- ColumnStatistics ss3_cs2 = ss3.getColumnStatistics()[1];
- ColumnStatistics ss3_cs3 = ss3.getColumnStatistics()[2];
- assertEquals(false, ss3_cs1.hasNull());
- assertEquals(false, ss3_cs2.hasNull());
- assertEquals(false, ss3_cs3.hasNull());
-
- // stripe 4 stats
- StripeStatistics ss4 = stripeStats.get(3);
- ColumnStatistics ss4_cs1 = ss4.getColumnStatistics()[0];
- ColumnStatistics ss4_cs2 = ss4.getColumnStatistics()[1];
- ColumnStatistics ss4_cs3 = ss4.getColumnStatistics()[2];
- assertEquals(false, ss4_cs1.hasNull());
- assertEquals(false, ss4_cs2.hasNull());
- assertEquals(true, ss4_cs3.hasNull());
-
- // Test file dump
- PrintStream origOut = System.out;
- String outputFilename = "orc-file-has-null.out";
- FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);
-
- // replace stdout and run command
- System.setOut(new PrintStream(myOut));
- FileDump.main(new String[]{testFilePath.toString(), "--rowindex=2"});
- System.out.flush();
- System.setOut(origOut);
- // If called with an expression evaluating to false, the test will halt
- // and be ignored.
- assumeTrue(!System.getProperty("os.name").startsWith("Windows"));
- TestFileDump.checkOutput(outputFilename, workDir + File.separator + outputFilename);
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/TestNewIntegerEncoding.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestNewIntegerEncoding.java b/orc/src/test/org/apache/orc/TestNewIntegerEncoding.java
deleted file mode 100644
index 526dd81..0000000
--- a/orc/src/test/org/apache/orc/TestNewIntegerEncoding.java
+++ /dev/null
@@ -1,1373 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import static junit.framework.Assert.assertEquals;
-
-import java.io.File;
-import java.sql.Timestamp;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-import com.google.common.collect.Lists;
-import com.google.common.primitives.Longs;
-
-@RunWith(value = Parameterized.class)
-public class TestNewIntegerEncoding {
-
- private OrcFile.EncodingStrategy encodingStrategy;
-
- public TestNewIntegerEncoding( OrcFile.EncodingStrategy es) {
- this.encodingStrategy = es;
- }
-
- @Parameters
- public static Collection<Object[]> data() {
- Object[][] data = new Object[][] { { OrcFile.EncodingStrategy.COMPRESSION },
- { OrcFile.EncodingStrategy.SPEED } };
- return Arrays.asList(data);
- }
-
- public static class TSRow {
- Timestamp ts;
-
- public TSRow(Timestamp ts) {
- this.ts = ts;
- }
- }
-
- public static TypeDescription getRowSchema() {
- return TypeDescription.createStruct()
- .addField("int1", TypeDescription.createInt())
- .addField("long1", TypeDescription.createLong());
- }
-
- public static void appendRow(VectorizedRowBatch batch,
- int int1, long long1) {
- int row = batch.size++;
- ((LongColumnVector) batch.cols[0]).vector[row] = int1;
- ((LongColumnVector) batch.cols[1]).vector[row] = long1;
- }
-
- public static void appendLong(VectorizedRowBatch batch,
- long long1) {
- int row = batch.size++;
- ((LongColumnVector) batch.cols[0]).vector[row] = long1;
- }
-
- Path workDir = new Path(System.getProperty("test.tmp.dir", "target"
- + File.separator + "test" + File.separator + "tmp"));
-
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcFile."
- + testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @Test
- public void testBasicRow() throws Exception {
- TypeDescription schema= getRowSchema();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- appendRow(batch, 111, 1111L);
- appendRow(batch, 111, 1111L);
- appendRow(batch, 111, 1111L);
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(111, ((LongColumnVector) batch.cols[0]).vector[r]);
- assertEquals(1111, ((LongColumnVector) batch.cols[1]).vector[r]);
- }
- }
- }
-
- @Test
- public void testBasicOld() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
- long[] inp = new long[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6,
- 7, 8, 9, 10, 1, 1, 1, 1, 1, 1, 10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1,
- 2, 5, 1, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1,
- 9, 2, 6, 3, 7, 1, 9, 2, 6, 2000, 2, 1, 1, 1, 1, 1, 3, 7, 1, 9, 2, 6, 1,
- 1, 1, 1, 1 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .compress(CompressionKind.NONE)
- .version(OrcFile.Version.V_0_11)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- batch = reader.getSchema().createRowBatch();
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testBasicNew() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- long[] inp = new long[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6,
- 7, 8, 9, 10, 1, 1, 1, 1, 1, 1, 10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1,
- 2, 5, 1, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1, 9, 2, 6, 3, 7, 1,
- 9, 2, 6, 3, 7, 1, 9, 2, 6, 2000, 2, 1, 1, 1, 1, 1, 3, 7, 1, 9, 2, 6, 1,
- 1, 1, 1, 1 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- batch = reader.getSchema().createRowBatch();
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testBasicDelta1() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- long[] inp = new long[] { -500, -400, -350, -325, -310 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testBasicDelta2() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- long[] inp = new long[] { -500, -600, -650, -675, -710 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testBasicDelta3() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- long[] inp = new long[] { 500, 400, 350, 325, 310 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testBasicDelta4() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- long[] inp = new long[] { 500, 600, 650, 675, 710 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testDeltaOverflow() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- long[] inp = new long[]{4513343538618202719l, 4513343538618202711l,
- 2911390882471569739l,
- -9181829309989854913l};
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
- .compress(CompressionKind.NONE).bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch();
- for (Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile
- .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testDeltaOverflow2() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- long[] inp = new long[]{Long.MAX_VALUE, 4513343538618202711l,
- 2911390882471569739l,
- Long.MIN_VALUE};
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
- .compress(CompressionKind.NONE).bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch();
- for (Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile
- .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testDeltaOverflow3() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- long[] inp = new long[]{-4513343538618202711l, -2911390882471569739l, -2,
- Long.MAX_VALUE};
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(
- testFilePath,
- OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
- .compress(CompressionKind.NONE).bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch();
- for (Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile
- .createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testIntegerMin() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- input.add((long) Integer.MIN_VALUE);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testIntegerMax() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- input.add((long) Integer.MAX_VALUE);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testLongMin() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- input.add(Long.MIN_VALUE);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testLongMax() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- input.add(Long.MAX_VALUE);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testRandomInt() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 100000; i++) {
- input.add((long) rand.nextInt());
- }
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch(100000);
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testRandomLong() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 100000; i++) {
- input.add(rand.nextLong());
- }
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch(100000);
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testPatchedBaseNegativeMin() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- long[] inp = new long[] { 20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2, 2,
- 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1,
- 1, 135, 3, 3, 1, 414, 2, 1, 2, 2, 594, 2, 5, 6, 4, 11, 1, 2, 2, 1, 1,
- 52, 4, 1, 2, 7, 1, 17, 334, 1, 2, 1, 2, 2, 6, 1, 266, 1, 2, 217, 2, 6,
- 2, 13, 2, 2, 1, 2, 3, 5, 1, 2, 1, 7244, 11813, 1, 33, 2, -13, 1, 2, 3,
- 13, 1, 92, 3, 13, 5, 14, 9, 141, 12, 6, 15, 25, 1, 1, 1, 46, 2, 1, 1,
- 141, 3, 1, 1, 1, 1, 2, 1, 4, 34, 5, 78, 8, 1, 2, 2, 1, 9, 10, 2, 1, 4,
- 13, 1, 5, 4, 4, 19, 5, 1, 1, 1, 68, 33, 399, 1, 1885, 25, 5, 2, 4, 1,
- 1, 2, 16, 1, 2966, 3, 1, 1, 25501, 1, 1, 1, 66, 1, 3, 8, 131, 14, 5, 1,
- 2, 2, 1, 1, 8, 1, 1, 2, 1, 5, 9, 2, 3, 112, 13, 2, 2, 1, 5, 10, 3, 1,
- 1, 13, 2, 3, 4, 1, 3, 1, 1, 2, 1, 1, 2, 4, 2, 207, 1, 1, 2, 4, 3, 3, 2,
- 2, 16 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testPatchedBaseNegativeMin2() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- long[] inp = new long[] { 20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2, 2,
- 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1,
- 1, 135, 3, 3, 1, 414, 2, 1, 2, 2, 594, 2, 5, 6, 4, 11, 1, 2, 2, 1, 1,
- 52, 4, 1, 2, 7, 1, 17, 334, 1, 2, 1, 2, 2, 6, 1, 266, 1, 2, 217, 2, 6,
- 2, 13, 2, 2, 1, 2, 3, 5, 1, 2, 1, 7244, 11813, 1, 33, 2, -1, 1, 2, 3,
- 13, 1, 92, 3, 13, 5, 14, 9, 141, 12, 6, 15, 25, 1, 1, 1, 46, 2, 1, 1,
- 141, 3, 1, 1, 1, 1, 2, 1, 4, 34, 5, 78, 8, 1, 2, 2, 1, 9, 10, 2, 1, 4,
- 13, 1, 5, 4, 4, 19, 5, 1, 1, 1, 68, 33, 399, 1, 1885, 25, 5, 2, 4, 1,
- 1, 2, 16, 1, 2966, 3, 1, 1, 25501, 1, 1, 1, 66, 1, 3, 8, 131, 14, 5, 1,
- 2, 2, 1, 1, 8, 1, 1, 2, 1, 5, 9, 2, 3, 112, 13, 2, 2, 1, 5, 10, 3, 1,
- 1, 13, 2, 3, 4, 1, 3, 1, 1, 2, 1, 1, 2, 4, 2, 207, 1, 1, 2, 4, 3, 3, 2,
- 2, 16 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testPatchedBaseNegativeMin3() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- long[] inp = new long[] { 20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2, 2,
- 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1,
- 1, 135, 3, 3, 1, 414, 2, 1, 2, 2, 594, 2, 5, 6, 4, 11, 1, 2, 2, 1, 1,
- 52, 4, 1, 2, 7, 1, 17, 334, 1, 2, 1, 2, 2, 6, 1, 266, 1, 2, 217, 2, 6,
- 2, 13, 2, 2, 1, 2, 3, 5, 1, 2, 1, 7244, 11813, 1, 33, 2, 0, 1, 2, 3,
- 13, 1, 92, 3, 13, 5, 14, 9, 141, 12, 6, 15, 25, 1, 1, 1, 46, 2, 1, 1,
- 141, 3, 1, 1, 1, 1, 2, 1, 4, 34, 5, 78, 8, 1, 2, 2, 1, 9, 10, 2, 1, 4,
- 13, 1, 5, 4, 4, 19, 5, 1, 1, 1, 68, 33, 399, 1, 1885, 25, 5, 2, 4, 1,
- 1, 2, 16, 1, 2966, 3, 1, 1, 25501, 1, 1, 1, 66, 1, 3, 8, 131, 14, 5, 1,
- 2, 2, 1, 1, 8, 1, 1, 2, 1, 5, 9, 2, 3, 112, 13, 2, 2, 1, 5, 10, 3, 1,
- 1, 13, 2, 3, 4, 1, 3, 1, 1, 2, 1, 1, 2, 4, 2, 207, 1, 1, 2, 4, 3, 3, 2,
- 2, 16 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testPatchedBaseNegativeMin4() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- long[] inp = new long[] { 13, 13, 11, 8, 13, 10, 10, 11, 11, 14, 11, 7, 13,
- 12, 12, 11, 15, 12, 12, 9, 8, 10, 13, 11, 8, 6, 5, 6, 11, 7, 15, 10, 7,
- 6, 8, 7, 9, 9, 11, 33, 11, 3, 7, 4, 6, 10, 14, 12, 5, 14, 7, 6 };
- List<Long> input = Lists.newArrayList(Longs.asList(inp));
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testPatchedBaseAt0() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(100));
- }
- input.set(0, 20000L);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testPatchedBaseAt1() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(100));
- }
- input.set(1, 20000L);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testPatchedBaseAt255() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(100));
- }
- input.set(255, 20000L);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testPatchedBaseAt256() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(100));
- }
- input.set(256, 20000L);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testPatchedBase510() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(100));
- }
- input.set(510, 20000L);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testPatchedBase511() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(100));
- }
- input.set(511, 20000L);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testPatchedBaseMax1() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for (int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(60));
- }
- input.set(511, Long.MAX_VALUE);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- for (Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testPatchedBaseMax2() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for (int i = 0; i < 5120; i++) {
- input.add((long) rand.nextInt(60));
- }
- input.set(128, Long.MAX_VALUE);
- input.set(256, Long.MAX_VALUE);
- input.set(511, Long.MAX_VALUE);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch(5120);
- for (Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testPatchedBaseMax3() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- input.add(371946367L);
- input.add(11963367L);
- input.add(68639400007L);
- input.add(100233367L);
- input.add(6367L);
- input.add(10026367L);
- input.add(3670000L);
- input.add(3602367L);
- input.add(4719226367L);
- input.add(7196367L);
- input.add(444442L);
- input.add(210267L);
- input.add(21033L);
- input.add(160267L);
- input.add(400267L);
- input.add(23634347L);
- input.add(16027L);
- input.add(46026367L);
- input.add(Long.MAX_VALUE);
- input.add(33333L);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for (Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testPatchedBaseMax4() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- for (int i = 0; i < 25; i++) {
- input.add(371292224226367L);
- input.add(119622332222267L);
- input.add(686329400222007L);
- input.add(100233333222367L);
- input.add(636272333322222L);
- input.add(10202633223267L);
- input.add(36700222022230L);
- input.add(36023226224227L);
- input.add(47192226364427L);
- input.add(71963622222447L);
- input.add(22244444222222L);
- input.add(21220263327442L);
- input.add(21032233332232L);
- input.add(16026322232227L);
- input.add(40022262272212L);
- input.add(23634342227222L);
- input.add(16022222222227L);
- input.add(46026362222227L);
- input.add(46026362222227L);
- input.add(33322222222323L);
- }
- input.add(Long.MAX_VALUE);
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
- for (Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-
- @Test
- public void testPatchedBaseTimestamp() throws Exception {
- TypeDescription schema = TypeDescription.createStruct()
- .addField("ts", TypeDescription.createTimestamp());
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
-
- List<Timestamp> tslist = Lists.newArrayList();
- tslist.add(Timestamp.valueOf("2099-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2003-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("1999-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("1995-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2002-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2010-03-02 00:00:00"));
- tslist.add(Timestamp.valueOf("2005-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2006-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2003-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("1996-08-02 00:00:00"));
- tslist.add(Timestamp.valueOf("1998-11-02 00:00:00"));
- tslist.add(Timestamp.valueOf("2008-10-02 00:00:00"));
- tslist.add(Timestamp.valueOf("1993-08-02 00:00:00"));
- tslist.add(Timestamp.valueOf("2008-01-02 00:00:00"));
- tslist.add(Timestamp.valueOf("2007-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2004-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2008-10-02 00:00:00"));
- tslist.add(Timestamp.valueOf("2003-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2004-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2008-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2005-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("1994-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2006-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2004-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2001-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2000-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2000-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2002-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2006-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2011-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2002-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("2005-01-01 00:00:00"));
- tslist.add(Timestamp.valueOf("1974-01-01 00:00:00"));
- int idx = 0;
- for (Timestamp ts : tslist) {
- ((TimestampColumnVector) batch.cols[0]).set(idx, ts);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(tslist.get(idx++),
- ((TimestampColumnVector) batch.cols[0]).asScratchTimestamp(r));
- }
- }
- }
-
- @Test
- public void testDirectLargeNegatives() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch();
-
- appendLong(batch, -7486502418706614742L);
- appendLong(batch, 0L);
- appendLong(batch, 1L);
- appendLong(batch, 1L);
- appendLong(batch, -5535739865598783616L);
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- assertEquals(true, rows.nextBatch(batch));
- assertEquals(5, batch.size);
- assertEquals(-7486502418706614742L,
- ((LongColumnVector) batch.cols[0]).vector[0]);
- assertEquals(0L,
- ((LongColumnVector) batch.cols[0]).vector[1]);
- assertEquals(1L,
- ((LongColumnVector) batch.cols[0]).vector[2]);
- assertEquals(1L,
- ((LongColumnVector) batch.cols[0]).vector[3]);
- assertEquals(-5535739865598783616L,
- ((LongColumnVector) batch.cols[0]).vector[4]);
- assertEquals(false, rows.nextBatch(batch));
- }
-
- @Test
- public void testSeek() throws Exception {
- TypeDescription schema = TypeDescription.createLong();
-
- List<Long> input = Lists.newArrayList();
- Random rand = new Random();
- for(int i = 0; i < 100000; i++) {
- input.add((long) rand.nextInt());
- }
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .compress(CompressionKind.NONE)
- .stripeSize(100000)
- .bufferSize(10000)
- .version(OrcFile.Version.V_0_11)
- .encodingStrategy(encodingStrategy));
- VectorizedRowBatch batch = schema.createRowBatch(100000);
- for(Long l : input) {
- appendLong(batch, l);
- }
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- int idx = 55555;
- rows.seekToRow(idx);
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(input.get(idx++).longValue(),
- ((LongColumnVector) batch.cols[0]).vector[r]);
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/TestOrcNullOptimization.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestOrcNullOptimization.java b/orc/src/test/org/apache/orc/TestOrcNullOptimization.java
deleted file mode 100644
index 0b605c9..0000000
--- a/orc/src/test/org/apache/orc/TestOrcNullOptimization.java
+++ /dev/null
@@ -1,415 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import static junit.framework.Assert.assertEquals;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.List;
-import java.util.Random;
-
-import junit.framework.Assert;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-
-import org.apache.orc.impl.RecordReaderImpl;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-
-import com.google.common.collect.Lists;
-
-public class TestOrcNullOptimization {
-
- TypeDescription createMyStruct() {
- return TypeDescription.createStruct()
- .addField("a", TypeDescription.createInt())
- .addField("b", TypeDescription.createString())
- .addField("c", TypeDescription.createBoolean())
- .addField("d", TypeDescription.createList(
- TypeDescription.createStruct()
- .addField("z", TypeDescription.createInt())));
- }
-
- void addRow(Writer writer, VectorizedRowBatch batch,
- Integer a, String b, Boolean c,
- Integer... d) throws IOException {
- if (batch.size == batch.getMaxSize()) {
- writer.addRowBatch(batch);
- batch.reset();
- }
- int row = batch.size++;
- LongColumnVector aColumn = (LongColumnVector) batch.cols[0];
- BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1];
- LongColumnVector cColumn = (LongColumnVector) batch.cols[2];
- ListColumnVector dColumn = (ListColumnVector) batch.cols[3];
- StructColumnVector dStruct = (StructColumnVector) dColumn.child;
- LongColumnVector dInt = (LongColumnVector) dStruct.fields[0];
- if (a == null) {
- aColumn.noNulls = false;
- aColumn.isNull[row] = true;
- } else {
- aColumn.vector[row] = a;
- }
- if (b == null) {
- bColumn.noNulls = false;
- bColumn.isNull[row] = true;
- } else {
- bColumn.setVal(row, b.getBytes());
- }
- if (c == null) {
- cColumn.noNulls = false;
- cColumn.isNull[row] = true;
- } else {
- cColumn.vector[row] = c ? 1 : 0;
- }
- if (d == null) {
- dColumn.noNulls = false;
- dColumn.isNull[row] = true;
- } else {
- dColumn.offsets[row] = dColumn.childCount;
- dColumn.lengths[row] = d.length;
- dColumn.childCount += d.length;
- for(int e=0; e < d.length; ++e) {
- dInt.vector[(int) dColumn.offsets[row] + e] = d[e];
- }
- }
- }
-
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
-
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcNullOptimization." +
- testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @Test
- public void testMultiStripeWithNull() throws Exception {
- TypeDescription schema = createMyStruct();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000));
- Random rand = new Random(100);
- VectorizedRowBatch batch = schema.createRowBatch();
- addRow(writer, batch, null, null, true, 100);
- for (int i = 2; i < 20000; i++) {
- addRow(writer, batch, rand.nextInt(1), "a", true, 100);
- }
- addRow(writer, batch, null, null, true, 100);
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- // check the stats
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(20000, reader.getNumberOfRows());
- assertEquals(20000, stats[0].getNumberOfValues());
-
- assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMaximum());
- assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum());
- assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
- assertEquals(0, ((IntegerColumnStatistics) stats[1]).getSum());
- assertEquals("count: 19998 hasNull: true min: 0 max: 0 sum: 0",
- stats[1].toString());
-
- assertEquals("a", ((StringColumnStatistics) stats[2]).getMaximum());
- assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
- assertEquals(19998, stats[2].getNumberOfValues());
- assertEquals("count: 19998 hasNull: true min: a max: a sum: 19998",
- stats[2].toString());
-
- // check the inspectors
- assertEquals("struct<a:int,b:string,c:boolean,d:array<struct<z:int>>>",
- reader.getSchema().toString());
-
- RecordReader rows = reader.rows();
-
- List<Boolean> expected = Lists.newArrayList();
- for (StripeInformation sinfo : reader.getStripes()) {
- expected.add(false);
- }
- // only the first and last stripe will have PRESENT stream
- expected.set(0, true);
- expected.set(expected.size() - 1, true);
-
- List<Boolean> got = Lists.newArrayList();
- // check if the strip footer contains PRESENT stream
- for (StripeInformation sinfo : reader.getStripes()) {
- OrcProto.StripeFooter sf =
- ((RecordReaderImpl) rows).readStripeFooter(sinfo);
- got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString())
- != -1);
- }
- assertEquals(expected, got);
-
- batch = reader.getSchema().createRowBatch();
- LongColumnVector aColumn = (LongColumnVector) batch.cols[0];
- BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1];
- LongColumnVector cColumn = (LongColumnVector) batch.cols[2];
- ListColumnVector dColumn = (ListColumnVector) batch.cols[3];
- LongColumnVector dElements =
- (LongColumnVector)(((StructColumnVector) dColumn.child).fields[0]);
- assertEquals(true , rows.nextBatch(batch));
- assertEquals(1024, batch.size);
-
- // row 1
- assertEquals(true, aColumn.isNull[0]);
- assertEquals(true, bColumn.isNull[0]);
- assertEquals(1, cColumn.vector[0]);
- assertEquals(0, dColumn.offsets[0]);
- assertEquals(1, dColumn.lengths[1]);
- assertEquals(100, dElements.vector[0]);
-
- rows.seekToRow(19998);
- rows.nextBatch(batch);
- assertEquals(2, batch.size);
-
- // last-1 row
- assertEquals(0, aColumn.vector[0]);
- assertEquals("a", bColumn.toString(0));
- assertEquals(1, cColumn.vector[0]);
- assertEquals(0, dColumn.offsets[0]);
- assertEquals(1, dColumn.lengths[0]);
- assertEquals(100, dElements.vector[0]);
-
- // last row
- assertEquals(true, aColumn.isNull[1]);
- assertEquals(true, bColumn.isNull[1]);
- assertEquals(1, cColumn.vector[1]);
- assertEquals(1, dColumn.offsets[1]);
- assertEquals(1, dColumn.lengths[1]);
- assertEquals(100, dElements.vector[1]);
-
- assertEquals(false, rows.nextBatch(batch));
- rows.close();
- }
-
- @Test
- public void testMultiStripeWithoutNull() throws Exception {
- TypeDescription schema = createMyStruct();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .compress(CompressionKind.NONE)
- .bufferSize(10000));
- Random rand = new Random(100);
- VectorizedRowBatch batch = schema.createRowBatch();
- for (int i = 1; i < 20000; i++) {
- addRow(writer, batch, rand.nextInt(1), "a", true, 100);
- }
- addRow(writer, batch, 0, "b", true, 100);
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- // check the stats
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(20000, reader.getNumberOfRows());
- assertEquals(20000, stats[0].getNumberOfValues());
-
- assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMaximum());
- assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum());
- assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
- assertEquals(0, ((IntegerColumnStatistics) stats[1]).getSum());
- assertEquals("count: 20000 hasNull: false min: 0 max: 0 sum: 0",
- stats[1].toString());
-
- assertEquals("b", ((StringColumnStatistics) stats[2]).getMaximum());
- assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
- assertEquals(20000, stats[2].getNumberOfValues());
- assertEquals("count: 20000 hasNull: false min: a max: b sum: 20000",
- stats[2].toString());
-
- // check the inspectors
- Assert.assertEquals("struct<a:int,b:string,c:boolean,d:array<struct<z:int>>>",
- reader.getSchema().toString());
-
- RecordReader rows = reader.rows();
-
- // none of the stripes will have PRESENT stream
- List<Boolean> expected = Lists.newArrayList();
- for (StripeInformation sinfo : reader.getStripes()) {
- expected.add(false);
- }
-
- List<Boolean> got = Lists.newArrayList();
- // check if the strip footer contains PRESENT stream
- for (StripeInformation sinfo : reader.getStripes()) {
- OrcProto.StripeFooter sf =
- ((RecordReaderImpl) rows).readStripeFooter(sinfo);
- got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString())
- != -1);
- }
- assertEquals(expected, got);
-
- rows.seekToRow(19998);
-
- batch = reader.getSchema().createRowBatch();
- LongColumnVector aColumn = (LongColumnVector) batch.cols[0];
- BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1];
- LongColumnVector cColumn = (LongColumnVector) batch.cols[2];
- ListColumnVector dColumn = (ListColumnVector) batch.cols[3];
- LongColumnVector dElements =
- (LongColumnVector)(((StructColumnVector) dColumn.child).fields[0]);
-
- assertEquals(true, rows.nextBatch(batch));
- assertEquals(2, batch.size);
-
- // last-1 row
- assertEquals(0, aColumn.vector[0]);
- assertEquals("a", bColumn.toString(0));
- assertEquals(1, cColumn.vector[0]);
- assertEquals(0, dColumn.offsets[0]);
- assertEquals(1, dColumn.lengths[0]);
- assertEquals(100, dElements.vector[0]);
-
- // last row
- assertEquals(0, aColumn.vector[1]);
- assertEquals("b", bColumn.toString(1));
- assertEquals(1, cColumn.vector[1]);
- assertEquals(1, dColumn.offsets[1]);
- assertEquals(1, dColumn.lengths[1]);
- assertEquals(100, dElements.vector[1]);
- rows.close();
- }
-
- @Test
- public void testColumnsWithNullAndCompression() throws Exception {
- TypeDescription schema = createMyStruct();
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf)
- .setSchema(schema)
- .stripeSize(100000)
- .bufferSize(10000));
- VectorizedRowBatch batch = schema.createRowBatch();
- addRow(writer, batch, 3, "a", true, 100);
- addRow(writer, batch, null, "b", true, 100);
- addRow(writer, batch, 3, null, false, 100);
- addRow(writer, batch, 3, "d", true, 100);
- addRow(writer, batch, 2, "e", true, 100);
- addRow(writer, batch, 2, "f", true, 100);
- addRow(writer, batch, 2, "g", true, 100);
- addRow(writer, batch, 2, "h", true, 100);
- writer.addRowBatch(batch);
- writer.close();
-
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- // check the stats
- ColumnStatistics[] stats = reader.getStatistics();
- assertEquals(8, reader.getNumberOfRows());
- assertEquals(8, stats[0].getNumberOfValues());
-
- assertEquals(3, ((IntegerColumnStatistics) stats[1]).getMaximum());
- assertEquals(2, ((IntegerColumnStatistics) stats[1]).getMinimum());
- assertEquals(true, ((IntegerColumnStatistics) stats[1]).isSumDefined());
- assertEquals(17, ((IntegerColumnStatistics) stats[1]).getSum());
- assertEquals("count: 7 hasNull: true min: 2 max: 3 sum: 17",
- stats[1].toString());
-
- assertEquals("h", ((StringColumnStatistics) stats[2]).getMaximum());
- assertEquals("a", ((StringColumnStatistics) stats[2]).getMinimum());
- assertEquals(7, stats[2].getNumberOfValues());
- assertEquals("count: 7 hasNull: true min: a max: h sum: 7",
- stats[2].toString());
-
- // check the inspectors
- batch = reader.getSchema().createRowBatch();
- LongColumnVector aColumn = (LongColumnVector) batch.cols[0];
- BytesColumnVector bColumn = (BytesColumnVector) batch.cols[1];
- LongColumnVector cColumn = (LongColumnVector) batch.cols[2];
- ListColumnVector dColumn = (ListColumnVector) batch.cols[3];
- LongColumnVector dElements =
- (LongColumnVector)(((StructColumnVector) dColumn.child).fields[0]);
- Assert.assertEquals("struct<a:int,b:string,c:boolean,d:array<struct<z:int>>>",
- reader.getSchema().toString());
-
- RecordReader rows = reader.rows();
- // only the last strip will have PRESENT stream
- List<Boolean> expected = Lists.newArrayList();
- for (StripeInformation sinfo : reader.getStripes()) {
- expected.add(false);
- }
- expected.set(expected.size() - 1, true);
-
- List<Boolean> got = Lists.newArrayList();
- // check if the strip footer contains PRESENT stream
- for (StripeInformation sinfo : reader.getStripes()) {
- OrcProto.StripeFooter sf =
- ((RecordReaderImpl) rows).readStripeFooter(sinfo);
- got.add(sf.toString().indexOf(OrcProto.Stream.Kind.PRESENT.toString())
- != -1);
- }
- assertEquals(expected, got);
-
- assertEquals(true, rows.nextBatch(batch));
- assertEquals(8, batch.size);
-
- // row 1
- assertEquals(3, aColumn.vector[0]);
- assertEquals("a", bColumn.toString(0));
- assertEquals(1, cColumn.vector[0]);
- assertEquals(0, dColumn.offsets[0]);
- assertEquals(1, dColumn.lengths[0]);
- assertEquals(100, dElements.vector[0]);
-
- // row 2
- assertEquals(true, aColumn.isNull[1]);
- assertEquals("b", bColumn.toString(1));
- assertEquals(1, cColumn.vector[1]);
- assertEquals(1, dColumn.offsets[1]);
- assertEquals(1, dColumn.lengths[1]);
- assertEquals(100, dElements.vector[1]);
-
- // row 3
- assertEquals(3, aColumn.vector[2]);
- assertEquals(true, bColumn.isNull[2]);
- assertEquals(0, cColumn.vector[2]);
- assertEquals(2, dColumn.offsets[2]);
- assertEquals(1, dColumn.lengths[2]);
- assertEquals(100, dElements.vector[2]);
-
- rows.close();
- }
-}
http://git-wip-us.apache.org/repos/asf/hive/blob/df8921d8/orc/src/test/org/apache/orc/TestOrcTimezone1.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestOrcTimezone1.java b/orc/src/test/org/apache/orc/TestOrcTimezone1.java
deleted file mode 100644
index 72dc455..0000000
--- a/orc/src/test/org/apache/orc/TestOrcTimezone1.java
+++ /dev/null
@@ -1,189 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.orc;
-
-import static junit.framework.Assert.assertEquals;
-import static junit.framework.Assert.assertNotNull;
-
-import java.io.File;
-import java.sql.Timestamp;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-import java.util.TimeZone;
-
-import junit.framework.Assert;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TestName;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-
-import com.google.common.collect.Lists;
-
-/**
- *
- */
-@RunWith(Parameterized.class)
-public class TestOrcTimezone1 {
- Path workDir = new Path(System.getProperty("test.tmp.dir",
- "target" + File.separator + "test" + File.separator + "tmp"));
- Configuration conf;
- FileSystem fs;
- Path testFilePath;
- String writerTimeZone;
- String readerTimeZone;
- static TimeZone defaultTimeZone = TimeZone.getDefault();
-
- public TestOrcTimezone1(String writerTZ, String readerTZ) {
- this.writerTimeZone = writerTZ;
- this.readerTimeZone = readerTZ;
- }
-
- @Parameterized.Parameters
- public static Collection<Object[]> data() {
- List<Object[]> result = Arrays.asList(new Object[][]{
- /* Extreme timezones */
- {"GMT-12:00", "GMT+14:00"},
- /* No difference in DST */
- {"America/Los_Angeles", "America/Los_Angeles"}, /* same timezone both with DST */
- {"Europe/Berlin", "Europe/Berlin"}, /* same as above but europe */
- {"America/Phoenix", "Asia/Kolkata"} /* Writer no DST, Reader no DST */,
- {"Europe/Berlin", "America/Los_Angeles"} /* Writer DST, Reader DST */,
- {"Europe/Berlin", "America/Chicago"} /* Writer DST, Reader DST */,
- /* With DST difference */
- {"Europe/Berlin", "UTC"},
- {"UTC", "Europe/Berlin"} /* Writer no DST, Reader DST */,
- {"America/Los_Angeles", "Asia/Kolkata"} /* Writer DST, Reader no DST */,
- {"Europe/Berlin", "Asia/Kolkata"} /* Writer DST, Reader no DST */,
- /* Timezone offsets for the reader has changed historically */
- {"Asia/Saigon", "Pacific/Enderbury"},
- {"UTC", "Asia/Jerusalem"},
-
- // NOTE:
- // "1995-01-01 03:00:00.688888888" this is not a valid time in Pacific/Enderbury timezone.
- // On 1995-01-01 00:00:00 GMT offset moved from -11:00 hr to +13:00 which makes all values
- // on 1995-01-01 invalid. Try this with joda time
- // new MutableDateTime("1995-01-01", DateTimeZone.forTimeZone(readerTimeZone));
- });
- return result;
- }
-
- @Rule
- public TestName testCaseName = new TestName();
-
- @Before
- public void openFileSystem() throws Exception {
- conf = new Configuration();
- fs = FileSystem.getLocal(conf);
- testFilePath = new Path(workDir, "TestOrcFile." +
- testCaseName.getMethodName() + ".orc");
- fs.delete(testFilePath, false);
- }
-
- @After
- public void restoreTimeZone() {
- TimeZone.setDefault(defaultTimeZone);
- }
-
- @Test
- public void testTimestampWriter() throws Exception {
- TypeDescription schema = TypeDescription.createTimestamp();
-
- TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone));
- Writer writer = OrcFile.createWriter(testFilePath,
- OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
- .bufferSize(10000));
- assertEquals(writerTimeZone, TimeZone.getDefault().getID());
- List<String> ts = Lists.newArrayList();
- ts.add("2003-01-01 01:00:00.000000222");
- ts.add("1996-08-02 09:00:00.723100809");
- ts.add("1999-01-01 02:00:00.999999999");
- ts.add("1995-01-02 03:00:00.688888888");
- ts.add("2002-01-01 04:00:00.1");
- ts.add("2010-03-02 05:00:00.000009001");
- ts.add("2005-01-01 06:00:00.000002229");
- ts.add("2006-01-01 07:00:00.900203003");
- ts.add("2003-01-01 08:00:00.800000007");
- ts.add("1998-11-02 10:00:00.857340643");
- ts.add("2008-10-02 11:00:00.0");
- ts.add("2037-01-01 00:00:00.000999");
- ts.add("2014-03-28 00:00:00.0");
- VectorizedRowBatch batch = schema.createRowBatch();
- TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
- for (String t : ts) {
- times.set(batch.size++, Timestamp.valueOf(t));
- }
- writer.addRowBatch(batch);
- writer.close();
-
- TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
- Reader reader = OrcFile.createReader(testFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- assertEquals(readerTimeZone, TimeZone.getDefault().getID());
- RecordReader rows = reader.rows();
- batch = reader.getSchema().createRowBatch();
- times = (TimestampColumnVector) batch.cols[0];
- int idx = 0;
- while (rows.nextBatch(batch)) {
- for(int r=0; r < batch.size; ++r) {
- assertEquals(ts.get(idx++), times.asScratchTimestamp(r).toString());
- }
- }
- rows.close();
- }
-
- @Test
- public void testReadTimestampFormat_0_11() throws Exception {
- TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
- Path oldFilePath = new Path(getClass().getClassLoader().
- getSystemResource("orc-file-11-format.orc").getPath());
- Reader reader = OrcFile.createReader(oldFilePath,
- OrcFile.readerOptions(conf).filesystem(fs));
- TypeDescription schema = reader.getSchema();
- int col = schema.getFieldNames().indexOf("ts");
- VectorizedRowBatch batch = schema.createRowBatch(10);
- TimestampColumnVector ts = (TimestampColumnVector) batch.cols[col];
-
- boolean[] include = new boolean[schema.getMaximumId() + 1];
- include[schema.getChildren().get(col).getId()] = true;
- RecordReader rows = reader.rows
- (new Reader.Options().include(include));
- assertEquals(true, rows.nextBatch(batch));
- assertEquals(Timestamp.valueOf("2000-03-12 15:00:00"),
- ts.asScratchTimestamp(0));
-
- // check the contents of second row
- rows.seekToRow(7499);
- assertEquals(true, rows.nextBatch(batch));
- assertEquals(1, batch.size);
- assertEquals(Timestamp.valueOf("2000-03-12 15:00:01"),
- ts.asScratchTimestamp(0));
-
- // handle the close up
- Assert.assertEquals(false, rows.nextBatch(batch));
- rows.close();
- }
-}