You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by om...@apache.org on 2016/05/20 21:22:58 UTC
[20/27] hive git commit: HIVE-11417. Move the ReaderImpl and
RowReaderImpl to the ORC module,
by making shims for the row by row reader. (omalley reviewed by prasanth_j)
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/TestStringDictionary.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestStringDictionary.java b/orc/src/test/org/apache/orc/TestStringDictionary.java
new file mode 100644
index 0000000..46209bb
--- /dev/null
+++ b/orc/src/test/org/apache/orc/TestStringDictionary.java
@@ -0,0 +1,290 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import org.apache.orc.impl.RecordReaderImpl;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+public class TestStringDictionary {
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
+ + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testTooManyDistinct() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector col = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 20000; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ col.setVal(batch.size++, String.valueOf(i).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ col = (BytesColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(String.valueOf(idx++), col.toString(r));
+ }
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ assertEquals(OrcProto.ColumnEncoding.Kind.DIRECT_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testHalfDistinct() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ Random rand = new Random(123);
+ int[] input = new int[20000];
+ for (int i = 0; i < 20000; i++) {
+ input[i] = rand.nextInt(10000);
+ }
+
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector col = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 20000; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ col.setVal(batch.size++, String.valueOf(input[i]).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ col = (BytesColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(String.valueOf(input[idx++]), col.toString(r));
+ }
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testTooManyDistinctCheckDisabled() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ conf.setBoolean(OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getAttribute(), false);
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector string = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 20000; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ string.setVal(batch.size++, String.valueOf(i).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ string = (BytesColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(String.valueOf(idx++), string.toString(r));
+ }
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ assertEquals(OrcProto.ColumnEncoding.Kind.DIRECT_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testHalfDistinctCheckDisabled() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ conf.setBoolean(OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getAttribute(),
+ false);
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .bufferSize(10000));
+ Random rand = new Random(123);
+ int[] input = new int[20000];
+ for (int i = 0; i < 20000; i++) {
+ input[i] = rand.nextInt(10000);
+ }
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector string = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 20000; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ string.setVal(batch.size++, String.valueOf(input[i]).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ string = (BytesColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(String.valueOf(input[idx++]), string.toString(r));
+ }
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY_V2, encoding.getKind());
+ }
+ }
+ }
+
+ @Test
+ public void testTooManyDistinctV11AlwaysDictionary() throws Exception {
+ TypeDescription schema = TypeDescription.createString();
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema)
+ .compress(CompressionKind.NONE)
+ .version(OrcFile.Version.V_0_11).bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector string = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 20000; i++) {
+ if (batch.size == batch.getMaxSize()) {
+ writer.addRowBatch(batch);
+ batch.reset();
+ }
+ string.setVal(batch.size++, String.valueOf(i).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ batch = reader.getSchema().createRowBatch();
+ string = (BytesColumnVector) batch.cols[0];
+ RecordReader rows = reader.rows();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(String.valueOf(idx++), string.toString(r));
+ }
+ }
+
+ // make sure the encoding type is correct
+ for (StripeInformation stripe : reader.getStripes()) {
+ // hacky but does the job, this casting will work as long this test resides
+ // within the same package as ORC reader
+ OrcProto.StripeFooter footer = ((RecordReaderImpl) rows).readStripeFooter(stripe);
+ for (int i = 0; i < footer.getColumnsCount(); ++i) {
+ OrcProto.ColumnEncoding encoding = footer.getColumns(i);
+ assertEquals(OrcProto.ColumnEncoding.Kind.DICTIONARY, encoding.getKind());
+ }
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/TestTypeDescription.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestTypeDescription.java b/orc/src/test/org/apache/orc/TestTypeDescription.java
new file mode 100644
index 0000000..0ac1e64
--- /dev/null
+++ b/orc/src/test/org/apache/orc/TestTypeDescription.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.orc.TypeDescription;
+import org.junit.Test;
+
+public class TestTypeDescription {
+
+ @Test
+ public void testJson() {
+ TypeDescription bin = TypeDescription.createBinary();
+ assertEquals("{\"category\": \"binary\", \"id\": 0, \"max\": 0}",
+ bin.toJson());
+ assertEquals("binary", bin.toString());
+ TypeDescription struct = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createInt())
+ .addField("f2", TypeDescription.createString())
+ .addField("f3", TypeDescription.createDecimal());
+ assertEquals("struct<f1:int,f2:string,f3:decimal(38,10)>",
+ struct.toString());
+ assertEquals("{\"category\": \"struct\", \"id\": 0, \"max\": 3, \"fields\": [\n"
+ + " \"f1\": {\"category\": \"int\", \"id\": 1, \"max\": 1},\n"
+ + " \"f2\": {\"category\": \"string\", \"id\": 2, \"max\": 2},\n"
+ + " \"f3\": {\"category\": \"decimal\", \"id\": 3, \"max\": 3, \"precision\": 38, \"scale\": 10}]}",
+ struct.toJson());
+ struct = TypeDescription.createStruct()
+ .addField("f1", TypeDescription.createUnion()
+ .addUnionChild(TypeDescription.createByte())
+ .addUnionChild(TypeDescription.createDecimal()
+ .withPrecision(20).withScale(10)))
+ .addField("f2", TypeDescription.createStruct()
+ .addField("f3", TypeDescription.createDate())
+ .addField("f4", TypeDescription.createDouble())
+ .addField("f5", TypeDescription.createBoolean()))
+ .addField("f6", TypeDescription.createChar().withMaxLength(100));
+ assertEquals("struct<f1:uniontype<tinyint,decimal(20,10)>,f2:struct<f3:date,f4:double,f5:boolean>,f6:char(100)>",
+ struct.toString());
+ assertEquals(
+ "{\"category\": \"struct\", \"id\": 0, \"max\": 8, \"fields\": [\n" +
+ " \"f1\": {\"category\": \"uniontype\", \"id\": 1, \"max\": 3, \"children\": [\n" +
+ " {\"category\": \"tinyint\", \"id\": 2, \"max\": 2},\n" +
+ " {\"category\": \"decimal\", \"id\": 3, \"max\": 3, \"precision\": 20, \"scale\": 10}]},\n" +
+ " \"f2\": {\"category\": \"struct\", \"id\": 4, \"max\": 7, \"fields\": [\n" +
+ " \"f3\": {\"category\": \"date\", \"id\": 5, \"max\": 5},\n" +
+ " \"f4\": {\"category\": \"double\", \"id\": 6, \"max\": 6},\n" +
+ " \"f5\": {\"category\": \"boolean\", \"id\": 7, \"max\": 7}]},\n" +
+ " \"f6\": {\"category\": \"char\", \"id\": 8, \"max\": 8, \"length\": 100}]}",
+ struct.toJson());
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/TestUnrolledBitPack.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/TestUnrolledBitPack.java b/orc/src/test/org/apache/orc/TestUnrolledBitPack.java
new file mode 100644
index 0000000..ef8fcd0
--- /dev/null
+++ b/orc/src/test/org/apache/orc/TestUnrolledBitPack.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import com.google.common.collect.Lists;
+import com.google.common.primitives.Longs;
+
+@RunWith(value = Parameterized.class)
+public class TestUnrolledBitPack {
+
+ private long val;
+
+ public TestUnrolledBitPack(long val) {
+ this.val = val;
+ }
+
+ @Parameters
+ public static Collection<Object[]> data() {
+ Object[][] data = new Object[][] { { -1 }, { 1 }, { 7 }, { -128 }, { 32000 }, { 8300000 },
+ { Integer.MAX_VALUE }, { 540000000000L }, { 140000000000000L }, { 36000000000000000L },
+ { Long.MAX_VALUE } };
+ return Arrays.asList(data);
+ }
+
+ Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
+ + File.separator + "tmp"));
+
+ Configuration conf;
+ FileSystem fs;
+ Path testFilePath;
+
+ @Rule
+ public TestName testCaseName = new TestName();
+
+ @Before
+ public void openFileSystem() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testBitPacking() throws Exception {
+ TypeDescription schema = TypeDescription.createLong();
+
+ long[] inp = new long[] { val, 0, val, val, 0, val, 0, val, val, 0, val, 0, val, val, 0, 0,
+ val, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val,
+ 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0,
+ 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0, val, 0, val, 0, 0, val, 0,
+ val, 0, val, 0, 0, val, 0, val, 0, 0, val, val };
+ List<Long> input = Lists.newArrayList(Longs.asList(inp));
+
+ Writer writer = OrcFile.createWriter(
+ testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .compress(CompressionKind.NONE).bufferSize(10000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (Long l : input) {
+ int row = batch.size++;
+ ((LongColumnVector) batch.cols[0]).vector[row] = l;
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals(input.get(idx++).longValue(),
+ ((LongColumnVector) batch.cols[0]).vector[r]);
+ }
+ }
+ }
+
+}