You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by om...@apache.org on 2016/05/20 21:22:56 UTC

[18/27] hive git commit: HIVE-11417. Move the ReaderImpl and RowReaderImpl to the ORC module, by making shims for the row by row reader. (omalley reviewed by prasanth_j)

http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/impl/TestOrcWideTable.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestOrcWideTable.java b/orc/src/test/org/apache/orc/impl/TestOrcWideTable.java
new file mode 100644
index 0000000..289a86e
--- /dev/null
+++ b/orc/src/test/org/apache/orc/impl/TestOrcWideTable.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+
+import org.junit.Test;
+
+public class TestOrcWideTable {
+
+  @Test
+  public void testBufferSizeFor1Col() throws IOException {
+    assertEquals(128 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
+        1, 128*1024));
+  }
+
+  @Test
+  public void testBufferSizeFor50Col() throws IOException {
+    assertEquals(256 * 1024, WriterImpl.getEstimatedBufferSize(256 * 1024 * 1024,
+        50, 256*1024));
+  }
+
+  @Test
+  public void testBufferSizeFor1000Col() throws IOException {
+    assertEquals(32 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
+        1000, 128*1024));
+  }
+
+  @Test
+  public void testBufferSizeFor2000Col() throws IOException {
+    assertEquals(16 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
+        2000, 256*1024));
+  }
+
+  @Test
+  public void testBufferSizeFor4000Col() throws IOException {
+    assertEquals(8 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
+        4000, 256*1024));
+  }
+
+  @Test
+  public void testBufferSizeFor25000Col() throws IOException {
+    assertEquals(4 * 1024, WriterImpl.getEstimatedBufferSize(512 * 1024 * 1024,
+        25000, 256*1024));
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/impl/TestRLEv2.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestRLEv2.java b/orc/src/test/org/apache/orc/impl/TestRLEv2.java
new file mode 100644
index 0000000..e139619
--- /dev/null
+++ b/orc/src/test/org/apache/orc/impl/TestRLEv2.java
@@ -0,0 +1,307 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.PrintStream;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcFile;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.apache.orc.tools.FileDump;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+public class TestRLEv2 {
+  Path workDir = new Path(System.getProperty("test.tmp.dir",
+      "target" + File.separator + "test" + File.separator + "tmp"));
+  Path testFilePath;
+  Configuration conf;
+  FileSystem fs;
+
+  @Rule
+  public TestName testCaseName = new TestName();
+
+  @Before
+  public void openFileSystem () throws Exception {
+    conf = new Configuration();
+    fs = FileSystem.getLocal(conf);
+    testFilePath = new Path(workDir, "TestRLEv2." +
+        testCaseName.getMethodName() + ".orc");
+    fs.delete(testFilePath, false);
+  }
+
+  void appendInt(VectorizedRowBatch batch, int i) {
+    ((LongColumnVector) batch.cols[0]).vector[batch.size++] = i;
+  }
+
+  @Test
+  public void testFixedDeltaZero() throws Exception {
+    TypeDescription schema = TypeDescription.createInt();
+    Writer w = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .compress(CompressionKind.NONE)
+            .setSchema(schema)
+            .rowIndexStride(0)
+            .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+            .version(OrcFile.Version.V_0_12)
+    );
+    VectorizedRowBatch batch = schema.createRowBatch(5120);
+    for (int i = 0; i < 5120; ++i) {
+      appendInt(batch, 123);
+    }
+    w.addRowBatch(batch);
+    w.close();
+
+    PrintStream origOut = System.out;
+    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+    System.setOut(new PrintStream(myOut));
+    FileDump.main(new String[]{testFilePath.toUri().toString()});
+    System.out.flush();
+    String outDump = new String(myOut.toByteArray());
+    // 10 runs of 512 elements. Each run has 2 bytes header, 2 bytes base (base = 123,
+    // zigzag encoded varint) and 1 byte delta (delta = 0). In total, 5 bytes per run.
+    assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 50"));
+    System.setOut(origOut);
+  }
+
+  @Test
+  public void testFixedDeltaOne() throws Exception {
+    TypeDescription schema = TypeDescription.createInt();
+    Writer w = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .compress(CompressionKind.NONE)
+            .setSchema(schema)
+            .rowIndexStride(0)
+            .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+            .version(OrcFile.Version.V_0_12)
+    );
+    VectorizedRowBatch batch = schema.createRowBatch(5120);
+    for (int i = 0; i < 5120; ++i) {
+      appendInt(batch, i % 512);
+    }
+    w.addRowBatch(batch);
+    w.close();
+
+    PrintStream origOut = System.out;
+    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+    System.setOut(new PrintStream(myOut));
+    FileDump.main(new String[]{testFilePath.toUri().toString()});
+    System.out.flush();
+    String outDump = new String(myOut.toByteArray());
+    // 10 runs of 512 elements. Each run has 2 bytes header, 1 byte base (base = 0)
+    // and 1 byte delta (delta = 1). In total, 4 bytes per run.
+    assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 40"));
+    System.setOut(origOut);
+  }
+
+  @Test
+  public void testFixedDeltaOneDescending() throws Exception {
+    TypeDescription schema = TypeDescription.createInt();
+    Writer w = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .compress(CompressionKind.NONE)
+            .setSchema(schema)
+            .rowIndexStride(0)
+            .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+            .version(OrcFile.Version.V_0_12)
+    );
+    VectorizedRowBatch batch = schema.createRowBatch(5120);
+    for (int i = 0; i < 5120; ++i) {
+      appendInt(batch, 512 - (i % 512));
+    }
+    w.addRowBatch(batch);
+    w.close();
+
+    PrintStream origOut = System.out;
+    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+    System.setOut(new PrintStream(myOut));
+    FileDump.main(new String[]{testFilePath.toUri().toString()});
+    System.out.flush();
+    String outDump = new String(myOut.toByteArray());
+    // 10 runs of 512 elements. Each run has 2 bytes header, 2 byte base (base = 512, zigzag + varint)
+    // and 1 byte delta (delta = 1). In total, 5 bytes per run.
+    assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 50"));
+    System.setOut(origOut);
+  }
+
+  @Test
+  public void testFixedDeltaLarge() throws Exception {
+    TypeDescription schema = TypeDescription.createInt();
+    Writer w = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .compress(CompressionKind.NONE)
+            .setSchema(schema)
+            .rowIndexStride(0)
+            .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+            .version(OrcFile.Version.V_0_12)
+    );
+    VectorizedRowBatch batch = schema.createRowBatch(5120);
+    for (int i = 0; i < 5120; ++i) {
+      appendInt(batch, i % 512 + ((i % 512) * 100));
+    }
+    w.addRowBatch(batch);
+    w.close();
+
+    PrintStream origOut = System.out;
+    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+    System.setOut(new PrintStream(myOut));
+    FileDump.main(new String[]{testFilePath.toUri().toString()});
+    System.out.flush();
+    String outDump = new String(myOut.toByteArray());
+    // 10 runs of 512 elements. Each run has 2 bytes header, 1 byte base (base = 0)
+    // and 2 bytes delta (delta = 100, zigzag encoded varint). In total, 5 bytes per run.
+    assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 50"));
+    System.setOut(origOut);
+  }
+
+  @Test
+  public void testFixedDeltaLargeDescending() throws Exception {
+    TypeDescription schema = TypeDescription.createInt();
+    Writer w = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .compress(CompressionKind.NONE)
+            .setSchema(schema)
+            .rowIndexStride(0)
+            .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+            .version(OrcFile.Version.V_0_12)
+    );
+    VectorizedRowBatch batch = schema.createRowBatch(5120);
+    for (int i = 0; i < 5120; ++i) {
+      appendInt(batch, (512 - i % 512) + ((i % 512) * 100));
+    }
+    w.addRowBatch(batch);
+    w.close();
+
+    PrintStream origOut = System.out;
+    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+    System.setOut(new PrintStream(myOut));
+    FileDump.main(new String[]{testFilePath.toUri().toString()});
+    System.out.flush();
+    String outDump = new String(myOut.toByteArray());
+    // 10 runs of 512 elements. Each run has 2 bytes header, 2 byte base (base = 512, zigzag + varint)
+    // and 2 bytes delta (delta = 100, zigzag encoded varint). In total, 6 bytes per run.
+    assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 60"));
+    System.setOut(origOut);
+  }
+
+  @Test
+  public void testShortRepeat() throws Exception {
+    TypeDescription schema = TypeDescription.createInt();
+    Writer w = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .compress(CompressionKind.NONE)
+            .setSchema(schema)
+            .rowIndexStride(0)
+            .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+            .version(OrcFile.Version.V_0_12)
+    );
+    VectorizedRowBatch batch = schema.createRowBatch(5120);
+    for (int i = 0; i < 5; ++i) {
+      appendInt(batch, 10);
+    }
+    w.addRowBatch(batch);
+    w.close();
+
+    PrintStream origOut = System.out;
+    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+    System.setOut(new PrintStream(myOut));
+    FileDump.main(new String[]{testFilePath.toUri().toString()});
+    System.out.flush();
+    String outDump = new String(myOut.toByteArray());
+    // 1 byte header + 1 byte value
+    assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 2"));
+    System.setOut(origOut);
+  }
+
+  @Test
+  public void testDeltaUnknownSign() throws Exception {
+    TypeDescription schema = TypeDescription.createInt();
+    Writer w = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .compress(CompressionKind.NONE)
+            .setSchema(schema)
+            .rowIndexStride(0)
+            .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+            .version(OrcFile.Version.V_0_12)
+    );
+    VectorizedRowBatch batch = schema.createRowBatch(5120);
+    appendInt(batch, 0);
+    for (int i = 0; i < 511; ++i) {
+      appendInt(batch, i);
+    }
+    w.addRowBatch(batch);
+    w.close();
+
+    PrintStream origOut = System.out;
+    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+    System.setOut(new PrintStream(myOut));
+    FileDump.main(new String[]{testFilePath.toUri().toString()});
+    System.out.flush();
+    String outDump = new String(myOut.toByteArray());
+    // monotonicity will be undetermined for this sequence 0,0,1,2,3,...510. Hence DIRECT encoding
+    // will be used. 2 bytes for header and 640 bytes for data (512 values with fixed bit of 10 bits
+    // each, 5120/8 = 640). Total bytes 642
+    assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 642"));
+    System.setOut(origOut);
+  }
+
+  @Test
+  public void testPatchedBase() throws Exception {
+    TypeDescription schema = TypeDescription.createInt();
+    Writer w = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .compress(CompressionKind.NONE)
+            .setSchema(schema)
+            .rowIndexStride(0)
+            .encodingStrategy(OrcFile.EncodingStrategy.COMPRESSION)
+            .version(OrcFile.Version.V_0_12)
+    );
+
+    Random rand = new Random(123);
+    VectorizedRowBatch batch = schema.createRowBatch(5120);
+    appendInt(batch, 10000000);
+    for (int i = 0; i < 511; ++i) {
+      appendInt(batch, rand.nextInt(i+1));
+    }
+    w.addRowBatch(batch);
+    w.close();
+
+    PrintStream origOut = System.out;
+    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+    System.setOut(new PrintStream(myOut));
+    FileDump.main(new String[]{testFilePath.toUri().toString()});
+    System.out.flush();
+    String outDump = new String(myOut.toByteArray());
+    // use PATCHED_BASE encoding
+    assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 583"));
+    System.setOut(origOut);
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/ffb79509/orc/src/test/org/apache/orc/impl/TestReaderImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/test/org/apache/orc/impl/TestReaderImpl.java b/orc/src/test/org/apache/orc/impl/TestReaderImpl.java
new file mode 100644
index 0000000..23d0dab
--- /dev/null
+++ b/orc/src/test/org/apache/orc/impl/TestReaderImpl.java
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2016 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl;
+
+import java.io.ByteArrayInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PositionedReadable;
+import org.apache.hadoop.fs.Seekable;
+import org.apache.orc.FileFormatException;
+import org.apache.hadoop.io.Text;
+import org.apache.orc.OrcFile;
+import org.junit.Test;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.rules.ExpectedException;
+
+public class TestReaderImpl {
+
+  @Rule
+  public ExpectedException thrown = ExpectedException.none();
+
+  private final Path path = new Path("test-file.orc");
+  private FSDataInputStream in;
+  private int psLen;
+  private ByteBuffer buffer;
+
+  @Before
+  public void setup() {
+    in = null;
+  }
+
+  @Test
+  public void testEnsureOrcFooterSmallTextFile() throws IOException {
+    prepareTestCase("1".getBytes());
+    thrown.expect(FileFormatException.class);
+    ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
+  }
+
+  @Test
+  public void testEnsureOrcFooterLargeTextFile() throws IOException {
+    prepareTestCase("This is Some Text File".getBytes());
+    thrown.expect(FileFormatException.class);
+    ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
+  }
+
+  @Test
+  public void testEnsureOrcFooter011ORCFile() throws IOException {
+    prepareTestCase(composeContent(OrcFile.MAGIC, "FOOTER"));
+    ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
+  }
+
+  @Test
+  public void testEnsureOrcFooterCorrectORCFooter() throws IOException {
+    prepareTestCase(composeContent("", OrcFile.MAGIC));
+    ReaderImpl.ensureOrcFooter(in, path, psLen, buffer);
+  }
+
+  private void prepareTestCase(byte[] bytes) {
+    buffer = ByteBuffer.wrap(bytes);
+    psLen = buffer.get(bytes.length - 1) & 0xff;
+    in = new FSDataInputStream(new SeekableByteArrayInputStream(bytes));
+  }
+
+  private byte[] composeContent(String headerStr, String footerStr) throws CharacterCodingException {
+    ByteBuffer header = Text.encode(headerStr);
+    ByteBuffer footer = Text.encode(footerStr);
+    int headerLen = header.remaining();
+    int footerLen = footer.remaining() + 1;
+
+    ByteBuffer buf = ByteBuffer.allocate(headerLen + footerLen);
+
+    buf.put(header);
+    buf.put(footer);
+    buf.put((byte) footerLen);
+    return buf.array();
+  }
+
+  private static final class SeekableByteArrayInputStream extends ByteArrayInputStream
+          implements Seekable, PositionedReadable {
+
+    public SeekableByteArrayInputStream(byte[] buf) {
+      super(buf);
+    }
+
+    @Override
+    public void seek(long pos) throws IOException {
+      this.reset();
+      this.skip(pos);
+    }
+
+    @Override
+    public long getPos() throws IOException {
+      return pos;
+    }
+
+    @Override
+    public boolean seekToNewSource(long targetPos) throws IOException {
+      return false;
+    }
+
+    @Override
+    public int read(long position, byte[] buffer, int offset, int length)
+            throws IOException {
+      long oldPos = getPos();
+      int nread = -1;
+      try {
+        seek(position);
+        nread = read(buffer, offset, length);
+      } finally {
+        seek(oldPos);
+      }
+      return nread;
+    }
+
+    @Override
+    public void readFully(long position, byte[] buffer, int offset, int length)
+            throws IOException {
+      int nread = 0;
+      while (nread < length) {
+        int nbytes = read(position + nread, buffer, offset + nread, length - nread);
+        if (nbytes < 0) {
+          throw new EOFException("End of file reached before reading fully.");
+        }
+        nread += nbytes;
+      }
+    }
+
+    @Override
+    public void readFully(long position, byte[] buffer)
+            throws IOException {
+      readFully(position, buffer, 0, buffer.length);
+    }
+  }
+}