You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2017/02/18 16:51:17 UTC

orc git commit: ORC-144. Implement test cases and fix the documentation for RLEv2. (Douglas Dinka via omalley)

Repository: orc
Updated Branches:
  refs/heads/master fbf0b71d3 -> 6c603c22c


ORC-144. Implement test cases and fix the documentation for RLEv2. (Douglas
Dinka via omalley)

Fixes #93

Signed-off-by: Owen O'Malley <om...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/6c603c22
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/6c603c22
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/6c603c22

Branch: refs/heads/master
Commit: 6c603c22c86a92f1f160e08357220a69ee7748a0
Parents: fbf0b71
Author: Owen O'Malley <om...@apache.org>
Authored: Fri Feb 17 09:58:18 2017 -0800
Committer: Owen O'Malley <om...@apache.org>
Committed: Sat Feb 18 08:49:37 2017 -0800

----------------------------------------------------------------------
 .../src/test/org/apache/orc/impl/TestRLEv2.java | 82 ++++++++++++++++++++
 site/_docs/run-length.md                        | 18 +++--
 2 files changed, 92 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/6c603c22/java/tools/src/test/org/apache/orc/impl/TestRLEv2.java
----------------------------------------------------------------------
diff --git a/java/tools/src/test/org/apache/orc/impl/TestRLEv2.java b/java/tools/src/test/org/apache/orc/impl/TestRLEv2.java
index 56386c1..6558023 100644
--- a/java/tools/src/test/org/apache/orc/impl/TestRLEv2.java
+++ b/java/tools/src/test/org/apache/orc/impl/TestRLEv2.java
@@ -21,7 +21,11 @@ import static org.junit.Assert.assertEquals;
 
 import java.io.ByteArrayOutputStream;
 import java.io.File;
+import java.io.IOException;
 import java.io.PrintStream;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Random;
 
 import org.apache.hadoop.conf.Configuration;
@@ -29,8 +33,10 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.CompressionCodec;
 import org.apache.orc.CompressionKind;
 import org.apache.orc.OrcFile;
+import org.apache.orc.PhysicalWriter;
 import org.apache.orc.TypeDescription;
 import org.apache.orc.Writer;
 import org.apache.orc.tools.FileDump;
@@ -304,4 +310,80 @@ public class TestRLEv2 {
     assertEquals(true, outDump.contains("Stream: column 0 section DATA start: 3 length 583"));
     System.setOut(origOut);
   }
+
+  static class TestOutputCatcher implements PhysicalWriter.OutputReceiver {
+    int currentBuffer = 0;
+    List<ByteBuffer> buffers = new ArrayList<ByteBuffer>();
+
+    @Override
+    public void output(ByteBuffer buffer) throws IOException {
+      buffers.add(buffer);
+    }
+
+    @Override
+    public void suppress() {
+    }
+
+    ByteBuffer getCurrentBuffer() {
+      while (currentBuffer < buffers.size() &&
+          buffers.get(currentBuffer).remaining() == 0) {
+        currentBuffer += 1;
+      }
+      return currentBuffer < buffers.size() ? buffers.get(currentBuffer) : null;
+    }
+
+    // assert that the list of ints (as bytes) are equal to the output
+    public void compareBytes(int... expected) {
+      for(int i=0; i < expected.length; ++i) {
+        ByteBuffer current = getCurrentBuffer();
+        assertEquals("position " + i, (byte) expected[i], current.get());
+      }
+      assertEquals(null, getCurrentBuffer());
+    }
+  }
+
+  static TestOutputCatcher encodeV2(long[] input,
+                                    boolean signed) throws IOException {
+    TestOutputCatcher catcher = new TestOutputCatcher();
+    RunLengthIntegerWriterV2 writer =
+        new RunLengthIntegerWriterV2(new OutStream("test", 10000, null,
+            catcher), signed);
+    for(long x: input) {
+      writer.write(x);
+    }
+    writer.flush();
+    return catcher;
+  }
+
+  @Test
+  public void testShortRepeatExample() throws Exception {
+    long[] input = {10000, 10000, 10000, 10000, 10000};
+    TestOutputCatcher output = encodeV2(input, false);
+    output.compareBytes(0x0a, 0x27, 0x10);
+  }
+
+  @Test
+  public void testDirectExample() throws Exception {
+    long[] input = {23713, 43806, 57005, 48879};
+    TestOutputCatcher output = encodeV2(input, false);
+    output.compareBytes(0x5e, 0x03, 0x5c, 0xa1, 0xab, 0x1e, 0xde, 0xad, 0xbe,
+        0xef);
+  }
+
+  @Test
+  public void testPatchedBaseExample() throws Exception {
+    long[] input = {2030, 2000, 2020, 1000000, 2040, 2050, 2060, 2070, 2080,
+        2090, 2100, 2110, 2120, 2130, 2140, 2150, 2160, 2170, 2180, 2190};
+    TestOutputCatcher output = encodeV2(input, false);
+    output.compareBytes(0x8e, 0x13, 0x2b, 0x21, 0x07, 0xd0, 0x1e, 0x00, 0x14,
+        0x70, 0x28, 0x32, 0x3c, 0x46, 0x50, 0x5a, 0x64, 0x6e, 0x78, 0x82, 0x8c,
+        0x96, 0xa0, 0xaa, 0xb4, 0xbe, 0xfc, 0xe8);
+  }
+
+  @Test
+  public void testDeltaExample() throws Exception {
+    long[] input = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29};
+    TestOutputCatcher output = encodeV2(input, false);
+    output.compareBytes(0xc6, 0x09, 0x02, 0x02, 0x22, 0x42, 0x42, 0x46);
+  }
 }

http://git-wip-us.apache.org/repos/asf/orc/blob/6c603c22/site/_docs/run-length.md
----------------------------------------------------------------------
diff --git a/site/_docs/run-length.md b/site/_docs/run-length.md
index 0566feb..699b713 100644
--- a/site/_docs/run-length.md
+++ b/site/_docs/run-length.md
@@ -201,15 +201,17 @@ the index values and the additional value bits.
   combined length of each patch (PGW + PW) must be less or equal to
   64.
 
-The unsigned sequence of [2030, 2000, 2020, 1000000, 2040, 2050, 2060,
-2070, 2080, 2090] has a minimum of 2000, which makes the adjusted
-sequence [30, 0, 20, 998000, 40, 50, 60, 70, 80, 90]. It has an
-encoding of patched base (2), a bit width of 8 (7), a length of 10
-(9), a base value width of 2 bytes (1), a patch width of 12 bits (11),
+The unsigned sequence of [2030, 2000, 2020, 1000000, 2040, 2050, 2060, 2070,
+2080, 2090, 2100, 2110, 2120, 2130, 2140, 2150, 2160, 2170, 2180, 2190]
+has a minimum of 2000, which makes the adjusted
+sequence [30, 0, 20, 998000, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140,
+150, 160, 170, 180, 190]. It has an
+encoding of patched base (2), a bit width of 8 (7), a length of 20
+(19), a base value width of 2 bytes (1), a patch width of 12 bits (11),
 patch gap width of 2 bits (1), and a patch list length of 1 (1). The
-base value is 2000 and the combined result is [0x8e, 0x09, 0x2b, 0x21,
-0x07, 0xd0, 0x1e, 0x00, 0x14, 0x70, 0x28, 0x32, 0x3c, 0x46, 0x50,
-0x5a, 0xfc, 0xe8]
+base value is 2000 and the combined result is [0x8e, 0x13, 0x2b, 0x21, 0x07,
+0xd0, 0x1e, 0x00, 0x14, 0x70, 0x28, 0x32, 0x3c, 0x46, 0x50, 0x5a, 0x64, 0x6e,
+0x78, 0x82, 0x8c, 0x96, 0xa0, 0xaa, 0xb4, 0xbe, 0xfc, 0xe8]
 
 ## Delta