You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@avro.apache.org by cu...@apache.org on 2013/03/28 00:42:31 UTC

svn commit: r1461896 - in /avro/trunk: ./ lang/java/trevni/core/src/main/java/org/apache/trevni/ lang/java/trevni/doc/apt/

Author: cutting
Date: Wed Mar 27 23:42:30 2013
New Revision: 1461896

URL: http://svn.apache.org/r1461896
Log:
AVRO-1259. Java: Improve Trevni's encoding of sparse columns.

Modified:
    avro/trunk/CHANGES.txt
    avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ArrayColumnOutputBuffer.java
    avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ColumnFileReader.java
    avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ColumnFileWriter.java
    avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ColumnOutputBuffer.java
    avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/InputBuffer.java
    avro/trunk/lang/java/trevni/doc/apt/spec.apt

Modified: avro/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/avro/trunk/CHANGES.txt?rev=1461896&r1=1461895&r2=1461896&view=diff
==============================================================================
--- avro/trunk/CHANGES.txt (original)
+++ avro/trunk/CHANGES.txt Wed Mar 27 23:42:30 2013
@@ -19,6 +19,9 @@ Trunk (not yet released)
     types BigDecimal, BigInteger, URI, URL, Date and File can now be
     fields in generated classes. (Alexandre Normand and cutting)
 
+    AVRO-1259. Java: Improve Trevni's encoding of sparse columns.
+    (cutting)
+
   BUG FIXES
 
 Avro 1.7.4 (22 February 2012)

Modified: avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ArrayColumnOutputBuffer.java
URL: http://svn.apache.org/viewvc/avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ArrayColumnOutputBuffer.java?rev=1461896&r1=1461895&r2=1461896&view=diff
==============================================================================
--- avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ArrayColumnOutputBuffer.java (original)
+++ avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ArrayColumnOutputBuffer.java Wed Mar 27 23:42:30 2013
@@ -23,6 +23,11 @@ import java.io.IOException;
 class ArrayColumnOutputBuffer extends ColumnOutputBuffer {
   private int length;                             // remaining in current array
 
+  private static final int NONE = -1;
+
+  private int runLength;                          // length of current run
+  private int runValue = NONE;                    // what kind of run
+
   public ArrayColumnOutputBuffer(ColumnFileWriter writer, ColumnMetaData meta)
     throws IOException {
     super(writer, meta);
@@ -30,16 +35,47 @@ class ArrayColumnOutputBuffer extends Co
     assert !getMeta().hasIndexValues();
   }
 
-  @Override public void writeLength(int length) throws IOException {
+  @Override public void writeLength(int l) throws IOException {
     assert this.length == 0;
-    this.length = length;
-    getBuffer().writeLength(length);
+    assert l >= 0;
+    this.length = l;
+    if (l == runValue) {
+      runLength++;                                // continue a run
+      return;
+    }
+    flushRun();                                   // end a run
+    if (l == 1 || l == 0) {
+      runLength = 1;                              // start a run
+      runValue = l;
+    } else {
+      getBuffer().writeLength(l);                 // not a run
+    }
   }
 
   @Override public void writeValue(Object value) throws IOException {
     assert length > 0;
-    getBuffer().writeValue(value, getMeta().getType());
+    if (getMeta().getType() != ValueType.NULL) {
+      flushRun();
+      getBuffer().writeValue(value, getMeta().getType());
+    }
     length -= 1;
   }
 
+  @Override void flushBuffer() throws IOException {
+    flushRun();
+    super.flushBuffer();
+  }
+
+  private void flushRun() throws IOException {
+    if (runLength == 0)                           // not in run
+      return;
+    else if (runLength == 1)                      // single value
+      getBuffer().writeLength(runValue);
+    else                                          // a run
+      getBuffer().writeLength((3-runValue)-(runLength<<1));
+
+    runLength = 0;                                // reset
+    runValue = NONE;
+  }
+
 }

Modified: avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ColumnFileReader.java
URL: http://svn.apache.org/viewvc/avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ColumnFileReader.java?rev=1461896&r1=1461895&r2=1461896&view=diff
==============================================================================
--- avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ColumnFileReader.java (original)
+++ avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ColumnFileReader.java Wed Mar 27 23:42:30 2013
@@ -111,6 +111,7 @@ public class ColumnFileReader implements
       throw new IOException("Not a data file.");
     }
     if (!(Arrays.equals(ColumnFileWriter.MAGIC, magic)
+          || !Arrays.equals(ColumnFileWriter.MAGIC_1, magic)
           || !Arrays.equals(ColumnFileWriter.MAGIC_0, magic)))
       throw new IOException("Not a data file.");
   }

Modified: avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ColumnFileWriter.java
URL: http://svn.apache.org/viewvc/avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ColumnFileWriter.java?rev=1461896&r1=1461895&r2=1461896&view=diff
==============================================================================
--- avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ColumnFileWriter.java (original)
+++ avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ColumnFileWriter.java Wed Mar 27 23:42:30 2013
@@ -30,7 +30,8 @@ import java.util.HashSet;
 public class ColumnFileWriter {
 
   static final byte[] MAGIC_0 = new byte[] {'T', 'r', 'v', 0};
-  static final byte[] MAGIC = new byte[] {'T', 'r', 'v', 1};
+  static final byte[] MAGIC_1 = new byte[] {'T', 'r', 'v', 1};
+  static final byte[] MAGIC = new byte[] {'T', 'r', 'v', 2};
 
   private ColumnFileMetaData metaData;
   private ColumnOutputBuffer[] columns;

Modified: avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ColumnOutputBuffer.java
URL: http://svn.apache.org/viewvc/avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ColumnOutputBuffer.java?rev=1461896&r1=1461895&r2=1461896&view=diff
==============================================================================
--- avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ColumnOutputBuffer.java (original)
+++ avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/ColumnOutputBuffer.java Wed Mar 27 23:42:30 2013
@@ -70,7 +70,7 @@ class ColumnOutputBuffer {
     rowCount++;
   }
 
-  private void flushBuffer() throws IOException {
+  void flushBuffer() throws IOException {
     if (rowCount == 0) return;
     ByteBuffer raw = buffer.asByteBuffer();
     ByteBuffer c = codec.compress(raw);

Modified: avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/InputBuffer.java
URL: http://svn.apache.org/viewvc/avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/InputBuffer.java?rev=1461896&r1=1461895&r2=1461896&view=diff
==============================================================================
--- avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/InputBuffer.java (original)
+++ avro/trunk/lang/java/trevni/core/src/main/java/org/apache/trevni/InputBuffer.java Wed Mar 27 23:42:30 2013
@@ -38,6 +38,9 @@ class InputBuffer {
   
   private int bitCount;                           // position in booleans
 
+  private int runLength;                          // length of run
+  private int runValue;                           // value of run
+
   public InputBuffer(Input in) throws IOException { this(in, 0); }
 
   public InputBuffer(Input in, long position) throws IOException {
@@ -56,6 +59,7 @@ class InputBuffer {
   }
 
   public void seek(long position) throws IOException {
+    runLength = 0;
     if (position >= (offset-limit) && position <= offset) {
       pos = (int)(limit - (offset - position));   // seek in buffer;
       return;
@@ -133,7 +137,18 @@ class InputBuffer {
 
   public int readLength() throws IOException {
     bitCount = 0;
-    return readInt();
+    if (runLength > 0) {
+      runLength--;                                // in run
+      return runValue;
+    }
+
+    int length = readInt();
+    if (length >= 0)                              // not a run
+      return length;
+
+    runLength = (1-length)>>>1;                   // start of run
+    runValue = (length+1) & 1;
+    return runValue;
   }
 
   public int readInt() throws IOException {

Modified: avro/trunk/lang/java/trevni/doc/apt/spec.apt
URL: http://svn.apache.org/viewvc/avro/trunk/lang/java/trevni/doc/apt/spec.apt?rev=1461896&r1=1461895&r2=1461896&view=diff
==============================================================================
--- avro/trunk/lang/java/trevni/doc/apt/spec.apt (original)
+++ avro/trunk/lang/java/trevni/doc/apt/spec.apt Wed Mar 27 23:42:30 2013
@@ -261,7 +261,9 @@ whether subsequent bytes are present.  F
   * <<trevni.array>> if present, indicates that each row in this column
     contains a sequence of values of the named type rather than just a
     single value.  An integer length precedes each sequence of values
-    indicating the count of values in the sequence.
+    indicating the count of values in the sequence.  If the length is
+    negative then it indicates a sequence of zero or one lengths, where -1
+    indicates two zeros, -2 two ones, -3 three zeros, -4 three ones, etc.
 
   * <<trevni.parent>> if present, the name of an <array> column whose
     lengths are also used by this column.  Thus values of this column
@@ -348,7 +350,7 @@ name=value  type=string              par
 
   A <<file header>> consists of:
 
-  * Four bytes, ASCII 'T', 'r', 'v', followed by 1.
+  * Four bytes, ASCII 'T', 'r', 'v', followed by 0x02.
 
   * a <fixed64> indicating the number of rows in the file