You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@accumulo.apache.org by GitBox <gi...@apache.org> on 2019/11/26 20:59:30 UTC
[GitHub] [accumulo-testing] keith-turner commented on a change in pull request #119: Add asynchronous input format and bulk key.

keith-turner commented on a change in pull request #119: Add asynchronous input format and bulk key.
URL: https://github.com/apache/accumulo-testing/pull/119#discussion_r350971559
 
 

 ##########
 File path: src/main/java/org/apache/accumulo/testing/continuous/BulkKey.java
 ##########
 @@ -0,0 +1,236 @@
+package org.apache.accumulo.testing.continuous;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.accumulo.core.data.ByteSequence;
+import org.apache.accumulo.core.data.Key;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableComparator;
+import org.apache.hadoop.io.WritableUtils;
+
+/**
+ * Supports immediate sorting via eager deser of the key object. This has the benefit of reducing
+ * the amount of deserialization that may occur when sorting keys in memory
+ */
+public class BulkKey implements WritableComparable<BulkKey> {
+
+  protected Key key = new Key();
+  protected int hashCode = 31;
+
+  static final byte[] EMPTY = {};
+
+  Text row = new Text();
+  Text cf = new Text();
+  Text cq = new Text();
+  Text cv = new Text();
+
+  public BulkKey() {}
+
+  public BulkKey(Key key) {
+    this.key = key;
+    hashCode = key.hashCode();
+  }
+
+  public BulkKey(byte[] row, byte[] cf, byte[] cq, byte[] cv, long ts, boolean deleted) {
+    // don't copy the arrays
+    this.key = new Key(row, cf, cq, cv, ts, deleted, false);
+    hashCode = key.hashCode();
+  }
+
+  public Key getKey() {
+    return key;
+  }
+
+  public ByteSequence getRowData() {
+    return key.getRowData();
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+
+    final int rowsize = WritableUtils.readVInt(in);
+    final byte[] row = readBytes(in, rowsize);
+
+    final int cfsize = WritableUtils.readVInt(in);
+    final byte[] cf = readBytes(in, cfsize);
+
+    final int cqsize = WritableUtils.readVInt(in);
+    final byte[] cq = readBytes(in, cqsize);
+
+    final int cvsize = WritableUtils.readVInt(in);
+    final byte[] cv = readBytes(in, cvsize);
+
+    final long ts = WritableUtils.readVLong(in);
+    boolean isDeleted = in.readBoolean();
+
+    key = new Key(row, cf, cq, cv, ts, isDeleted, false);
+
+    hashCode = key.hashCode();
+  }
+
+  private static byte[] readBytes(DataInput in, int size) throws IOException {
+    if (size == 0)
+      return EMPTY;
+    final byte[] data = new byte[size];
+    in.readFully(data, 0, data.length);
+    return data;
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+
+    key.getRow(row);
+    key.getColumnFamily(cf);
+    key.getColumnQualifier(cq);
+    key.getColumnVisibility(cv);
+
+    WritableUtils.writeVInt(out, row.getLength());
 
 Review comment:
   For the bulk ingest data, the row, family, and qualifier are all really small.  The length of each of these should fit in a single byte.  Could just always store the len in a single byte here instead of using writeVInt.  Seems like this  could simplify the code in KeyShortCircuitComparator.
   
   Could have a little util method like 
   
   ```java
     void writeLen(OutputStream out, int len){
         Preconditions.checkArgument(len<128);
         out.write(len);
     }
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services