You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2016/07/10 18:12:34 UTC

hive git commit: HIVE-13887 : LazySimpleSerDe should parse "NULL" dates faster (Gopal V via Ashutosh Chauhan)

Repository: hive
Updated Branches:
  refs/heads/master 0bdaeba6c -> 146a15280


HIVE-13887 : LazySimpleSerDe should parse "NULL" dates faster (Gopal V via Ashutosh Chauhan)

Signed-off-by: Ashutosh Chauhan <ha...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/146a1528
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/146a1528
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/146a1528

Branch: refs/heads/master
Commit: 146a152803971d0bd5e992aa7085ce14a06b3b94
Parents: 0bdaeba
Author: Gopal V <go...@apache.org>
Authored: Sat May 28 22:55:38 2016 -0700
Committer: Ashutosh Chauhan <ha...@apache.org>
Committed: Sun Jul 10 11:11:49 2016 -0700

----------------------------------------------------------------------
 .../benchmark/serde/LazySimpleSerDeBench.java   | 173 ++++++++++++++++++-
 .../hadoop/hive/serde2/lazy/LazyDate.java       |   4 +
 .../hadoop/hive/serde2/lazy/LazyTimestamp.java  |   4 +
 .../hadoop/hive/serde2/lazy/LazyUtils.java      |  12 ++
 .../lazy/fast/LazySimpleDeserializeRead.java    |  11 +-
 5 files changed, 199 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/146a1528/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java
----------------------------------------------------------------------
diff --git a/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java
index a1b63d5..826bf53 100644
--- a/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java
+++ b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java
@@ -15,16 +15,19 @@ package org.apache.hive.benchmark.serde;
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.util.Date;
 import java.util.Random;
 import java.util.concurrent.TimeUnit;
 
 import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef;
 import org.apache.hadoop.hive.serde2.lazy.LazyByte;
+import org.apache.hadoop.hive.serde2.lazy.LazyDate;
 import org.apache.hadoop.hive.serde2.lazy.LazyDouble;
 import org.apache.hadoop.hive.serde2.lazy.LazyFloat;
 import org.apache.hadoop.hive.serde2.lazy.LazyInteger;
 import org.apache.hadoop.hive.serde2.lazy.LazyLong;
 import org.apache.hadoop.hive.serde2.lazy.LazyShort;
+import org.apache.hadoop.hive.serde2.lazy.LazyTimestamp;
 import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyPrimitiveObjectInspectorFactory;
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.BenchmarkMode;
@@ -53,15 +56,14 @@ public class LazySimpleSerDeBench {
    * $ java -cp target/benchmarks.jar org.apache.hive.benchmark.serde.LazySimpleSerDeBench
    * <p/>
    */
+  public static final int DEFAULT_ITER_TIME = 1000000;
+  public static final int DEFAULT_DATA_SIZE = 4096;
 
   @BenchmarkMode(Mode.AverageTime)
   @Fork(1)
   @State(Scope.Thread)
   @OutputTimeUnit(TimeUnit.NANOSECONDS)
   public static abstract class AbstractDeserializer {
-    public static final int DEFAULT_ITER_TIME = 1000000;
-
-    public static final int DEFAULT_DATA_SIZE = 4096;
 
     public int[] offsets = new int[DEFAULT_DATA_SIZE];
     public int[] sizes = new int[DEFAULT_DATA_SIZE];
@@ -445,6 +447,171 @@ public class LazySimpleSerDeBench {
     }
   }
 
+  @BenchmarkMode(Mode.AverageTime)
+  @Fork(1)
+  @State(Scope.Thread)
+  @OutputTimeUnit(TimeUnit.NANOSECONDS)
+  public static class GoodLazyDate {
+
+    final LazyDate obj = new LazyDate(
+        LazyPrimitiveObjectInspectorFactory.LAZY_DATE_OBJECT_INSPECTOR);
+
+    public int[] offsets = new int[DEFAULT_DATA_SIZE];
+    public int[] sizes = new int[DEFAULT_DATA_SIZE];
+    protected final ByteArrayRef ref = new ByteArrayRef();
+
+    @Setup
+    public void setup() {
+      sizes = new int[DEFAULT_DATA_SIZE];
+      offsets = new int[sizes.length];
+      ByteArrayOutputStream bos = new ByteArrayOutputStream();
+      Random r = new Random();
+      int len = 0;
+      final long base = -320000000L*1000L; // 1959
+      for (int i = 0; i < DEFAULT_DATA_SIZE; i++) {
+        // -ve dates are also valid dates - the dates are within 1959 to 2027
+        Date dt = new Date(base + (Math.abs(r.nextLong()) % (Integer.MAX_VALUE*1000L)));
+        byte[] ds = dt.toString().getBytes();
+        sizes[i] = ds.length;
+        offsets[i] = len;
+        len += ds.length;
+        try {
+          bos.write(ds);
+        } catch (IOException e) {
+          e.printStackTrace();
+          throw new RuntimeException(e);
+        }
+      }
+      ref.setData(bos.toByteArray());
+    }
+
+    @Benchmark
+    @Warmup(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS)
+    @Measurement(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS)
+    public void bench() {
+      for (int i = 0; i < DEFAULT_ITER_TIME; i++) {
+        obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]);
+      }
+    }
+  }
+
+  public static class RandomLazyDate extends RandomDataInitializer {
+
+    final LazyDate obj = new LazyDate(
+        LazyPrimitiveObjectInspectorFactory.LAZY_DATE_OBJECT_INSPECTOR);
+
+    public RandomLazyDate() {
+      super(4);
+    }
+
+
+    @Override
+    public void bench() {
+      for (int i = 0; i < DEFAULT_ITER_TIME; i++) {
+        obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]);
+      }
+    }
+  }
+
+  public static class WorstLazyDate extends RandomDataInitializer {
+
+    final LazyDate obj = new LazyDate(
+        LazyPrimitiveObjectInspectorFactory.LAZY_DATE_OBJECT_INSPECTOR);
+
+    public WorstLazyDate() {
+      super(8);
+    }
+
+    @Override
+    public void bench() {
+      for (int i = 0; i < DEFAULT_ITER_TIME; i++) {
+        obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]);
+      }
+    }
+  }
+  
+  @BenchmarkMode(Mode.AverageTime)
+  @Fork(1)
+  @State(Scope.Thread)
+  @OutputTimeUnit(TimeUnit.NANOSECONDS)
+  public static class GoodLazyTimestamp {
+
+    final LazyTimestamp obj = new LazyTimestamp(
+        LazyPrimitiveObjectInspectorFactory.LAZY_TIMESTAMP_OBJECT_INSPECTOR);
+
+    public int[] offsets = new int[DEFAULT_DATA_SIZE];
+    public int[] sizes = new int[DEFAULT_DATA_SIZE];
+    protected final ByteArrayRef ref = new ByteArrayRef();
+
+    @Setup
+    public void setup() {
+      sizes = new int[DEFAULT_DATA_SIZE];
+      offsets = new int[sizes.length];
+      ByteArrayOutputStream bos = new ByteArrayOutputStream();
+      Random r = new Random();
+      int len = 0;
+      final long base = -320000000L * 1000L; // 1959
+      for (int i = 0; i < DEFAULT_DATA_SIZE; i++) {
+        // -ve dates are also valid Timestamps - dates are within 1959 to 2027
+        Date dt = new Date(base + (Math.abs(r.nextLong()) % (Integer.MAX_VALUE * 1000L)));
+        byte[] ds = String.format("%s 00:00:01", dt.toString()).getBytes();
+        sizes[i] = ds.length;
+        offsets[i] = len;
+        len += ds.length;
+        try {
+          bos.write(ds);
+        } catch (IOException e) {
+          e.printStackTrace();
+          throw new RuntimeException(e);
+        }
+      }
+      ref.setData(bos.toByteArray());
+    }
+
+    @Benchmark
+    @Warmup(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS)
+    @Measurement(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS)
+    public void bench() {
+      for (int i = 0; i < DEFAULT_ITER_TIME; i++) {
+        obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]);
+      }
+    }
+  }
+
+  public static class RandomLazyTimestamp extends RandomDataInitializer {
+
+    final LazyTimestamp obj = new LazyTimestamp(
+        LazyPrimitiveObjectInspectorFactory.LAZY_TIMESTAMP_OBJECT_INSPECTOR);
+
+    public RandomLazyTimestamp() {
+      super(4);
+    }
+
+    @Override
+    public void bench() {
+      for (int i = 0; i < DEFAULT_ITER_TIME; i++) {
+        obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]);
+      }
+    }
+  }
+
+  public static class WorstLazyTimestamp extends RandomDataInitializer {
+
+    final LazyTimestamp obj = new LazyTimestamp(
+        LazyPrimitiveObjectInspectorFactory.LAZY_TIMESTAMP_OBJECT_INSPECTOR);
+
+    public WorstLazyTimestamp() {
+      super(8);
+    }
+
+    @Override
+    public void bench() {
+      for (int i = 0; i < DEFAULT_ITER_TIME; i++) {
+        obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]);
+      }
+    }
+  }
+
   public static void main(String[] args) throws RunnerException {
     Options opt = new OptionsBuilder().include(
         ".*" + LazySimpleSerDeBench.class.getSimpleName() + ".*").build();

http://git-wip-us.apache.org/repos/asf/hive/blob/146a1528/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDate.java
----------------------------------------------------------------------
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDate.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDate.java
index 0579ff2..c00faac 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDate.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDate.java
@@ -59,6 +59,10 @@ public class LazyDate extends LazyPrimitive<LazyDateObjectInspector, DateWritabl
   @Override
   public void init(ByteArrayRef bytes, int start, int length) {
     String s = null;
+    if (!LazyUtils.isDateMaybe(bytes.getData(), start, length)) {
+      isNull = true;
+      return;
+    }
     try {
       s = Text.decode(bytes.getData(), start, length);
       data.set(Date.valueOf(s));

http://git-wip-us.apache.org/repos/asf/hive/blob/146a1528/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyTimestamp.java
----------------------------------------------------------------------
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyTimestamp.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyTimestamp.java
index 8f0c3d2..56945d1 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyTimestamp.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyTimestamp.java
@@ -59,6 +59,10 @@ public class LazyTimestamp extends LazyPrimitive<LazyTimestampObjectInspector, T
   @Override
   public void init(ByteArrayRef bytes, int start, int length) {
     String s = null;
+    if (!LazyUtils.isDateMaybe(bytes.getData(), start, length)) {
+      isNull = true;
+      return;
+    }
     try {
       s = new String(bytes.getData(), start, length, "US-ASCII");
     } catch (UnsupportedEncodingException e) {

http://git-wip-us.apache.org/repos/asf/hive/blob/146a1528/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java
----------------------------------------------------------------------
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java
index 6d7369b..73c72e1 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java
@@ -109,6 +109,18 @@ public final class LazyUtils {
   }
 
   /**
+   * returns false, when the bytes definitely cannot be parsed into a date/timestamp.
+   * 
+   * Y2k requirements and dash requirements say the string has to be at least
+   * yyyy-m-m = 8 bytes or more minimum; Timestamp needs to be at least 1 byte longer,
+   * but the Date check is necessary, but not sufficient.
+   */
+  public static boolean isDateMaybe(byte[] buf, int offset, int len) {
+    // maybe valid - too expensive to check without a parse
+    return len >= 8;
+  }
+
+  /**
    * Returns -1 if the first byte sequence is lexicographically less than the
    * second; returns +1 if the second byte sequence is lexicographically less
    * than the first; otherwise return 0.

http://git-wip-us.apache.org/repos/asf/hive/blob/146a1528/serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java
----------------------------------------------------------------------
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java
index 7e9f94e..765ba7e 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java
@@ -21,6 +21,7 @@ package org.apache.hadoop.hive.serde2.lazy.fast;
 import java.io.UnsupportedEncodingException;
 import java.nio.charset.CharacterCodingException;
 import java.sql.Date;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.hive.common.type.HiveDecimal;
@@ -381,7 +382,7 @@ public final class LazySimpleDeserializeRead extends DeserializeRead {
       break;
     case DATE:
       {
-        if (fieldLength == 0) {
+        if (!LazyUtils.isDateMaybe(bytes, fieldStart, fieldLength)) {
           return true;
         }
         String s = null;
@@ -396,7 +397,7 @@ public final class LazySimpleDeserializeRead extends DeserializeRead {
       break;
     case TIMESTAMP:
       {
-        if (fieldLength == 0) {
+        if (!LazyUtils.isDateMaybe(bytes, fieldStart, fieldLength)) {
           return true;
         }
         String s = null;
@@ -425,6 +426,9 @@ public final class LazySimpleDeserializeRead extends DeserializeRead {
       break;
     case INTERVAL_YEAR_MONTH:
       {
+        if (fieldLength == 0) {
+          return true;
+        }
         String s = null;
         try {
           s = Text.decode(bytes, fieldStart, fieldLength);
@@ -437,6 +441,9 @@ public final class LazySimpleDeserializeRead extends DeserializeRead {
       break;
     case INTERVAL_DAY_TIME:
       {
+        if (fieldLength == 0) {
+          return true;
+        }
         String s = null;
         try {
           s = Text.decode(bytes, fieldStart, fieldLength);