You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2016/07/10 18:12:34 UTC
hive git commit: HIVE-13887 : LazySimpleSerDe should parse "NULL"
dates faster (Gopal V via Ashutosh Chauhan)
Repository: hive
Updated Branches:
refs/heads/master 0bdaeba6c -> 146a15280
HIVE-13887 : LazySimpleSerDe should parse "NULL" dates faster (Gopal V via Ashutosh Chauhan)
Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/146a1528
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/146a1528
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/146a1528
Branch: refs/heads/master
Commit: 146a152803971d0bd5e992aa7085ce14a06b3b94
Parents: 0bdaeba
Author: Gopal V <go...@apache.org>
Authored: Sat May 28 22:55:38 2016 -0700
Committer: Ashutosh Chauhan <ha...@apache.org>
Committed: Sun Jul 10 11:11:49 2016 -0700
----------------------------------------------------------------------
.../benchmark/serde/LazySimpleSerDeBench.java | 173 ++++++++++++++++++-
.../hadoop/hive/serde2/lazy/LazyDate.java | 4 +
.../hadoop/hive/serde2/lazy/LazyTimestamp.java | 4 +
.../hadoop/hive/serde2/lazy/LazyUtils.java | 12 ++
.../lazy/fast/LazySimpleDeserializeRead.java | 11 +-
5 files changed, 199 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/146a1528/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java
----------------------------------------------------------------------
diff --git a/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java
index a1b63d5..826bf53 100644
--- a/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java
+++ b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/serde/LazySimpleSerDeBench.java
@@ -15,16 +15,19 @@ package org.apache.hive.benchmark.serde;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
+import java.util.Date;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef;
import org.apache.hadoop.hive.serde2.lazy.LazyByte;
+import org.apache.hadoop.hive.serde2.lazy.LazyDate;
import org.apache.hadoop.hive.serde2.lazy.LazyDouble;
import org.apache.hadoop.hive.serde2.lazy.LazyFloat;
import org.apache.hadoop.hive.serde2.lazy.LazyInteger;
import org.apache.hadoop.hive.serde2.lazy.LazyLong;
import org.apache.hadoop.hive.serde2.lazy.LazyShort;
+import org.apache.hadoop.hive.serde2.lazy.LazyTimestamp;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyPrimitiveObjectInspectorFactory;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
@@ -53,15 +56,14 @@ public class LazySimpleSerDeBench {
* $ java -cp target/benchmarks.jar org.apache.hive.benchmark.serde.LazySimpleSerDeBench
* <p/>
*/
+ public static final int DEFAULT_ITER_TIME = 1000000;
+ public static final int DEFAULT_DATA_SIZE = 4096;
@BenchmarkMode(Mode.AverageTime)
@Fork(1)
@State(Scope.Thread)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
public static abstract class AbstractDeserializer {
- public static final int DEFAULT_ITER_TIME = 1000000;
-
- public static final int DEFAULT_DATA_SIZE = 4096;
public int[] offsets = new int[DEFAULT_DATA_SIZE];
public int[] sizes = new int[DEFAULT_DATA_SIZE];
@@ -445,6 +447,171 @@ public class LazySimpleSerDeBench {
}
}
+ @BenchmarkMode(Mode.AverageTime)
+ @Fork(1)
+ @State(Scope.Thread)
+ @OutputTimeUnit(TimeUnit.NANOSECONDS)
+ public static class GoodLazyDate {
+
+ final LazyDate obj = new LazyDate(
+ LazyPrimitiveObjectInspectorFactory.LAZY_DATE_OBJECT_INSPECTOR);
+
+ public int[] offsets = new int[DEFAULT_DATA_SIZE];
+ public int[] sizes = new int[DEFAULT_DATA_SIZE];
+ protected final ByteArrayRef ref = new ByteArrayRef();
+
+ @Setup
+ public void setup() {
+ sizes = new int[DEFAULT_DATA_SIZE];
+ offsets = new int[sizes.length];
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ Random r = new Random();
+ int len = 0;
+ final long base = -320000000L*1000L; // 1959
+ for (int i = 0; i < DEFAULT_DATA_SIZE; i++) {
+ // -ve dates are also valid dates - the dates are within 1959 to 2027
+ Date dt = new Date(base + (Math.abs(r.nextLong()) % (Integer.MAX_VALUE*1000L)));
+ byte[] ds = dt.toString().getBytes();
+ sizes[i] = ds.length;
+ offsets[i] = len;
+ len += ds.length;
+ try {
+ bos.write(ds);
+ } catch (IOException e) {
+ e.printStackTrace();
+ throw new RuntimeException(e);
+ }
+ }
+ ref.setData(bos.toByteArray());
+ }
+
+ @Benchmark
+ @Warmup(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS)
+ @Measurement(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS)
+ public void bench() {
+ for (int i = 0; i < DEFAULT_ITER_TIME; i++) {
+ obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]);
+ }
+ }
+ }
+
+ public static class RandomLazyDate extends RandomDataInitializer {
+
+ final LazyDate obj = new LazyDate(
+ LazyPrimitiveObjectInspectorFactory.LAZY_DATE_OBJECT_INSPECTOR);
+
+ public RandomLazyDate() {
+ super(4);
+ }
+
+
+ @Override
+ public void bench() {
+ for (int i = 0; i < DEFAULT_ITER_TIME; i++) {
+ obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]);
+ }
+ }
+ }
+
+ public static class WorstLazyDate extends RandomDataInitializer {
+
+ final LazyDate obj = new LazyDate(
+ LazyPrimitiveObjectInspectorFactory.LAZY_DATE_OBJECT_INSPECTOR);
+
+ public WorstLazyDate() {
+ super(8);
+ }
+
+ @Override
+ public void bench() {
+ for (int i = 0; i < DEFAULT_ITER_TIME; i++) {
+ obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]);
+ }
+ }
+ }
+
+ @BenchmarkMode(Mode.AverageTime)
+ @Fork(1)
+ @State(Scope.Thread)
+ @OutputTimeUnit(TimeUnit.NANOSECONDS)
+ public static class GoodLazyTimestamp {
+
+ final LazyTimestamp obj = new LazyTimestamp(
+ LazyPrimitiveObjectInspectorFactory.LAZY_TIMESTAMP_OBJECT_INSPECTOR);
+
+ public int[] offsets = new int[DEFAULT_DATA_SIZE];
+ public int[] sizes = new int[DEFAULT_DATA_SIZE];
+ protected final ByteArrayRef ref = new ByteArrayRef();
+
+ @Setup
+ public void setup() {
+ sizes = new int[DEFAULT_DATA_SIZE];
+ offsets = new int[sizes.length];
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ Random r = new Random();
+ int len = 0;
+ final long base = -320000000L * 1000L; // 1959
+ for (int i = 0; i < DEFAULT_DATA_SIZE; i++) {
+ // -ve dates are also valid Timestamps - dates are within 1959 to 2027
+ Date dt = new Date(base + (Math.abs(r.nextLong()) % (Integer.MAX_VALUE * 1000L)));
+ byte[] ds = String.format("%s 00:00:01", dt.toString()).getBytes();
+ sizes[i] = ds.length;
+ offsets[i] = len;
+ len += ds.length;
+ try {
+ bos.write(ds);
+ } catch (IOException e) {
+ e.printStackTrace();
+ throw new RuntimeException(e);
+ }
+ }
+ ref.setData(bos.toByteArray());
+ }
+
+ @Benchmark
+ @Warmup(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS)
+ @Measurement(iterations = 2, time = 2, timeUnit = TimeUnit.MILLISECONDS)
+ public void bench() {
+ for (int i = 0; i < DEFAULT_ITER_TIME; i++) {
+ obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]);
+ }
+ }
+ }
+
+ public static class RandomLazyTimestamp extends RandomDataInitializer {
+
+ final LazyTimestamp obj = new LazyTimestamp(
+ LazyPrimitiveObjectInspectorFactory.LAZY_TIMESTAMP_OBJECT_INSPECTOR);
+
+ public RandomLazyTimestamp() {
+ super(4);
+ }
+
+ @Override
+ public void bench() {
+ for (int i = 0; i < DEFAULT_ITER_TIME; i++) {
+ obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]);
+ }
+ }
+ }
+
+ public static class WorstLazyTimestamp extends RandomDataInitializer {
+
+ final LazyTimestamp obj = new LazyTimestamp(
+ LazyPrimitiveObjectInspectorFactory.LAZY_TIMESTAMP_OBJECT_INSPECTOR);
+
+ public WorstLazyTimestamp() {
+ super(8);
+ }
+
+ @Override
+ public void bench() {
+ for (int i = 0; i < DEFAULT_ITER_TIME; i++) {
+ obj.init(ref, offsets[i % sizes.length], sizes[i % sizes.length]);
+ }
+ }
+ }
+
public static void main(String[] args) throws RunnerException {
Options opt = new OptionsBuilder().include(
".*" + LazySimpleSerDeBench.class.getSimpleName() + ".*").build();
http://git-wip-us.apache.org/repos/asf/hive/blob/146a1528/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDate.java
----------------------------------------------------------------------
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDate.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDate.java
index 0579ff2..c00faac 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDate.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyDate.java
@@ -59,6 +59,10 @@ public class LazyDate extends LazyPrimitive<LazyDateObjectInspector, DateWritabl
@Override
public void init(ByteArrayRef bytes, int start, int length) {
String s = null;
+ if (!LazyUtils.isDateMaybe(bytes.getData(), start, length)) {
+ isNull = true;
+ return;
+ }
try {
s = Text.decode(bytes.getData(), start, length);
data.set(Date.valueOf(s));
http://git-wip-us.apache.org/repos/asf/hive/blob/146a1528/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyTimestamp.java
----------------------------------------------------------------------
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyTimestamp.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyTimestamp.java
index 8f0c3d2..56945d1 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyTimestamp.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyTimestamp.java
@@ -59,6 +59,10 @@ public class LazyTimestamp extends LazyPrimitive<LazyTimestampObjectInspector, T
@Override
public void init(ByteArrayRef bytes, int start, int length) {
String s = null;
+ if (!LazyUtils.isDateMaybe(bytes.getData(), start, length)) {
+ isNull = true;
+ return;
+ }
try {
s = new String(bytes.getData(), start, length, "US-ASCII");
} catch (UnsupportedEncodingException e) {
http://git-wip-us.apache.org/repos/asf/hive/blob/146a1528/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java
----------------------------------------------------------------------
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java
index 6d7369b..73c72e1 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyUtils.java
@@ -109,6 +109,18 @@ public final class LazyUtils {
}
/**
+ * returns false, when the bytes definitely cannot be parsed into a date/timestamp.
+ *
+ * Y2k requirements and dash requirements say the string has to be at least
+ * yyyy-m-m = 8 bytes or more minimum; Timestamp needs to be at least 1 byte longer,
+ * but the Date check is necessary, but not sufficient.
+ */
+ public static boolean isDateMaybe(byte[] buf, int offset, int len) {
+ // maybe valid - too expensive to check without a parse
+ return len >= 8;
+ }
+
+ /**
* Returns -1 if the first byte sequence is lexicographically less than the
* second; returns +1 if the second byte sequence is lexicographically less
* than the first; otherwise return 0.
http://git-wip-us.apache.org/repos/asf/hive/blob/146a1528/serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java
----------------------------------------------------------------------
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java
index 7e9f94e..765ba7e 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/fast/LazySimpleDeserializeRead.java
@@ -21,6 +21,7 @@ package org.apache.hadoop.hive.serde2.lazy.fast;
import java.io.UnsupportedEncodingException;
import java.nio.charset.CharacterCodingException;
import java.sql.Date;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.common.type.HiveDecimal;
@@ -381,7 +382,7 @@ public final class LazySimpleDeserializeRead extends DeserializeRead {
break;
case DATE:
{
- if (fieldLength == 0) {
+ if (!LazyUtils.isDateMaybe(bytes, fieldStart, fieldLength)) {
return true;
}
String s = null;
@@ -396,7 +397,7 @@ public final class LazySimpleDeserializeRead extends DeserializeRead {
break;
case TIMESTAMP:
{
- if (fieldLength == 0) {
+ if (!LazyUtils.isDateMaybe(bytes, fieldStart, fieldLength)) {
return true;
}
String s = null;
@@ -425,6 +426,9 @@ public final class LazySimpleDeserializeRead extends DeserializeRead {
break;
case INTERVAL_YEAR_MONTH:
{
+ if (fieldLength == 0) {
+ return true;
+ }
String s = null;
try {
s = Text.decode(bytes, fieldStart, fieldLength);
@@ -437,6 +441,9 @@ public final class LazySimpleDeserializeRead extends DeserializeRead {
break;
case INTERVAL_DAY_TIME:
{
+ if (fieldLength == 0) {
+ return true;
+ }
String s = null;
try {
s = Text.decode(bytes, fieldStart, fieldLength);