You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by kg...@apache.org on 2022/06/13 15:26:42 UTC
[hive] branch master updated: HIVE-26184: COLLECT_SET with GROUP BY is very slow when some keys are highly skewed (#3253) (okumin reviewed by Zoltan Haindrich)
This is an automated email from the ASF dual-hosted git repository.
kgyrtkirk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new c4f6eb2f914 HIVE-26184: COLLECT_SET with GROUP BY is very slow when some keys are highly skewed (#3253) (okumin reviewed by Zoltan Haindrich)
c4f6eb2f914 is described below
commit c4f6eb2f91478152c89070a7455df9a1b8980c75
Author: okumin <gi...@okumin.com>
AuthorDate: Tue Jun 14 00:26:36 2022 +0900
HIVE-26184: COLLECT_SET with GROUP BY is very slow when some keys are highly skewed (#3253) (okumin reviewed by Zoltan Haindrich)
---
.../udf/generic/GenericUDAFMkCollectionEvaluator.java | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java
index b05023dd37a..cffc7f76510 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java
@@ -95,11 +95,27 @@ public class GenericUDAFMkCollectionEvaluator extends GenericUDAFEvaluator
throw new RuntimeException("Buffer type unknown");
}
}
+
+ private void reset() {
+ if (bufferType == BufferType.LIST) {
+ container.clear();
+ } else if (bufferType == BufferType.SET) {
+ // Don't reuse a container because HashSet#clear can be very slow. The operation takes O(N)
+ // and N is the capacity of the internal hash table. The internal capacity grows based on
+ // the number of elements and it never shrinks. Thus, HashSet#clear takes O(N) every time
+ // once a skewed key appears.
+ // In order to avoid too many resizing in average cases, we set the initial capacity to the
+ // number of elements of the previous aggregation.
+ container = new LinkedHashSet<>(container.size());
+ } else {
+ throw new RuntimeException("Buffer type unknown");
+ }
+ }
}
@Override
public void reset(AggregationBuffer agg) throws HiveException {
- ((MkArrayAggregationBuffer) agg).container.clear();
+ ((MkArrayAggregationBuffer) agg).reset();
}
@Override