You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iotdb.apache.org by ja...@apache.org on 2023/03/29 08:50:07 UTC
[iotdb] branch master updated: [IOTDB-5683] Add special case process for aggregation function Mode
This is an automated email from the ASF dual-hosted git repository.
jackietien pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iotdb.git
The following commit(s) were added to refs/heads/master by this push:
new af8f6afaad [IOTDB-5683] Add special case process for aggregation function Mode
af8f6afaad is described below
commit af8f6afaad222640c69df887f7997f31f8a2aee9
Author: Weihao Li <60...@users.noreply.github.com>
AuthorDate: Wed Mar 29 16:49:58 2023 +0800
[IOTDB-5683] Add special case process for aggregation function Mode
---
docs/UserGuide/Operators-Functions/Aggregation.md | 30 ++++++-------
.../UserGuide/Operators-Functions/Aggregation.md | 30 ++++++-------
.../iotdb/db/it/aggregation/IoTDBModeIT.java | 24 ++++++++++-
.../src/main/codegen/templates/ModeAccumulator.ftl | 49 ++++++++++++++++------
4 files changed, 89 insertions(+), 44 deletions(-)
diff --git a/docs/UserGuide/Operators-Functions/Aggregation.md b/docs/UserGuide/Operators-Functions/Aggregation.md
index a6c0e07126..1a73578784 100644
--- a/docs/UserGuide/Operators-Functions/Aggregation.md
+++ b/docs/UserGuide/Operators-Functions/Aggregation.md
@@ -27,21 +27,21 @@ All aggregate functions except `COUNT()`, `COUNT_IF()` ignore null values and re
The aggregate functions supported by IoTDB are as follows:
-| Function Name | Function Description | Allowed Input Data Types | required attributes [...]
-|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------| ------------------------ |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- [...]
-| SUM | Summation. | INT32 INT64 FLOAT DOUBLE | No [...]
-| COUNT | Counts the number of data points. | All types | No [...]
-| AVG | Average. | INT32 INT64 FLOAT DOUBLE | No [...]
-| EXTREME | Finds the value with the largest absolute value. Returns a positive value if the maximum absolute value of positive and negative values is equal. | INT32 INT64 FLOAT DOUBLE | No [...]
-| MAX_VALUE | Find the maximum value. | INT32 INT64 FLOAT DOUBLE | No [...]
-| MIN_VALUE | Find the minimum value. | INT32 INT64 FLOAT DOUBLE | No [...]
-| FIRST_VALUE | Find the value with the smallest timestamp. | All data types | No [...]
-| LAST_VALUE | Find the value with the largest timestamp. | All data types | No [...]
-| MAX_TIME | Find the maximum timestamp. | All data Types | No [...]
-| MIN_TIME | Find the minimum timestamp. | All data Types | No [...]
-| COUNT_IF | Find the number of data points that continuously meet a given condition and the number of data points that meet the condition (represented by keep) meet the specified threshold. | BOOLEAN | `[keep >=/>/=/!=/</<=]threshold`:The specified threshold or threshold condition, it is equivalent to `keep >= threshold` if `threshold` is used alone, type of `threshold` is `INT64`<br/> `ignoreNull`:Optional, default value is `true`;If the value is `true`, null valu [...]
-| TIME_DURATION | Find the difference between the timestamp of the largest non-null value and the timestamp of the smallest non-null value in a column | All data Types | No [...]
-| MODE | Find the mode. Note: Having too many different values in the input series risks a memory exception. | All data Types | No [...]
+| Function Name | Function Description | Allowed Input Data Types | required attributes [...]
+|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ------------------------ |--------------------------------------------------------------------------------------------------------------------------------------------------------- [...]
+| SUM | Summation. | INT32 INT64 FLOAT DOUBLE | No [...]
+| COUNT | Counts the number of data points. | All types | No [...]
+| AVG | Average. | INT32 INT64 FLOAT DOUBLE | No [...]
+| EXTREME | Finds the value with the largest absolute value. Returns a positive value if the maximum absolute value of positive and negative values is equal. | INT32 INT64 FLOAT DOUBLE | No [...]
+| MAX_VALUE | Find the maximum value. | INT32 INT64 FLOAT DOUBLE | No [...]
+| MIN_VALUE | Find the minimum value. | INT32 INT64 FLOAT DOUBLE | No [...]
+| FIRST_VALUE | Find the value with the smallest timestamp. | All data types | No [...]
+| LAST_VALUE | Find the value with the largest timestamp. | All data types | No [...]
+| MAX_TIME | Find the maximum timestamp. | All data Types | No [...]
+| MIN_TIME | Find the minimum timestamp. | All data Types | No [...]
+| COUNT_IF | Find the number of data points that continuously meet a given condition and the number of data points that meet the condition (represented by keep) meet the specified threshold. | BOOLEAN | `[keep >=/>/=/!=/</<=]threshold`:The specified threshold or threshold condition, it is equivalent to `keep >= threshold` if `threshold` is used alone, t [...]
+| TIME_DURATION | Find the difference between the timestamp of the largest non-null value and the timestamp of the smallest non-null value in a column | All data Types | No [...]
+| MODE | Find the mode. Note: </br>1.Having too many different values in the input series risks a memory exception; </br>2.If all the elements have the same number of occurrences, that is no Mode, return the value with earliest time; </br>3.If there are many Modes, return the Mode with earliest time. | All data Types | No [...]
## COUNT
### example
diff --git a/docs/zh/UserGuide/Operators-Functions/Aggregation.md b/docs/zh/UserGuide/Operators-Functions/Aggregation.md
index 995aef81fa..87177d6145 100644
--- a/docs/zh/UserGuide/Operators-Functions/Aggregation.md
+++ b/docs/zh/UserGuide/Operators-Functions/Aggregation.md
@@ -27,21 +27,21 @@
IoTDB 支持的聚合函数如下:
-| 函数名 | 功能描述 | 允许的输入类型 | 必要的属性参数 | 输出类型 |
-|---------------|-----------------------------------------------|--------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|
-| SUM | 求和。 | INT32 INT64 FLOAT DOUBLE | 无 | DOUBLE |
-| COUNT | 计算数据点数。 | 所有类型 | 无 | INT64 |
-| AVG | 求平均值。 | INT32 INT64 FLOAT DOUBLE | 无 | DOUBLE |
-| EXTREME | 求具有最大绝对值的值。如果正值和负值的最大绝对值相等,则返回正值。 | INT32 INT64 FLOAT DOUBLE | 无 | 与输入类型一致 |
-| MAX_VALUE | 求最大值。 | INT32 INT64 FLOAT DOUBLE | 无 | 与输入类型一致 |
-| MIN_VALUE | 求最小值。 | INT32 INT64 FLOAT DOUBLE | 无 | 与输入类型一致 |
-| FIRST_VALUE | 求时间戳最小的值。 | 所有类型 | 无 | 与输入类型一致 |
-| LAST_VALUE | 求时间戳最大的值。 | 所有类型 | 无 | 与输入类型一致 |
-| MAX_TIME | 求最大时间戳。 | 所有类型 | 无 | Timestamp |
-| MIN_TIME | 求最小时间戳。 | 所有类型 | 无 | Timestamp |
-| COUNT_IF | 求数据点连续满足某一给定条件,且满足条件的数据点个数(用keep表示)满足指定阈值的次数。 | BOOLEAN | `[keep >=/>/=/!=/</<=]threshold`:被指定的阈值或阈值条件,若只使用`threshold`则等价于`keep >= threshold`,`threshold`类型为`INT64`<br/> `ignoreNull`:可选,默认为`true`;为`true`表示忽略null值,即如果中间出现null值,直接忽略,不会打断连续性;为`false`表示不忽略null值,即如果中间出现null值,会打断连续性 | INT64 |
-| TIME_DURATION | 求某一列最大一个不为NULL的值所在时间戳与最小一个不为NULL的值所在时间戳的时间戳差 | 所有类型 | 无 | INT64 |
-| MODE | 求众数。注意:输入序列的不同值个数过多时会有内存异常风险。 | 所有类型 | 无 | INT64 |
+| 函数名 | 功能描述 | 允许的输入类型 | 必要的属性参数 | 输出类型 |
+|---------------|-------------------------------------------------------------------------------------------------------------|--------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|
+| SUM | 求和。 | INT32 INT64 FLOAT DOUBLE | 无 | DOUBLE |
+| COUNT | 计算数据点数。 | 所有类型 | 无 | INT64 |
+| AVG | 求平均值。 | INT32 INT64 FLOAT DOUBLE | 无 | DOUBLE |
+| EXTREME | 求具有最大绝对值的值。如果正值和负值的最大绝对值相等,则返回正值。 | INT32 INT64 FLOAT DOUBLE | 无 | 与输入类型一致 |
+| MAX_VALUE | 求最大值。 | INT32 INT64 FLOAT DOUBLE | 无 | 与输入类型一致 |
+| MIN_VALUE | 求最小值。 | INT32 INT64 FLOAT DOUBLE | 无 | 与输入类型一致 |
+| FIRST_VALUE | 求时间戳最小的值。 | 所有类型 | 无 | 与输入类型一致 |
+| LAST_VALUE | 求时间戳最大的值。 | 所有类型 | 无 | 与输入类型一致 |
+| MAX_TIME | 求最大时间戳。 | 所有类型 | 无 | Timestamp |
+| MIN_TIME | 求最小时间戳。 | 所有类型 | 无 | Timestamp |
+| COUNT_IF | 求数据点连续满足某一给定条件,且满足条件的数据点个数(用keep表示)满足指定阈值的次数。 | BOOLEAN | `[keep >=/>/=/!=/</<=]threshold`:被指定的阈值或阈值条件,若只使用`threshold`则等价于`keep >= threshold`,`threshold`类型为`INT64`<br/> `ignoreNull`:可选,默认为`true`;为`true`表示忽略null值,即如果中间出现null值,直接忽略,不会打断连续性;为`false`表示不忽略null值,即如果中间出现null值,会打断连续性 | INT64 |
+| TIME_DURATION | 求某一列最大一个不为NULL的值所在时间戳与最小一个不为NULL的值所在时间戳的时间戳差 | 所有类型 | 无 | INT64 |
+| MODE | 求众数。注意:</br>1.输入序列的不同值个数过多时会有内存异常风险; </br>2.如果所有元素出现的频次相同,即没有众数,则返回对应时间戳最小的值; </br>3.如果有多个众数,则返回对应时间戳最小的众数。 | 所有类型 | 无 | INT64 |
### COUNT_IF
#### 语法
diff --git a/integration-test/src/test/java/org/apache/iotdb/db/it/aggregation/IoTDBModeIT.java b/integration-test/src/test/java/org/apache/iotdb/db/it/aggregation/IoTDBModeIT.java
index 37e8883182..49bee98459 100644
--- a/integration-test/src/test/java/org/apache/iotdb/db/it/aggregation/IoTDBModeIT.java
+++ b/integration-test/src/test/java/org/apache/iotdb/db/it/aggregation/IoTDBModeIT.java
@@ -66,6 +66,13 @@ public class IoTDBModeIT {
"INSERT INTO root.db.d2(timestamp,s1,s2,s3,s4,s5,s6) values(10000000001, 2, 2, false, 2, 2, \"2\")",
"INSERT INTO root.db.d2(timestamp,s1,s2,s3,s4,s5,s6) values(10000000002, 2, 2, false, 2, 2, \"2\")",
"INSERT INTO root.db.d2(timestamp,s1,s2,s3,s4,s5,s6) values(10000000003, 2, 2, false, 2, 2, \"2\")",
+ "INSERT INTO root.test.d1(timestamp,s1) values(1, 5)",
+ "INSERT INTO root.test.d1(timestamp,s1) values(2, 5)",
+ "INSERT INTO root.test.d1(timestamp,s1) values(3, 2)",
+ "INSERT INTO root.test.d1(timestamp,s1) values(4, 2)",
+ "INSERT INTO root.test.d1(timestamp,s1) values(5, 8)",
+ "INSERT INTO root.test.d1(timestamp,s1) values(6, 8)",
+ "INSERT INTO root.test.d1(timestamp,s1) values(7, 1)",
"flush"
};
@@ -145,7 +152,7 @@ public class IoTDBModeIT {
};
String[] retArray = new String[] {"2,2,false,2.0,2.0,2,"};
resultSetEqualTest(
- "select mode(s1),mode(s2),mode(s3),mode(s4),mode(s5),mode(s6) from root.** group by level = 0",
+ "select mode(s1),mode(s2),mode(s3),mode(s4),mode(s5),mode(s6) from root.db.* group by level = 0",
expectedHeader,
retArray);
}
@@ -157,4 +164,19 @@ public class IoTDBModeIT {
TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode()
+ ": MODE with slidingWindow is not supported now");
}
+
+ @Test
+ public void testNoMode() {
+ String[] expectedHeader = new String[] {mode("root.test.d1.s1")};
+ String[] retArray = new String[] {"5.0,"};
+ resultSetEqualTest(
+ "select mode(s1) from root.test.d1 where time <= 6", expectedHeader, retArray);
+ }
+
+ @Test
+ public void testManyModes() {
+ String[] expectedHeader = new String[] {mode("root.test.d1.s1")};
+ String[] retArray = new String[] {"5.0,"};
+ resultSetEqualTest("select mode(s1) from root.test.d1", expectedHeader, retArray);
+ }
}
diff --git a/server/src/main/codegen/templates/ModeAccumulator.ftl b/server/src/main/codegen/templates/ModeAccumulator.ftl
index 62175bf628..e81ae32d15 100644
--- a/server/src/main/codegen/templates/ModeAccumulator.ftl
+++ b/server/src/main/codegen/templates/ModeAccumulator.ftl
@@ -25,6 +25,8 @@
package org.apache.iotdb.db.mpp.aggregation;
+import com.google.common.collect.ImmutableList;
+import org.apache.commons.collections4.comparators.ComparatorChain;
import org.apache.iotdb.db.conf.IoTDBDescriptor;
import org.apache.iotdb.tsfile.file.metadata.enums.TSDataType;
import org.apache.iotdb.tsfile.file.metadata.statistics.Statistics;
@@ -32,6 +34,7 @@ import org.apache.iotdb.tsfile.read.common.block.column.Column;
import org.apache.iotdb.tsfile.read.common.block.column.ColumnBuilder;
import org.apache.iotdb.tsfile.utils.Binary;
import org.apache.iotdb.tsfile.utils.BitMap;
+import org.apache.iotdb.tsfile.utils.Pair;
import org.apache.iotdb.tsfile.utils.ReadWriteIOUtils;
import java.io.ByteArrayInputStream;
@@ -49,7 +52,8 @@ import static com.google.common.base.Preconditions.checkArgument;
*/
@SuppressWarnings("unused")
public class ${className} implements Accumulator {
- private final Map<${type.javaBoxName}, Long> countMap = new HashMap<>();
+ // pair left records count of element, pair right records min time of element
+ private final Map<${type.javaBoxName}, Pair<Long, Long>> countMap = new HashMap<>();
<#if type.dataType != "boolean">
private final int MAP_SIZE_THRESHOLD = IoTDBDescriptor.getInstance().getConfig().getModeMapSizeThreshold();
@@ -62,7 +66,11 @@ public class ${className} implements Accumulator {
continue;
}
if (!column[1].isNull(i)) {
- countMap.compute(column[1].get${type.dataType?cap_first}(i), (k, v) -> v == null ? 1 : v + 1);
+ final long time = column[0].getLong(i);
+ countMap.compute(
+ column[1].get${type.dataType?cap_first}(i),
+ (k, v) ->
+ v == null ? new Pair<>(1L, time) : new Pair<>(v.left + 1, Math.min(v.right, time)));
<#if type.dataType != "boolean">
if (countMap.size() > MAP_SIZE_THRESHOLD) {
@@ -96,7 +104,7 @@ public class ${className} implements Accumulator {
// Step of ModeAccumulator is STATIC,
// countMap only need to record one entry which key is finalResult
- countMap.put(finalResult.get${type.dataType?cap_first}(0), 0L);
+ countMap.put(finalResult.get${type.dataType?cap_first}(0), new Pair<>(0L, Long.MIN_VALUE));
}
@Override
@@ -110,7 +118,14 @@ public class ${className} implements Accumulator {
tsBlockBuilder.appendNull();
} else {
tsBlockBuilder.write${type.dataType?cap_first}(
- Collections.max(countMap.entrySet(), Map.Entry.comparingByValue()).getKey());
+ Collections.max(
+ countMap.entrySet(),
+ Map.Entry.comparingByValue(
+ new ComparatorChain<>(
+ ImmutableList.of(
+ (v1, v2) -> v1.left.compareTo(v2.left),
+ (v1, v2) -> v2.right.compareTo(v1.right)))))
+ .getKey());
}
}
@@ -138,9 +153,10 @@ public class ${className} implements Accumulator {
ByteArrayOutputStream stream = new ByteArrayOutputStream();
try {
ReadWriteIOUtils.write(countMap.size(), stream);
- for (Map.Entry<${type.javaBoxName}, Long> entry : countMap.entrySet()) {
+ for (Map.Entry<${type.javaBoxName}, Pair<Long, Long>> entry : countMap.entrySet()) {
ReadWriteIOUtils.write(entry.getKey(), stream);
- ReadWriteIOUtils.write(entry.getValue(), stream);
+ ReadWriteIOUtils.write(entry.getValue().left, stream);
+ ReadWriteIOUtils.write(entry.getValue().right, stream);
}
} catch (IOException e) {
// Totally memory operation. This case won't happen.
@@ -153,13 +169,20 @@ public class ${className} implements Accumulator {
try {
int size = ReadWriteIOUtils.readInt(stream);
for (int i = 0; i < size; i++) {
- countMap.compute(ReadWriteIOUtils.read${type.dataType?cap_first}(stream), (k, v) -> {
- try {
- return v == null ? ReadWriteIOUtils.readLong(stream) : v + ReadWriteIOUtils.readLong(stream);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- });
+ countMap.compute(
+ ReadWriteIOUtils.read${type.dataType?cap_first}(stream),
+ (k, v) -> {
+ try {
+ return v == null
+ ? new Pair<>(
+ ReadWriteIOUtils.readLong(stream), ReadWriteIOUtils.readLong(stream))
+ : new Pair<>(
+ v.left + ReadWriteIOUtils.readLong(stream),
+ Math.min(v.right, ReadWriteIOUtils.readLong(stream)));
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ });
<#if type.dataType != "boolean">
if (countMap.size() > MAP_SIZE_THRESHOLD) {