You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iotdb.apache.org by ja...@apache.org on 2023/03/29 08:50:07 UTC

[iotdb] branch master updated: [IOTDB-5683] Add special case process for aggregation function Mode

This is an automated email from the ASF dual-hosted git repository.

jackietien pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iotdb.git


The following commit(s) were added to refs/heads/master by this push:
     new af8f6afaad [IOTDB-5683] Add special case process for aggregation function Mode
af8f6afaad is described below

commit af8f6afaad222640c69df887f7997f31f8a2aee9
Author: Weihao Li <60...@users.noreply.github.com>
AuthorDate: Wed Mar 29 16:49:58 2023 +0800

    [IOTDB-5683] Add special case process for aggregation function Mode
---
 docs/UserGuide/Operators-Functions/Aggregation.md  | 30 ++++++-------
 .../UserGuide/Operators-Functions/Aggregation.md   | 30 ++++++-------
 .../iotdb/db/it/aggregation/IoTDBModeIT.java       | 24 ++++++++++-
 .../src/main/codegen/templates/ModeAccumulator.ftl | 49 ++++++++++++++++------
 4 files changed, 89 insertions(+), 44 deletions(-)

diff --git a/docs/UserGuide/Operators-Functions/Aggregation.md b/docs/UserGuide/Operators-Functions/Aggregation.md
index a6c0e07126..1a73578784 100644
--- a/docs/UserGuide/Operators-Functions/Aggregation.md
+++ b/docs/UserGuide/Operators-Functions/Aggregation.md
@@ -27,21 +27,21 @@ All aggregate functions except `COUNT()`, `COUNT_IF()` ignore null values and re
 
 The aggregate functions supported by IoTDB are as follows:
 
-| Function Name | Function Description                                                                                                                                 | Allowed Input Data Types | required attributes                                                                                                                                                                                                                                                                                      [...]
-|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------| ------------------------ |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- [...]
-| SUM           | Summation.                                                                                                                                           | INT32 INT64 FLOAT DOUBLE | No                                                                                                                                                                                                                                                                                                       [...]
-| COUNT         | Counts the number of data points.                                                                                                                    | All types                | No                                                                                                                                                                                                                                                                                                       [...]
-| AVG           | Average.                                                                                                                                             | INT32 INT64 FLOAT DOUBLE | No                                                                                                                                                                                                                                                                                                       [...]
-| EXTREME       | Finds the value with the largest absolute value. Returns a positive value if the maximum absolute value of positive and negative values is equal.    | INT32 INT64 FLOAT DOUBLE | No                                                                                                                                                                                                                                                                                                       [...]
-| MAX_VALUE     | Find the maximum value.                                                                                                                              | INT32 INT64 FLOAT DOUBLE | No                                                                                                                                                                                                                                                                                                       [...]
-| MIN_VALUE     | Find the minimum value.                                                                                                                              | INT32 INT64 FLOAT DOUBLE | No                                                                                                                                                                                                                                                                                                       [...]
-| FIRST_VALUE   | Find the value with the smallest timestamp.                                                                                                          | All data types           | No                                                                                                                                                                                                                                                                                                       [...]
-| LAST_VALUE    | Find the value with the largest timestamp.                                                                                                           | All data types           | No                                                                                                                                                                                                                                                                                                       [...]
-| MAX_TIME      | Find the maximum timestamp.                                                                                                                          | All data Types           | No                                                                                                                                                                                                                                                                                                       [...]
-| MIN_TIME      | Find the minimum timestamp.                                                                                                                          | All data Types           | No                                                                                                                                                                                                                                                                                                       [...]
-| COUNT_IF      | Find the number of data points that continuously meet a given condition and the number of data points that meet the condition (represented by keep) meet the specified threshold. | BOOLEAN                  | `[keep >=/>/=/!=/</<=]threshold`:The specified threshold or threshold condition, it is equivalent to `keep >= threshold` if `threshold` is used alone, type of `threshold` is `INT64`<br/> `ignoreNull`:Optional, default value is `true`;If the value is `true`, null valu [...]
-| TIME_DURATION | Find the difference between the timestamp of the largest non-null value and the timestamp of the smallest non-null value in a column               |  All data Types   | No                                                                                                                                                                                                                                                                                                                [...]
-|  MODE         | Find the mode. Note: Having too many different values in the input series risks a memory exception.                                                                               | All data Types                     | No                                                                                                                                                                                                                                                                [...]
+| Function Name | Function Description                                                                                                                                                                                                                                                                                 | Allowed Input Data Types | required attributes                                                                                                                                      [...]
+|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ------------------------ |--------------------------------------------------------------------------------------------------------------------------------------------------------- [...]
+| SUM           | Summation.                                                                                                                                                                                                                                                                                           | INT32 INT64 FLOAT DOUBLE | No                                                                                                                                                       [...]
+| COUNT         | Counts the number of data points.                                                                                                                                                                                                                                                                    | All types                | No                                                                                                                                                       [...]
+| AVG           | Average.                                                                                                                                                                                                                                                                                             | INT32 INT64 FLOAT DOUBLE | No                                                                                                                                                       [...]
+| EXTREME       | Finds the value with the largest absolute value. Returns a positive value if the maximum absolute value of positive and negative values is equal.                                                                                                                                                    | INT32 INT64 FLOAT DOUBLE | No                                                                                                                                                       [...]
+| MAX_VALUE     | Find the maximum value.                                                                                                                                                                                                                                                                              | INT32 INT64 FLOAT DOUBLE | No                                                                                                                                                       [...]
+| MIN_VALUE     | Find the minimum value.                                                                                                                                                                                                                                                                              | INT32 INT64 FLOAT DOUBLE | No                                                                                                                                                       [...]
+| FIRST_VALUE   | Find the value with the smallest timestamp.                                                                                                                                                                                                                                                          | All data types           | No                                                                                                                                                       [...]
+| LAST_VALUE    | Find the value with the largest timestamp.                                                                                                                                                                                                                                                           | All data types           | No                                                                                                                                                       [...]
+| MAX_TIME      | Find the maximum timestamp.                                                                                                                                                                                                                                                                          | All data Types           | No                                                                                                                                                       [...]
+| MIN_TIME      | Find the minimum timestamp.                                                                                                                                                                                                                                                                          | All data Types           | No                                                                                                                                                       [...]
+| COUNT_IF      | Find the number of data points that continuously meet a given condition and the number of data points that meet the condition (represented by keep) meet the specified threshold.                                                                                                                    | BOOLEAN                  | `[keep >=/>/=/!=/</<=]threshold`:The specified threshold or threshold condition, it is equivalent to `keep >= threshold` if `threshold` is used alone, t [...]
+| TIME_DURATION | Find the difference between the timestamp of the largest non-null value and the timestamp of the smallest non-null value in a column                                                                                                                                                                 |  All data Types   | No                                                                                                                                                              [...]
+|  MODE         | Find the mode. Note: </br>1.Having too many different values in the input series risks a memory exception; </br>2.If all the elements have the same number of occurrences, that is no Mode, return the value with earliest time; </br>3.If there are many Modes, return the Mode with earliest time. | All data Types                     | No                                                                                                                                             [...]
 ## COUNT
 
 ### example
diff --git a/docs/zh/UserGuide/Operators-Functions/Aggregation.md b/docs/zh/UserGuide/Operators-Functions/Aggregation.md
index 995aef81fa..87177d6145 100644
--- a/docs/zh/UserGuide/Operators-Functions/Aggregation.md
+++ b/docs/zh/UserGuide/Operators-Functions/Aggregation.md
@@ -27,21 +27,21 @@
 
 IoTDB 支持的聚合函数如下:
 
-| 函数名           | 功能描述                                          | 允许的输入类型                  | 必要的属性参数                                                                                                                                                                                                                    | 输出类型      |
-|---------------|-----------------------------------------------|--------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|
-| SUM           | 求和。                                           | INT32 INT64 FLOAT DOUBLE | 无                                                                                                                                                                                                                          | DOUBLE    |
-| COUNT         | 计算数据点数。                                       | 所有类型                     | 无                                                                                                                                                                                                                          | INT64     |
-| AVG           | 求平均值。                                         | INT32 INT64 FLOAT DOUBLE | 无                                                                                                                                                                                                                          | DOUBLE    |
-| EXTREME       | 求具有最大绝对值的值。如果正值和负值的最大绝对值相等,则返回正值。             | INT32 INT64 FLOAT DOUBLE | 无                                                                                                                                                                                                                          | 与输入类型一致   |
-| MAX_VALUE     | 求最大值。                                         | INT32 INT64 FLOAT DOUBLE | 无                                                                                                                                                                                                                          | 与输入类型一致   |
-| MIN_VALUE     | 求最小值。                                         | INT32 INT64 FLOAT DOUBLE | 无                                                                                                                                                                                                                          | 与输入类型一致   |
-| FIRST_VALUE   | 求时间戳最小的值。                                     | 所有类型                     | 无                                                                                                                                                                                                                          | 与输入类型一致   |
-| LAST_VALUE    | 求时间戳最大的值。                                     | 所有类型                     | 无                                                                                                                                                                                                                          | 与输入类型一致   |
-| MAX_TIME      | 求最大时间戳。                                       | 所有类型                     | 无                                                                                                                                                                                                                          | Timestamp |
-| MIN_TIME      | 求最小时间戳。                                       | 所有类型                     | 无                                                                                                                                                                                                                          | Timestamp |
-| COUNT_IF      | 求数据点连续满足某一给定条件,且满足条件的数据点个数(用keep表示)满足指定阈值的次数。 | BOOLEAN                  | `[keep >=/>/=/!=/</<=]threshold`:被指定的阈值或阈值条件,若只使用`threshold`则等价于`keep >= threshold`,`threshold`类型为`INT64`<br/> `ignoreNull`:可选,默认为`true`;为`true`表示忽略null值,即如果中间出现null值,直接忽略,不会打断连续性;为`false`表示不忽略null值,即如果中间出现null值,会打断连续性 | INT64     |
-| TIME_DURATION | 求某一列最大一个不为NULL的值所在时间戳与最小一个不为NULL的值所在时间戳的时间戳差  | 所有类型                     | 无                                                                                                                                                                                                                          |   INT64        |
-| MODE          | 求众数。注意:输入序列的不同值个数过多时会有内存异常风险。                 | 所有类型                     | 无                                                                                                                                                                                                                          | INT64     |
+| 函数名           | 功能描述                                                                                                        | 允许的输入类型                  | 必要的属性参数                                                                                                                                                                                                                    | 输出类型      |
+|---------------|-------------------------------------------------------------------------------------------------------------|--------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|
+| SUM           | 求和。                                                                                                         | INT32 INT64 FLOAT DOUBLE | 无                                                                                                                                                                                                                          | DOUBLE    |
+| COUNT         | 计算数据点数。                                                                                                     | 所有类型                     | 无                                                                                                                                                                                                                          | INT64     |
+| AVG           | 求平均值。                                                                                                       | INT32 INT64 FLOAT DOUBLE | 无                                                                                                                                                                                                                          | DOUBLE    |
+| EXTREME       | 求具有最大绝对值的值。如果正值和负值的最大绝对值相等,则返回正值。                                                                           | INT32 INT64 FLOAT DOUBLE | 无                                                                                                                                                                                                                          | 与输入类型一致   |
+| MAX_VALUE     | 求最大值。                                                                                                       | INT32 INT64 FLOAT DOUBLE | 无                                                                                                                                                                                                                          | 与输入类型一致   |
+| MIN_VALUE     | 求最小值。                                                                                                       | INT32 INT64 FLOAT DOUBLE | 无                                                                                                                                                                                                                          | 与输入类型一致   |
+| FIRST_VALUE   | 求时间戳最小的值。                                                                                                   | 所有类型                     | 无                                                                                                                                                                                                                          | 与输入类型一致   |
+| LAST_VALUE    | 求时间戳最大的值。                                                                                                   | 所有类型                     | 无                                                                                                                                                                                                                          | 与输入类型一致   |
+| MAX_TIME      | 求最大时间戳。                                                                                                     | 所有类型                     | 无                                                                                                                                                                                                                          | Timestamp |
+| MIN_TIME      | 求最小时间戳。                                                                                                     | 所有类型                     | 无                                                                                                                                                                                                                          | Timestamp |
+| COUNT_IF      | 求数据点连续满足某一给定条件,且满足条件的数据点个数(用keep表示)满足指定阈值的次数。                                                               | BOOLEAN                  | `[keep >=/>/=/!=/</<=]threshold`:被指定的阈值或阈值条件,若只使用`threshold`则等价于`keep >= threshold`,`threshold`类型为`INT64`<br/> `ignoreNull`:可选,默认为`true`;为`true`表示忽略null值,即如果中间出现null值,直接忽略,不会打断连续性;为`false`表示不忽略null值,即如果中间出现null值,会打断连续性 | INT64     |
+| TIME_DURATION | 求某一列最大一个不为NULL的值所在时间戳与最小一个不为NULL的值所在时间戳的时间戳差                                                                | 所有类型                     | 无                                                                                                                                                                                                                          |   INT64        |
+| MODE          | 求众数。注意:</br>1.输入序列的不同值个数过多时会有内存异常风险; </br>2.如果所有元素出现的频次相同,即没有众数,则返回对应时间戳最小的值; </br>3.如果有多个众数,则返回对应时间戳最小的众数。 | 所有类型                     | 无                                                                                                                                                                                                                          | INT64     |
 ### COUNT_IF
 
 #### 语法
diff --git a/integration-test/src/test/java/org/apache/iotdb/db/it/aggregation/IoTDBModeIT.java b/integration-test/src/test/java/org/apache/iotdb/db/it/aggregation/IoTDBModeIT.java
index 37e8883182..49bee98459 100644
--- a/integration-test/src/test/java/org/apache/iotdb/db/it/aggregation/IoTDBModeIT.java
+++ b/integration-test/src/test/java/org/apache/iotdb/db/it/aggregation/IoTDBModeIT.java
@@ -66,6 +66,13 @@ public class IoTDBModeIT {
         "INSERT INTO root.db.d2(timestamp,s1,s2,s3,s4,s5,s6) values(10000000001, 2, 2, false, 2, 2, \"2\")",
         "INSERT INTO root.db.d2(timestamp,s1,s2,s3,s4,s5,s6) values(10000000002, 2, 2, false, 2, 2, \"2\")",
         "INSERT INTO root.db.d2(timestamp,s1,s2,s3,s4,s5,s6) values(10000000003, 2, 2, false, 2, 2, \"2\")",
+        "INSERT INTO root.test.d1(timestamp,s1) values(1, 5)",
+        "INSERT INTO root.test.d1(timestamp,s1) values(2, 5)",
+        "INSERT INTO root.test.d1(timestamp,s1) values(3, 2)",
+        "INSERT INTO root.test.d1(timestamp,s1) values(4, 2)",
+        "INSERT INTO root.test.d1(timestamp,s1) values(5, 8)",
+        "INSERT INTO root.test.d1(timestamp,s1) values(6, 8)",
+        "INSERT INTO root.test.d1(timestamp,s1) values(7, 1)",
         "flush"
       };
 
@@ -145,7 +152,7 @@ public class IoTDBModeIT {
         };
     String[] retArray = new String[] {"2,2,false,2.0,2.0,2,"};
     resultSetEqualTest(
-        "select mode(s1),mode(s2),mode(s3),mode(s4),mode(s5),mode(s6) from root.** group by level = 0",
+        "select mode(s1),mode(s2),mode(s3),mode(s4),mode(s5),mode(s6) from root.db.* group by level = 0",
         expectedHeader,
         retArray);
   }
@@ -157,4 +164,19 @@ public class IoTDBModeIT {
         TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode()
             + ": MODE with slidingWindow is not supported now");
   }
+
+  @Test
+  public void testNoMode() {
+    String[] expectedHeader = new String[] {mode("root.test.d1.s1")};
+    String[] retArray = new String[] {"5.0,"};
+    resultSetEqualTest(
+        "select mode(s1) from root.test.d1 where time <= 6", expectedHeader, retArray);
+  }
+
+  @Test
+  public void testManyModes() {
+    String[] expectedHeader = new String[] {mode("root.test.d1.s1")};
+    String[] retArray = new String[] {"5.0,"};
+    resultSetEqualTest("select mode(s1) from root.test.d1", expectedHeader, retArray);
+  }
 }
diff --git a/server/src/main/codegen/templates/ModeAccumulator.ftl b/server/src/main/codegen/templates/ModeAccumulator.ftl
index 62175bf628..e81ae32d15 100644
--- a/server/src/main/codegen/templates/ModeAccumulator.ftl
+++ b/server/src/main/codegen/templates/ModeAccumulator.ftl
@@ -25,6 +25,8 @@
 
 package org.apache.iotdb.db.mpp.aggregation;
 
+import com.google.common.collect.ImmutableList;
+import org.apache.commons.collections4.comparators.ComparatorChain;
 import org.apache.iotdb.db.conf.IoTDBDescriptor;
 import org.apache.iotdb.tsfile.file.metadata.enums.TSDataType;
 import org.apache.iotdb.tsfile.file.metadata.statistics.Statistics;
@@ -32,6 +34,7 @@ import org.apache.iotdb.tsfile.read.common.block.column.Column;
 import org.apache.iotdb.tsfile.read.common.block.column.ColumnBuilder;
 import org.apache.iotdb.tsfile.utils.Binary;
 import org.apache.iotdb.tsfile.utils.BitMap;
+import org.apache.iotdb.tsfile.utils.Pair;
 import org.apache.iotdb.tsfile.utils.ReadWriteIOUtils;
 
 import java.io.ByteArrayInputStream;
@@ -49,7 +52,8 @@ import static com.google.common.base.Preconditions.checkArgument;
 */
 @SuppressWarnings("unused")
 public class ${className} implements Accumulator {
-  private final Map<${type.javaBoxName}, Long> countMap = new HashMap<>();
+  // pair left records count of element, pair right records min time of element
+  private final Map<${type.javaBoxName}, Pair<Long, Long>> countMap = new HashMap<>();
 
   <#if type.dataType != "boolean">
   private final int MAP_SIZE_THRESHOLD = IoTDBDescriptor.getInstance().getConfig().getModeMapSizeThreshold();
@@ -62,7 +66,11 @@ public class ${className} implements Accumulator {
         continue;
       }
       if (!column[1].isNull(i)) {
-        countMap.compute(column[1].get${type.dataType?cap_first}(i), (k, v) -> v == null ? 1 : v + 1);
+          final long time = column[0].getLong(i);
+        countMap.compute(
+            column[1].get${type.dataType?cap_first}(i),
+            (k, v) ->
+                v == null ? new Pair<>(1L, time) : new Pair<>(v.left + 1, Math.min(v.right, time)));
         <#if type.dataType != "boolean">
 
         if (countMap.size() > MAP_SIZE_THRESHOLD) {
@@ -96,7 +104,7 @@ public class ${className} implements Accumulator {
 
     // Step of ModeAccumulator is STATIC,
     // countMap only need to record one entry which key is finalResult
-    countMap.put(finalResult.get${type.dataType?cap_first}(0), 0L);
+    countMap.put(finalResult.get${type.dataType?cap_first}(0), new Pair<>(0L, Long.MIN_VALUE));
   }
 
   @Override
@@ -110,7 +118,14 @@ public class ${className} implements Accumulator {
       tsBlockBuilder.appendNull();
     } else {
       tsBlockBuilder.write${type.dataType?cap_first}(
-          Collections.max(countMap.entrySet(), Map.Entry.comparingByValue()).getKey());
+          Collections.max(
+                  countMap.entrySet(),
+                  Map.Entry.comparingByValue(
+                      new ComparatorChain<>(
+                          ImmutableList.of(
+                              (v1, v2) -> v1.left.compareTo(v2.left),
+                              (v1, v2) -> v2.right.compareTo(v1.right)))))
+              .getKey());
     }
   }
 
@@ -138,9 +153,10 @@ public class ${className} implements Accumulator {
     ByteArrayOutputStream stream = new ByteArrayOutputStream();
     try {
       ReadWriteIOUtils.write(countMap.size(), stream);
-      for (Map.Entry<${type.javaBoxName}, Long> entry : countMap.entrySet()) {
+      for (Map.Entry<${type.javaBoxName}, Pair<Long, Long>> entry : countMap.entrySet()) {
         ReadWriteIOUtils.write(entry.getKey(), stream);
-        ReadWriteIOUtils.write(entry.getValue(), stream);
+        ReadWriteIOUtils.write(entry.getValue().left, stream);
+        ReadWriteIOUtils.write(entry.getValue().right, stream);
       }
     } catch (IOException e) {
       // Totally memory operation. This case won't happen.
@@ -153,13 +169,20 @@ public class ${className} implements Accumulator {
     try {
       int size = ReadWriteIOUtils.readInt(stream);
       for (int i = 0; i < size; i++) {
-        countMap.compute(ReadWriteIOUtils.read${type.dataType?cap_first}(stream), (k, v) -> {
-          try {
-            return v == null ? ReadWriteIOUtils.readLong(stream) : v + ReadWriteIOUtils.readLong(stream);
-          } catch (IOException e) {
-            throw new RuntimeException(e);
-          }
-        });
+        countMap.compute(
+            ReadWriteIOUtils.read${type.dataType?cap_first}(stream),
+            (k, v) -> {
+              try {
+                return v == null
+                    ? new Pair<>(
+                        ReadWriteIOUtils.readLong(stream), ReadWriteIOUtils.readLong(stream))
+                    : new Pair<>(
+                        v.left + ReadWriteIOUtils.readLong(stream),
+                        Math.min(v.right, ReadWriteIOUtils.readLong(stream)));
+              } catch (IOException e) {
+                throw new RuntimeException(e);
+              }
+            });
         <#if type.dataType != "boolean">
 
         if (countMap.size() > MAP_SIZE_THRESHOLD) {