You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iotdb.apache.org by hu...@apache.org on 2022/10/31 08:44:17 UTC

[iotdb] branch research/timeseries-master-data updated: Research/timeseries master data (#7822)

This is an automated email from the ASF dual-hosted git repository.

hui pushed a commit to branch research/timeseries-master-data
in repository https://gitbox.apache.org/repos/asf/iotdb.git


The following commit(s) were added to refs/heads/research/timeseries-master-data by this push:
     new b141678540 Research/timeseries master data (#7822)
b141678540 is described below

commit b14167854082a5d54dc4f855c41d92bb7a93b6f2
Author: Husaimawx <44...@users.noreply.github.com>
AuthorDate: Mon Oct 31 16:44:09 2022 +0800

    Research/timeseries master data (#7822)
    
    * add new udf function(MasterRepair)
    
    * add docs of new udf function(MasterRepair)
    
    * add udf accuracy
    
    * add docs for udf accuracy
    
    Co-authored-by: MasterRepair <11...@users.noreply.github.com>
---
 docs/UserGuide/UDF-Library/Data-Quality.md         | 57 ++++++++++++++++-
 docs/zh/UserGuide/UDF-Library/Data-Quality.md      | 57 ++++++++++++++++-
 .../iotdb/library/dquality/UDTFAccuracy.java       | 73 ++++++++++++++++++++++
 .../iotdb/library/drepair/util/KDTreeUtil.java     | 19 +-----
 .../library/drepair/util/MasterRepairUtil.java     | 21 ++++++-
 5 files changed, 204 insertions(+), 23 deletions(-)

diff --git a/docs/UserGuide/UDF-Library/Data-Quality.md b/docs/UserGuide/UDF-Library/Data-Quality.md
index d82cd719d5..7db56dbd1f 100644
--- a/docs/UserGuide/UDF-Library/Data-Quality.md
+++ b/docs/UserGuide/UDF-Library/Data-Quality.md
@@ -516,4 +516,59 @@ Output series:
 |2020-01-01T00:00:02.000+08:00|                      0.8833333333333333|
 |2020-01-01T00:00:32.000+08:00|                                     1.0|
 +-----------------------------+----------------------------------------+
-```
\ No newline at end of file
+```
+
+## Accuracy
+
+### Usage
+
+This function is used to calculate the Accuracy of time series based on master data. 
+
+**Name**: Accuracy
+
+**Input Series:** Support multiple input series. The types are are in INT32 / INT64 / FLOAT / DOUBLE.
+
+**Parameters:**
+
++ `omega`: The window size. It is a non-negative integer whose unit is millisecond. By default, it will be estimated according to the distances of two tuples with various time differences.
++ `eta`: The distance threshold. It is a positive number. By default, it will be estimated according to the distance distribution of tuples in windows.
++ `k`: The number of neighbors in master data. It is a positive integer. By default, it will be estimated according to the tuple dis- tance of the k-th nearest neighbor in the master data.
+
+**Output Series**: Output a single value. The type is DOUBLE. The range is [0,1].
+
+### Examples
+
+Input series:
+
+```
++-----------------------------+------------+------------+------------+------------+------------+------------+
+|                         Time|root.test.t1|root.test.t2|root.test.t3|root.test.m1|root.test.m2|root.test.m3|
++-----------------------------+------------+------------+------------+------------+------------+------------+
+|2021-07-01T12:00:01.000+08:00|        1704|     1154.55|       0.195|        1704|     1154.55|       0.195|
+|2021-07-01T12:00:02.000+08:00|        1702|     1152.30|       0.193|        1702|     1152.30|       0.193|
+|2021-07-01T12:00:03.000+08:00|        1702|     1148.65|       0.192|        1702|     1148.65|       0.192|
+|2021-07-01T12:00:04.000+08:00|        1701|     1145.20|       0.194|        1701|     1145.20|       0.194|
+|2021-07-01T12:00:07.000+08:00|        1703|     1150.55|       0.195|        1703|     1150.55|       0.195|
+|2021-07-01T12:00:08.000+08:00|        1694|     1151.55|       0.193|        1704|     1151.55|       0.193|
+|2021-07-01T12:01:09.000+08:00|        1705|     1153.55|       0.194|        1705|     1153.55|       0.194|
+|2021-07-01T12:01:10.000+08:00|        1706|     1152.30|       0.190|        1706|     1152.30|       0.190|
++-----------------------------+------------+------------+------------+------------+------------+------------+
+```
+
+SQL for query:
+
+```sql
+select Accuracy(t1,t2,t3,m1,m2,m3) from root.test
+```
+
+Output series:
+
+
+```
++-----------------------------+---------------------------------------------------------------------------------------+
+|                         Time|Accuracy(root.test.t1,root.test.t2,root.test.t3,root.test.m1,root.test.m2,root.test.m3)|
++-----------------------------+---------------------------------------------------------------------------------------+
+|2021-07-01T12:00:01.000+08:00|                                                                                  0.875|
++-----------------------------+---------------------------------------------------------------------------------------+
+```
+
diff --git a/docs/zh/UserGuide/UDF-Library/Data-Quality.md b/docs/zh/UserGuide/UDF-Library/Data-Quality.md
index 95b20fd9e7..5a82ee424e 100644
--- a/docs/zh/UserGuide/UDF-Library/Data-Quality.md
+++ b/docs/zh/UserGuide/UDF-Library/Data-Quality.md
@@ -521,4 +521,59 @@ select validity(s1,"window"="15") from root.test.d1 where time <= 2020-01-01 00:
 |2020-01-01T00:00:02.000+08:00|                      0.8833333333333333|
 |2020-01-01T00:00:32.000+08:00|                                     1.0|
 +-----------------------------+----------------------------------------+
-```
\ No newline at end of file
+```
+
+## Accuracy
+
+### 函数简介
+
+本函数基于主数据计算原始时间序列准确性。
+
+**函数名**:Accuracy
+
+**输入序列:** 支持多个输入序列,类型为 INT32 / INT64 / FLOAT / DOUBLE。
+
+**参数:**
+
+- `omega`:算法窗口大小,非负整数(单位为毫秒), 在缺省情况下,算法根据不同时间差下的两个元组距离自动估计该参数。
+- `eta`:算法距离阈值,正数, 在缺省情况下,算法根据窗口中元组的距离分布自动估计该参数。
+- `k`:主数据中的近邻数量,正整数, 在缺省情况下,算法根据主数据中的k个近邻的元组距离自动估计该参数。
+
+**输出序列**:输出单个值,类型为DOUBLE,值的范围为[0,1]。
+
+### 使用示例
+
+输入序列:
+
+```
++-----------------------------+------------+------------+------------+------------+------------+------------+
+|                         Time|root.test.t1|root.test.t2|root.test.t3|root.test.m1|root.test.m2|root.test.m3|
++-----------------------------+------------+------------+------------+------------+------------+------------+
+|2021-07-01T12:00:01.000+08:00|        1704|     1154.55|       0.195|        1704|     1154.55|       0.195|
+|2021-07-01T12:00:02.000+08:00|        1702|     1152.30|       0.193|        1702|     1152.30|       0.193|
+|2021-07-01T12:00:03.000+08:00|        1702|     1148.65|       0.192|        1702|     1148.65|       0.192|
+|2021-07-01T12:00:04.000+08:00|        1701|     1145.20|       0.194|        1701|     1145.20|       0.194|
+|2021-07-01T12:00:07.000+08:00|        1703|     1150.55|       0.195|        1703|     1150.55|       0.195|
+|2021-07-01T12:00:08.000+08:00|        1694|     1151.55|       0.193|        1704|     1151.55|       0.193|
+|2021-07-01T12:01:09.000+08:00|        1705|     1153.55|       0.194|        1705|     1153.55|       0.194|
+|2021-07-01T12:01:10.000+08:00|        1706|     1152.30|       0.190|        1706|     1152.30|       0.190|
++-----------------------------+------------+------------+------------+------------+------------+------------+
+```
+
+用于查询的 SQL 语句:
+
+```sql
+select Accuracy(t1,t2,t3,m1,m2,m3) from root.test
+```
+
+输出序列:
+
+
+```
++-----------------------------+---------------------------------------------------------------------------------------+
+|                         Time|Accuracy(root.test.t1,root.test.t2,root.test.t3,root.test.m1,root.test.m2,root.test.m3)|
++-----------------------------+---------------------------------------------------------------------------------------+
+|2021-07-01T12:00:01.000+08:00|                                                                                  0.875|
++-----------------------------+---------------------------------------------------------------------------------------+
+```
+
diff --git a/library-udf/src/main/java/org/apache/iotdb/library/dquality/UDTFAccuracy.java b/library-udf/src/main/java/org/apache/iotdb/library/dquality/UDTFAccuracy.java
new file mode 100644
index 0000000000..70bda912e5
--- /dev/null
+++ b/library-udf/src/main/java/org/apache/iotdb/library/dquality/UDTFAccuracy.java
@@ -0,0 +1,73 @@
+package org.apache.iotdb.library.dquality;
+
+import org.apache.iotdb.library.drepair.util.MasterRepairUtil;
+import org.apache.iotdb.udf.api.UDTF;
+import org.apache.iotdb.udf.api.access.Row;
+import org.apache.iotdb.udf.api.collector.PointCollector;
+import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations;
+import org.apache.iotdb.udf.api.customizer.parameter.UDFParameterValidator;
+import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters;
+import org.apache.iotdb.udf.api.customizer.strategy.RowByRowAccessStrategy;
+import org.apache.iotdb.udf.api.type.Type;
+
+public class UDTFAccuracy implements UDTF {
+  private MasterRepairUtil masterRepairUtil;
+
+  @Override
+  public void validate(UDFParameterValidator validator) throws Exception {
+    for (int i = 0; i < validator.getParameters().getAttributes().size(); i++) {
+      validator.validateInputSeriesDataType(i, Type.DOUBLE, Type.FLOAT, Type.INT32, Type.INT64);
+    }
+    if (validator.getParameters().hasAttribute("omega")) {
+      validator.validate(
+          omega -> (int) omega >= 0,
+          "Parameter omega should be non-negative.",
+          validator.getParameters().getInt("omega"));
+    }
+    if (validator.getParameters().hasAttribute("eta")) {
+      validator.validate(
+          eta -> (double) eta > 0,
+          "Parameter eta should be larger than 0.",
+          validator.getParameters().getDouble("eta"));
+    }
+    if (validator.getParameters().hasAttribute("k")) {
+      validator.validate(
+          k -> (int) k > 0,
+          "Parameter k should be a positive integer.",
+          validator.getParameters().getInt("k"));
+    }
+    if (validator.getParameters().hasAttribute("output_column")) {
+      validator.validate(
+          output_column -> (int) output_column > 0,
+          "Parameter output_column should be a positive integer.",
+          validator.getParameters().getInt("output_column"));
+    }
+  }
+
+  @Override
+  public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations)
+      throws Exception {
+    configurations.setAccessStrategy(new RowByRowAccessStrategy());
+    configurations.setOutputDataType(Type.DOUBLE);
+    int columnCnt = parameters.getDataTypes().size() / 2;
+    long omega = parameters.getLongOrDefault("omega", -1);
+    double eta = parameters.getDoubleOrDefault("eta", Double.NaN);
+    int k = parameters.getIntOrDefault("k", -1);
+    masterRepairUtil = new MasterRepairUtil(columnCnt, omega, eta, k);
+  }
+
+  @Override
+  public void transform(Row row, PointCollector collector) throws Exception {
+    if (!masterRepairUtil.isNullRow(row)) {
+      masterRepairUtil.addRow(row);
+    }
+  }
+
+  @Override
+  public void terminate(PointCollector collector) throws Exception {
+    masterRepairUtil.repair();
+    int repaired_cnt = masterRepairUtil.getRepaired_cnt();
+    int total_cnt = masterRepairUtil.getTotal_cnt();
+    collector.putDouble(1, (double) (total_cnt - repaired_cnt) / total_cnt);
+  }
+}
diff --git a/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/KDTreeUtil.java b/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/KDTreeUtil.java
index 49f2c48da0..dd78398ba3 100644
--- a/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/KDTreeUtil.java
+++ b/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/KDTreeUtil.java
@@ -11,22 +11,13 @@ public class KDTreeUtil {
   private Node kdtree;
 
   private static class Node {
-    // 分割的维度
     int partitiondimension;
-    // 分割的值
     double partitionValue;
-    // 如果为非叶子节点,该属性为空
-    // 否则为数据
     ArrayList<Double> value;
-    // 是否为叶子
     boolean isLeaf = false;
-    // 左树
     Node left;
-    // 右树
     Node right;
-    // 每个维度的最小值
     ArrayList<Double> min;
-    // 每个维度的最大值
     ArrayList<Double> max;
   }
 
@@ -46,7 +37,6 @@ public class KDTreeUtil {
       node.value = data.get(0);
       return;
     }
-    // 选择方差最大的维度
     node.partitiondimension = -1;
     double var = -1;
     double tmpvar;
@@ -57,14 +47,13 @@ public class KDTreeUtil {
         node.partitiondimension = i;
       }
     }
-    // 如果方差=0,表示所有数据都相同,判定为叶子节点
+
     if (var == 0d) {
       node.isLeaf = true;
       node.value = data.get(0);
       return;
     }
 
-    // 选择分割的值
     node.partitionValue = UtilZ.median(data, node.partitiondimension);
 
     ArrayList<ArrayList<Double>> maxmin = UtilZ.maxmin(data, dimensions);
@@ -167,11 +156,6 @@ public class KDTreeUtil {
           nearest.add(node.value);
         }
       } else {
-        /*
-         * 得到该节点代表的超矩形中点到查找点的最小距离mindistance
-         * 如果mindistance<distance表示有可能在这个节点的子节点上找到更近的点
-         * 否则不可能找到
-         */
         double mindistance = UtilZ.mindistance(input, node.max, node.min, std);
         if (mindistance < distance) {
           while (!node.isLeaf) {
@@ -290,7 +274,6 @@ public class KDTreeUtil {
       ArrayList<ArrayList<Double>> mm = new ArrayList<>();
       ArrayList<Double> min_v = new ArrayList<>();
       ArrayList<Double> max_v = new ArrayList<>();
-      // 初始化 第一行为min,第二行为max
       for (int i = 0; i < dimensions; i++) {
         double min_temp = Double.MAX_VALUE;
         double max_temp = Double.MIN_VALUE;
diff --git a/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/MasterRepairUtil.java b/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/MasterRepairUtil.java
index f99a81a731..6e054e2860 100644
--- a/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/MasterRepairUtil.java
+++ b/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/MasterRepairUtil.java
@@ -18,6 +18,8 @@ public class MasterRepairUtil {
   private int k;
   private double[] std;
   private KDTreeUtil kdTreeUtil;
+  private int repaired_cnt;
+  private int total_cnt;
 
   public MasterRepairUtil(int columnCnt, long omega, double eta, int k) throws Exception {
     this.columnCnt = columnCnt;
@@ -102,6 +104,8 @@ public class MasterRepairUtil {
     for (int l = i - 1; l >= 0; l--) {
       if (this.td_time.get(i) <= this.td_time.get(l) + omega) {
         W_i.add(l);
+      } else {
+        break;
       }
     }
     return W_i;
@@ -121,6 +125,8 @@ public class MasterRepairUtil {
   }
 
   public void master_repair() {
+    this.repaired_cnt = 0;
+    this.total_cnt = 0;
     for (int i = 0; i < this.td.size(); i++) {
       ArrayList<Double> tuple = this.td.get(i);
       ArrayList<Integer> W_i = cal_W(i);
@@ -144,6 +150,10 @@ public class MasterRepairUtil {
           }
         }
       }
+      if (!repair_tuple.toString().equals(tuple.toString())) {
+        repaired_cnt++;
+      }
+      total_cnt++;
       this.td_cleaned.add(repair_tuple);
     }
   }
@@ -231,9 +241,6 @@ public class MasterRepairUtil {
     buildKDTree();
     call_std();
     set_parameters();
-    System.out.println(this.omega);
-    System.out.println(this.eta);
-    System.out.println(this.k);
     master_repair();
   }
 
@@ -245,6 +252,14 @@ public class MasterRepairUtil {
     return intervals;
   }
 
+  public int getRepaired_cnt() {
+    return repaired_cnt;
+  }
+
+  public int getTotal_cnt() {
+    return total_cnt;
+  }
+
   public void fillNullValue() {
     for (int i = 0; i < columnCnt; i++) {
       double temp = this.td.get(0).get(i);