You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iotdb.apache.org by hu...@apache.org on 2022/10/31 08:44:17 UTC
[iotdb] branch research/timeseries-master-data updated: Research/timeseries master data (#7822)
This is an automated email from the ASF dual-hosted git repository.
hui pushed a commit to branch research/timeseries-master-data
in repository https://gitbox.apache.org/repos/asf/iotdb.git
The following commit(s) were added to refs/heads/research/timeseries-master-data by this push:
new b141678540 Research/timeseries master data (#7822)
b141678540 is described below
commit b14167854082a5d54dc4f855c41d92bb7a93b6f2
Author: Husaimawx <44...@users.noreply.github.com>
AuthorDate: Mon Oct 31 16:44:09 2022 +0800
Research/timeseries master data (#7822)
* add new udf function(MasterRepair)
* add docs of new udf function(MasterRepair)
* add udf accuracy
* add docs for udf accuracy
Co-authored-by: MasterRepair <11...@users.noreply.github.com>
---
docs/UserGuide/UDF-Library/Data-Quality.md | 57 ++++++++++++++++-
docs/zh/UserGuide/UDF-Library/Data-Quality.md | 57 ++++++++++++++++-
.../iotdb/library/dquality/UDTFAccuracy.java | 73 ++++++++++++++++++++++
.../iotdb/library/drepair/util/KDTreeUtil.java | 19 +-----
.../library/drepair/util/MasterRepairUtil.java | 21 ++++++-
5 files changed, 204 insertions(+), 23 deletions(-)
diff --git a/docs/UserGuide/UDF-Library/Data-Quality.md b/docs/UserGuide/UDF-Library/Data-Quality.md
index d82cd719d5..7db56dbd1f 100644
--- a/docs/UserGuide/UDF-Library/Data-Quality.md
+++ b/docs/UserGuide/UDF-Library/Data-Quality.md
@@ -516,4 +516,59 @@ Output series:
|2020-01-01T00:00:02.000+08:00| 0.8833333333333333|
|2020-01-01T00:00:32.000+08:00| 1.0|
+-----------------------------+----------------------------------------+
-```
\ No newline at end of file
+```
+
+## Accuracy
+
+### Usage
+
+This function is used to calculate the Accuracy of time series based on master data.
+
+**Name**: Accuracy
+
+**Input Series:** Support multiple input series. The types are are in INT32 / INT64 / FLOAT / DOUBLE.
+
+**Parameters:**
+
++ `omega`: The window size. It is a non-negative integer whose unit is millisecond. By default, it will be estimated according to the distances of two tuples with various time differences.
++ `eta`: The distance threshold. It is a positive number. By default, it will be estimated according to the distance distribution of tuples in windows.
++ `k`: The number of neighbors in master data. It is a positive integer. By default, it will be estimated according to the tuple dis- tance of the k-th nearest neighbor in the master data.
+
+**Output Series**: Output a single value. The type is DOUBLE. The range is [0,1].
+
+### Examples
+
+Input series:
+
+```
++-----------------------------+------------+------------+------------+------------+------------+------------+
+| Time|root.test.t1|root.test.t2|root.test.t3|root.test.m1|root.test.m2|root.test.m3|
++-----------------------------+------------+------------+------------+------------+------------+------------+
+|2021-07-01T12:00:01.000+08:00| 1704| 1154.55| 0.195| 1704| 1154.55| 0.195|
+|2021-07-01T12:00:02.000+08:00| 1702| 1152.30| 0.193| 1702| 1152.30| 0.193|
+|2021-07-01T12:00:03.000+08:00| 1702| 1148.65| 0.192| 1702| 1148.65| 0.192|
+|2021-07-01T12:00:04.000+08:00| 1701| 1145.20| 0.194| 1701| 1145.20| 0.194|
+|2021-07-01T12:00:07.000+08:00| 1703| 1150.55| 0.195| 1703| 1150.55| 0.195|
+|2021-07-01T12:00:08.000+08:00| 1694| 1151.55| 0.193| 1704| 1151.55| 0.193|
+|2021-07-01T12:01:09.000+08:00| 1705| 1153.55| 0.194| 1705| 1153.55| 0.194|
+|2021-07-01T12:01:10.000+08:00| 1706| 1152.30| 0.190| 1706| 1152.30| 0.190|
++-----------------------------+------------+------------+------------+------------+------------+------------+
+```
+
+SQL for query:
+
+```sql
+select Accuracy(t1,t2,t3,m1,m2,m3) from root.test
+```
+
+Output series:
+
+
+```
++-----------------------------+---------------------------------------------------------------------------------------+
+| Time|Accuracy(root.test.t1,root.test.t2,root.test.t3,root.test.m1,root.test.m2,root.test.m3)|
++-----------------------------+---------------------------------------------------------------------------------------+
+|2021-07-01T12:00:01.000+08:00| 0.875|
++-----------------------------+---------------------------------------------------------------------------------------+
+```
+
diff --git a/docs/zh/UserGuide/UDF-Library/Data-Quality.md b/docs/zh/UserGuide/UDF-Library/Data-Quality.md
index 95b20fd9e7..5a82ee424e 100644
--- a/docs/zh/UserGuide/UDF-Library/Data-Quality.md
+++ b/docs/zh/UserGuide/UDF-Library/Data-Quality.md
@@ -521,4 +521,59 @@ select validity(s1,"window"="15") from root.test.d1 where time <= 2020-01-01 00:
|2020-01-01T00:00:02.000+08:00| 0.8833333333333333|
|2020-01-01T00:00:32.000+08:00| 1.0|
+-----------------------------+----------------------------------------+
-```
\ No newline at end of file
+```
+
+## Accuracy
+
+### 函数简介
+
+本函数基于主数据计算原始时间序列准确性。
+
+**函数名**:Accuracy
+
+**输入序列:** 支持多个输入序列,类型为 INT32 / INT64 / FLOAT / DOUBLE。
+
+**参数:**
+
+- `omega`:算法窗口大小,非负整数(单位为毫秒), 在缺省情况下,算法根据不同时间差下的两个元组距离自动估计该参数。
+- `eta`:算法距离阈值,正数, 在缺省情况下,算法根据窗口中元组的距离分布自动估计该参数。
+- `k`:主数据中的近邻数量,正整数, 在缺省情况下,算法根据主数据中的k个近邻的元组距离自动估计该参数。
+
+**输出序列**:输出单个值,类型为DOUBLE,值的范围为[0,1]。
+
+### 使用示例
+
+输入序列:
+
+```
++-----------------------------+------------+------------+------------+------------+------------+------------+
+| Time|root.test.t1|root.test.t2|root.test.t3|root.test.m1|root.test.m2|root.test.m3|
++-----------------------------+------------+------------+------------+------------+------------+------------+
+|2021-07-01T12:00:01.000+08:00| 1704| 1154.55| 0.195| 1704| 1154.55| 0.195|
+|2021-07-01T12:00:02.000+08:00| 1702| 1152.30| 0.193| 1702| 1152.30| 0.193|
+|2021-07-01T12:00:03.000+08:00| 1702| 1148.65| 0.192| 1702| 1148.65| 0.192|
+|2021-07-01T12:00:04.000+08:00| 1701| 1145.20| 0.194| 1701| 1145.20| 0.194|
+|2021-07-01T12:00:07.000+08:00| 1703| 1150.55| 0.195| 1703| 1150.55| 0.195|
+|2021-07-01T12:00:08.000+08:00| 1694| 1151.55| 0.193| 1704| 1151.55| 0.193|
+|2021-07-01T12:01:09.000+08:00| 1705| 1153.55| 0.194| 1705| 1153.55| 0.194|
+|2021-07-01T12:01:10.000+08:00| 1706| 1152.30| 0.190| 1706| 1152.30| 0.190|
++-----------------------------+------------+------------+------------+------------+------------+------------+
+```
+
+用于查询的 SQL 语句:
+
+```sql
+select Accuracy(t1,t2,t3,m1,m2,m3) from root.test
+```
+
+输出序列:
+
+
+```
++-----------------------------+---------------------------------------------------------------------------------------+
+| Time|Accuracy(root.test.t1,root.test.t2,root.test.t3,root.test.m1,root.test.m2,root.test.m3)|
++-----------------------------+---------------------------------------------------------------------------------------+
+|2021-07-01T12:00:01.000+08:00| 0.875|
++-----------------------------+---------------------------------------------------------------------------------------+
+```
+
diff --git a/library-udf/src/main/java/org/apache/iotdb/library/dquality/UDTFAccuracy.java b/library-udf/src/main/java/org/apache/iotdb/library/dquality/UDTFAccuracy.java
new file mode 100644
index 0000000000..70bda912e5
--- /dev/null
+++ b/library-udf/src/main/java/org/apache/iotdb/library/dquality/UDTFAccuracy.java
@@ -0,0 +1,73 @@
+package org.apache.iotdb.library.dquality;
+
+import org.apache.iotdb.library.drepair.util.MasterRepairUtil;
+import org.apache.iotdb.udf.api.UDTF;
+import org.apache.iotdb.udf.api.access.Row;
+import org.apache.iotdb.udf.api.collector.PointCollector;
+import org.apache.iotdb.udf.api.customizer.config.UDTFConfigurations;
+import org.apache.iotdb.udf.api.customizer.parameter.UDFParameterValidator;
+import org.apache.iotdb.udf.api.customizer.parameter.UDFParameters;
+import org.apache.iotdb.udf.api.customizer.strategy.RowByRowAccessStrategy;
+import org.apache.iotdb.udf.api.type.Type;
+
+public class UDTFAccuracy implements UDTF {
+ private MasterRepairUtil masterRepairUtil;
+
+ @Override
+ public void validate(UDFParameterValidator validator) throws Exception {
+ for (int i = 0; i < validator.getParameters().getAttributes().size(); i++) {
+ validator.validateInputSeriesDataType(i, Type.DOUBLE, Type.FLOAT, Type.INT32, Type.INT64);
+ }
+ if (validator.getParameters().hasAttribute("omega")) {
+ validator.validate(
+ omega -> (int) omega >= 0,
+ "Parameter omega should be non-negative.",
+ validator.getParameters().getInt("omega"));
+ }
+ if (validator.getParameters().hasAttribute("eta")) {
+ validator.validate(
+ eta -> (double) eta > 0,
+ "Parameter eta should be larger than 0.",
+ validator.getParameters().getDouble("eta"));
+ }
+ if (validator.getParameters().hasAttribute("k")) {
+ validator.validate(
+ k -> (int) k > 0,
+ "Parameter k should be a positive integer.",
+ validator.getParameters().getInt("k"));
+ }
+ if (validator.getParameters().hasAttribute("output_column")) {
+ validator.validate(
+ output_column -> (int) output_column > 0,
+ "Parameter output_column should be a positive integer.",
+ validator.getParameters().getInt("output_column"));
+ }
+ }
+
+ @Override
+ public void beforeStart(UDFParameters parameters, UDTFConfigurations configurations)
+ throws Exception {
+ configurations.setAccessStrategy(new RowByRowAccessStrategy());
+ configurations.setOutputDataType(Type.DOUBLE);
+ int columnCnt = parameters.getDataTypes().size() / 2;
+ long omega = parameters.getLongOrDefault("omega", -1);
+ double eta = parameters.getDoubleOrDefault("eta", Double.NaN);
+ int k = parameters.getIntOrDefault("k", -1);
+ masterRepairUtil = new MasterRepairUtil(columnCnt, omega, eta, k);
+ }
+
+ @Override
+ public void transform(Row row, PointCollector collector) throws Exception {
+ if (!masterRepairUtil.isNullRow(row)) {
+ masterRepairUtil.addRow(row);
+ }
+ }
+
+ @Override
+ public void terminate(PointCollector collector) throws Exception {
+ masterRepairUtil.repair();
+ int repaired_cnt = masterRepairUtil.getRepaired_cnt();
+ int total_cnt = masterRepairUtil.getTotal_cnt();
+ collector.putDouble(1, (double) (total_cnt - repaired_cnt) / total_cnt);
+ }
+}
diff --git a/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/KDTreeUtil.java b/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/KDTreeUtil.java
index 49f2c48da0..dd78398ba3 100644
--- a/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/KDTreeUtil.java
+++ b/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/KDTreeUtil.java
@@ -11,22 +11,13 @@ public class KDTreeUtil {
private Node kdtree;
private static class Node {
- // 分割的维度
int partitiondimension;
- // 分割的值
double partitionValue;
- // 如果为非叶子节点,该属性为空
- // 否则为数据
ArrayList<Double> value;
- // 是否为叶子
boolean isLeaf = false;
- // 左树
Node left;
- // 右树
Node right;
- // 每个维度的最小值
ArrayList<Double> min;
- // 每个维度的最大值
ArrayList<Double> max;
}
@@ -46,7 +37,6 @@ public class KDTreeUtil {
node.value = data.get(0);
return;
}
- // 选择方差最大的维度
node.partitiondimension = -1;
double var = -1;
double tmpvar;
@@ -57,14 +47,13 @@ public class KDTreeUtil {
node.partitiondimension = i;
}
}
- // 如果方差=0,表示所有数据都相同,判定为叶子节点
+
if (var == 0d) {
node.isLeaf = true;
node.value = data.get(0);
return;
}
- // 选择分割的值
node.partitionValue = UtilZ.median(data, node.partitiondimension);
ArrayList<ArrayList<Double>> maxmin = UtilZ.maxmin(data, dimensions);
@@ -167,11 +156,6 @@ public class KDTreeUtil {
nearest.add(node.value);
}
} else {
- /*
- * 得到该节点代表的超矩形中点到查找点的最小距离mindistance
- * 如果mindistance<distance表示有可能在这个节点的子节点上找到更近的点
- * 否则不可能找到
- */
double mindistance = UtilZ.mindistance(input, node.max, node.min, std);
if (mindistance < distance) {
while (!node.isLeaf) {
@@ -290,7 +274,6 @@ public class KDTreeUtil {
ArrayList<ArrayList<Double>> mm = new ArrayList<>();
ArrayList<Double> min_v = new ArrayList<>();
ArrayList<Double> max_v = new ArrayList<>();
- // 初始化 第一行为min,第二行为max
for (int i = 0; i < dimensions; i++) {
double min_temp = Double.MAX_VALUE;
double max_temp = Double.MIN_VALUE;
diff --git a/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/MasterRepairUtil.java b/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/MasterRepairUtil.java
index f99a81a731..6e054e2860 100644
--- a/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/MasterRepairUtil.java
+++ b/library-udf/src/main/java/org/apache/iotdb/library/drepair/util/MasterRepairUtil.java
@@ -18,6 +18,8 @@ public class MasterRepairUtil {
private int k;
private double[] std;
private KDTreeUtil kdTreeUtil;
+ private int repaired_cnt;
+ private int total_cnt;
public MasterRepairUtil(int columnCnt, long omega, double eta, int k) throws Exception {
this.columnCnt = columnCnt;
@@ -102,6 +104,8 @@ public class MasterRepairUtil {
for (int l = i - 1; l >= 0; l--) {
if (this.td_time.get(i) <= this.td_time.get(l) + omega) {
W_i.add(l);
+ } else {
+ break;
}
}
return W_i;
@@ -121,6 +125,8 @@ public class MasterRepairUtil {
}
public void master_repair() {
+ this.repaired_cnt = 0;
+ this.total_cnt = 0;
for (int i = 0; i < this.td.size(); i++) {
ArrayList<Double> tuple = this.td.get(i);
ArrayList<Integer> W_i = cal_W(i);
@@ -144,6 +150,10 @@ public class MasterRepairUtil {
}
}
}
+ if (!repair_tuple.toString().equals(tuple.toString())) {
+ repaired_cnt++;
+ }
+ total_cnt++;
this.td_cleaned.add(repair_tuple);
}
}
@@ -231,9 +241,6 @@ public class MasterRepairUtil {
buildKDTree();
call_std();
set_parameters();
- System.out.println(this.omega);
- System.out.println(this.eta);
- System.out.println(this.k);
master_repair();
}
@@ -245,6 +252,14 @@ public class MasterRepairUtil {
return intervals;
}
+ public int getRepaired_cnt() {
+ return repaired_cnt;
+ }
+
+ public int getTotal_cnt() {
+ return total_cnt;
+ }
+
public void fillNullValue() {
for (int i = 0; i < columnCnt; i++) {
double temp = this.td.get(0).get(i);