You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@dolphinscheduler.apache.org by GitBox <gi...@apache.org> on 2019/12/05 04:09:50 UTC

[GitHub] [incubator-dolphinscheduler] chongchongzi opened a new pull request #1396: [FEATURE][#1306]Data quality inspection component(数据质量检测组件)

chongchongzi opened a new pull request #1396: [FEATURE][#1306]Data quality inspection component(数据质量检测组件)
URL: https://github.com/apache/incubator-dolphinscheduler/pull/1396
 
 
   **一、创建测试表**
   CREATE TABLE `measure` (
     `id` bigint(20) NOT NULL COMMENT '主键',
     `a_varchar` varchar(30) CHARACTER SET utf8 DEFAULT NULL COMMENT 'varchar类型',
     `a_int` int(11) DEFAULT NULL COMMENT 'int类型',
     `a_integer` int(11) DEFAULT NULL COMMENT 'integer类型',
     `a_double` double(11,2) DEFAULT NULL COMMENT 'double类型',
     `a_float` float(11,1) DEFAULT NULL COMMENT 'float类型',
     `a_char` char(30) CHARACTER SET utf8 DEFAULT NULL COMMENT 'char类型',
     `a_decimal` decimal(11,2) DEFAULT NULL COMMENT 'decimal类型',
     `a_timestamp` timestamp NULL DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP COMMENT 'timestamp类型',
     `a_datetime` datetime DEFAULT NULL COMMENT 'datetime类型',
     PRIMARY KEY (`id`)
   ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='测试measure表';
   
   **二、向测试表导入测试数据**
   INSERT INTO `measure` (`id`, `a_varchar`, `a_int`, `a_integer`, `a_double`, `a_float`, `a_char`, `a_decimal`, `a_timestamp`, `a_datetime`) VALUES (1, 'Jone', 18, 18, 123.01, 15.1, 'a', 86.25, '2019-12-4 20:47:55', '2019-12-4 20:47:53');
   INSERT INTO `measure` (`id`, `a_varchar`, `a_int`, `a_integer`, `a_double`, `a_float`, `a_char`, `a_decimal`, `a_timestamp`, `a_datetime`) VALUES (2, 'Jack', NULL, 19, 12.01, 16.1, 'b', 65.08, '2019-12-4 20:47:58', '2019-12-4 20:47:56');
   INSERT INTO `measure` (`id`, `a_varchar`, `a_int`, `a_integer`, `a_double`, `a_float`, `a_char`, `a_decimal`, `a_timestamp`, `a_datetime`) VALUES (3, 'Tom', 28, 20, 23.01, 17.7, 'c', 65.78, '2019-12-4 20:48:00', '2019-12-4 20:47:59');
   INSERT INTO `measure` (`id`, `a_varchar`, `a_int`, `a_integer`, `a_double`, `a_float`, `a_char`, `a_decimal`, `a_timestamp`, `a_datetime`) VALUES (4, 'Sandy', 21, 21, 30.56, 18.9, 'd', 8.59, '2019-12-4 20:48:04', '2019-12-4 20:48:02');
   INSERT INTO `measure` (`id`, `a_varchar`, `a_int`, `a_integer`, `a_double`, `a_float`, `a_char`, `a_decimal`, `a_timestamp`, `a_datetime`) VALUES (5, '', 11, 50, 40.56, 19.7, 'e', 52.56, '2019-12-4 20:48:12', '2019-12-4 20:48:05');
   
   **三、新建数据源**
   
   **四、新建项目-新建工作流-选择数据质量分析组件**
   ![image](https://user-images.githubusercontent.com/20510737/70203417-e2082680-1757-11ea-8b9d-60bafbb4593e.png)
   
   **五、向数据质量分析组件添加相关参数**
   
   ![image](https://user-images.githubusercontent.com/20510737/70203432-e92f3480-1757-11ea-9b60-33f960de8e35.png)
   ![image](https://user-images.githubusercontent.com/20510737/70203444-ecc2bb80-1757-11ea-91a1-22ec84a46e1b.png)
   
   **测量参数说明:**
   ![image](https://user-images.githubusercontent.com/20510737/70203452-f2b89c80-1757-11ea-97f2-3e14b5af31d2.png)
   
   第一个输入框:你要测量的字段(必填)
   第二个下拉框:你期望测量字段的类型(INT,VARCHAR,INTEGER,BIGINT,LONG,DOUBLE,FLOAT,CHAR, DECIMAL,TIMESTAMP,DATETIME),目前只实现了这几个常用的,如果需要增加可以提需求。
   第三个下拉框:你要测量的字段和期望值的对比操作符(=,!=,>,>=,<,<=,contain,notContain),其中(=,!=,>,>=,<,<=)作用于(INT,INTEGER,BIGINT,LONG,DOUBLE,FLOAT,DECIMAL,TIMESTAMP,DATETIME);(contain,notContain)作用于VARCHAR,CHAR)
   第四个输入框:期望值(必填)
   第五个输入框:你期望的匹配率(正整数,必填:1-100)%,作用于遍历字段在SQL的结果和测量的规则的匹配率,计算公式:测量出符合规则的数量/SQL的条数=匹配率
   第六个下拉框:是否失败(true,false),如果选择true,最终会将测量该字段的匹配率和期望的匹配率进行比较,如果测量的匹配率小于期望的匹配率,该任务状态设置为失败。如果选择false,不会进行比较。
   
   **六、运行工作流,查看邮件**
   ![image](https://user-images.githubusercontent.com/20510737/70203461-f77d5080-1757-11ea-8a54-32290425a489.png)
   
   测量结果说明:
   字段名【DataIsEmpty】: 字段数据为空数量
   字段名【DataIsNull】: 字段数据为NULL数量
   字段名【DataMatch】: 字段匹配数量
   字段名【DataNotMatch】: 字段不匹配数量
   字段名【DataTotal】: 字段总数量
   字段名【MatchRate】: 字段匹配率
   字段名【TypeNotMatch】: 字段类型不匹配
   
   
   **七、查看相关日志**
   measureResult: {
   	"a_char【DataIsEmpty】": 0,
   	"a_char【DataIsNull】": 0,
   	"a_char【DataMatch】": 5,
   	"a_char【DataNotMatch】": 0,
   	"a_char【DataTotal】": 5,
   	"a_char【MatchRate】": "100.00%",
   	"a_char【TypeNotMatch】": 0,
   	"a_datetime【DataIsEmpty】": 0,
   	"a_datetime【DataIsNull】": 0,
   	"a_datetime【DataMatch】": 5,
   	"a_datetime【DataNotMatch】": 0,
   	"a_datetime【DataTotal】": 5,
   	"a_datetime【MatchRate】": "100.00%",
   	"a_datetime【TypeNotMatch】": 0,
   	"a_decimal【DataIsEmpty】": 0,
   	"a_decimal【DataIsNull】": 0,
   	"a_decimal【DataMatch】": 5,
   	"a_decimal【DataNotMatch】": 0,
   	"a_decimal【DataTotal】": 5,
   	"a_decimal【MatchRate】": "100.00%",
   	"a_decimal【TypeNotMatch】": 0,
   	"a_double【DataIsEmpty】": 0,
   	"a_double【DataIsNull】": 0,
   	"a_double【DataMatch】": 5,
   	"a_double【DataNotMatch】": 0,
   	"a_double【DataTotal】": 5,
   	"a_double【MatchRate】": "100.00%",
   	"a_double【TypeNotMatch】": 0,
   	"a_float【DataIsEmpty】": 0,
   	"a_float【DataIsNull】": 0,
   	"a_float【DataMatch】": 5,
   	"a_float【DataNotMatch】": 0,
   	"a_float【DataTotal】": 5,
   	"a_float【MatchRate】": "100.00%",
   	"a_float【TypeNotMatch】": 0,
   	"a_integer【DataIsEmpty】": 0,
   	"a_integer【DataIsNull】": 0,
   	"a_integer【DataMatch】": 0,
   	"a_integer【DataNotMatch】": 0,
   	"a_integer【TypeNotMatch】": 5,
   	"a_int【DataIsEmpty】": 0,
   	"a_int【DataIsNull】": 1,
   	"a_int【DataMatch】": 4,
   	"a_int【DataNotMatch】": 0,
   	"a_int【DataTotal】": 5,
   	"a_int【MatchRate】": "80.00%",
   	"a_int【TypeNotMatch】": 0,
   	"a_timestamp【DataIsEmpty】": 0,
   	"a_timestamp【DataIsNull】": 0,
   	"a_timestamp【DataMatch】": 5,
   	"a_timestamp【DataNotMatch】": 0,
   	"a_timestamp【DataTotal】": 5,
   	"a_timestamp【MatchRate】": "100.00%",
   	"a_timestamp【TypeNotMatch】": 0,
   	"a_varchar【DataIsEmpty】": 1,
   	"a_varchar【DataIsNull】": 0,
   	"a_varchar【DataMatch】": 1,
   	"a_varchar【DataNotMatch】": 3,
   	"a_varchar【DataTotal】": 4,
   	"a_varchar【MatchRate】": "25.00%",
   	"a_varchar【TypeNotMatch】": 0,
   	"id【DataIsEmpty】": 0,
   	"id【DataIsNull】": 0,
   	"id【DataMatch】": 5,
   	"id【DataNotMatch】": 0,
   	"id【DataTotal】": 5,
   	"id【MatchRate】": "100.00%",
   	"id【TypeNotMatch】": 0
   }
   
   jobFailReason : a_int target is 【TRUE】 , result is 【FALSE】
   
   **测量日志说明:**
   jobFailReason :任务失败原因(只要有测量参数设置了是否失败为true,任何字段匹配了就会失败)
   a_int这个字段的期望目标是true,结果是false,所以任务失败。
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services