You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@griffin.apache.org by gu...@apache.org on 2018/10/16 06:19:51 UTC

incubator-griffin git commit: update measure documents

Repository: incubator-griffin
Updated Branches:
  refs/heads/master 1b2b3e352 -> 3545a71c7


update measure documents

Author: Lionel Liu <bh...@163.com>

Closes #438 from bhlx3lyx7/doc-update.


Project: http://git-wip-us.apache.org/repos/asf/incubator-griffin/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-griffin/commit/3545a71c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-griffin/tree/3545a71c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-griffin/diff/3545a71c

Branch: refs/heads/master
Commit: 3545a71c7c63623869f20aa414eb617fcb0786c0
Parents: 1b2b3e3
Author: Lionel Liu <bh...@163.com>
Authored: Tue Oct 16 14:19:44 2018 +0800
Committer: William Guo <gu...@apache.org>
Committed: Tue Oct 16 14:19:44 2018 +0800

----------------------------------------------------------------------
 griffin-doc/measure/dsl-guide.md                |  6 +-
 griffin-doc/measure/measure-batch-sample.md     | 14 +++--
 .../measure/measure-configuration-guide.md      | 55 ++++++++----------
 griffin-doc/measure/measure-streaming-sample.md | 59 ++++++++++----------
 .../configuration/dqdefinition/DQConfig.scala   |  3 -
 5 files changed, 66 insertions(+), 71 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-griffin/blob/3545a71c/griffin-doc/measure/dsl-guide.md
----------------------------------------------------------------------
diff --git a/griffin-doc/measure/dsl-guide.md b/griffin-doc/measure/dsl-guide.md
index 4eb294e..5296176 100644
--- a/griffin-doc/measure/dsl-guide.md
+++ b/griffin-doc/measure/dsl-guide.md
@@ -24,14 +24,14 @@ Griffin DSL is designed for DQ measurement, as a SQL-like language, which descri
 Griffin DSL syntax is easy to learn as it's SQL-like, case insensitive.
 
 ### Supporting process
-- logical operation: `not, and, or, in, between, like, is null, is nan, =, !=, <>, <=, >=, <, >`
+- logical operation: `not, and, or, in, between, like, rlike, is null, is nan, =, !=, <>, <=, >=, <, >`
 - mathematical operation: `+, -, *, /, %`
 - sql statement: `as, where, group by, having, order by, limit`
 
 ### Keywords
 - `null, nan, true, false`
 - `not, and, or`
-- `in, between, like, is`
+- `in, between, like, rlike, is`
 - `select, distinct, from, as, where, group, by, having, order, desc, asc, limit`
 
 ### Operators
@@ -79,6 +79,8 @@ Griffin DSL syntax is easy to learn as it's SQL-like, case insensitive.
 	e.g. `source.age between 3 and 30`, `source.age between (3, 30)`
 - **like**: like clause like sql.  
 	e.g. `source.name like "%abc%"`
+- **rlike**: rlike clause like spark sql.  
+    e.g. `source.name rlike "^abc.*$"`
 - **is null**: is null operator like sql.  
 	e.g. `source.desc is not null`
 - **is nan**: check if the value is not a number, the syntax like `is null`  

http://git-wip-us.apache.org/repos/asf/incubator-griffin/blob/3545a71c/griffin-doc/measure/measure-batch-sample.md
----------------------------------------------------------------------
diff --git a/griffin-doc/measure/measure-batch-sample.md b/griffin-doc/measure/measure-batch-sample.md
index 1a9bc41..7867ea5 100644
--- a/griffin-doc/measure/measure-batch-sample.md
+++ b/griffin-doc/measure/measure-batch-sample.md
@@ -59,7 +59,7 @@ Apache Griffin measures consist of batch measure and streaming measure, this doc
       {
         "dsl.type": "griffin-dsl",
         "dq.type": "accuracy",
-        "name": "accu",
+        "out.dataframe.name": "accu",
         "rule": "source.user_id = target.user_id AND upper(source.first_name) = upper(target.first_name) AND source.last_name = target.last_name AND source.address = target.address AND source.email = target.email AND source.phone = target.phone AND source.post_code = target.post_code",
         "details": {
           "source": "source",
@@ -80,7 +80,9 @@ Apache Griffin measures consist of batch measure and streaming measure, this doc
         ]        
       }
     ]
-  }
+  },
+  
+  "sinks": ["CONSOLE", "ELASTICSEARCH"]
 }
 ```
 Above is the configure file of batch accuracy job.  
@@ -121,7 +123,7 @@ The miss records of source will be persisted as record.
       {
         "dsl.type": "griffin-dsl",
         "dq.type": "profiling",
-        "name": "prof",
+        "out.dataframe.name": "prof",
         "rule": "select max(age) as `max_age`, min(age) as `min_age` from source",
         "out": [
           {
@@ -133,7 +135,7 @@ The miss records of source will be persisted as record.
       {
         "dsl.type": "griffin-dsl",
         "dq.type": "profiling",
-        "name": "name_grp",
+        "out.dataframe.name": "name_grp",
         "rule": "select name, count(*) as cnt from source group by name",
         "out": [
           {
@@ -144,7 +146,9 @@ The miss records of source will be persisted as record.
         ]
       }
     ]
-  }
+  },
+   
+  "sinks": ["CONSOLE", "ELASTICSEARCH"]
 }
 ```
 Above is the configure file of batch profiling job.  

http://git-wip-us.apache.org/repos/asf/incubator-griffin/blob/3545a71c/griffin-doc/measure/measure-configuration-guide.md
----------------------------------------------------------------------
diff --git a/griffin-doc/measure/measure-configuration-guide.md b/griffin-doc/measure/measure-configuration-guide.md
index 13774e1..43cb8ac 100644
--- a/griffin-doc/measure/measure-configuration-guide.md
+++ b/griffin-doc/measure/measure-configuration-guide.md
@@ -43,7 +43,7 @@ Apache Griffin measure module needs two configuration files to define the parame
 
   "sinks": [
     {
-      "type": "log",
+      "type": "console",
       "config": {
         "max.log.lines": 100
       }
@@ -56,7 +56,7 @@ Apache Griffin measure module needs two configuration files to define the parame
     }
   ],
 
-  "info.cache": [
+  "griffin.checkpoint": [
     {
       "type": "zk",
       "config": {
@@ -79,30 +79,30 @@ Above lists environment parameters.
 	+ batch.interval: Interval of dumping streaming data, for streaming mode.
 	+ process.interval: Interval of processing dumped streaming data, for streaming mode.
 	+ config: Configuration of spark parameters.
-- **persist**: This field configures list of metrics persist parameters, multiple persist ways are supported. Details of persist configuration [here](#persist).
-- **info.cache**: This field configures list of information cache parameters, multiple cache ways are supported. It is only for streaming dq case. Details of info cache configuration [here](#info-cache).
+- **sinks**: This field configures list of metrics sink parameters, multiple sink ways are supported. Details of sink configuration [here](#sinks).
+- **griffin.checkpoint**: This field configures list of griffin checkpoint parameters, multiple cache ways are supported. It is only for streaming dq case. Details of info cache configuration [here](#griffin-checkpoint).
 
 ### <a name="sinks"></a>Sinks
-- **type**: Metrics and records persist type, "log", "hdfs", "http", "mongo". 
-- **config**: Configure parameters of each persist type.
-	+ log persist (aliases: "console")
+- **type**: Metrics and records sink type, "console", "hdfs", "http", "mongo". 
+- **config**: Configure parameters of each sink type.
+	+ console sink (aliases: "log")
 		* max.log.lines: the max lines of log.
-	+ hdfs persist
-		* path: hdfs path to persist metrics
-		* max.persist.lines: the max lines of total persist data.
-		* max.lines.per.file: the max lines of each persist file.
-	+ http persist (aliases: "es", "elasticsearch")
-		* api: api to submit persist metrics.
+	+ hdfs sink
+		* path: hdfs path to sink metrics
+		* max.persist.lines: the max lines of total sink data.
+		* max.lines.per.file: the max lines of each sink file.
+	+ http sink (aliases: "es", "elasticsearch")
+		* api: api to submit sink metrics.
 		* method: http method, "post" default.
-    + mongo persist
+    + mongo sink
         * url: url of mongo db.
         * database: database name.
         * collection: collection name. 
 
-### <a name="info-cache"></a>Info Cache
-- **type**: Information cache type, "zk" for zookeeper cache.
-- **config**: Configure parameters of info cache type.
-	+ zookeeper cache
+### <a name="griffin-checkpoint"></a>Griffin Checkpoint
+- **type**: Griffin checkpoint type, "zk" for zookeeper checkpoint.
+- **config**: Configure parameters of griffin checkpoint type.
+	+ zookeeper checkpoint
 		* hosts: zookeeper hosts list as a string, separated by comma.
 		* namespace: namespace of cache info, "" as default.
 		* lock.path: path of lock info, "lock" as default.
@@ -150,7 +150,7 @@ Above lists environment parameters.
       {
         "dsl.type": "griffin-dsl",
         "dq.type": "accuracy",
-        "name": "accu",
+        "out.dataframe.name": "accu",
         "rule": "source.user_id = target.user_id AND upper(source.first_name) = upper(target.first_name) AND source.last_name = target.last_name AND source.address = target.address AND source.email = target.email AND source.phone = target.phone AND source.post_code = target.post_code",
         "details": {
           "source": "source",
@@ -206,9 +206,10 @@ Above lists DQ job configure parameters.
 		* done.file: 
 
 ### <a name="rule"></a>Rule
-- **dsl.type**: Rule dsl type, "spark-sql", "df-opr" and "griffin-dsl".
+- **dsl.type**: Rule dsl type, "spark-sql", "df-ops" and "griffin-dsl".
 - **dq.type**: DQ type of this rule, only for "griffin-dsl" type. Supported types: "accuracy", "profiling", "timeliness", "uniqueness", "completeness".
-- **name** (step information): Result table name of this rule, optional for "griffin-dsl" type.
+- **out.dataframe.name** (step information): Output table name of this rule, could be used in the following rules.
+- **in.dataframe.name** (step information): Input table name of this rule, only used for "df-ops" type.
 - **rule**: The rule string.
 - **details**: Details of this rule, optional.
   + accuracy dq type detail configuration
@@ -219,14 +220,6 @@ Above lists DQ job configure parameters.
     * matched: the matched count name in metric, optional.
   + profiling dq type detail configuration
     * source: the data source name which as source in profiling, default is the name of first data source in "data.sources" if not configured. If the griffin-dsl rule contains from clause, this parameter is ignored.
-  + uniqueness dq type detail configuration
-    * source: name of data source to measure uniqueness.
-    * target: name of data source to compare with. It is always the same as source, or more than source.
-    * unique: the unique count name in metric, optional.
-    * total: the total count name in metric, optional.
-    * dup: the duplicate count name in metric, optional.
-    * num: the duplicate number name in metric, optional.
-    * duplication.array: optional, if set as a non-empty string, the duplication metric will be computed, and the group metric name is this string.
   + distinctness dq type detail configuration
     * source: name of data source to measure uniqueness.
     * target: name of data source to compare with. It is always the same as source, or more than source.
@@ -241,7 +234,7 @@ Above lists DQ job configure parameters.
     * source: name of data source to measure timeliness.
     * latency: the latency column name in metric, optional.
     * threshold: optional, if set as a time string like "1h", the items with latency more than 1 hour will be record.
-- **out**: Lits of output sinks for the job.
+- **out**: List of output sinks for the job.
   + Metric output.
     * type: "metric"
     * name: Metric name, semantics depends on "flatten" field value.   
@@ -250,7 +243,7 @@ Above lists DQ job configure parameters.
       - entries: sends first row of data frame as metric results, like like `{"agg_col": "value"}`
       - array: wraps all metrics into a map, like `{"my_out_name": [{"agg_col": "value"}]}`
       - map: wraps first row of data frame into a map, like `{"my_out_name": {"agg_col": "value"}}`
-  + Record output. Currenly handled only by HDFS sink.
+  + Record output. Currently handled only by HDFS sink.
     * type: "record"
     * name: File name within sink output folder to dump files to.   
   + Data source cache update for streaming jobs.

http://git-wip-us.apache.org/repos/asf/incubator-griffin/blob/3545a71c/griffin-doc/measure/measure-streaming-sample.md
----------------------------------------------------------------------
diff --git a/griffin-doc/measure/measure-streaming-sample.md b/griffin-doc/measure/measure-streaming-sample.md
index 1d9f70e..30ed718 100644
--- a/griffin-doc/measure/measure-streaming-sample.md
+++ b/griffin-doc/measure/measure-streaming-sample.md
@@ -48,17 +48,15 @@ Apache Griffin measures consist of batch measure and streaming measure, this doc
           },
           "pre.proc": [
             {
-              "dsl.type": "df-opr",
-              "name": "${s1}",
-              "rule": "from_json",
-              "details": {
-                "df.name": "${this}"
-              }
+              "dsl.type": "df-ops",
+              "in.dataframe.name": "this",
+              "out.dataframe.name": "s1",
+              "rule": "from_json"
             },
             {
               "dsl.type": "spark-sql",
-              "name": "${this}",
-              "rule": "select name, age from ${s1}"
+              "out.dataframe.name": "this",
+              "rule": "select name, age from s1"
             }
           ]
         }
@@ -68,7 +66,8 @@ Apache Griffin measures consist of batch measure and streaming measure, this doc
         "info.path": "source",
         "ready.time.interval": "10s",
         "ready.time.delay": "0",
-        "time.range": ["-2m", "0"]
+        "time.range": ["-2m", "0"],
+        "updatable": true
       }
     }, {
       "name": "target",
@@ -89,17 +88,15 @@ Apache Griffin measures consist of batch measure and streaming measure, this doc
           },
           "pre.proc": [
             {
-              "dsl.type": "df-opr",
-              "name": "${t1}",
-              "rule": "from_json",
-              "details": {
-                "df.name": "${this}"
-              }
+              "dsl.type": "df-ops",
+              "in.dataframe.name": "this",
+              "out.dataframe.name": "t1",
+              "rule": "from_json"
             },
             {
               "dsl.type": "spark-sql",
-              "name": "${this}",
-              "rule": "select name, age from ${t1}"
+              "out.dataframe.name": "this",
+              "rule": "select name, age from t1"
             }
           ]
         }
@@ -119,7 +116,7 @@ Apache Griffin measures consist of batch measure and streaming measure, this doc
       {
         "dsl.type": "griffin-dsl",
         "dq.type": "accuracy",
-        "name": "accu",
+        "out.dataframe.name": "accu",
         "rule": "source.name = target.name and source.age = target.age",
         "details": {
           "source": "source",
@@ -140,7 +137,9 @@ Apache Griffin measures consist of batch measure and streaming measure, this doc
         ]
       }
     ]
-  }
+  },
+   
+  "sinks": ["CONSOLE","ELASTICSEARCH"]
 }
 ```
 Above is the configure file of streaming accuracy job.  
@@ -199,17 +198,15 @@ The miss records of source will be persisted as record.
           },
           "pre.proc": [
             {
-              "dsl.type": "df-opr",
-              "name": "${s1}",
-              "rule": "from_json",
-              "details": {
-                "df.name": "${this}"
-              }
+              "dsl.type": "df-ops",
+              "in.dataframe.name": "this",
+              "out.dataframe.name": "s1",
+              "rule": "from_json"
             },
             {
               "dsl.type": "spark-sql",
-              "name": "${this}",
-              "rule": "select name, age from ${s1}"
+              "out.dataframe.name": "this",
+              "rule": "select name, age from s1"
             }
           ]
         }
@@ -229,7 +226,7 @@ The miss records of source will be persisted as record.
       {
         "dsl.type": "griffin-dsl",
         "dq.type": "profiling",
-        "name": "prof",
+        "out.dataframe.name": "prof",
         "rule": "select count(name) as `cnt`, max(age) as `max`, min(age) as `min` from source",
         "out": [
           {
@@ -241,7 +238,7 @@ The miss records of source will be persisted as record.
       {
         "dsl.type": "griffin-dsl",
         "dq.type": "profiling",
-        "name": "grp",
+        "out.dataframe.name": "grp",
         "rule": "select name, count(*) as `cnt` from source group by name",
         "out": [
           {
@@ -252,7 +249,9 @@ The miss records of source will be persisted as record.
         ]        
       }
     ]
-  }
+  },
+      
+  "sinks": ["CONSOLE","ELASTICSEARCH"]
 }
 ```
 Above is the configure file of streaming profiling job.  

http://git-wip-us.apache.org/repos/asf/incubator-griffin/blob/3545a71c/measure/src/main/scala/org/apache/griffin/measure/configuration/dqdefinition/DQConfig.scala
----------------------------------------------------------------------
diff --git a/measure/src/main/scala/org/apache/griffin/measure/configuration/dqdefinition/DQConfig.scala b/measure/src/main/scala/org/apache/griffin/measure/configuration/dqdefinition/DQConfig.scala
index a4cdfc1..d41abf3 100644
--- a/measure/src/main/scala/org/apache/griffin/measure/configuration/dqdefinition/DQConfig.scala
+++ b/measure/src/main/scala/org/apache/griffin/measure/configuration/dqdefinition/DQConfig.scala
@@ -141,9 +141,6 @@ case class EvaluateRuleParam( @JsonProperty("rules") private val rules: List[Rul
   * @param details    detail config of rule (optional)
   * @param cache      cache the result for multiple usage (optional, valid for "spark-sql" and "df-ops" mode)
   * @param outputs    output ways configuration (optional)
-//  * @param metric     config for metric output (optional)
-//  * @param record     config for record output (optional)
-//  * @param dsCacheUpdate    config for data source cache update output (optional, valid in streaming mode)
   */
 @JsonInclude(Include.NON_NULL)
 case class RuleParam(@JsonProperty("dsl.type") private val dslType: String,