You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@griffin.apache.org by gu...@apache.org on 2018/09/13 14:23:29 UTC
incubator-griffin-site git commit: Updated asf-site site from master
(32c1e7a277a05be49c06b0bee96a54bca4f8f3cd)
Repository: incubator-griffin-site
Updated Branches:
refs/heads/asf-site 8cb12ee3c -> 7af1690fb
Updated asf-site site from master (32c1e7a277a05be49c06b0bee96a54bca4f8f3cd)
Project: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/commit/7af1690f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/tree/7af1690f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/diff/7af1690f
Branch: refs/heads/asf-site
Commit: 7af1690fb4cc15d3659378ebf05680f1844487b7
Parents: 8cb12ee
Author: William Guo <gu...@apache.org>
Authored: Thu Sep 13 22:23:23 2018 +0800
Committer: William Guo <gu...@apache.org>
Committed: Thu Sep 13 22:23:23 2018 +0800
----------------------------------------------------------------------
data/create-table.hql | 27 +++++++++++
data/gen_delta_src.sh | 12 +++++
data/gen_demo_data.sh | 14 ++++++
data/gen_hive_data.sh | 54 ++++++++++++++++++++++
data/insert-data.hql.template | 2 +
docs/quickstart.html | 92 +++++++++++++++++++++++++++++++-------
6 files changed, 185 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/7af1690f/data/create-table.hql
----------------------------------------------------------------------
diff --git a/data/create-table.hql b/data/create-table.hql
new file mode 100644
index 0000000..e117cd6
--- /dev/null
+++ b/data/create-table.hql
@@ -0,0 +1,27 @@
+--replace data location with your own path
+
+CREATE EXTERNAL TABLE `demo_src`(
+ `id` bigint,
+ `age` int,
+ `desc` string)
+PARTITIONED BY (
+ `dt` string,
+ `hour` string)
+ROW FORMAT DELIMITED
+ FIELDS TERMINATED BY '|'
+LOCATION
+ 'hdfs:///griffin/data/batch/demo_src';
+
+--replace data location with your own path
+
+CREATE EXTERNAL TABLE `demo_tgt`(
+ `id` bigint,
+ `age` int,
+ `desc` string)
+PARTITIONED BY (
+ `dt` string,
+ `hour` string)
+ROW FORMAT DELIMITED
+ FIELDS TERMINATED BY '|'
+LOCATION
+ 'hdfs:///griffin/data/batch/demo_tgt';
http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/7af1690f/data/gen_delta_src.sh
----------------------------------------------------------------------
diff --git a/data/gen_delta_src.sh b/data/gen_delta_src.sh
new file mode 100644
index 0000000..29fc96b
--- /dev/null
+++ b/data/gen_delta_src.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+file=delta_src
+id=124
+
+rm ${file}
+
+for i in {1..1000}
+do
+ idx=`shuf -i1-2000 -n1`
+ echo "${id}|${idx}|${idx}" >> ${file}
+done
http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/7af1690f/data/gen_demo_data.sh
----------------------------------------------------------------------
diff --git a/data/gen_demo_data.sh b/data/gen_demo_data.sh
new file mode 100644
index 0000000..55a975c
--- /dev/null
+++ b/data/gen_demo_data.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+./gen_delta_src.sh
+
+src=demo_src
+tgt=demo_tgt
+
+rm ${src}
+cat demo_basic >> ${src}
+cat delta_src >> ${src}
+
+rm ${tgt}
+cat demo_basic >> ${tgt}
+cat delta_tgt >> ${tgt}
http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/7af1690f/data/gen_hive_data.sh
----------------------------------------------------------------------
diff --git a/data/gen_hive_data.sh b/data/gen_hive_data.sh
new file mode 100644
index 0000000..5d7816d
--- /dev/null
+++ b/data/gen_hive_data.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+#create table
+hive -f create-table.hql
+echo "create table done"
+
+#current hour
+./gen_demo_data.sh
+cur_date=`date +%Y%m%d%H`
+dt=${cur_date:0:8}
+hour=${cur_date:8:2}
+partition_date="dt='$dt',hour='$hour'"
+sed s/PARTITION_DATE/$partition_date/ ./insert-data.hql.template > insert-data.hql
+hive -f insert-data.hql
+src_done_path=/griffin/data/batch/demo_src/dt=${dt}/hour=${hour}/_DONE
+tgt_done_path=/griffin/data/batch/demo_tgt/dt=${dt}/hour=${hour}/_DONE
+hadoop fs -touchz ${src_done_path}
+hadoop fs -touchz ${tgt_done_path}
+echo "insert data [$partition_date] done"
+
+#last hour
+./gen_demo_data.sh
+cur_date=`date -d '1 hour ago' +%Y%m%d%H`
+dt=${cur_date:0:8}
+hour=${cur_date:8:2}
+partition_date="dt='$dt',hour='$hour'"
+sed s/PARTITION_DATE/$partition_date/ ./insert-data.hql.template > insert-data.hql
+hive -f insert-data.hql
+src_done_path=/griffin/data/batch/demo_src/dt=${dt}/hour=${hour}/_DONE
+tgt_done_path=/griffin/data/batch/demo_tgt/dt=${dt}/hour=${hour}/_DONE
+hadoop fs -touchz ${src_done_path}
+hadoop fs -touchz ${tgt_done_path}
+echo "insert data [$partition_date] done"
+
+#next hours
+set +e
+while true
+do
+ ./gen_demo_data.sh
+ cur_date=`date +%Y%m%d%H`
+ next_date=`date -d "+1hour" '+%Y%m%d%H'`
+ dt=${next_date:0:8}
+ hour=${next_date:8:2}
+ partition_date="dt='$dt',hour='$hour'"
+ sed s/PARTITION_DATE/$partition_date/ ./insert-data.hql.template > insert-data.hql
+ hive -f insert-data.hql
+ src_done_path=/griffin/data/batch/demo_src/dt=${dt}/hour=${hour}/_DONE
+ tgt_done_path=/griffin/data/batch/demo_tgt/dt=${dt}/hour=${hour}/_DONE
+ hadoop fs -touchz ${src_done_path}
+ hadoop fs -touchz ${tgt_done_path}
+ echo "insert data [$partition_date] done"
+ sleep 3600
+done
+set -e
http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/7af1690f/data/insert-data.hql.template
----------------------------------------------------------------------
diff --git a/data/insert-data.hql.template b/data/insert-data.hql.template
new file mode 100644
index 0000000..4e4039a
--- /dev/null
+++ b/data/insert-data.hql.template
@@ -0,0 +1,2 @@
+LOAD DATA LOCAL INPATH 'demo_src' INTO TABLE demo_src PARTITION (PARTITION_DATE);
+LOAD DATA LOCAL INPATH 'demo_tgt' INTO TABLE demo_tgt PARTITION (PARTITION_DATE);
http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/7af1690f/docs/quickstart.html
----------------------------------------------------------------------
diff --git a/docs/quickstart.html b/docs/quickstart.html
index dee377e..1fdc75b 100644
--- a/docs/quickstart.html
+++ b/docs/quickstart.html
@@ -126,30 +126,85 @@ under the License.
</div>
<div class="col-xs-6 col-sm-9 page-main-content" style="margin-left: -15px" id="loadcontent">
<h1 class="page-header" style="margin-top: 0px">Quick Start</h1>
- <h2 id="environment-preparation">Environment Preparation</h2>
-<p>Prepare the environment for Apache Griffin.
-You can use our pre-built docker images as the environment.
-Follow the <a href="https://github.com/apache/incubator-griffin/blob/master/griffin-doc/docker/griffin-docker-guide.md#environment-preparation">docker guide</a> to start up the docker images, and login to the griffin container.</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>docker exec -it <griffin docker container id> bash
-cd ~/measure
-</code></pre></div></div>
-
-<h2 id="data-preparation">Data Preparation</h2>
-<p>Prepare the test data in Hive.
-In the docker image, we’ve prepared two Hive tables named <code class="highlighter-rouge">demo_src</code> and <code class="highlighter-rouge">demo_tgt</code>, and the test data is generated hourly.
-The schema is like this:</p>
+ <h2 id="user-story">User Story</h2>
+<p>Say we have two hive tables(demo_src, demo_tgt), we need to know what is the data quality for target table, based on source table.</p>
+<p>For simplicity, suppose both two table have the same schema as this:</p>
<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>id bigint
age int
desc string
dt string
hour string
</code></pre></div></div>
+<p>dt and hour are partitions, as every date we have one big partition dt(like 20180912), for every date we have 24 hour partitions(like 01,02, …).</p>
+
+<h2 id="environment-preparation">Environment Preparation</h2>
+<p>You need to prepare the environment for Apache Griffin measure module, including the following software:</p>
+<ul>
+ <li>JDK (1.8+)</li>
+ <li>Hadoop (2.6.0+)</li>
+ <li>Spark (2.2.1+)</li>
+ <li>Hive (2.2.0)</li>
+</ul>
-<p>In which <code class="highlighter-rouge">dt</code> and <code class="highlighter-rouge">hour</code> are the partition columns, with string values like <code class="highlighter-rouge">20180912</code> and <code class="highlighter-rouge">06</code>.</p>
+<h2 id="build-griffin-measure-module">Build Griffin Measure Module</h2>
+<ol>
+ <li>Download Griffin source package <a href="https://www.apache.org/dist/incubator/griffin/0.3.0-incubating">here</a>.</li>
+ <li>Unzip the source package.
+ <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>unzip griffin-0.3.0-incubating-source-release.zip
+cd griffin-0.3.0-incubating-source-release
+</code></pre></div> </div>
+ </li>
+ <li>Build Griffin jars.
+ <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>mvn clean install
+</code></pre></div> </div>
+
+ <p>Move the built griffin measure jar to your work path.</p>
+
+ <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>mv measure/target/measure-0.3.0-incubating.jar <work path>/griffin-measure.jar
+</code></pre></div> </div>
+ </li>
+</ol>
+
+<h2 id="data-preparation">Data Preparation</h2>
-<h2 id="configuration-files">Configuration Files</h2>
+<p>For our quick start, We will generate two Hive tables demo_src and demo_tgt.</p>
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>--create hive tables here. hql script
+--Note: replace hdfs location with your own path
+CREATE EXTERNAL TABLE `demo_src`(
+ `id` bigint,
+ `age` int,
+ `desc` string)
+PARTITIONED BY (
+ `dt` string,
+ `hour` string)
+ROW FORMAT DELIMITED
+ FIELDS TERMINATED BY '|'
+LOCATION
+ 'hdfs:///griffin/data/batch/demo_src';
+
+--Note: replace hdfs location with your own path
+CREATE EXTERNAL TABLE `demo_tgt`(
+ `id` bigint,
+ `age` int,
+ `desc` string)
+PARTITIONED BY (
+ `dt` string,
+ `hour` string)
+ROW FORMAT DELIMITED
+ FIELDS TERMINATED BY '|'
+LOCATION
+ 'hdfs:///griffin/data/batch/demo_tgt';
+
+</code></pre></div></div>
+<p>and we will load data into both two tables for every hour.</p>
+
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>#load data here...
+</code></pre></div></div>
+
+<h2 id="define-data-quality-measure">Define data quality measure</h2>
+
+<h4 id="griffin-env-configuration">Griffin env configuration</h4>
<p>The environment config file: env.json</p>
<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>{
"spark": {
@@ -176,6 +231,7 @@ hour string
}
</code></pre></div></div>
+<h4 id="define-griffin-data-quality">Define griffin data quality</h4>
<p>The DQ config file: dq.json</p>
<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>{
@@ -241,7 +297,7 @@ hour string
}
</code></pre></div></div>
-<h2 id="submit-measure-job">Submit Measure Job</h2>
+<h2 id="measure-data-quality">Measure data quality</h2>
<p>Submit the measure job to Spark, with config file paths as parameters.</p>
<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>spark-submit --class org.apache.griffin.measure.Application --master yarn --deploy-mode client --queue default \
@@ -250,8 +306,12 @@ hour string
<path>/env.json <path>/batch-accu-config.json
</code></pre></div></div>
+<h2 id="report-data-quality-metrics">Report data quality metrics</h2>
<p>Then you can get the calculation log in console, after the job finishes, you can get the result metrics printed. The metrics will also be saved in hdfs: <code class="highlighter-rouge">hdfs:///griffin/persist/<job name>/<timestamp>/_METRICS</code>.</p>
+<h2 id="refine-data-quality-report">Refine Data Quality report</h2>
+<p>Depends on your business, you might need to refine your data quality measure further till your are satisfied.</p>
+
<h2 id="more-details">More Details</h2>
<p>For more details about griffin measures, you can visit our documents in <a href="https://github.com/apache/incubator-griffin/tree/master/griffin-doc">github</a>.</p>