You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@griffin.apache.org by gu...@apache.org on 2018/09/13 14:23:29 UTC

incubator-griffin-site git commit: Updated asf-site site from master (32c1e7a277a05be49c06b0bee96a54bca4f8f3cd)

Repository: incubator-griffin-site
Updated Branches:
  refs/heads/asf-site 8cb12ee3c -> 7af1690fb


Updated asf-site site from master (32c1e7a277a05be49c06b0bee96a54bca4f8f3cd)


Project: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/commit/7af1690f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/tree/7af1690f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/diff/7af1690f

Branch: refs/heads/asf-site
Commit: 7af1690fb4cc15d3659378ebf05680f1844487b7
Parents: 8cb12ee
Author: William Guo <gu...@apache.org>
Authored: Thu Sep 13 22:23:23 2018 +0800
Committer: William Guo <gu...@apache.org>
Committed: Thu Sep 13 22:23:23 2018 +0800

----------------------------------------------------------------------
 data/create-table.hql         | 27 +++++++++++
 data/gen_delta_src.sh         | 12 +++++
 data/gen_demo_data.sh         | 14 ++++++
 data/gen_hive_data.sh         | 54 ++++++++++++++++++++++
 data/insert-data.hql.template |  2 +
 docs/quickstart.html          | 92 +++++++++++++++++++++++++++++++-------
 6 files changed, 185 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/7af1690f/data/create-table.hql
----------------------------------------------------------------------
diff --git a/data/create-table.hql b/data/create-table.hql
new file mode 100644
index 0000000..e117cd6
--- /dev/null
+++ b/data/create-table.hql
@@ -0,0 +1,27 @@
+--replace data location with your own path
+
+CREATE EXTERNAL TABLE `demo_src`(
+  `id` bigint,
+  `age` int,
+  `desc` string)
+PARTITIONED BY (
+  `dt` string,
+  `hour` string)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '|'
+LOCATION
+  'hdfs:///griffin/data/batch/demo_src';
+
+--replace data location with your own path
+
+CREATE EXTERNAL TABLE `demo_tgt`(
+  `id` bigint,
+  `age` int,
+  `desc` string)
+PARTITIONED BY (
+  `dt` string,
+  `hour` string)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '|'
+LOCATION
+  'hdfs:///griffin/data/batch/demo_tgt';

http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/7af1690f/data/gen_delta_src.sh
----------------------------------------------------------------------
diff --git a/data/gen_delta_src.sh b/data/gen_delta_src.sh
new file mode 100644
index 0000000..29fc96b
--- /dev/null
+++ b/data/gen_delta_src.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+file=delta_src
+id=124
+
+rm ${file}
+
+for i in {1..1000}
+do
+  idx=`shuf -i1-2000 -n1`
+  echo "${id}|${idx}|${idx}" >> ${file}
+done

http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/7af1690f/data/gen_demo_data.sh
----------------------------------------------------------------------
diff --git a/data/gen_demo_data.sh b/data/gen_demo_data.sh
new file mode 100644
index 0000000..55a975c
--- /dev/null
+++ b/data/gen_demo_data.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+./gen_delta_src.sh
+
+src=demo_src
+tgt=demo_tgt
+
+rm ${src}
+cat demo_basic >> ${src}
+cat delta_src >> ${src}
+
+rm ${tgt}
+cat demo_basic >> ${tgt}
+cat delta_tgt >> ${tgt}

http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/7af1690f/data/gen_hive_data.sh
----------------------------------------------------------------------
diff --git a/data/gen_hive_data.sh b/data/gen_hive_data.sh
new file mode 100644
index 0000000..5d7816d
--- /dev/null
+++ b/data/gen_hive_data.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+#create table
+hive -f create-table.hql
+echo "create table done"
+
+#current hour
+./gen_demo_data.sh
+cur_date=`date +%Y%m%d%H`
+dt=${cur_date:0:8}
+hour=${cur_date:8:2}
+partition_date="dt='$dt',hour='$hour'"
+sed s/PARTITION_DATE/$partition_date/ ./insert-data.hql.template > insert-data.hql
+hive -f insert-data.hql
+src_done_path=/griffin/data/batch/demo_src/dt=${dt}/hour=${hour}/_DONE
+tgt_done_path=/griffin/data/batch/demo_tgt/dt=${dt}/hour=${hour}/_DONE
+hadoop fs -touchz ${src_done_path}
+hadoop fs -touchz ${tgt_done_path}
+echo "insert data [$partition_date] done"
+
+#last hour
+./gen_demo_data.sh
+cur_date=`date -d '1 hour ago' +%Y%m%d%H`
+dt=${cur_date:0:8}
+hour=${cur_date:8:2}
+partition_date="dt='$dt',hour='$hour'"
+sed s/PARTITION_DATE/$partition_date/ ./insert-data.hql.template > insert-data.hql
+hive -f insert-data.hql
+src_done_path=/griffin/data/batch/demo_src/dt=${dt}/hour=${hour}/_DONE
+tgt_done_path=/griffin/data/batch/demo_tgt/dt=${dt}/hour=${hour}/_DONE
+hadoop fs -touchz ${src_done_path}
+hadoop fs -touchz ${tgt_done_path}
+echo "insert data [$partition_date] done"
+
+#next hours
+set +e
+while true
+do
+  ./gen_demo_data.sh
+  cur_date=`date +%Y%m%d%H`
+  next_date=`date -d "+1hour" '+%Y%m%d%H'`
+  dt=${next_date:0:8}
+  hour=${next_date:8:2}
+  partition_date="dt='$dt',hour='$hour'"
+  sed s/PARTITION_DATE/$partition_date/ ./insert-data.hql.template > insert-data.hql
+  hive -f insert-data.hql
+  src_done_path=/griffin/data/batch/demo_src/dt=${dt}/hour=${hour}/_DONE
+  tgt_done_path=/griffin/data/batch/demo_tgt/dt=${dt}/hour=${hour}/_DONE
+  hadoop fs -touchz ${src_done_path}
+  hadoop fs -touchz ${tgt_done_path}
+  echo "insert data [$partition_date] done"
+  sleep 3600
+done
+set -e

http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/7af1690f/data/insert-data.hql.template
----------------------------------------------------------------------
diff --git a/data/insert-data.hql.template b/data/insert-data.hql.template
new file mode 100644
index 0000000..4e4039a
--- /dev/null
+++ b/data/insert-data.hql.template
@@ -0,0 +1,2 @@
+LOAD DATA LOCAL INPATH 'demo_src' INTO TABLE demo_src PARTITION (PARTITION_DATE);
+LOAD DATA LOCAL INPATH 'demo_tgt' INTO TABLE demo_tgt PARTITION (PARTITION_DATE);

http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/7af1690f/docs/quickstart.html
----------------------------------------------------------------------
diff --git a/docs/quickstart.html b/docs/quickstart.html
index dee377e..1fdc75b 100644
--- a/docs/quickstart.html
+++ b/docs/quickstart.html
@@ -126,30 +126,85 @@ under the License.
       </div>
       <div class="col-xs-6 col-sm-9 page-main-content" style="margin-left: -15px" id="loadcontent">
         <h1 class="page-header" style="margin-top: 0px">Quick Start</h1>
-        <h2 id="environment-preparation">Environment Preparation</h2>
-<p>Prepare the environment for Apache Griffin. 
-You can use our pre-built docker images as the environment.
-Follow the <a href="https://github.com/apache/incubator-griffin/blob/master/griffin-doc/docker/griffin-docker-guide.md#environment-preparation">docker guide</a> to start up the docker images, and login to the griffin container.</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>docker exec -it &lt;griffin docker container id&gt; bash
-cd ~/measure
-</code></pre></div></div>
-
-<h2 id="data-preparation">Data Preparation</h2>
-<p>Prepare the test data in Hive.
-In the docker image, we’ve prepared two Hive tables named <code class="highlighter-rouge">demo_src</code> and <code class="highlighter-rouge">demo_tgt</code>, and the test data is generated hourly.
-The schema is like this:</p>
+        <h2 id="user-story">User Story</h2>
+<p>Say we have two hive tables(demo_src, demo_tgt), we need to know what is the data quality for target table, based on source table.</p>
 
+<p>For simplicity, suppose both two table have the same schema as this:</p>
 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>id                      bigint                                      
 age                     int                                         
 desc                    string                                      
 dt                      string                                      
 hour                    string 
 </code></pre></div></div>
+<p>dt and hour are partitions, as every date we have one big partition dt(like 20180912), for every date we have 24 hour partitions(like 01,02, …).</p>
+
+<h2 id="environment-preparation">Environment Preparation</h2>
+<p>You need to prepare the environment for Apache Griffin measure module, including the following software:</p>
+<ul>
+  <li>JDK (1.8+)</li>
+  <li>Hadoop (2.6.0+)</li>
+  <li>Spark (2.2.1+)</li>
+  <li>Hive (2.2.0)</li>
+</ul>
 
-<p>In which <code class="highlighter-rouge">dt</code> and <code class="highlighter-rouge">hour</code> are the partition columns, with string values like <code class="highlighter-rouge">20180912</code> and <code class="highlighter-rouge">06</code>.</p>
+<h2 id="build-griffin-measure-module">Build Griffin Measure Module</h2>
+<ol>
+  <li>Download Griffin source package <a href="https://www.apache.org/dist/incubator/griffin/0.3.0-incubating">here</a>.</li>
+  <li>Unzip the source package.
+    <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>unzip griffin-0.3.0-incubating-source-release.zip
+cd griffin-0.3.0-incubating-source-release
+</code></pre></div>    </div>
+  </li>
+  <li>Build Griffin jars.
+    <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>mvn clean install
+</code></pre></div>    </div>
+
+    <p>Move the built griffin measure jar to your work path.</p>
+
+    <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>mv measure/target/measure-0.3.0-incubating.jar &lt;work path&gt;/griffin-measure.jar
+</code></pre></div>    </div>
+  </li>
+</ol>
+
+<h2 id="data-preparation">Data Preparation</h2>
 
-<h2 id="configuration-files">Configuration Files</h2>
+<p>For our quick start, We will generate two Hive tables demo_src and demo_tgt.</p>
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>--create hive tables here. hql script
+--Note: replace hdfs location with your own path
+CREATE EXTERNAL TABLE `demo_src`(
+  `id` bigint,
+  `age` int,
+  `desc` string) 
+PARTITIONED BY (
+  `dt` string,
+  `hour` string)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '|'
+LOCATION
+  'hdfs:///griffin/data/batch/demo_src';
+
+--Note: replace hdfs location with your own path
+CREATE EXTERNAL TABLE `demo_tgt`(
+  `id` bigint,
+  `age` int,
+  `desc` string) 
+PARTITIONED BY (
+  `dt` string,
+  `hour` string)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '|'
+LOCATION
+  'hdfs:///griffin/data/batch/demo_tgt';
+
+</code></pre></div></div>
+<p>and we will load data into both two tables for every hour.</p>
+
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>#load data here...
+</code></pre></div></div>
+
+<h2 id="define-data-quality-measure">Define data quality measure</h2>
+
+<h4 id="griffin-env-configuration">Griffin env configuration</h4>
 <p>The environment config file: env.json</p>
 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>{
   "spark": {
@@ -176,6 +231,7 @@ hour                    string
 }
 </code></pre></div></div>
 
+<h4 id="define-griffin-data-quality">Define griffin data quality</h4>
 <p>The DQ config file: dq.json</p>
 
 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>{
@@ -241,7 +297,7 @@ hour                    string
 }
 </code></pre></div></div>
 
-<h2 id="submit-measure-job">Submit Measure Job</h2>
+<h2 id="measure-data-quality">Measure data quality</h2>
 <p>Submit the measure job to Spark, with config file paths as parameters.</p>
 
 <div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>spark-submit --class org.apache.griffin.measure.Application --master yarn --deploy-mode client --queue default \
@@ -250,8 +306,12 @@ hour                    string
 &lt;path&gt;/env.json &lt;path&gt;/batch-accu-config.json
 </code></pre></div></div>
 
+<h2 id="report-data-quality-metrics">Report data quality metrics</h2>
 <p>Then you can get the calculation log in console, after the job finishes, you can get the result metrics printed. The metrics will also be saved in hdfs: <code class="highlighter-rouge">hdfs:///griffin/persist/&lt;job name&gt;/&lt;timestamp&gt;/_METRICS</code>.</p>
 
+<h2 id="refine-data-quality-report">Refine Data Quality report</h2>
+<p>Depends on your business, you might need to refine your data quality measure further till your are satisfied.</p>
+
 <h2 id="more-details">More Details</h2>
 <p>For more details about griffin measures, you can visit our documents in <a href="https://github.com/apache/incubator-griffin/tree/master/griffin-doc">github</a>.</p>