You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by li...@apache.org on 2017/02/26 13:59:58 UTC
svn commit: r1784444 - in /kylin/site: ./ blog/ blog/2017/02/23/ blog/2017/02/23/by-layer-spark-cubing/ images/blog/

Author: lidong
Date: Sun Feb 26 13:59:58 2017
New Revision: 1784444

URL: http://svn.apache.org/viewvc?rev=1784444&view=rev
Log:
add post for spark cubing

Added:
    kylin/site/blog/2017/02/23/
    kylin/site/blog/2017/02/23/by-layer-spark-cubing/
    kylin/site/blog/2017/02/23/by-layer-spark-cubing/index.html
    kylin/site/images/blog/spark-cubing-layer.png   (with props)
    kylin/site/images/blog/spark-dag.png   (with props)
    kylin/site/images/blog/spark-mr-layer.png   (with props)
    kylin/site/images/blog/spark-mr-performance.png   (with props)
Modified:
    kylin/site/blog/index.html
    kylin/site/feed.xml

Added: kylin/site/blog/2017/02/23/by-layer-spark-cubing/index.html
URL: http://svn.apache.org/viewvc/kylin/site/blog/2017/02/23/by-layer-spark-cubing/index.html?rev=1784444&view=auto
==============================================================================
--- kylin/site/blog/2017/02/23/by-layer-spark-cubing/index.html (added)
+++ kylin/site/blog/2017/02/23/by-layer-spark-cubing/index.html Sun Feb 26 13:59:58 2017
@@ -0,0 +1,307 @@
+<!--
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+-->
+<!doctype html>
+<html>
+	<!--
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+-->
+
+<head>
+  <meta charset="utf-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+
+  <title>Apache Kylin | By-layer Spark Cubing</title>
+  <meta name="description" content="Before v2.0, Apache Kylin uses Hadoop MapReduce as the framework to build Cubes over huge dataset. The MapReduce framework is simple, stable and can fulfill ...">
+  <meta name="author"      content="Apache Kylin">
+  <link rel="shortcut icon" href="fav.png" type="image/png">
+
+
+
+<link rel="stylesheet" href="/assets/css/animate.css">
+<!-- Bootstrap -->
+<link rel="stylesheet" href="/assets/css/bootstrap.min.css">
+
+<!-- Fonts -->
+<!-- <link rel="stylesheet" href="http://fonts.googleapis.com/css?family=Alice|Open+Sans:400,300,700"> -->
+
+<!-- Icons -->
+<link rel="stylesheet" href="/assets/css/font-awesome.min.css">
+
+  <!-- Custom styles -->
+  <link rel="stylesheet" href="/assets/css/styles.css">
+  <link rel="stylesheet" href="/assets/css/docs.css">
+  <link rel="stylesheet" href="/assets/css/pygments.css">
+
+  <link rel="canonical" href="http://kylin.apache.org/blog/2017/02/23/by-layer-spark-cubing/">
+  <link rel="alternate" type="application/rss+xml" title="Apache Kylin" href="http://kylin.apache.org/feed.xml" />
+
+<!--[if lt IE 9]> <script src="assets/js/html5shiv.js"></script> <![endif]-->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+  //oringal tracker for kylin.io
+  ga('create', 'UA-55534813-1', 'auto');
+  //new tracker for kylin.apache.org
+  ga('create', 'UA-55534813-2', 'auto', {'name':'toplevel'});
+
+  ga('send', 'pageview');
+  ga('toplevel.send', 'pageview');
+
+
+</script>
+<script type="text/javascript" src="/assets/js/jquery-1.9.1.min.js"></script>
+<script type="text/javascript" src="/assets/js/nside.js"></script> </script>
+<script type="text/javascript" src="/assets/js/nnav.js"></script> </script>
+</head>
+
+	<body>
+		<!--
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+-->
+
+<header id="header" >
+  
+  <div id="head" class="parallax" parallax-speed="3" >
+    <div id="logo" class="text-center"> <img class="img-circle" id="circlelogo" src="/assets/images/kylin_logo.jpg"> <span class="title" >Apache Kylin™</span> <span class="tagline">Extreme OLAP Engine for Big Data</span> 
+    </div>
+  </div>
+  
+
+  <!-- Main Menu -->
+  <nav class="navbar navbar-default" role="navigation" id="nav-wrapper">
+  <div class="container-fluid" id="nav">
+    <!--
+    <img class="img-circle" width="40px" height="40px" id="circlelogo" src="/assets/images/kylin_logo.jpg">
+    -->
+    <!-- Brand and toggle get grouped for better mobile display -->
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+     
+    </div>
+
+    <!-- Collect the nav links, forms, and other content for toggling -->
+    <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
+      <ul class="nav navbar-nav">
+     <li><a href="/">Home</a></li>
+          <li><a href="/docs16" >Docs</a></li>
+          <li><a href="/download">Download</li>
+          <li><a href="/community" >Community</a></li>
+          <li><a href="/development" >Development</a></li>
+          <li><a href="/blog">Blog</li>
+          <li><a href="/cn" >中文版</a></li>  
+          <li><a href="https://twitter.com/apachekylin" target="_blank" class="fa fa-twitter fa-lg" title="Twitter: @ApacheKylin" ></a></li>
+          <li><a href="https://github.com/apache/kylin" target="_blank" class="fa fa-github-alt fa-lg" title="Github: apache/kylin" ></a></li>          
+          <li><a href="https://www.facebook.com/kylinio" target="_blank" class="fa fa-facebook fa-lg" title="Facebook: kylin.io" ></a></li>   
+      </ul>      
+    </div><!-- /.navbar-collapse -->
+  </div><!-- /.container-fluid -->
+</nav>
+ </header>
+
+		<div class="page-content">
+			<header style=" padding:2em 0 0 0">
+			<div class="container" >
+				<h4 class="section-title"><span>Apache Kylin™ Technical Blog</span></h4>
+			</div>
+		</div>
+
+		<div class="container">
+			<div>
+				<article class="post-content" >	
+				<!--
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+-->
+
+<div class="post" style=" padding:2em 4em 4em 4em">
+
+  <header class="post-header">
+    <h1 class="post-title">By-layer Spark Cubing</h1>
+    <p class="post-meta" >Feb 23, 2017 • Shaofeng Shi</p>
+  </header>
+
+  <article class="post-content" >
+    <p>Before v2.0, Apache Kylin uses Hadoop MapReduce as the framework to build Cubes over huge dataset. The MapReduce framework is simple, stable and can fulfill Kylin’s need very well except the performance. In order to get better performance, we introduced the “fast cubing” algorithm in Kylin v1.5, tries to do as much as possible aggregations at map side within memory, so to avoid the disk and network I/O; but not all data models can benefit from it, and it still runs on MR which means on-disk sorting and shuffling.</p>
+
+<p>Now Spark comes; Apache Spark is an open-source cluster-computing framework, which provides programmers with an application programming interface centered on a data structure called RDD; it runs in-memory on the cluster, this makes repeated access to the same data much faster. Spark provides flexible and fancy APIs. You are not tied to Hadoop’s MapReduce two-stage paradigm.</p>
+
+<p>Before introducing how calculate Cube with Spark, let’s see how Kylin do that with MR; Figure 1 illustrates how a 4-dimension Cube get calculated with the classic “by-layer” algorithm: the first round MR aggregates the base (4-D) cuboid from source data; the second MR aggregates on the base cuboid to get the 3-D cuboids; With N+1 round MR all layers’ cuboids get calculated.</p>
+
+<p><img src="/images/blog/spark-mr-layer.png" alt="MapReduce Cubing by Layer" /></p>
+
+<p>The “by-layer” Cubing divides a big task into a couple steps, and each step bases on the previous step’s output, so it can reuse the previous calculation and also avoid calculating from very beginning when there is a failure in between. These makes it as a reliable algorithm. When moving to Spark, we decide to keep this algorithm, that’s why we call this feature as “By layer Spark Cubing”.</p>
+
+<p>As we know, RDD (Resilient Distributed Dataset) is a basic concept in Spark. A collection of N-Dimension cuboids can be well described as an RDD, a N-Dimension Cube will have N+1 RDD. These RDDs have the parent/child relationship as the parent can be used to generate the children. With the parent RDD cached in memory, the child RDD’s generation can be much efficient than reading from disk. Figure 2 describes this process.</p>
+
+<p><img src="/images/blog/spark-cubing-layer.png" alt="Spark Cubing by Layer" /></p>
+
+<p>Figure 3 is the DAG of Cubing in Spark, it illustrates the process in detail: In “Stage 5”, Kylin uses a HiveContext to read the intermediate Hive table, and then do a “map” operation, which is an one to one map, to encode the origin values into K-V bytes. On complete Kylin gets an intermediate encoded RDD. In “Stage 6”, the intermediate RDD is aggregated with a “reduceByKey” operation to get RDD-1, which is the base cuboid. Nextly, do an “flatMap” (one to many map) on RDD-1, because the base cuboid has N children cuboids. And so on, all levels’ RDDs get calculated. These RDDs will be persisted to distributed file system on complete, but be cached in memory for next level’s calculation. When child be generated, it will be removed from cache.</p>
+
+<p><img src="/images/blog/spark-dag.png" alt="DAG of Spark Cubing" /></p>
+
+<p>We did a test to see how much performance improvement can gain from Spark:</p>
+
+<p>Environment</p>
+
+<ul>
+  <li>4 nodes Hadoop cluster; each node has 28 GB RAM and 12 cores;</li>
+  <li>YRAN has 48GB RAM and 30 cores in total;</li>
+  <li>CDH 5.8, Apache Kylin 2.0 beta.</li>
+</ul>
+
+<p>Spark</p>
+
+<ul>
+  <li>Spark 1.6.3 on YARN</li>
+  <li>6 executors, each has 4 cores, 4GB +1GB (overhead) memory</li>
+</ul>
+
+<p>Test Data</p>
+
+<ul>
+  <li>Airline data, total 160 million rows</li>
+  <li>Cube: 10 dimensions, 5 measures (SUM)</li>
+</ul>
+
+<p>Test Scenarios</p>
+
+<ul>
+  <li>Build the cube at different source data level: 3 million, 50 million and 160 million source rows; Compare the build time with MapReduce (by layer) and Spark. No compression enabled.<br />
+The time only cover the building cube step, not including data preparations and subsequent steps.</li>
+</ul>
+
+<p><img src="/images/blog/spark-mr-performance.png" alt="Spark vs MR performance" /></p>
+
+<p>Spark is faster than MR in all the 3 scenarios, and overall it can reduce about half time in the cubing.</p>
+
+<p>Now you can download a 2.0.0 beta build from Kylin’s download page, and then follow this <a href="https://kylin.apache.org/blog/2017/02/25/v2.0.0-beta-ready/">post</a> to build a cube with Spark engine. If you have any comments or inputs, please discuss in the community.</p>
+
+
+  </article>
+
+</div>
+
+
+
+
+
+				</article>
+			</div>
+		</div>		
+		<!--
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+-->
+
+<footer id="underfooter">
+    <div class="container">
+        <div class="row">
+            <div class="col-md-12 widget">
+                <div class="widget-body" style="text-align:center">
+                    <a href="http://www.apache.org">
+                        <img id="asf-logo" alt="Apache Software Foundation" src="/assets/images/feather-small.gif">
+                    </a>
+
+                    <div>
+                        The contents of this website are © 2015 Apache Software Foundation under the terms of the <a
+                            href="http://www.apache.org/licenses/LICENSE-2.0"> Apache License v2 </a>. Apache Kylin and
+                        its logo are trademarks of the Apache Software Foundation.
+                    </div>
+
+                </div>
+            </div>
+        </div>
+        <!-- /row of widgets -->
+
+    </div>
+    <div></div>
+
+</footer>
+
+	<script src="/assets/js/jquery-1.9.1.min.js"></script> 
+	<script src="/assets/js/bootstrap.min.js"></script> 
+	<script src="/assets/js/main.js"></script>
+	</body>
+</html>
+
+
+
+

Modified: kylin/site/blog/index.html
URL: http://svn.apache.org/viewvc/kylin/site/blog/index.html?rev=1784444&r1=1784443&r2=1784444&view=diff
==============================================================================
--- kylin/site/blog/index.html (original)
+++ kylin/site/blog/index.html Sun Feb 26 13:59:58 2017
@@ -186,6 +186,12 @@
     
             <li>
         <h2 align="left" style="margin:0px">
+          <a class="post-link" href="/blog/2017/02/23/by-layer-spark-cubing/">By-layer Spark Cubing</a></h2><div align="left" class="post-meta">posted: Feb 23, 2017</div>
+        
+      </li>
+    
+            <li>
+        <h2 align="left" style="margin:0px">
           <a class="post-link" href="/cn/blog/2016/12/04/release-v1.6.0/">Apache Kylin v1.6.0 正式发布</a></h2><div align="left" class="post-meta">posted: Dec 4, 2016</div>
         
       </li>
@@ -300,13 +306,13 @@
     
             <li>
         <h2 align="left" style="margin:0px">
-          <a class="post-link" href="/blog/2016/03/16/release-v1.3.0/">Apache Kylin v1.3.0 Release Announcement</a></h2><div align="left" class="post-meta">posted: Mar 16, 2016</div>
+          <a class="post-link" href="/cn/blog/2016/03/16/release-v1.3.0/">Apache Kylin v1.3.0 正式发布</a></h2><div align="left" class="post-meta">posted: Mar 16, 2016</div>
         
       </li>
     
             <li>
         <h2 align="left" style="margin:0px">
-          <a class="post-link" href="/cn/blog/2016/03/16/release-v1.3.0/">Apache Kylin v1.3.0 正式发布</a></h2><div align="left" class="post-meta">posted: Mar 16, 2016</div>
+          <a class="post-link" href="/blog/2016/03/16/release-v1.3.0/">Apache Kylin v1.3.0 Release Announcement</a></h2><div align="left" class="post-meta">posted: Mar 16, 2016</div>
         
       </li>
     
@@ -324,13 +330,13 @@
     
             <li>
         <h2 align="left" style="margin:0px">
-          <a class="post-link" href="/cn/blog/2015/12/25/support-powerbi-tableau9/">Apache Kylin增加对Tableau 9及微软Excel, Power BI的支持</a></h2><div align="left" class="post-meta">posted: Dec 25, 2015</div>
+          <a class="post-link" href="/blog/2015/12/25/support-powerbi-tableau9/">Apache Kylin supports Tableau 9 and MS Excel, Power BI now</a></h2><div align="left" class="post-meta">posted: Dec 25, 2015</div>
         
       </li>
     
             <li>
         <h2 align="left" style="margin:0px">
-          <a class="post-link" href="/blog/2015/12/25/support-powerbi-tableau9/">Apache Kylin supports Tableau 9 and MS Excel, Power BI now</a></h2><div align="left" class="post-meta">posted: Dec 25, 2015</div>
+          <a class="post-link" href="/cn/blog/2015/12/25/support-powerbi-tableau9/">Apache Kylin增加对Tableau 9及微软Excel, Power BI的支持</a></h2><div align="left" class="post-meta">posted: Dec 25, 2015</div>
         
       </li>
     

Modified: kylin/site/feed.xml
URL: http://svn.apache.org/viewvc/kylin/site/feed.xml?rev=1784444&r1=1784443&r2=1784444&view=diff
==============================================================================
--- kylin/site/feed.xml (original)
+++ kylin/site/feed.xml Sun Feb 26 13:59:58 2017
@@ -19,8 +19,8 @@
     <description>Apache Kylin Home</description>
     <link>http://kylin.apache.org/</link>
     <atom:link href="http://kylin.apache.org/feed.xml" rel="self" type="application/rss+xml"/>
-    <pubDate>Sat, 25 Feb 2017 05:55:03 -0800</pubDate>
-    <lastBuildDate>Sat, 25 Feb 2017 05:55:03 -0800</lastBuildDate>
+    <pubDate>Sun, 26 Feb 2017 05:59:06 -0800</pubDate>
+    <lastBuildDate>Sun, 26 Feb 2017 05:59:06 -0800</lastBuildDate>
     <generator>Jekyll v2.5.3</generator>
     
       <item>
@@ -156,6 +156,73 @@
       </item>
     
       <item>
+        <title>By-layer Spark Cubing</title>
+        <description>&lt;p&gt;Before v2.0, Apache Kylin uses Hadoop MapReduce as the framework to build Cubes over huge dataset. The MapReduce framework is simple, stable and can fulfill Kylin’s need very well except the performance. In order to get better performance, we introduced the “fast cubing” algorithm in Kylin v1.5, tries to do as much as possible aggregations at map side within memory, so to avoid the disk and network I/O; but not all data models can benefit from it, and it still runs on MR which means on-disk sorting and shuffling.&lt;/p&gt;
+
+&lt;p&gt;Now Spark comes; Apache Spark is an open-source cluster-computing framework, which provides programmers with an application programming interface centered on a data structure called RDD; it runs in-memory on the cluster, this makes repeated access to the same data much faster. Spark provides flexible and fancy APIs. You are not tied to Hadoop’s MapReduce two-stage paradigm.&lt;/p&gt;
+
+&lt;p&gt;Before introducing how calculate Cube with Spark, let’s see how Kylin do that with MR; Figure 1 illustrates how a 4-dimension Cube get calculated with the classic “by-layer” algorithm: the first round MR aggregates the base (4-D) cuboid from source data; the second MR aggregates on the base cuboid to get the 3-D cuboids; With N+1 round MR all layers’ cuboids get calculated.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/blog/spark-mr-layer.png&quot; alt=&quot;MapReduce Cubing by Layer&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The “by-layer” Cubing divides a big task into a couple steps, and each step bases on the previous step’s output, so it can reuse the previous calculation and also avoid calculating from very beginning when there is a failure in between. These makes it as a reliable algorithm. When moving to Spark, we decide to keep this algorithm, that’s why we call this feature as “By layer Spark Cubing”.&lt;/p&gt;
+
+&lt;p&gt;As we know, RDD (Resilient Distributed Dataset) is a basic concept in Spark. A collection of N-Dimension cuboids can be well described as an RDD, a N-Dimension Cube will have N+1 RDD. These RDDs have the parent/child relationship as the parent can be used to generate the children. With the parent RDD cached in memory, the child RDD’s generation can be much efficient than reading from disk. Figure 2 describes this process.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/blog/spark-cubing-layer.png&quot; alt=&quot;Spark Cubing by Layer&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Figure 3 is the DAG of Cubing in Spark, it illustrates the process in detail: In “Stage 5”, Kylin uses a HiveContext to read the intermediate Hive table, and then do a “map” operation, which is an one to one map, to encode the origin values into K-V bytes. On complete Kylin gets an intermediate encoded RDD. In “Stage 6”, the intermediate RDD is aggregated with a “reduceByKey” operation to get RDD-1, which is the base cuboid. Nextly, do an “flatMap” (one to many map) on RDD-1, because the base cuboid has N children cuboids. And so on, all levels’ RDDs get calculated. These RDDs will be persisted to distributed file system on complete, but be cached in memory for next level’s calculation. When child be generated, it will be removed from cache.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/blog/spark-dag.png&quot; alt=&quot;DAG of Spark Cubing&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;We did a test to see how much performance improvement can gain from Spark:&lt;/p&gt;
+
+&lt;p&gt;Environment&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;4 nodes Hadoop cluster; each node has 28 GB RAM and 12 cores;&lt;/li&gt;
+  &lt;li&gt;YRAN has 48GB RAM and 30 cores in total;&lt;/li&gt;
+  &lt;li&gt;CDH 5.8, Apache Kylin 2.0 beta.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;Spark&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Spark 1.6.3 on YARN&lt;/li&gt;
+  &lt;li&gt;6 executors, each has 4 cores, 4GB +1GB (overhead) memory&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;Test Data&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Airline data, total 160 million rows&lt;/li&gt;
+  &lt;li&gt;Cube: 10 dimensions, 5 measures (SUM)&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;Test Scenarios&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Build the cube at different source data level: 3 million, 50 million and 160 million source rows; Compare the build time with MapReduce (by layer) and Spark. No compression enabled.&lt;br /&gt;
+The time only cover the building cube step, not including data preparations and subsequent steps.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/blog/spark-mr-performance.png&quot; alt=&quot;Spark vs MR performance&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Spark is faster than MR in all the 3 scenarios, and overall it can reduce about half time in the cubing.&lt;/p&gt;
+
+&lt;p&gt;Now you can download a 2.0.0 beta build from Kylin’s download page, and then follow this &lt;a href=&quot;https://kylin.apache.org/blog/2017/02/25/v2.0.0-beta-ready/&quot;&gt;post&lt;/a&gt; to build a cube with Spark engine. If you have any comments or inputs, please discuss in the community.&lt;/p&gt;
+
+</description>
+        <pubDate>Thu, 23 Feb 2017 09:30:00 -0800</pubDate>
+        <link>http://kylin.apache.org/blog/2017/02/23/by-layer-spark-cubing/</link>
+        <guid isPermaLink="true">http://kylin.apache.org/blog/2017/02/23/by-layer-spark-cubing/</guid>
+        
+        
+        <category>blog</category>
+        
+      </item>
+    
+      <item>
         <title>Apache Kylin v1.6.0 正式发布</title>
         <description>&lt;p&gt;Apache Kylin社区非常高兴宣布Apache Kylin v1.6.0正式发布。&lt;/p&gt;
 
@@ -721,57 +788,6 @@ kylin.job.mr.config.override.mapreduce.m
         
         
         <category>blog</category>
-        
-      </item>
-    
-      <item>
-        <title>Apache Kylin v1.5.3 正式发布</title>
-        <description>&lt;p&gt;Apache Kylin社区非常高兴宣布Apache Kylin v1.5.3正式发布。&lt;/p&gt;
-
-&lt;p&gt;Apache Kylin是一个开源的分布式分析引擎，提供Hadoop之上的SQL查询接口及多维分析（OLAP）能力以支持超大规模数据，最初由eBay Inc. 开发并贡献至开源社区。&lt;/p&gt;
-
-&lt;p&gt;下载Apache Kylin v1.5.3源代码及二进制安装包，&lt;br /&gt;
-请访问&lt;a href=&quot;http://kylin.apache.org/cn/download/&quot;&gt;下载&lt;/a&gt;页面.&lt;/p&gt;
-
-&lt;p&gt;这是一个主要的版本发布带来了更稳定，健壮及更好管理的版本，Apache Kylin社区解决了84个issue，包括Bug修复，功能增强及一些新特性等。&lt;/p&gt;
-
-&lt;h2 id=&quot;section&quot;&gt;主要变化&lt;/h2&gt;
-
-&lt;ul&gt;
-  &lt;li&gt;采用标准API获取Hadoop任务的状态 &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1319&quot;&gt;KYLIN-1319&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;全局的（扩展性更好的）字典编码方法 &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1705&quot;&gt;KYLIN-1705&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;更稳定的精确去重(count distinct)度量 &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1379&quot;&gt;KYLIN-1379&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;通过指定每个Mapper处理纪录的数量，从而提高Cube构建性能 &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1656&quot;&gt;KYLIN-1656&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;在创建Hive平表时按某些列（UHC）列来分散数据 &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1677&quot;&gt;KYLIN-1677&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;允许在Cube级别覆盖MR任务的属性 &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1706&quot;&gt;KYLIN-1706&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;允许非管理员用户编辑修改Cube向导的“高级设置”页 &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1731&quot;&gt;KYLIN-1731&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;计算全0组合（mandantory维度除外） cuboids &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1747&quot;&gt;KYLIN-1747&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;允许全部维度都是mandatory &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1749&quot;&gt;KYLIN-1749&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;修复“当连接类型时inner时不能使用view做维度表”的问题 &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1789&quot;&gt;KYLIN-1789&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;HBase coprocessor出错时将Exception传回查询线程 &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1645&quot;&gt;KYLIN-1645&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;精简JDBC driver的依赖 &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1846&quot;&gt;KYLIN-1846&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;TopN度量支持使用非字典的编码方式 &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1478&quot;&gt;KYLIN-1478&lt;/a&gt;&lt;/li&gt;
-&lt;/ul&gt;
-
-&lt;p&gt;&lt;strong&gt;升级&lt;/strong&gt;&lt;/p&gt;
-
-&lt;p&gt;参见&lt;a href=&quot;/docs15/howto/howto_upgrade.html&quot;&gt;升级指南&lt;/a&gt;.&lt;/p&gt;
-
-&lt;p&gt;&lt;strong&gt;支持&lt;/strong&gt;&lt;/p&gt;
-
-&lt;p&gt;升级和使用过程中有任何问题，请：&lt;br /&gt;
-提交至Kylin的JIRA: &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN/&quot;&gt;https://issues.apache.org/jira/browse/KYLIN/&lt;/a&gt;&lt;br /&gt;
-或者&lt;br /&gt;
-发送邮件到Apache Kylin邮件列表: &lt;a href=&quot;&amp;#109;&amp;#097;&amp;#105;&amp;#108;&amp;#116;&amp;#111;:&amp;#100;&amp;#101;&amp;#118;&amp;#064;&amp;#107;&amp;#121;&amp;#108;&amp;#105;&amp;#110;&amp;#046;&amp;#097;&amp;#112;&amp;#097;&amp;#099;&amp;#104;&amp;#101;&amp;#046;&amp;#111;&amp;#114;&amp;#103;&quot;&gt;&amp;#100;&amp;#101;&amp;#118;&amp;#064;&amp;#107;&amp;#121;&amp;#108;&amp;#105;&amp;#110;&amp;#046;&amp;#097;&amp;#112;&amp;#097;&amp;#099;&amp;#104;&amp;#101;&amp;#046;&amp;#111;&amp;#114;&amp;#103;&lt;/a&gt;&lt;/p&gt;
-
-&lt;p&gt;&lt;em&gt;感谢每一位朋友的参与和贡献!&lt;/em&gt;&lt;/p&gt;
-</description>
-        <pubDate>Thu, 28 Jul 2016 14:00:00 -0700</pubDate>
-        <link>http://kylin.apache.org/cn/blog/2016/07/28/release-v1.5.3/</link>
-        <guid isPermaLink="true">http://kylin.apache.org/cn/blog/2016/07/28/release-v1.5.3/</guid>
-        
-        
-        <category>blog</category>
         
       </item>
     

Added: kylin/site/images/blog/spark-cubing-layer.png
URL: http://svn.apache.org/viewvc/kylin/site/images/blog/spark-cubing-layer.png?rev=1784444&view=auto
==============================================================================
Binary file - no diff available.

Propchange: kylin/site/images/blog/spark-cubing-layer.png
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: kylin/site/images/blog/spark-dag.png
URL: http://svn.apache.org/viewvc/kylin/site/images/blog/spark-dag.png?rev=1784444&view=auto
==============================================================================
Binary file - no diff available.

Propchange: kylin/site/images/blog/spark-dag.png
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: kylin/site/images/blog/spark-mr-layer.png
URL: http://svn.apache.org/viewvc/kylin/site/images/blog/spark-mr-layer.png?rev=1784444&view=auto
==============================================================================
Binary file - no diff available.

Propchange: kylin/site/images/blog/spark-mr-layer.png
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: kylin/site/images/blog/spark-mr-performance.png
URL: http://svn.apache.org/viewvc/kylin/site/images/blog/spark-mr-performance.png?rev=1784444&view=auto
==============================================================================
Binary file - no diff available.

Propchange: kylin/site/images/blog/spark-mr-performance.png
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream