You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by li...@apache.org on 2016/03/21 03:05:11 UTC

svn commit: r1735917 - in /kylin/site: blog/2016/03/19/ blog/2016/03/19/approximate-topn-measure/ blog/2016/03/19/approximate-topn-measure/index.html blog/index.html feed.xml

Author: lidong
Date: Mon Mar 21 02:05:10 2016
New Revision: 1735917

URL: http://svn.apache.org/viewvc?rev=1735917&view=rev
Log:
add topn blog

Added:
    kylin/site/blog/2016/03/19/
    kylin/site/blog/2016/03/19/approximate-topn-measure/
    kylin/site/blog/2016/03/19/approximate-topn-measure/index.html
Modified:
    kylin/site/blog/index.html
    kylin/site/feed.xml

Added: kylin/site/blog/2016/03/19/approximate-topn-measure/index.html
URL: http://svn.apache.org/viewvc/kylin/site/blog/2016/03/19/approximate-topn-measure/index.html?rev=1735917&view=auto
==============================================================================
--- kylin/site/blog/2016/03/19/approximate-topn-measure/index.html (added)
+++ kylin/site/blog/2016/03/19/approximate-topn-measure/index.html Mon Mar 21 02:05:10 2016
@@ -0,0 +1,631 @@
+<!--
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+-->
+<!doctype html>
+<html>
+	<!--
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+-->
+
+<head>
+  <meta charset="utf-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+
+  <title>Apache Kylin | Approximate Top-N support in Kylin</title>
+  <meta name="description" content="Background">
+  <meta name="author"      content="Apache Kylin">
+  <link rel="shortcut icon" href="fav.png" type="image/png">
+
+
+
+<link rel="stylesheet" href="/assets/css/animate.css">
+<!-- Bootstrap -->
+<link rel="stylesheet" href="/assets/css/bootstrap.min.css">
+
+<!-- Fonts -->
+<!-- <link rel="stylesheet" href="http://fonts.googleapis.com/css?family=Alice|Open+Sans:400,300,700"> -->
+
+<!-- Icons -->
+<link rel="stylesheet" href="/assets/css/font-awesome.min.css">
+
+  <!-- Custom styles -->
+  <link rel="stylesheet" href="/assets/css/styles.css">
+  <link rel="stylesheet" href="/assets/css/docs.css">
+  <link rel="stylesheet" href="/assets/css/pygments.css">
+
+  <link rel="canonical" href="http://kylin.apache.org/blog/2016/03/19/approximate-topn-measure/">
+  <link rel="alternate" type="application/rss+xml" title="Apache Kylin" href="http://kylin.apache.org/feed.xml" />
+
+<!--[if lt IE 9]> <script src="assets/js/html5shiv.js"></script> <![endif]-->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+  //oringal tracker for kylin.io
+  ga('create', 'UA-55534813-1', 'auto');
+  //new tracker for kylin.apache.org
+  ga('create', 'UA-55534813-2', 'auto', {'name':'toplevel'});
+
+  ga('send', 'pageview');
+  ga('toplevel.send', 'pageview');
+
+
+</script>
+<script type="text/javascript" src="/assets/js/jquery-1.9.1.min.js"></script>
+<script type="text/javascript" src="/assets/js/nside.js"></script> </script>
+<script type="text/javascript" src="/assets/js/nnav.js"></script> </script>
+</head>
+
+	<body>
+		<!--
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+-->
+
+<header id="header" >
+  
+  <div id="head" class="parallax" parallax-speed="3" >
+    <div id="logo" class="text-center"> <img class="img-circle" id="circlelogo" src="/assets/images/kylin_logo.jpg"> <span class="title" >Apache Kylin™</span> <span class="tagline">Extreme OLAP Engine for Big Data</span> 
+    </div>
+  </div>
+  
+
+  <!-- Main Menu -->
+  <nav class="navbar navbar-default" role="navigation" id="nav-wrapper">
+  <div class="container-fluid" id="nav">
+    <!--
+    <img class="img-circle" width="40px" height="40px" id="circlelogo" src="/assets/images/kylin_logo.jpg">
+    -->
+    <!-- Brand and toggle get grouped for better mobile display -->
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+     
+    </div>
+
+    <!-- Collect the nav links, forms, and other content for toggling -->
+    <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
+      <ul class="nav navbar-nav">
+     <li><a href="/">Home</a></li>
+          <li><a href="/docs15" >Docs</a></li>
+          <li><a href="/download">Download</li>
+          <li><a href="/community" >Community</a></li>
+          <li><a href="/development" >Development</a></li>
+          <li><a href="/blog">Blog</li>
+          <li><a href="/cn" >中文版</a></li>  
+          <li><a href="https://twitter.com/apachekylin" target="_blank" class="fa fa-twitter fa-lg" title="Twitter: @ApacheKylin" ></a></li>
+          <li><a href="https://github.com/apache/kylin" target="_blank" class="fa fa-github-alt fa-lg" title="Github: apache/kylin" ></a></li>          
+          <li><a href="https://www.facebook.com/kylinio" target="_blank" class="fa fa-facebook fa-lg" title="Facebook: kylin.io" ></a></li>   
+      </ul>      
+    </div><!-- /.navbar-collapse -->
+  </div><!-- /.container-fluid -->
+</nav>
+ </header>
+
+		<div class="page-content">
+			<header style=" padding:2em 0 0 0">
+			<div class="container" >
+				<h4 class="section-title"><span>Apache Kylin™ Technical Blog</span></h4>
+			</div>
+		</div>
+
+		<div class="container">
+			<div>
+				<article class="post-content" >	
+				<!--
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+-->
+
+<div class="post" style=" padding:2em 4em 4em 4em">
+
+  <header class="post-header">
+    <h1 class="post-title">Approximate Top-N support in Kylin</h1>
+    <p class="post-meta" >Mar 19, 2016 • Shaofeng Shi</p>
+  </header>
+
+  <article class="post-content" >
+    <h2 id="background">Background</h2>
+
+<p>Find the Top-N (or Top-K) entities from a dataset is a common scenario and requirement in data minding; We often see the reports or news like “Top 100 companies in the world”, “Most popular 20 electronics sold on eBay”, etc. Exploring and analysising the top entities can always find some high value information.</p>
+
+<p>Within the era of big data, this need is much stronger than ever before, as both the raw dataset and the number of entities can be vast; Without certain pre-calculation, get the Top-K entities among a distributed big dataset may take a long time, makes the ad-hoc query inefficient.</p>
+
+<p>In v1.5.0, Apache Kylin introduces the “Top-N” measure, aiming to pre-calculate the top entities during the cube build phase; in the query phase,  Kylin can quickly fetch and return the top records. The performance would be much better than a cube without “Top-N”, giving the analyst more power to inspect data.</p>
+
+<p>Please note, this “Top-N” measure is an approximate realization, to use it well you need have a good understanding with the algorithm as well as the data distribution.</p>
+
+<h2 id="top-n-query">Top-N query</h2>
+
+<p>Let’s start with the sample table that shipped in Kylin binary package. If you haven’t run that, follow this tutorial to create it: <a href="https://kylin.apache.org/docs15/tutorial/kylin_sample.html">Quick Start with Sample Cube</a>.</p>
+
+<p>The sample fact table “default.kylin_sales” mock the transactions on an online marketplace. It has a couple of dimension and measure columns. To be simple, here we only use four: “PART_DT”, “LSTG_SITE_ID”, “SELLER_ID” and “PRICE”. Bellow table is the concept of these columns, with a rough cardinality, the “SELLER_ID” is a high cardinality column.</p>
+
+<table>
+  <thead>
+    <tr>
+      <th style="text-align: left">Column</th>
+      <th style="text-align: center">Description</th>
+      <th style="text-align: center">Cardinality</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="text-align: left">PART_DT</td>
+      <td style="text-align: center">Transaction Date</td>
+      <td style="text-align: center">730: two years</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">LSTG_SITE_ID</td>
+      <td style="text-align: center">Site ID, 0 represents ‘US’</td>
+      <td style="text-align: center">50</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">SELLER_ID</td>
+      <td style="text-align: center">Seller ID</td>
+      <td style="text-align: center">About one million</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">PRICE</td>
+      <td style="text-align: center">Sold amount</td>
+      <td style="text-align: center">-</td>
+    </tr>
+  </tbody>
+</table>
+
+<p>Very often this online marketplace company need to identify the top sellers  (say top 100) in a given time period in some countries. The query looks like:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>SELECT SELLER_ID, SUM(PRICE) FROM KYLIN_SALES
+ WHERE 
+	PART_DT &gt;= date'2016-02-18' AND PART_DT &lt; date'2016-03-18' 
+		AND LSTG_SITE_ID in (0) 
+	group by SELLER_ID 
+	order by SUM(PRICE) DESC limit 100;
+</code></pre>
+</div>
+
+<h2 id="without-top-n-pre-calculation">Without Top-N pre-calculation</h2>
+
+<p>Before Kylin v1.5.0, all the “group by” columns need be as dimension, we come of a design that use PART_DT, LSTG_SITE_ID and SELLER_ID as dimensions, and define SUM(PRICE) as the measure. After build, the base cubiod of the cube will be like:</p>
+
+<table>
+  <thead>
+    <tr>
+      <th style="text-align: left">Rowkey of base cuboid</th>
+      <th style="text-align: center">SUM(PRICE)</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="text-align: left">20140318_00_seller0000001</td>
+      <td style="text-align: center">xx.xx</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">20140318_00_seller0000002</td>
+      <td style="text-align: center">xx.xx</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">…</td>
+      <td style="text-align: center">…</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">20140318_00_seller0999999</td>
+      <td style="text-align: center">xx.xx</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">20140318_01_seller0999999</td>
+      <td style="text-align: center">xx.xx</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">…</td>
+      <td style="text-align: center">…</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">…</td>
+      <td style="text-align: center">…</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">20160318_49_seller0999999</td>
+      <td style="text-align: center">xx.xx</td>
+    </tr>
+  </tbody>
+</table>
+
+<p>Assume these dimensions are independent. The number of rows in base cuboid is 730*50*1million = 36.5 billion. Other cuboids which include “SELLER_ID” will also has millions of rows. At this moment you may notice that the cube expansion rate is high, the situation would be worse if there are more dimensions or the cardinality is higher. But the real challenge is not here.</p>
+
+<p>Soon you will find the Top-N query couldn’t work, or took an unacceptable long time. Assume you want the top sellers in past 30 days in US, it need read 30 million rows from storage, aggregate and sort, finally return the top 100 ones.</p>
+
+<p>Now we see, due to no pre-calculation, although the final result set is small, the memory footprint and I/Os in between is heavy.</p>
+
+<h2 id="with-top-n-pre-calculation">With Top-N pre-calculation</h2>
+
+<p>With the Top-N measure, Kylin will pre-calculate the top entities for each dimension combination duing the cube build, saving the result (both entity ID and measure value) as a column in storage. The entity ID (“SELLER_ID” in this case) now can be moved from dimension to the measure, which doesn’t participate in the rowkey. For the sample scenario described above, the newly designed cube will have 2 dimensions (PART_DT, LSTG_SITE_ID), and 1 Top-N measure.</p>
+
+<table>
+  <thead>
+    <tr>
+      <th style="text-align: left">Rowkey of base cuboid</th>
+      <th style="text-align: center">Top-N measure</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="text-align: left">20140318_00</td>
+      <td style="text-align: center">seller0010091:xx.xx, seller0005002:xx.xx, …, seller0001789:xx.xx</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">20140318_01</td>
+      <td style="text-align: center">seller0032036:xx.xx, seller0010091:xx.xx, …, seller000699:xx.xx</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">…</td>
+      <td style="text-align: center">…</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">20160318_49</td>
+      <td style="text-align: center">seller0061016:xx.xx, seller0665091:xx.xx, …, seller000699:xx.xx</td>
+    </tr>
+  </tbody>
+</table>
+
+<p>The base cuboid will have 730 * 50 = 36.5 k rows now. In the measure cell, the Top certain records be stored in a container in descending order, those tail entities have been filtered out.</p>
+
+<p>For the same query, “Top sellers in past 30 days in US” now only need read 30 rows from storage. The measure object, also called as counter containers will be further aggregated/merged at the storage side, finally only one container is returned to Kylin. Kylin extract the “SELLER_ID” and “SUM(PRICE)” from it before returns to client. The cost is much lighter than before, the performance gets highly improved.</p>
+
+<h2 id="algorithm">Algorithm</h2>
+
+<p>Kylin’s Top-N implementation referred to <a href="https://github.com/addthis/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/StreamSummary.java">stream-lib</a>, which is based on the Space-Saving algorithm and the Stream-Summary data structure as described in <i>[1]Efficient Computation of Frequent and Top-k Elements in Data Streams</i> by Metwally, Agrawal, and Abbadi.</p>
+
+<p>A couple of modifications are made to let it better fit with Kylin:</p>
+
+<ul>
+  <li>Use double as the counter data type;</li>
+  <li>Simplfy the data strucutre, using one linked list for all entries;</li>
+  <li>Use a more compact serializer;</li>
+</ul>
+
+<p>Besides, in order to run SpaceSaving in parallel on Hadoop, we make it mergable with the algorithm introduced in <i>[2] A parallel space saving algorithm for frequent items and the Hurwitz zeta distribution</i>.</p>
+
+<h2 id="accuracy">Accuracy</h2>
+
+<p>Although the experiments in paper [1] has proved SpaceSaving’s efficiency and accuracy for realistic Zipfian data, it doesn’t ensure 100% correctness for all cases. SpaceSaving uses a fixed space to put the most frequent candidates; when the size exceeds the space, the tail elements will be truncated, causing data loss. The parallel algorithm will merge multiple SpaceSavings into one, at that moment for the elements appeared in one but not in the other it had some assumptions, this will also cause some data loss. Finally, the result from Top-N measure may have minor difference with the real result.</p>
+
+<p>A couple of factors can affect the accuracy:</p>
+
+<ul>
+  <li>Zipfian distribution</li>
+</ul>
+
+<p>Many rankings in the world follows the <strong>[3] Zipfian distribution</strong>, such as the population ranks of cities in various countries, corporation sizes, income rankings, etc. But the exponent of the distribution varies in different scenarios, this will affect the correctness of the result. The higher the exponent is (the distribution is more sharp), the more accurate answer will get. When using SpaceSaving, you’d better have an calculation on your data distribution.</p>
+
+<ul>
+  <li>Space in SpaceSaving</li>
+</ul>
+
+<p>As mentioned above, SpaceSaving use a small space to put the most frequent elements. Giving more space it will provide more accurate answer. For example, to calculate Top N elements, using 100 * N space would provide more accurate answer than 50 * N space. But more space will take more CPU, memory and storage, this need be balanced.</p>
+
+<ul>
+  <li>Element cardinality</li>
+</ul>
+
+<p>Element cardinality is also a factor to consider. Calculating Top 100 among 10 thousands is easiser than among 10 million.</p>
+
+<h2 id="statistics">Statistics</h2>
+
+<p>We designed a test case to calculate the top 100 elements using the parallel SpaceSaving among a data set; The element’s occurancy follows the Zipfian distribution, adjust the Zipfian exponent, space, and cardinality time to times, compare the result with the accurate result to collect the statistics, we get a rough accuracy report in below.</p>
+
+<p>The first column is the element cardinality, means among how many elements to identify the top 100 elements; The other three columns represent how much space using in the algorithm: 20X means using 2,000, 50X means use 5,000. Each cell of the table shows how many records are exactly matched with the real result. The calculation is executed in parallel with 10 threads.</p>
+
+<h3 id="test-1-calculate-top-100-in-1-million-records-zif-exponent--05">Test 1. Calculate top-100 in 1 million records, zif exponent = 0.5</h3>
+
+<table>
+  <thead>
+    <tr>
+      <th style="text-align: right">Element cardinality</th>
+      <th style="text-align: center">20X space</th>
+      <th style="text-align: center">50X space</th>
+      <th style="text-align: center">100X space</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="text-align: right">10,000</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+    </tr>
+    <tr>
+      <td style="text-align: right">20,000</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+    </tr>
+    <tr>
+      <td style="text-align: right">100,000</td>
+      <td style="text-align: center">70%</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+    </tr>
+    <tr>
+      <td style="text-align: right">1,000,000</td>
+      <td style="text-align: center">8%</td>
+      <td style="text-align: center">45%</td>
+      <td style="text-align: center">98%</td>
+    </tr>
+  </tbody>
+</table>
+
+<p>Test 1: More space can get better accuracy.</p>
+
+<h3 id="test-2-calculate-top-100-in-100-million-records-zif-exponent--05">Test 2. Calculate top-100 in 100 million records, zif exponent = 0.5</h3>
+
+<table>
+  <thead>
+    <tr>
+      <th style="text-align: right">Element cardinality</th>
+      <th style="text-align: center">20X space</th>
+      <th style="text-align: center">50X space</th>
+      <th style="text-align: center">100X space</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="text-align: right">10,000</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+    </tr>
+    <tr>
+      <td style="text-align: right">20,000</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+    </tr>
+    <tr>
+      <td style="text-align: right">100,000</td>
+      <td style="text-align: center">60%</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+    </tr>
+    <tr>
+      <td style="text-align: right">1,000,000</td>
+      <td style="text-align: center">8%</td>
+      <td style="text-align: center">56%</td>
+      <td style="text-align: center">96%</td>
+    </tr>
+  </tbody>
+</table>
+
+<p>Test 2: The data size doesn’t impact much.</p>
+
+<h3 id="test-3-calculate-top-100-in-1-million-records-zif-exponent--06">Test 3. Calculate top-100 in 1 million records, zif exponent = 0.6</h3>
+
+<table>
+  <thead>
+    <tr>
+      <th style="text-align: right">Element cardinality</th>
+      <th style="text-align: center">20X space</th>
+      <th style="text-align: center">50X space</th>
+      <th style="text-align: center">100X space</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="text-align: right">10,000</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+    </tr>
+    <tr>
+      <td style="text-align: right">20,000</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+    </tr>
+    <tr>
+      <td style="text-align: right">100,000</td>
+      <td style="text-align: center">94%</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+    </tr>
+    <tr>
+      <td style="text-align: right">1,000,000</td>
+      <td style="text-align: center">31%</td>
+      <td style="text-align: center">93%</td>
+      <td style="text-align: center">100%</td>
+    </tr>
+  </tbody>
+</table>
+
+<p>Test 3: more sharp the elements distribute, the better answer it prvoides</p>
+
+<h3 id="test-4-calculate-top-100-in-1-million-records-zif-exponent--07">Test 4. Calculate top-100 in 1 million records, zif exponent = 0.7</h3>
+
+<table>
+  <thead>
+    <tr>
+      <th style="text-align: right">Element cardinality</th>
+      <th style="text-align: center">20X space</th>
+      <th style="text-align: center">50X space</th>
+      <th style="text-align: center">100X space</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="text-align: right">10,000</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+    </tr>
+    <tr>
+      <td style="text-align: right">20,000</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+    </tr>
+    <tr>
+      <td style="text-align: right">100,000</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+    </tr>
+    <tr>
+      <td style="text-align: right">1,000,000</td>
+      <td style="text-align: center">62%</td>
+      <td style="text-align: center">100%</td>
+      <td style="text-align: center">100%</td>
+    </tr>
+  </tbody>
+</table>
+
+<p>Test 4: same conclusion as test 3.</p>
+
+<p>These statistics matches with what we expected above. It just gives us a rough estimation on the result correctness. To use this feature well in Kylin, you need know about all these variables, and do some pilots before publish it to the analysts.</p>
+
+<p>##Futher works</p>
+
+<p>This feature in v1.5.0 is a basic version, which may solve 80% cases; While it has some limitations or hard-codings that deserve your attention:</p>
+
+<ul>
+  <li>use SUM() as the default aggregation function;</li>
+  <li>sort in descending order always;</li>
+  <li>use 50X space always;</li>
+  <li>use dictionary encoding for the literal column;</li>
+  <li>the UI only allow selecting top 10, 100 and 1000;</li>
+</ul>
+
+<p>Whether or not to support more aggregations/sortings/encodings are totally based on user need. If you have any comment or suggestion, please subscribe and then drop email to our dev mailing list <a href="&#109;&#097;&#105;&#108;&#116;&#111;:&#100;&#101;&#118;&#064;&#107;&#121;&#108;&#105;&#110;&#046;&#097;&#112;&#097;&#099;&#104;&#101;&#046;&#111;&#114;&#103;">&#100;&#101;&#118;&#064;&#107;&#121;&#108;&#105;&#110;&#046;&#097;&#112;&#097;&#099;&#104;&#101;&#046;&#111;&#114;&#103;</a>, thanks for your feedbak.</p>
+
+<p>##References</p>
+
+<p>[1] <a href="https://dl.acm.org/citation.cfm?id=2131596">Efficient Computation of Frequent and Top-k Elements in Data Streams</a></p>
+
+<p>[2] <a href="http://arxiv.org/pdf/1401.0702.pdf">A parallel space saving algorithm for frequent items<br />
+and the Hurwitz zeta distribution</a></p>
+
+<p>[3] <a href="https://en.wikipedia.org/wiki/Zipf%27s_law">Zipfian law on wikipedia</a></p>
+
+  </article>
+
+</div>
+
+
+
+
+
+				</article>
+			</div>
+		</div>		
+		<!--
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+-->
+
+<footer id="underfooter">
+    <div class="container">
+        <div class="row">
+            <div class="col-md-12 widget">
+                <div class="widget-body" style="text-align:center">
+                    <a href="http://www.apache.org">
+                        <img id="asf-logo" alt="Apache Software Foundation" src="/assets/images/feather-small.gif">
+                    </a>
+
+                    <div>
+                        The contents of this website are © 2015 Apache Software Foundation under the terms of the <a
+                            href="http://www.apache.org/licenses/LICENSE-2.0"> Apache License v2 </a>. Apache Kylin and
+                        its logo are trademarks of the Apache Software Foundation.
+                    </div>
+
+                </div>
+            </div>
+        </div>
+        <!-- /row of widgets -->
+
+    </div>
+    <div></div>
+
+</footer>
+
+	<script src="/assets/js/jquery-1.9.1.min.js"></script> 
+	<script src="/assets/js/bootstrap.min.js"></script> 
+	<script src="/assets/js/main.js"></script>
+	</body>
+</html>
+
+
+
+

Modified: kylin/site/blog/index.html
URL: http://svn.apache.org/viewvc/kylin/site/blog/index.html?rev=1735917&r1=1735916&r2=1735917&view=diff
==============================================================================
--- kylin/site/blog/index.html (original)
+++ kylin/site/blog/index.html Mon Mar 21 02:05:10 2016
@@ -174,6 +174,12 @@
             
             <li>
         <h2 align="left" style="margin:0px">
+          <a class="post-link" href="/blog/2016/03/19/approximate-topn-measure/">Approximate Top-N support in Kylin</a></h2><div align="left" class="post-meta">posted: Mar 19, 2016</div>
+        
+      </li>
+    
+            <li>
+        <h2 align="left" style="margin:0px">
           <a class="post-link" href="/cn/blog/2016/03/17/release-v1.5.0/">Apache Kylin v1.5.0 正式发布</a></h2><div align="left" class="post-meta">posted: Mar 17, 2016</div>
         
       </li>
@@ -222,13 +228,13 @@
     
             <li>
         <h2 align="left" style="margin:0px">
-          <a class="post-link" href="/cn/blog/2015/12/23/release-v1.2/">Apache Kylin v1.2 正式发布</a></h2><div align="left" class="post-meta">posted: Dec 23, 2015</div>
+          <a class="post-link" href="/blog/2015/12/23/release-v1.2/">Apache Kylin v1.2 Release Announcement</a></h2><div align="left" class="post-meta">posted: Dec 23, 2015</div>
         
       </li>
     
             <li>
         <h2 align="left" style="margin:0px">
-          <a class="post-link" href="/blog/2015/12/23/release-v1.2/">Apache Kylin v1.2 Release Announcement</a></h2><div align="left" class="post-meta">posted: Dec 23, 2015</div>
+          <a class="post-link" href="/cn/blog/2015/12/23/release-v1.2/">Apache Kylin v1.2 正式发布</a></h2><div align="left" class="post-meta">posted: Dec 23, 2015</div>
         
       </li>
     

Modified: kylin/site/feed.xml
URL: http://svn.apache.org/viewvc/kylin/site/feed.xml?rev=1735917&r1=1735916&r2=1735917&view=diff
==============================================================================
--- kylin/site/feed.xml (original)
+++ kylin/site/feed.xml Mon Mar 21 02:05:10 2016
@@ -19,11 +19,402 @@
     <description>Apache Kylin Home</description>
     <link>http://kylin.apache.org/</link>
     <atom:link href="http://kylin.apache.org/feed.xml" rel="self" type="application/rss+xml"/>
-    <pubDate>Sat, 19 Mar 2016 06:59:13 -0700</pubDate>
-    <lastBuildDate>Sat, 19 Mar 2016 06:59:13 -0700</lastBuildDate>
+    <pubDate>Mon, 21 Mar 2016 03:03:44 -0700</pubDate>
+    <lastBuildDate>Mon, 21 Mar 2016 03:03:44 -0700</lastBuildDate>
     <generator>Jekyll v2.5.3</generator>
     
       <item>
+        <title>Approximate Top-N support in Kylin</title>
+        <description>&lt;h2 id=&quot;background&quot;&gt;Background&lt;/h2&gt;
+
+&lt;p&gt;Find the Top-N (or Top-K) entities from a dataset is a common scenario and requirement in data minding; We often see the reports or news like “Top 100 companies in the world”, “Most popular 20 electronics sold on eBay”, etc. Exploring and analysising the top entities can always find some high value information.&lt;/p&gt;
+
+&lt;p&gt;Within the era of big data, this need is much stronger than ever before, as both the raw dataset and the number of entities can be vast; Without certain pre-calculation, get the Top-K entities among a distributed big dataset may take a long time, makes the ad-hoc query inefficient.&lt;/p&gt;
+
+&lt;p&gt;In v1.5.0, Apache Kylin introduces the “Top-N” measure, aiming to pre-calculate the top entities during the cube build phase; in the query phase,  Kylin can quickly fetch and return the top records. The performance would be much better than a cube without “Top-N”, giving the analyst more power to inspect data.&lt;/p&gt;
+
+&lt;p&gt;Please note, this “Top-N” measure is an approximate realization, to use it well you need have a good understanding with the algorithm as well as the data distribution.&lt;/p&gt;
+
+&lt;h2 id=&quot;top-n-query&quot;&gt;Top-N query&lt;/h2&gt;
+
+&lt;p&gt;Let’s start with the sample table that shipped in Kylin binary package. If you haven’t run that, follow this tutorial to create it: &lt;a href=&quot;https://kylin.apache.org/docs15/tutorial/kylin_sample.html&quot;&gt;Quick Start with Sample Cube&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;The sample fact table “default.kylin_sales” mock the transactions on an online marketplace. It has a couple of dimension and measure columns. To be simple, here we only use four: “PART_DT”, “LSTG_SITE_ID”, “SELLER_ID” and “PRICE”. Bellow table is the concept of these columns, with a rough cardinality, the “SELLER_ID” is a high cardinality column.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: left&quot;&gt;Column&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Description&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Cardinality&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;PART_DT&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;Transaction Date&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;730: two years&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;LSTG_SITE_ID&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;Site ID, 0 represents ‘US’&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;50&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;SELLER_ID&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;Seller ID&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;About one million&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;PRICE&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;Sold amount&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;-&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;Very often this online marketplace company need to identify the top sellers  (say top 100) in a given time period in some countries. The query looks like:&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;SELECT SELLER_ID, SUM(PRICE) FROM KYLIN_SALES
+ WHERE 
+	PART_DT &amp;gt;= date&#39;2016-02-18&#39; AND PART_DT &amp;lt; date&#39;2016-03-18&#39; 
+		AND LSTG_SITE_ID in (0) 
+	group by SELLER_ID 
+	order by SUM(PRICE) DESC limit 100;
+&lt;/code&gt;&lt;/pre&gt;
+&lt;/div&gt;
+
+&lt;h2 id=&quot;without-top-n-pre-calculation&quot;&gt;Without Top-N pre-calculation&lt;/h2&gt;
+
+&lt;p&gt;Before Kylin v1.5.0, all the “group by” columns need be as dimension, we come of a design that use PART_DT, LSTG_SITE_ID and SELLER_ID as dimensions, and define SUM(PRICE) as the measure. After build, the base cubiod of the cube will be like:&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: left&quot;&gt;Rowkey of base cuboid&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;SUM(PRICE)&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;20140318_00_seller0000001&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;xx.xx&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;20140318_00_seller0000002&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;xx.xx&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;…&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;…&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;20140318_00_seller0999999&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;xx.xx&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;20140318_01_seller0999999&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;xx.xx&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;…&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;…&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;…&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;…&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;20160318_49_seller0999999&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;xx.xx&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;Assume these dimensions are independent. The number of rows in base cuboid is 730*50*1million = 36.5 billion. Other cuboids which include “SELLER_ID” will also has millions of rows. At this moment you may notice that the cube expansion rate is high, the situation would be worse if there are more dimensions or the cardinality is higher. But the real challenge is not here.&lt;/p&gt;
+
+&lt;p&gt;Soon you will find the Top-N query couldn’t work, or took an unacceptable long time. Assume you want the top sellers in past 30 days in US, it need read 30 million rows from storage, aggregate and sort, finally return the top 100 ones.&lt;/p&gt;
+
+&lt;p&gt;Now we see, due to no pre-calculation, although the final result set is small, the memory footprint and I/Os in between is heavy.&lt;/p&gt;
+
+&lt;h2 id=&quot;with-top-n-pre-calculation&quot;&gt;With Top-N pre-calculation&lt;/h2&gt;
+
+&lt;p&gt;With the Top-N measure, Kylin will pre-calculate the top entities for each dimension combination duing the cube build, saving the result (both entity ID and measure value) as a column in storage. The entity ID (“SELLER_ID” in this case) now can be moved from dimension to the measure, which doesn’t participate in the rowkey. For the sample scenario described above, the newly designed cube will have 2 dimensions (PART_DT, LSTG_SITE_ID), and 1 Top-N measure.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: left&quot;&gt;Rowkey of base cuboid&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Top-N measure&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;20140318_00&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;seller0010091:xx.xx, seller0005002:xx.xx, …, seller0001789:xx.xx&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;20140318_01&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;seller0032036:xx.xx, seller0010091:xx.xx, …, seller000699:xx.xx&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;…&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;…&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: left&quot;&gt;20160318_49&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;seller0061016:xx.xx, seller0665091:xx.xx, …, seller000699:xx.xx&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;The base cuboid will have 730 * 50 = 36.5 k rows now. In the measure cell, the Top certain records be stored in a container in descending order, those tail entities have been filtered out.&lt;/p&gt;
+
+&lt;p&gt;For the same query, “Top sellers in past 30 days in US” now only need read 30 rows from storage. The measure object, also called as counter containers will be further aggregated/merged at the storage side, finally only one container is returned to Kylin. Kylin extract the “SELLER_ID” and “SUM(PRICE)” from it before returns to client. The cost is much lighter than before, the performance gets highly improved.&lt;/p&gt;
+
+&lt;h2 id=&quot;algorithm&quot;&gt;Algorithm&lt;/h2&gt;
+
+&lt;p&gt;Kylin’s Top-N implementation referred to &lt;a href=&quot;https://github.com/addthis/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/StreamSummary.java&quot;&gt;stream-lib&lt;/a&gt;, which is based on the Space-Saving algorithm and the Stream-Summary data structure as described in &lt;i&gt;[1]Efficient Computation of Frequent and Top-k Elements in Data Streams&lt;/i&gt; by Metwally, Agrawal, and Abbadi.&lt;/p&gt;
+
+&lt;p&gt;A couple of modifications are made to let it better fit with Kylin:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Use double as the counter data type;&lt;/li&gt;
+  &lt;li&gt;Simplfy the data strucutre, using one linked list for all entries;&lt;/li&gt;
+  &lt;li&gt;Use a more compact serializer;&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;Besides, in order to run SpaceSaving in parallel on Hadoop, we make it mergable with the algorithm introduced in &lt;i&gt;[2] A parallel space saving algorithm for frequent items and the Hurwitz zeta distribution&lt;/i&gt;.&lt;/p&gt;
+
+&lt;h2 id=&quot;accuracy&quot;&gt;Accuracy&lt;/h2&gt;
+
+&lt;p&gt;Although the experiments in paper [1] has proved SpaceSaving’s efficiency and accuracy for realistic Zipfian data, it doesn’t ensure 100% correctness for all cases. SpaceSaving uses a fixed space to put the most frequent candidates; when the size exceeds the space, the tail elements will be truncated, causing data loss. The parallel algorithm will merge multiple SpaceSavings into one, at that moment for the elements appeared in one but not in the other it had some assumptions, this will also cause some data loss. Finally, the result from Top-N measure may have minor difference with the real result.&lt;/p&gt;
+
+&lt;p&gt;A couple of factors can affect the accuracy:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Zipfian distribution&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;Many rankings in the world follows the &lt;strong&gt;[3] Zipfian distribution&lt;/strong&gt;, such as the population ranks of cities in various countries, corporation sizes, income rankings, etc. But the exponent of the distribution varies in different scenarios, this will affect the correctness of the result. The higher the exponent is (the distribution is more sharp), the more accurate answer will get. When using SpaceSaving, you’d better have an calculation on your data distribution.&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Space in SpaceSaving&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;As mentioned above, SpaceSaving use a small space to put the most frequent elements. Giving more space it will provide more accurate answer. For example, to calculate Top N elements, using 100 * N space would provide more accurate answer than 50 * N space. But more space will take more CPU, memory and storage, this need be balanced.&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Element cardinality&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;Element cardinality is also a factor to consider. Calculating Top 100 among 10 thousands is easiser than among 10 million.&lt;/p&gt;
+
+&lt;h2 id=&quot;statistics&quot;&gt;Statistics&lt;/h2&gt;
+
+&lt;p&gt;We designed a test case to calculate the top 100 elements using the parallel SpaceSaving among a data set; The element’s occurancy follows the Zipfian distribution, adjust the Zipfian exponent, space, and cardinality time to times, compare the result with the accurate result to collect the statistics, we get a rough accuracy report in below.&lt;/p&gt;
+
+&lt;p&gt;The first column is the element cardinality, means among how many elements to identify the top 100 elements; The other three columns represent how much space using in the algorithm: 20X means using 2,000, 50X means use 5,000. Each cell of the table shows how many records are exactly matched with the real result. The calculation is executed in parallel with 10 threads.&lt;/p&gt;
+
+&lt;h3 id=&quot;test-1-calculate-top-100-in-1-million-records-zif-exponent--05&quot;&gt;Test 1. Calculate top-100 in 1 million records, zif exponent = 0.5&lt;/h3&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: right&quot;&gt;Element cardinality&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;20X space&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;50X space&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;100X space&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;10,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;20,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;100,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;70%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;1,000,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;8%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;45%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;98%&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;Test 1: More space can get better accuracy.&lt;/p&gt;
+
+&lt;h3 id=&quot;test-2-calculate-top-100-in-100-million-records-zif-exponent--05&quot;&gt;Test 2. Calculate top-100 in 100 million records, zif exponent = 0.5&lt;/h3&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: right&quot;&gt;Element cardinality&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;20X space&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;50X space&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;100X space&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;10,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;20,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;100,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;60%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;1,000,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;8%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;56%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;96%&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;Test 2: The data size doesn’t impact much.&lt;/p&gt;
+
+&lt;h3 id=&quot;test-3-calculate-top-100-in-1-million-records-zif-exponent--06&quot;&gt;Test 3. Calculate top-100 in 1 million records, zif exponent = 0.6&lt;/h3&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: right&quot;&gt;Element cardinality&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;20X space&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;50X space&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;100X space&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;10,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;20,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;100,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;94%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;1,000,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;31%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;93%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;Test 3: more sharp the elements distribute, the better answer it prvoides&lt;/p&gt;
+
+&lt;h3 id=&quot;test-4-calculate-top-100-in-1-million-records-zif-exponent--07&quot;&gt;Test 4. Calculate top-100 in 1 million records, zif exponent = 0.7&lt;/h3&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: right&quot;&gt;Element cardinality&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;20X space&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;50X space&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;100X space&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;10,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;20,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;100,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: right&quot;&gt;1,000,000&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;62%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;100%&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;Test 4: same conclusion as test 3.&lt;/p&gt;
+
+&lt;p&gt;These statistics matches with what we expected above. It just gives us a rough estimation on the result correctness. To use this feature well in Kylin, you need know about all these variables, and do some pilots before publish it to the analysts.&lt;/p&gt;
+
+&lt;p&gt;##Futher works&lt;/p&gt;
+
+&lt;p&gt;This feature in v1.5.0 is a basic version, which may solve 80% cases; While it has some limitations or hard-codings that deserve your attention:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;use SUM() as the default aggregation function;&lt;/li&gt;
+  &lt;li&gt;sort in descending order always;&lt;/li&gt;
+  &lt;li&gt;use 50X space always;&lt;/li&gt;
+  &lt;li&gt;use dictionary encoding for the literal column;&lt;/li&gt;
+  &lt;li&gt;the UI only allow selecting top 10, 100 and 1000;&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;Whether or not to support more aggregations/sortings/encodings are totally based on user need. If you have any comment or suggestion, please subscribe and then drop email to our dev mailing list &lt;a href=&quot;&amp;#109;&amp;#097;&amp;#105;&amp;#108;&amp;#116;&amp;#111;:&amp;#100;&amp;#101;&amp;#118;&amp;#064;&amp;#107;&amp;#121;&amp;#108;&amp;#105;&amp;#110;&amp;#046;&amp;#097;&amp;#112;&amp;#097;&amp;#099;&amp;#104;&amp;#101;&amp;#046;&amp;#111;&amp;#114;&amp;#103;&quot;&gt;&amp;#100;&amp;#101;&amp;#118;&amp;#064;&amp;#107;&amp;#121;&amp;#108;&amp;#105;&amp;#110;&amp;#046;&amp;#097;&amp;#112;&amp;#097;&amp;#099;&amp;#104;&amp;#101;&amp;#046;&amp;#111;&amp;#114;&amp;#103;&lt;/a&gt;, thanks for your feedbak.&lt;/p&gt;
+
+&lt;p&gt;##References&lt;/p&gt;
+
+&lt;p&gt;[1] &lt;a href=&quot;https://dl.acm.org/citation.cfm?id=2131596&quot;&gt;Efficient Computation of Frequent and Top-k Elements in Data Streams&lt;/a&gt;&lt;/p&gt;
+
+&lt;p&gt;[2] &lt;a href=&quot;http://arxiv.org/pdf/1401.0702.pdf&quot;&gt;A parallel space saving algorithm for frequent items&lt;br /&gt;
+and the Hurwitz zeta distribution&lt;/a&gt;&lt;/p&gt;
+
+&lt;p&gt;[3] &lt;a href=&quot;https://en.wikipedia.org/wiki/Zipf%27s_law&quot;&gt;Zipfian law on wikipedia&lt;/a&gt;&lt;/p&gt;
+</description>
+        <pubDate>Sat, 19 Mar 2016 09:30:00 -0700</pubDate>
+        <link>http://kylin.apache.org/blog/2016/03/19/approximate-topn-measure/</link>
+        <guid isPermaLink="true">http://kylin.apache.org/blog/2016/03/19/approximate-topn-measure/</guid>
+        
+        
+        <category>blog</category>
+        
+      </item>
+    
+      <item>
         <title>Apache Kylin v1.5.0 正式发布</title>
         <description>&lt;p&gt;Apache Kylin社区非常高兴宣布Apache Kylin v1.5.0正式发布。&lt;/p&gt;
 
@@ -527,67 +918,6 @@ With sub-seconds query latency feature o
         
         
         <category>blog</category>
-        
-      </item>
-    
-      <item>
-        <title>Apache Kylin v1.2 Release Announcement</title>
-        <description>&lt;p&gt;The Apache Kylin community is pleased to announce the release of Apache Kylin v1.2, the first release after graduation.&lt;/p&gt;
-
-&lt;p&gt;Apache Kylin is an open source Distributed Analytics Engine designed to provide SQL interface and multi-dimensional analysis (OLAP) on Hadoop supporting extremely large datasets, original contributed from eBay Inc.&lt;/p&gt;
-
-&lt;p&gt;To download Apache Kylin v1.2 source code or binary package: &lt;br /&gt;
-please visit the &lt;a href=&quot;http://kylin.apache.org/download&quot;&gt;download&lt;/a&gt; page.&lt;/p&gt;
-
-&lt;p&gt;This is a major release which brings more stable, robust and well management version, Apache Kylin community resolved about 44 issues including bug fixes, improvements, and few new features.&lt;/p&gt;
-
-&lt;h2 id=&quot;change-highlights&quot;&gt;Change Highlights&lt;/h2&gt;
-
-&lt;p&gt;&lt;strong&gt;Kylin Core Improvement&lt;/strong&gt;&lt;/p&gt;
-
-&lt;ul&gt;
-  &lt;li&gt;Support Excel, Power BI and Tableau 9.1 &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-596&quot;&gt;KYLIN-596&lt;/a&gt;,&lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1065&quot;&gt;KYLIN-1065&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;Improve small file management on HDFS &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-702&quot;&gt;KYLIN-702&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;Env shell script enhance for Hive HCatalog &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1081&quot;&gt;KYLIN-1081&lt;/a&gt;, &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1119&quot;&gt;KYLIN-1119&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;Dimenion column supports high cardinality over 10 million &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1099&quot;&gt;KYLIN-1099&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;Enhance job page loading performance &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1154&quot;&gt;KYLIN-1154&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;Make memory budget per query configurable &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1190&quot;&gt;KYLIN-1190&lt;/a&gt;&lt;/li&gt;
-&lt;/ul&gt;
-
-&lt;p&gt;&lt;strong&gt;Main Bug Fixes&lt;/strong&gt;&lt;/p&gt;
-
-&lt;ul&gt;
-  &lt;li&gt;Save cube issue in edit model &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1168&quot;&gt;KYLIN-1168&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;Couldn’t change a cube’s name after it be created &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-693&quot;&gt;KYLIN-693&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;Cube list missing under project &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-930&quot;&gt;KYLIN-930&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;Error when join two sub-query &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1033&quot;&gt;KYLIN-1033&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;Filter like (A or false) yields wrong result &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1039&quot;&gt;KYLIN-1039&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;Support get MapReduce Job status for ResourceManager HA Env &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1067&quot;&gt;KYLIN-1067&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;Can not send email caused by Build Base Cuboid Data step failed &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1106&quot;&gt;KYLIN-1106&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;ResourceTool download/upload does not work in binary package &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1121&quot;&gt;KYLIN-1121&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;Kylin’s sample cube “kylin_sales_cube” couldn’t be saved &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1140&quot;&gt;KYLIN-1140&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;Unit test with minicluster doesn’t work on 1.x &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1155&quot;&gt;KYLIN-1155&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;Can’t parse DateFormat like ‘YYYYMMDD’ correctly in query &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN-1216&quot;&gt;KYLIN-1216&lt;/a&gt;&lt;/li&gt;
-&lt;/ul&gt;
-
-&lt;p&gt;&lt;strong&gt;Upgrade&lt;/strong&gt;  &lt;br /&gt;
-We recommend to upgrade to this version for better performance, stability and bug fixes.&lt;br /&gt;
-Also to keep up to date with community with latest features and supports.&lt;/p&gt;
-
-&lt;p&gt;&lt;strong&gt;Support&lt;/strong&gt;  &lt;br /&gt;
-Any issue or question during upgrade, please &lt;br /&gt;
-open JIRA to Kylin project: &lt;a href=&quot;https://issues.apache.org/jira/browse/KYLIN/&quot;&gt;https://issues.apache.org/jira/browse/KYLIN/&lt;/a&gt;  &lt;br /&gt;
-or  &lt;br /&gt;
-send mail to Apache Kylin dev mailing list: &lt;a href=&quot;&amp;#109;&amp;#097;&amp;#105;&amp;#108;&amp;#116;&amp;#111;:&amp;#100;&amp;#101;&amp;#118;&amp;#064;&amp;#107;&amp;#121;&amp;#108;&amp;#105;&amp;#110;&amp;#046;&amp;#097;&amp;#112;&amp;#097;&amp;#099;&amp;#104;&amp;#101;&amp;#046;&amp;#111;&amp;#114;&amp;#103;&quot;&gt;&amp;#100;&amp;#101;&amp;#118;&amp;#064;&amp;#107;&amp;#121;&amp;#108;&amp;#105;&amp;#110;&amp;#046;&amp;#097;&amp;#112;&amp;#097;&amp;#099;&amp;#104;&amp;#101;&amp;#046;&amp;#111;&amp;#114;&amp;#103;&lt;/a&gt;&lt;/p&gt;
-
-&lt;p&gt;&lt;em&gt;Great thanks to everyone who contributed!&lt;/em&gt;&lt;/p&gt;
-</description>
-        <pubDate>Wed, 23 Dec 2015 14:28:00 -0800</pubDate>
-        <link>http://kylin.apache.org/blog/2015/12/23/release-v1.2/</link>
-        <guid isPermaLink="true">http://kylin.apache.org/blog/2015/12/23/release-v1.2/</guid>
-        
-        
-        <category>blog</category>
         
       </item>