You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by vi...@apache.org on 2019/03/14 05:25:26 UTC
[incubator-hudi] 03/03: Refreshing site content based on docs folder

This is an automated email from the ASF dual-hosted git repository.

vinoth pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/incubator-hudi.git

commit c7d3523ee78e7744a85dbf82ad0ebf815a535524
Author: Vinoth Chandar <vi...@uber.com>
AuthorDate: Wed Mar 13 22:23:23 2019 -0700

    Refreshing site content based on docs folder
---
 content/404.html                               |  32 +-
 content/admin_guide.html                       |  52 +-
 content/community.html                         |  48 +-
 content/comparison.html                        |  32 +-
 content/concepts.html                          |  32 +-
 content/configurations.html                    |  86 +---
 content/contributing.html                      |  32 +-
 content/feed.xml                               |   4 +-
 content/gcs_hoodie.html                        |  32 +-
 content/index.html                             |  32 +-
 content/js/mydoc_scroll.html                   |  32 +-
 content/migration_guide.html                   |  32 +-
 content/news_archive.html                      |  32 +-
 content/{privacy.html => performance.html}     | 150 +++---
 content/powered_by.html                        |  32 +-
 content/privacy.html                           |  32 +-
 content/{s3_hoodie.html => querying_data.html} | 126 ++---
 content/quickstart.html                        |  63 +--
 content/s3_hoodie.html                         |  32 +-
 content/search.json                            |  62 +--
 content/sitemap.xml                            |  22 +-
 content/strata-talk.html                       |  32 +-
 content/use_cases.html                         |  32 +-
 content/writing_data.html                      | 678 +++++++++++++++++++++++++
 24 files changed, 1060 insertions(+), 679 deletions(-)

diff --git a/content/404.html b/content/404.html
index fedef9b..ce0faf8 100644
--- a/content/404.html
+++ b/content/404.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
-            
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
diff --git a/content/admin_guide.html b/content/admin_guide.html
index 3625cee..c7cf00f 100644
--- a/content/admin_guide.html
+++ b/content/admin_guide.html
@@ -5,7 +5,7 @@
 <meta name="viewport" content="width=device-width, initial-scale=1">
 <meta name="description" content="This section offers an overview of tools available to operate an ecosystem of Hudi datasets">
 <meta name="keywords" content="hudi, administration, operation, devops">
-<title>Admin Guide | Hudi</title>
+<title>Administering Hudi Pipelines | Hudi</title>
 <link rel="stylesheet" href="css/syntax.css">
 
 
@@ -162,7 +162,7 @@
 
 
 
-  <a class="email" title="Submit feedback" href="#" onclick="javascript:window.location='mailto:dev@hudi.apache.org?subject=Hudi Documentation feedback&body=I have some feedback about the Admin Guide page: ' + window.location.href;"><i class="fa fa-envelope-o"></i> Feedback</a>
+  <a class="email" title="Submit feedback" href="#" onclick="javascript:window.location='mailto:dev@hudi.apache.org?subject=Hudi Documentation feedback&body=I have some feedback about the Administering Hudi Pipelines page: ' + window.location.href;"><i class="fa fa-envelope-o"></i> Feedback</a>
 
 <li>
 
@@ -180,7 +180,7 @@
                                 searchInput: document.getElementById('search-input'),
                                 resultsContainer: document.getElementById('results-container'),
                                 dataSource: 'search.json',
-                                searchResultTemplate: '<li><a href="{url}" title="Admin Guide">{title}</a></li>',
+                                searchResultTemplate: '<li><a href="{url}" title="Administering Hudi Pipelines">{title}</a></li>',
                     noResultsText: 'No results found.',
                             limit: 10,
                             fuzzy: true,
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
-            
-            
-            
-            <li class="active"><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li class="active"><a href="admin_guide.html">Administering</a></li>
             
             
             
@@ -331,7 +317,7 @@
     <!-- Content Column -->
     <div class="col-md-9">
         <div class="post-header">
-   <h1 class="post-title-main">Admin Guide</h1>
+   <h1 class="post-title-main">Administering Hudi Pipelines</h1>
 </div>
 
 
@@ -350,12 +336,12 @@
   <p>Admins/ops can gain visibility into Hudi datasets/pipelines in the following ways</p>
 
 <ul>
-  <li>Administering via the Admin CLI</li>
-  <li>Graphite metrics</li>
-  <li>Spark UI of the Hudi Application</li>
+  <li><a href="#admin-cli">Administering via the Admin CLI</a></li>
+  <li><a href="#metrics">Graphite metrics</a></li>
+  <li><a href="#spark-ui">Spark UI of the Hudi Application</a></li>
 </ul>
 
-<p>This section provides a glimpse into each of these, with some general guidance on troubleshooting</p>
+<p>This section provides a glimpse into each of these, with some general guidance on <a href="#troubleshooting">troubleshooting</a></p>
 
 <h2 id="admin-cli">Admin CLI</h2>
 
@@ -728,7 +714,7 @@ Compaction successfully repaired
 
 <figure><img class="docimage" src="images/hudi_commit_duration.png" alt="hudi_commit_duration.png" style="max-width: 1000px" /></figure>
 
-<h2 id="troubleshooting-failures">Troubleshooting Failures</h2>
+<h2 id="troubleshooting">Troubleshooting Failures</h2>
 
 <p>Section below generally aids in debugging Hudi failures. Off the bat, the following metadata is added to every record to help triage  issues easily using standard Hadoop SQL engines (Hive/Presto/Spark)</p>
 
@@ -756,7 +742,7 @@ If you do find errors, then the record was not actually written by Hudi, but han
   <li>if duplicates span multiple files within the same partitionpath, please engage with mailing list. This should not happen. You can use the <code class="highlighter-rouge">records deduplicate</code> command to fix your data.</li>
 </ul>
 
-<h4 id="spark-failures">Spark failures</h4>
+<h4 id="spark-ui">Spark failures</h4>
 
 <p>Typical upsert() DAG looks like below. Note that Hudi client also caches intermediate RDDs to intelligently profile workload and size files and spark parallelism.
 Also Spark UI shows sortByKey twice due to the probe job also being shown, nonetheless its just a single sort.</p>
diff --git a/content/community.html b/content/community.html
index 34196f3..991cc39 100644
--- a/content/community.html
+++ b/content/community.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="configurations.html">Configurations</a></li>
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
+            <li><a href="performance.html">Performance</a></li>
             
             
             
             
             
             
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
@@ -363,7 +349,7 @@
     </tr>
     <tr>
       <td>For reporting bugs or issues or discover known issues</td>
-      <td>Please use <a href="https://issues.apache.org/jira/projects/HUDI/summary">ASF Hudi JIRA</a></td>
+      <td>Please use <a href="https://issues.apache.org/jira/projects/HUDI/summary">ASF Hudi JIRA</a>. See <a href="#accounts">#here</a> for access</td>
     </tr>
     <tr>
       <td>For quick pings &amp; 1-1 chats</td>
@@ -371,7 +357,7 @@
     </tr>
     <tr>
       <td>For proposing large features, changes</td>
-      <td>Start a Hudi Improvement Process (HIP). Instructions coming soon.</td>
+      <td>Start a Hudi Improvement Process (HIP). Instructions coming soon. See <a href="#accounts">#here</a> for access</td>
     </tr>
     <tr>
       <td>For stream of commits, pull requests etc</td>
@@ -399,11 +385,21 @@ Apache Hudi follows the typical Apache vulnerability handling <a href="https://a
   <li>Contributing code to the project (<a href="https://issues.apache.org/jira/issues/?jql=project+%3D+HUDI+AND+component+%3D+newbie">newbie JIRAs</a>)</li>
 </ul>
 
-<h4 id="code-contributions">Code Contributions</h4>
+<h3 id="code-contributions">Code Contributions</h3>
 
 <p>Useful resources for contributing can be found under the “Developers” top menu.
 Specifically, please refer to the detailed <a href="contributing.html">contribution guide</a>.</p>
 
+<h2 id="accounts">Accounts</h2>
+
+<p>It’s useful to obtain few accounts to be able to effectively contribute to Hudi.</p>
+
+<ul>
+  <li>Github account is needed to send pull requests to Hudi</li>
+  <li>Sign-up/in to the Apache <a href="https://issues.apache.org/jira">JIRA</a>. Then please email the dev mailing list with your username, asking to be added as a contributor to the project. This enables you to assign/be-assigned tickets and comment on them.</li>
+  <li>Sign-up/in to the Apache <a href="https://cwiki.apache.org/confluence/signup.action">cWiki</a>, to be able to contribute to the wiki pages/HIPs.</li>
+</ul>
+
 
     <div class="tags">
         
diff --git a/content/comparison.html b/content/comparison.html
index 59bcf75..21a4e6f 100644
--- a/content/comparison.html
+++ b/content/comparison.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li class="active"><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
-            
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li class="active"><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
diff --git a/content/concepts.html b/content/concepts.html
index 22754c4..a7578e6 100644
--- a/content/concepts.html
+++ b/content/concepts.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
-            
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
diff --git a/content/configurations.html b/content/configurations.html
index 73f66c9..b6a8d9d 100644
--- a/content/configurations.html
+++ b/content/configurations.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
-            
-            
-            
-            <li class="active"><a href="configurations.html">Configurations</a></li>
-            
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
             
+            <li class="active"><a href="configurations.html">Configuration</a></li>
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
-            <li><a href="admin_guide.html">Admin Guide</a></li>
             
             
             
             
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
@@ -705,60 +691,6 @@ Property: <code class="highlighter-rouge">hoodie.consistency.check.enabled</code
   </li>
 </ul>
 
-<h3 id="tuning">Tuning</h3>
-
-<p>Writing data via Hudi happens as a Spark job and thus general rules of spark debugging applies here too. Below is a list of things to keep in mind, if you are looking to improving performance or reliability.</p>
-
-<p><strong>Input Parallelism</strong> : By default, Hudi tends to over-partition input (i.e <code class="highlighter-rouge">withParallelism(1500)</code>), to ensure each Spark partition stays within the 2GB limit for inputs upto 500GB. Bump this up accordingly if you have larger inputs. We recommend having shuffle parallelism <code class="highlighter-rouge">hoodie.[insert|upsert|bulkinsert].shuffle.parallelism</code> such that its atleast input_data_size/500MB</p>
-
-<p><strong>Off-heap memory</strong> : Hudi writes parquet files and that needs good amount of off-heap memory proportional to schema width. Consider setting something like <code class="highlighter-rouge">spark.yarn.executor.memoryOverhead</code> or <code class="highlighter-rouge">spark.yarn.driver.memoryOverhead</code>, if you are running into such failures.</p>
-
-<p><strong>Spark Memory</strong> : Typically, hudi needs to be able to read a single file into memory to perform merges or compactions and thus the executor memory should be sufficient to accomodate this. In addition, Hoodie caches the input to be able to intelligently place data and thus leaving some <code class="highlighter-rouge">spark.storage.memoryFraction</code> will generally help boost performance.</p>
-
-<p><strong>Sizing files</strong> : Set <code class="highlighter-rouge">limitFileSize</code> above judiciously, to balance ingest/write latency vs number of files &amp; consequently metadata overhead associated with it.</p>
-
-<p><strong>Timeseries/Log data</strong> : Default configs are tuned for database/nosql changelogs where individual record sizes are large. Another very popular class of data is timeseries/event/log data that tends to be more volumnious with lot more records per partition. In such cases
-    - Consider tuning the bloom filter accuracy via <code class="highlighter-rouge">.bloomFilterFPP()/bloomFilterNumEntries()</code> to achieve your target index look up time
-    - Consider making a key that is prefixed with time of the event, which will enable range pruning &amp; significantly speeding up index lookup.</p>
-
-<p><strong>GC Tuning</strong> : Please be sure to follow garbage collection tuning tips from Spark tuning guide to avoid OutOfMemory errors
-[Must] Use G1/CMS Collector. Sample CMS Flags to add to spark.executor.extraJavaOptions :</p>
-
-<div class="highlighter-rouge"><pre class="highlight"><code>-XX:NewSize=1g -XX:SurvivorRatio=2 -XX:+UseCompressedOops -XX:+UseConcMarkSweepGC -XX:+UseParNewGC -XX:CMSInitiatingOccupancyFraction=70 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintTenuringDistribution -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/hoodie-heapdump.hprof
-</code></pre>
-</div>
-
-<p>If it keeps OOMing still, reduce spark memory conservatively: <code class="highlighter-rouge">spark.memory.fraction=0.2, spark.memory.storageFraction=0.2</code> allowing it to spill rather than OOM. (reliably slow vs crashing intermittently)</p>
-
-<p>Below is a full working production config</p>
-
-<div class="highlighter-rouge"><pre class="highlight"><code> spark.driver.extraClassPath    /etc/hive/conf
- spark.driver.extraJavaOptions    -XX:+PrintTenuringDistribution -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintGCTimeStamps -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/hoodie-heapdump.hprof
- spark.driver.maxResultSize    2g
- spark.driver.memory    4g
- spark.executor.cores    1
- spark.executor.extraJavaOptions    -XX:+PrintFlagsFinal -XX:+PrintReferenceGC -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintAdaptiveSizePolicy -XX:+UnlockDiagnosticVMOptions -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/hoodie-heapdump.hprof
- spark.executor.id    driver
- spark.executor.instances    300
- spark.executor.memory    6g
- spark.rdd.compress true
-
- spark.kryoserializer.buffer.max    512m
- spark.serializer    org.apache.spark.serializer.KryoSerializer
- spark.shuffle.memoryFraction    0.2
- spark.shuffle.service.enabled    true
- spark.sql.hive.convertMetastoreParquet    false
- spark.storage.memoryFraction    0.6
- spark.submit.deployMode    cluster
- spark.task.cpus    1
- spark.task.maxFailures    4
-
- spark.yarn.driver.memoryOverhead    1024
- spark.yarn.executor.memoryOverhead    3072
- spark.yarn.max.executor.failures    100
-
-</code></pre>
-</div>
 
 
     <div class="tags">
diff --git a/content/contributing.html b/content/contributing.html
index 9c9e61d..b875ff1 100644
--- a/content/contributing.html
+++ b/content/contributing.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
-            
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
diff --git a/content/feed.xml b/content/feed.xml
index cd76d50..a6dfac9 100644
--- a/content/feed.xml
+++ b/content/feed.xml
@@ -5,8 +5,8 @@
         <description>Apache Hudi (pronounced “Hoodie”) provides upserts and incremental processing capaibilities on Big Data</description>
         <link>http://0.0.0.0:4000/</link>
         <atom:link href="http://0.0.0.0:4000/feed.xml" rel="self" type="application/rss+xml"/>
-        <pubDate>Sat, 09 Mar 2019 21:08:53 +0000</pubDate>
-        <lastBuildDate>Sat, 09 Mar 2019 21:08:53 +0000</lastBuildDate>
+        <pubDate>Thu, 14 Mar 2019 05:18:29 +0000</pubDate>
+        <lastBuildDate>Thu, 14 Mar 2019 05:18:29 +0000</lastBuildDate>
         <generator>Jekyll v3.3.1</generator>
         
         <item>
diff --git a/content/gcs_hoodie.html b/content/gcs_hoodie.html
index cb96011..149a8b1 100644
--- a/content/gcs_hoodie.html
+++ b/content/gcs_hoodie.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
-            
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
diff --git a/content/index.html b/content/index.html
index 1a1c5ff..56fdd90 100644
--- a/content/index.html
+++ b/content/index.html
@@ -222,28 +222,28 @@
             
             
             
-            <li class="active"><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
-            
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
diff --git a/content/js/mydoc_scroll.html b/content/js/mydoc_scroll.html
index ee70719..45111cc 100644
--- a/content/js/mydoc_scroll.html
+++ b/content/js/mydoc_scroll.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
-            
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
diff --git a/content/migration_guide.html b/content/migration_guide.html
index 03ea8a1..84ff4d1 100644
--- a/content/migration_guide.html
+++ b/content/migration_guide.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
-            <li class="active"><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
-            
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
diff --git a/content/news_archive.html b/content/news_archive.html
index d1986b5..aea14ce 100644
--- a/content/news_archive.html
+++ b/content/news_archive.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
-            
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
diff --git a/content/privacy.html b/content/performance.html
similarity index 61%
copy from content/privacy.html
copy to content/performance.html
index 1804b9f..19cb6a3 100644
--- a/content/privacy.html
+++ b/content/performance.html
@@ -4,8 +4,8 @@
 <meta http-equiv="X-UA-Compatible" content="IE=edge">
 <meta name="viewport" content="width=device-width, initial-scale=1">
 <meta name="description" content="">
-<meta name="keywords" content="hudi, privacy">
-<title>Privacy Policy | Hudi</title>
+<meta name="keywords" content="hudi, index, storage, compaction, cleaning, implementation">
+<title>Implementation | Hudi</title>
 <link rel="stylesheet" href="css/syntax.css">
 
 
@@ -162,7 +162,7 @@
 
 
 
-  <a class="email" title="Submit feedback" href="#" onclick="javascript:window.location='mailto:dev@hudi.apache.org?subject=Hudi Documentation feedback&body=I have some feedback about the Privacy Policy page: ' + window.location.href;"><i class="fa fa-envelope-o"></i> Feedback</a>
+  <a class="email" title="Submit feedback" href="#" onclick="javascript:window.location='mailto:dev@hudi.apache.org?subject=Hudi Documentation feedback&body=I have some feedback about the Implementation page: ' + window.location.href;"><i class="fa fa-envelope-o"></i> Feedback</a>
 
 <li>
 
@@ -180,7 +180,7 @@
                                 searchInput: document.getElementById('search-input'),
                                 resultsContainer: document.getElementById('results-container'),
                                 dataSource: 'search.json',
-                                searchResultTemplate: '<li><a href="{url}" title="Privacy Policy">{title}</a></li>',
+                                searchResultTemplate: '<li><a href="{url}" title="Implementation">{title}</a></li>',
                     noResultsText: 'No results found.',
                             limit: 10,
                             fuzzy: true,
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
             
+            <li class="active"><a href="performance.html">Performance</a></li>
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
             
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
@@ -331,7 +317,7 @@
     <!-- Content Column -->
     <div class="col-md-9">
         <div class="post-header">
-   <h1 class="post-title-main">Privacy Policy</h1>
+   <h1 class="post-title-main">Implementation</h1>
 </div>
 
 
@@ -341,51 +327,97 @@
    
 
     
+
+
     
-<!-- this handles the automatic toc. use ## for subheads to auto-generate the on-page minitoc. if you use html tags, you must supply an ID for the heading element in order for it to appear in the minitoc. -->
-<script>
-$( document ).ready(function() {
-  // Handler for .ready() called.
 
-$('#toc').toc({ minimumHeaders: 0, listType: 'ul', showSpeed: 0, headers: 'h2,h3,h4' });
+  <h2 id="performance">Performance</h2>
 
-/* this offset helps account for the space taken up by the floating toolbar. */
-$('#toc').on('click', 'a', function() {
-  var target = $(this.getAttribute('href'))
-    , scroll_target = target.offset().top
+<p>In this section, we go over some real world performance numbers for Hudi upserts, incremental pull and compare them against
+the conventional alternatives for achieving these tasks. Following shows the speed up obtained for NoSQL ingestion, 
+by switching from bulk loads off HBase to Parquet to incrementally upserting on a Hudi dataset, on 5 tables ranging from small to huge.</p>
 
-  $(window).scrollTop(scroll_target - 10);
-  return false
-})
-  
-});
-</script>
+<figure><img class="docimage" src="images/hudi_upsert_perf1.png" alt="hudi_upsert_perf1.png" style="max-width: 1000px" /></figure>
 
-<div id="toc"></div>
+<p>Given Hudi can build the dataset incrementally, it opens doors for also scheduling ingesting more frequently thus reducing latency, with
+significant savings on the overall compute cost.</p>
 
-    
+<figure><img class="docimage" src="images/hudi_upsert_perf2.png" alt="hudi_upsert_perf2.png" style="max-width: 1000px" /></figure>
 
+<p>Hudi upserts have been stress tested upto 4TB in a single commit across the t1 table.</p>
 
-    
+<h3 id="tuning">Tuning</h3>
 
-  <p>Information about your use of this website is collected using server access logs and a tracking cookie.
-The collected information consists of the following:</p>
+<p>Writing data via Hudi happens as a Spark job and thus general rules of spark debugging applies here too. Below is a list of things to keep in mind, if you are looking to improving performance or reliability.</p>
 
-<ul>
-  <li>The IP address from which you access the website;</li>
-  <li>The type of browser and operating system you use to access our site;</li>
-  <li>The date and time you access our site;</li>
-  <li>The pages you visit;</li>
-  <li>The addresses of pages from where you followed a link to our site.</li>
-</ul>
+<p><strong>Input Parallelism</strong> : By default, Hudi tends to over-partition input (i.e <code class="highlighter-rouge">withParallelism(1500)</code>), to ensure each Spark partition stays within the 2GB limit for inputs upto 500GB. Bump this up accordingly if you have larger inputs. We recommend having shuffle parallelism <code class="highlighter-rouge">hoodie.[insert|upsert|bulkinsert].shuffle.parallelism</code> such that its atleast input_data_size/500MB</p>
+
+<p><strong>Off-heap memory</strong> : Hudi writes parquet files and that needs good amount of off-heap memory proportional to schema width. Consider setting something like <code class="highlighter-rouge">spark.yarn.executor.memoryOverhead</code> or <code class="highlighter-rouge">spark.yarn.driver.memoryOverhead</code>, if you are running into such failures.</p>
+
+<p><strong>Spark Memory</strong> : Typically, hudi needs to be able to read a single file into memory to perform merges or compactions and thus the executor memory should be sufficient to accomodate this. In addition, Hoodie caches the input to be able to intelligently place data and thus leaving some <code class="highlighter-rouge">spark.storage.memoryFraction</code> will generally help boost performance.</p>
+
+<p><strong>Sizing files</strong> : Set <code class="highlighter-rouge">limitFileSize</code> above judiciously, to balance ingest/write latency vs number of files &amp; consequently metadata overhead associated with it.</p>
+
+<p><strong>Timeseries/Log data</strong> : Default configs are tuned for database/nosql changelogs where individual record sizes are large. Another very popular class of data is timeseries/event/log data that tends to be more volumnious with lot more records per partition. In such cases
+    - Consider tuning the bloom filter accuracy via <code class="highlighter-rouge">.bloomFilterFPP()/bloomFilterNumEntries()</code> to achieve your target index look up time
+    - Consider making a key that is prefixed with time of the event, which will enable range pruning &amp; significantly speeding up index lookup.</p>
+
+<p><strong>GC Tuning</strong> : Please be sure to follow garbage collection tuning tips from Spark tuning guide to avoid OutOfMemory errors
+[Must] Use G1/CMS Collector. Sample CMS Flags to add to spark.executor.extraJavaOptions :</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>-XX:NewSize=1g -XX:SurvivorRatio=2 -XX:+UseCompressedOops -XX:+UseConcMarkSweepGC -XX:+UseParNewGC -XX:CMSInitiatingOccupancyFraction=70 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintTenuringDistribution -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/hoodie-heapdump.hprof
+</code></pre>
+</div>
+
+<p>If it keeps OOMing still, reduce spark memory conservatively: <code class="highlighter-rouge">spark.memory.fraction=0.2, spark.memory.storageFraction=0.2</code> allowing it to spill rather than OOM. (reliably slow vs crashing intermittently)</p>
+
+<p>Below is a full working production config</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code> spark.driver.extraClassPath    /etc/hive/conf
+ spark.driver.extraJavaOptions    -XX:+PrintTenuringDistribution -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintGCTimeStamps -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/hoodie-heapdump.hprof
+ spark.driver.maxResultSize    2g
+ spark.driver.memory    4g
+ spark.executor.cores    1
+ spark.executor.extraJavaOptions    -XX:+PrintFlagsFinal -XX:+PrintReferenceGC -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintAdaptiveSizePolicy -XX:+UnlockDiagnosticVMOptions -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/hoodie-heapdump.hprof
+ spark.executor.id    driver
+ spark.executor.instances    300
+ spark.executor.memory    6g
+ spark.rdd.compress true
+
+ spark.kryoserializer.buffer.max    512m
+ spark.serializer    org.apache.spark.serializer.KryoSerializer
+ spark.shuffle.memoryFraction    0.2
+ spark.shuffle.service.enabled    true
+ spark.sql.hive.convertMetastoreParquet    false
+ spark.storage.memoryFraction    0.6
+ spark.submit.deployMode    cluster
+ spark.task.cpus    1
+ spark.task.maxFailures    4
+
+ spark.yarn.driver.memoryOverhead    1024
+ spark.yarn.executor.memoryOverhead    3072
+ spark.yarn.max.executor.failures    100
+
+</code></pre>
+</div>
+
+<h4 id="read-optimized-query-performance">Read Optimized Query Performance</h4>
+
+<p>The major design goal for read optimized view is to achieve the latency reduction &amp; efficiency gains in previous section,
+with no impact on queries. Following charts compare the Hudi vs non-Hudi datasets across Hive/Presto/Spark queries and demonstrate this.</p>
+
+<p><strong>Hive</strong></p>
+
+<figure><img class="docimage" src="images/hudi_query_perf_hive.png" alt="hudi_query_perf_hive.png" style="max-width: 800px" /></figure>
+
+<p><strong>Spark</strong></p>
 
-<p>Part of this information is gathered using a tracking cookie set by the <a href="http://www.google.com/analytics">Google Analytics</a> service and handled by Google as described in their <a href="http://www.google.com/privacy.html">privacy policy</a>. See your browser documentation for instructions on how to disable the cookie if you prefer not to share this data with Google.</p>
+<figure><img class="docimage" src="images/hudi_query_perf_spark.png" alt="hudi_query_perf_spark.png" style="max-width: 1000px" /></figure>
 
-<p>We use the gathered information to help us make our site more useful to visitors and to better understand how and when our site is used. We do not track or collect personally identifiable information or associate gathered data with any personally identifying information from other sources.</p>
+<p><strong>Presto</strong></p>
 
-<p>By using this website, you consent to the collection of this data in the manner and for the purpose described above.</p>
+<figure><img class="docimage" src="images/hudi_query_perf_presto.png" alt="hudi_query_perf_presto.png" style="max-width: 1000px" /></figure>
 
-<p>The Hudi development community welcomes your questions or comments regarding this Privacy Policy. Send them to dev@hudi.apache.org</p>
 
 
     <div class="tags">
diff --git a/content/powered_by.html b/content/powered_by.html
index 99991ca..75c0a61 100644
--- a/content/powered_by.html
+++ b/content/powered_by.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li class="active"><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li class="active"><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
-            
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
diff --git a/content/privacy.html b/content/privacy.html
index 1804b9f..45bd799 100644
--- a/content/privacy.html
+++ b/content/privacy.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
-            
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
diff --git a/content/s3_hoodie.html b/content/querying_data.html
similarity index 70%
copy from content/s3_hoodie.html
copy to content/querying_data.html
index 0366721..efd1bc0 100644
--- a/content/s3_hoodie.html
+++ b/content/querying_data.html
@@ -3,9 +3,9 @@
     <meta charset="utf-8">
 <meta http-equiv="X-UA-Compatible" content="IE=edge">
 <meta name="viewport" content="width=device-width, initial-scale=1">
-<meta name="description" content="In this page, we go over how to configure Hudi with S3 filesystem.">
-<meta name="keywords" content="hudi, hive, aws, s3, spark, presto">
-<title>S3 Filesystem | Hudi</title>
+<meta name="description" content="In this page, we go over how to enable SQL queries on Hudi built tables.">
+<meta name="keywords" content="hudi, hive, spark, sql, presto">
+<title>Querying Hudi Datasets | Hudi</title>
 <link rel="stylesheet" href="css/syntax.css">
 
 
@@ -162,7 +162,7 @@
 
 
 
-  <a class="email" title="Submit feedback" href="#" onclick="javascript:window.location='mailto:dev@hudi.apache.org?subject=Hudi Documentation feedback&body=I have some feedback about the S3 Filesystem page: ' + window.location.href;"><i class="fa fa-envelope-o"></i> Feedback</a>
+  <a class="email" title="Submit feedback" href="#" onclick="javascript:window.location='mailto:dev@hudi.apache.org?subject=Hudi Documentation feedback&body=I have some feedback about the Querying Hudi Datasets page: ' + window.location.href;"><i class="fa fa-envelope-o"></i> Feedback</a>
 
 <li>
 
@@ -180,7 +180,7 @@
                                 searchInput: document.getElementById('search-input'),
                                 resultsContainer: document.getElementById('results-container'),
                                 dataSource: 'search.json',
-                                searchResultTemplate: '<li><a href="{url}" title="S3 Filesystem">{title}</a></li>',
+                                searchResultTemplate: '<li><a href="{url}" title="Querying Hudi Datasets">{title}</a></li>',
                     noResultsText: 'No results found.',
                             limit: 10,
                             fuzzy: true,
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="configurations.html">Configurations</a></li>
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
+            <li class="active"><a href="querying_data.html">Querying Data</a></li>
             
             
             
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
+            <li><a href="performance.html">Performance</a></li>
             
             
             
             
             
             
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
@@ -331,7 +317,7 @@
     <!-- Content Column -->
     <div class="col-md-9">
         <div class="post-header">
-   <h1 class="post-title-main">S3 Filesystem</h1>
+   <h1 class="post-title-main">Querying Hudi Datasets</h1>
 </div>
 
 
@@ -339,7 +325,7 @@
 <div class="post-content">
 
    
-    <div class="summary">In this page, we go over how to configure Hudi with S3 filesystem.</div>
+    <div class="summary">In this page, we go over how to enable SQL queries on Hudi built tables.</div>
    
 
     
@@ -347,76 +333,56 @@
 
     
 
-  <p>In this page, we explain how to get your Hudi spark job to store into AWS S3.</p>
+  <p>Hudi registers the dataset into the Hive metastore backed by <code class="highlighter-rouge">HoodieInputFormat</code>. This makes the data accessible to
+Hive &amp; Spark &amp; Presto automatically. To be able to perform normal SQL queries on such a dataset, we need to get the individual query engines
+to call <code class="highlighter-rouge">HoodieInputFormat.getSplits()</code>, during query planning such that the right versions of files are exposed to it.</p>
 
-<h2 id="aws-configs">AWS configs</h2>
+<p>In the following sections, we cover the configs needed across different query engines to achieve this.</p>
 
-<p>There are two configurations required for Hudi-S3 compatibility:</p>
+<div class="bs-callout bs-callout-info">Instructions are currently only for Copy-on-write storage</div>
 
-<ul>
-  <li>Adding AWS Credentials for Hudi</li>
-  <li>Adding required Jars to classpath</li>
-</ul>
+<h2 id="hive">Hive</h2>
 
-<h3 id="aws-credentials">AWS Credentials</h3>
+<p>For HiveServer2 access, <a href="https://www.cloudera.com/documentation/enterprise/5-6-x/topics/cm_mc_hive_udf.html#concept_nc3_mms_lr">install</a>
+the hoodie-hadoop-mr-bundle-x.y.z-SNAPSHOT.jar into the aux jars path and we should be able to recognize the Hudi tables and query them correctly.</p>
 
-<p>Simplest way to use Hudi with S3, is to configure your <code class="highlighter-rouge">SparkSession</code> or <code class="highlighter-rouge">SparkContext</code> with S3 credentials. Hudi will automatically pick this up and talk to S3.</p>
+<p>For beeline access, the <code class="highlighter-rouge">hive.input.format</code> variable needs to be set to the fully qualified path name of the inputformat <code class="highlighter-rouge">com.uber.hoodie.hadoop.HoodieInputFormat</code>
+For Tez, additionally the <code class="highlighter-rouge">hive.tez.input.format</code> needs to be set to <code class="highlighter-rouge">org.apache.hadoop.hive.ql.io.HiveInputFormat</code></p>
 
-<p>Alternatively, add the required configs in your core-site.xml from where Hudi can fetch them. Replace the <code class="highlighter-rouge">fs.defaultFS</code> with your S3 bucket name and Hudi should be able to read/write from the bucket.</p>
+<h2 id="spark">Spark</h2>
 
-<div class="highlighter-rouge"><pre class="highlight"><code>  &lt;property&gt;
-      &lt;name&gt;fs.defaultFS&lt;/name&gt;
-      &lt;value&gt;s3://ysharma&lt;/value&gt;
-  &lt;/property&gt;
+<p>There are two ways of running Spark SQL on Hudi datasets.</p>
 
-  &lt;property&gt;
-      &lt;name&gt;fs.s3.impl&lt;/name&gt;
-      &lt;value&gt;org.apache.hadoop.fs.s3native.NativeS3FileSystem&lt;/value&gt;
-  &lt;/property&gt;
+<p>First method involves, setting <code class="highlighter-rouge">spark.sql.hive.convertMetastoreParquet=false</code>, forcing Spark to fallback
+to using the Hive Serde to read the data (planning/executions is still Spark). This turns off optimizations in Spark
+towards Parquet reading, which we will address in the next method based on path filters.
+However benchmarks have not revealed any real performance degradation with Hudi &amp; SparkSQL, compared to native support.</p>
 
-  &lt;property&gt;
-      &lt;name&gt;fs.s3.awsAccessKeyId&lt;/name&gt;
-      &lt;value&gt;AWS_KEY&lt;/value&gt;
-  &lt;/property&gt;
+<p>Sample command is provided below to spin up Spark Shell</p>
 
-  &lt;property&gt;
-       &lt;name&gt;fs.s3.awsSecretAccessKey&lt;/name&gt;
-       &lt;value&gt;AWS_SECRET&lt;/value&gt;
-  &lt;/property&gt;
+<div class="highlighter-rouge"><pre class="highlight"><code>$ spark-shell --jars hoodie-spark-bundle-x.y.z-SNAPSHOT.jar --driver-class-path /etc/hive/conf  --packages com.databricks:spark-avro_2.11:4.0.0 --conf spark.sql.hive.convertMetastoreParquet=false --num-executors 10 --driver-memory 7g --executor-memory 2g  --master yarn-client
 
-  &lt;property&gt;
-       &lt;name&gt;fs.s3n.awsAccessKeyId&lt;/name&gt;
-       &lt;value&gt;AWS_KEY&lt;/value&gt;
-  &lt;/property&gt;
+scala&gt; sqlContext.sql("select count(*) from uber.trips where datestr = '2016-10-02'").show()
 
-  &lt;property&gt;
-       &lt;name&gt;fs.s3n.awsSecretAccessKey&lt;/name&gt;
-       &lt;value&gt;AWS_SECRET&lt;/value&gt;
-  &lt;/property&gt;
 </code></pre>
 </div>
 
-<p>Utilities such as hoodie-cli or deltastreamer tool, can pick up s3 creds via environmental variable prefixed with <code class="highlighter-rouge">HOODIE_ENV_</code>. For e.g below is a bash snippet to setup
-such variables and then have cli be able to work on datasets stored in s3</p>
+<p>For scheduled Spark jobs, a dependency to <a href="https://mvnrepository.com/artifact/com.uber.hoodie/hoodie-hadoop-mr">hoodie-hadoop-mr</a> and <a href="https://mvnrepository.com/artifact/com.uber.hoodie/hoodie-client">hoodie-client</a> modules needs to be added
+and the same config needs to be set on <code class="highlighter-rouge">SparkConf</code> or conveniently via <code class="highlighter-rouge">HoodieReadClient.addHoodieSupport(conf)</code></p>
 
-<div class="highlighter-rouge"><pre class="highlight"><code>export HOODIE_ENV_fs_DOT_s3a_DOT_access_DOT_key=$accessKey
-export HOODIE_ENV_fs_DOT_s3a_DOT_secret_DOT_key=$secretKey
-export HOODIE_ENV_fs_DOT_s3_DOT_awsAccessKeyId=$accessKey
-export HOODIE_ENV_fs_DOT_s3_DOT_awsSecretAccessKey=$secretKey
-export HOODIE_ENV_fs_DOT_s3n_DOT_awsAccessKeyId=$accessKey
-export HOODIE_ENV_fs_DOT_s3n_DOT_awsSecretAccessKey=$secretKey
-export HOODIE_ENV_fs_DOT_s3n_DOT_impl=org.apache.hadoop.fs.s3a.S3AFileSystem
-</code></pre>
+<div class="bs-callout bs-callout-warning">Don’t instantiate a HoodieWriteClient against a table you don’t own. Hudi is a single writer &amp; multiple reader system as of now. You may accidentally cause incidents otherwise.
 </div>
 
-<h3 id="aws-libs">AWS Libs</h3>
+<p>The second method uses a new feature in Spark 2.x, which allows for the work of HoodieInputFormat to be done via a path filter as below. This method uses Spark built-in optimizations for
+reading Parquet files, just like queries on non-Hudi tables.</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>spark.sparkContext.hadoopConfiguration.setClass("mapreduce.input.pathFilter.class", classOf[com.uber.hoodie.hadoop.HoodieROTablePathFilter], classOf[org.apache.hadoop.fs.PathFilter]);
+</code></pre>
+</div>
 
-<p>AWS hadoop libraries to add to our classpath</p>
+<h2 id="presto">Presto</h2>
 
-<ul>
-  <li>com.amazonaws:aws-java-sdk:1.10.34</li>
-  <li>org.apache.hadoop:hadoop-aws:2.7.3</li>
-</ul>
+<p>Presto requires the <code class="highlighter-rouge">hoodie-presto-bundle</code> jar to be placed into <code class="highlighter-rouge">&lt;presto_install&gt;/plugin/hive-hadoop2/</code>, across the installation.</p>
 
 
     <div class="tags">
diff --git a/content/quickstart.html b/content/quickstart.html
index b7781b3..b3d9f41 100644
--- a/content/quickstart.html
+++ b/content/quickstart.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li class="active"><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li class="active"><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
-            
-            
-            
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
@@ -347,9 +333,7 @@
 
   <h2 id="download-hudi">Download Hudi</h2>
 
-<p>Check out code and pull it into Intellij as a normal maven project.</p>
-
-<p>Normally build the maven project, from command line</p>
+<p>Check out code and pull it into Intellij as a normal maven project. Normally build the maven project, from command line</p>
 
 <div class="highlighter-rouge"><pre class="highlight"><code>$ mvn clean install -DskipTests -DskipITs
 </code></pre>
@@ -360,9 +344,10 @@
 $ mvn clean install -DskipTests -DskipITs -Dhive11
 </code></p>
 
-<div class="bs-callout bs-callout-info">You might want to add your spark jars folder to project dependencies under ‘Module Setttings’, to be able to run Spark from IDE</div>
+<div class="bs-callout bs-callout-info">You might want to add your spark jars folder to project dependencies under ‘Module Setttings’, to be able to run Spark from IDE. 
+Setup your local hadoop/hive test environment, so you can play with entire ecosystem.</div>
 
-<div class="alert alert-info" role="alert"><i class="fa fa-info-circle"></i> <b>Note:</b> Setup your local hadoop/hive test environment, so you can play with entire ecosystem. See <a href="http://www.bytearray.io/2016/05/setting-up-hadoopyarnsparkhive-on-mac.html">this</a> for reference</div>
+<p><br />Please refer to <a href="migration_guide.html">migration guide</a>, for recommended ways to migrate your existing dataset to Hudi.</p>
 
 <h2 id="version-compatibility">Version Compatibility</h2>
 
@@ -619,7 +604,7 @@ scala&gt; sqlContext.sql("select count(*) from hoodie_test").show(10000)
 <p>Checkout the ‘master’ branch on OSS Presto, build it, and place your installation somewhere.</p>
 
 <ul>
-  <li>Copy the hoodie-hadoop-mr-* jar into $PRESTO_INSTALL/plugin/hive-hadoop2/</li>
+  <li>Copy the hudi/packaging/hoodie-presto-bundle/target/hoodie-presto-bundle-*.jar into $PRESTO_INSTALL/plugin/hive-hadoop2/</li>
   <li>Startup your server and you should be able to query the same Hive table via Presto</li>
 </ul>
 
@@ -1276,21 +1261,23 @@ beeline -u jdbc:hive2://hiveserver:10000 –hiveconf hive.input.format=org.apach
 No rows affected (0.009 seconds)
 0: jdbc:hive2://hiveserver:10000&gt;  set hoodie.stock_ticks_cow.consume.max.commits=3;
 No rows affected (0.009 seconds)
-0: jdbc:hive2://hiveserver:10000&gt; set hoodie.stock_ticks_cow.consume.start.timestamp=20180924064621;</p>
+0: jdbc:hive2://hiveserver:10000&gt; set hoodie.stock_ticks_cow.consume.start.timestamp=20180924064621;
+```</p>
 
-<h1 id="with-the-above-setting-file-ids-that-do-not-have-any-updates-from-the-commit-20180924065039-is-filtered-out-without-scanning">With the above setting, file-ids that do not have any updates from the commit 20180924065039 is filtered out without scanning.</h1>
-<p># Here is the incremental query :</p>
+<p>With the above setting, file-ids that do not have any updates from the commit 20180924065039 is filtered out without scanning.
+Here is the incremental query :</p>
 
-<p>0: jdbc:hive2://hiveserver:10000&gt;
-0: jdbc:hive2://hiveserver:10000&gt; select <code class="highlighter-rouge">_hoodie_commit_time</code>, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = ‘GOOG’ and <code class="highlighter-rouge">_hoodie_commit_time</code> &gt; ‘20180924064621’;
-+———————-+———+———————-+———+————+———–+–+
+<div class="highlighter-rouge"><pre class="highlight"><code>0: jdbc:hive2://hiveserver:10000&gt;
+0: jdbc:hive2://hiveserver:10000&gt; select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG' and `_hoodie_commit_time` &gt; '20180924064621';
++----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
-+———————-+———+———————-+———+————+———–+–+
++----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924065039       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
-+———————-+———+———————-+———+————+———–+–+
++----------------------+---------+----------------------+---------+------------+-----------+--+
 1 row selected (0.83 seconds)
 0: jdbc:hive2://hiveserver:10000&gt;
-```</p>
+</code></pre>
+</div>
 
 <h5 id="incremental-query-with-spark-sql">Incremental Query with Spark SQL:</h5>
 <div class="highlighter-rouge"><pre class="highlight"><code>docker exec -it adhoc-1 /bin/bash
diff --git a/content/s3_hoodie.html b/content/s3_hoodie.html
index 0366721..be4f70f 100644
--- a/content/s3_hoodie.html
+++ b/content/s3_hoodie.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
-            
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
diff --git a/content/search.json b/content/search.json
index 0473b34..072a0ef 100644
--- a/content/search.json
+++ b/content/search.json
@@ -4,7 +4,7 @@
 
 
 {
-"title": "Admin Guide",
+"title": "Administering Hudi Pipelines",
 "tags": "",
 "keywords": "hudi, administration, operation, devops",
 "url": "admin_guide.html",
@@ -83,28 +83,6 @@
 
 
 {
-"title": "Implementation",
-"tags": "",
-"keywords": "hudi, index, storage, compaction, cleaning, implementation",
-"url": "implementation.html",
-"summary": ""
-}
-,
-
-
-
-{
-"title": "Incremental Processing",
-"tags": "",
-"keywords": "hudi, incremental, batch, stream, processing, Hive, ETL, Spark SQL",
-"url": "incremental_processing.html",
-"summary": "In this page, we will discuss some available tools for ingesting data incrementally & consuming the changes."
-}
-,
-
-
-
-{
 "title": "What is Hudi?",
 "tags": "getting_started",
 "keywords": "big data, stream processing, cloud, hdfs, storage, upserts, change capture",
@@ -160,6 +138,17 @@
 
 
 {
+"title": "Implementation",
+"tags": "",
+"keywords": "hudi, index, storage, compaction, cleaning, implementation",
+"url": "performance.html",
+"summary": ""
+}
+,
+
+
+
+{
 "title": "Talks &amp; Powered By",
 "tags": "",
 "keywords": "hudi, talks, presentation",
@@ -182,6 +171,17 @@
 
 
 {
+"title": "Querying Hudi Datasets",
+"tags": "",
+"keywords": "hudi, hive, spark, sql, presto",
+"url": "querying_data.html",
+"summary": "In this page, we go over how to enable SQL queries on Hudi built tables."
+}
+,
+
+
+
+{
 "title": "Quickstart",
 "tags": "quickstart",
 "keywords": "hudi, quickstart",
@@ -208,22 +208,22 @@
 
 
 {
-"title": "SQL Queries",
+"title": "Use Cases",
 "tags": "",
-"keywords": "hudi, hive, spark, sql, presto",
-"url": "sql_queries.html",
-"summary": "In this page, we go over how to enable SQL queries on Hudi built tables."
+"keywords": "hudi, data ingestion, etl, real time, use cases",
+"url": "use_cases.html",
+"summary": "Following are some sample use-cases for Hudi, which illustrate the benefits in terms of faster processing & increased efficiency"
 }
 ,
 
 
 
 {
-"title": "Use Cases",
+"title": "Writing Hudi Datasets",
 "tags": "",
-"keywords": "hudi, data ingestion, etl, real time, use cases",
-"url": "use_cases.html",
-"summary": "Following are some sample use-cases for Hudi, which illustrate the benefits in terms of faster processing & increased efficiency"
+"keywords": "hudi, incremental, batch, stream, processing, Hive, ETL, Spark SQL",
+"url": "writing_data.html",
+"summary": "In this page, we will discuss some available tools for ingesting data incrementally & consuming the changes."
 }
 ,
 
diff --git a/content/sitemap.xml b/content/sitemap.xml
index 14f34a7..09d9c63 100644
--- a/content/sitemap.xml
+++ b/content/sitemap.xml
@@ -64,55 +64,55 @@
   
   
   <url>
-    <loc>http://0.0.0.0:4000/implementation.html</loc>
+    <loc>http://0.0.0.0:4000/index.html</loc>
   </url>
   
   
   
   <url>
-    <loc>http://0.0.0.0:4000/incremental_processing.html</loc>
+    <loc>http://0.0.0.0:4000/migration_guide.html</loc>
   </url>
   
   
   
   <url>
-    <loc>http://0.0.0.0:4000/index.html</loc>
+    <loc>http://0.0.0.0:4000/js/mydoc_scroll.html</loc>
   </url>
   
   
   
   <url>
-    <loc>http://0.0.0.0:4000/migration_guide.html</loc>
+    <loc>http://0.0.0.0:4000/news.html</loc>
   </url>
   
   
   
   <url>
-    <loc>http://0.0.0.0:4000/js/mydoc_scroll.html</loc>
+    <loc>http://0.0.0.0:4000/news_archive.html</loc>
   </url>
   
   
   
   <url>
-    <loc>http://0.0.0.0:4000/news.html</loc>
+    <loc>http://0.0.0.0:4000/performance.html</loc>
   </url>
   
   
   
   <url>
-    <loc>http://0.0.0.0:4000/news_archive.html</loc>
+    <loc>http://0.0.0.0:4000/powered_by.html</loc>
   </url>
   
   
   
   <url>
-    <loc>http://0.0.0.0:4000/powered_by.html</loc>
+    <loc>http://0.0.0.0:4000/privacy.html</loc>
   </url>
   
   
   
   <url>
-    <loc>http://0.0.0.0:4000/privacy.html</loc>
+    <loc>http://0.0.0.0:4000/querying_data.html</loc>
   </url>
   
   
@@ -134,13 +134,13 @@
   
   
   <url>
-    <loc>http://0.0.0.0:4000/sql_queries.html</loc>
+    <loc>http://0.0.0.0:4000/use_cases.html</loc>
   </url>
   
   
   
   <url>
-    <loc>http://0.0.0.0:4000/use_cases.html</loc>
+    <loc>http://0.0.0.0:4000/writing_data.html</loc>
   </url>
   
   
diff --git a/content/strata-talk.html b/content/strata-talk.html
index 58b6f8a..68f2912 100644
--- a/content/strata-talk.html
+++ b/content/strata-talk.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
-            
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
diff --git a/content/use_cases.html b/content/use_cases.html
index dcdf403..a89b1be 100644
--- a/content/use_cases.html
+++ b/content/use_cases.html
@@ -222,28 +222,28 @@
             
             
             
-            <li><a href="index.html">Overview</a></li>
+            <li><a href="quickstart.html">Quickstart</a></li>
             
             
             
             
             
             
-            <li><a href="quickstart.html">Quickstart</a></li>
+            <li class="active"><a href="use_cases.html">Use Cases</a></li>
             
             
             
             
             
             
-            <li class="active"><a href="use_cases.html">Use Cases</a></li>
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
             
             
             
             
             
             
-            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            <li><a href="comparison.html">Comparison</a></li>
             
             
             
@@ -265,49 +265,35 @@
             
             
             
-            <li><a href="implementation.html">Implementation</a></li>
-            
-            
-            
+            <li><a href="writing_data.html">Writing Data</a></li>
             
             
             
-            <li><a href="configurations.html">Configurations</a></li>
             
             
             
+            <li><a href="querying_data.html">Querying Data</a></li>
             
             
             
-            <li><a href="sql_queries.html">SQL Queries</a></li>
             
             
             
+            <li><a href="configurations.html">Configuration</a></li>
             
             
             
-            <li><a href="migration_guide.html">Migration Guide</a></li>
             
             
             
+            <li><a href="performance.html">Performance</a></li>
             
             
             
-            <li><a href="incremental_processing.html">Incremental Processing</a></li>
             
             
             
-            
-            
-            
-            <li><a href="admin_guide.html">Admin Guide</a></li>
-            
-            
-            
-            
-            
-            
-            <li><a href="comparison.html">Comparison</a></li>
+            <li><a href="admin_guide.html">Administering</a></li>
             
             
             
diff --git a/content/writing_data.html b/content/writing_data.html
new file mode 100644
index 0000000..6cbdad6
--- /dev/null
+++ b/content/writing_data.html
@@ -0,0 +1,678 @@
+<!DOCTYPE html>
+<head>
+    <meta charset="utf-8">
+<meta http-equiv="X-UA-Compatible" content="IE=edge">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<meta name="description" content="In this page, we will discuss some available tools for ingesting data incrementally & consuming the changes.">
+<meta name="keywords" content="hudi, incremental, batch, stream, processing, Hive, ETL, Spark SQL">
+<title>Writing Hudi Datasets | Hudi</title>
+<link rel="stylesheet" href="css/syntax.css">
+
+
+<link rel="stylesheet" type="text/css" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css">
+<!--<link rel="stylesheet" type="text/css" href="css/bootstrap.min.css">-->
+<link rel="stylesheet" href="css/modern-business.css">
+<link rel="stylesheet" href="css/lavish-bootstrap.css">
+<link rel="stylesheet" href="css/customstyles.css">
+<link rel="stylesheet" href="css/theme-blue.css">
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.1.4/jquery.min.js"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery-cookie/1.4.1/jquery.cookie.min.js"></script>
+<script src="js/jquery.navgoco.min.js"></script>
+
+
+<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/anchor-js/2.0.0/anchor.min.js"></script>
+<script src="js/toc.js"></script>
+<script src="js/customscripts.js"></script>
+
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
+
+  ga('create', 'UA-93561550-1', 'auto');
+  ga('send', 'pageview');
+
+</script>
+
+<link rel="shortcut icon" href="images/favicon.ico">
+
+<!-- HTML5 Shim and Respond.js IE8 support of HTML5 elements and media queries -->
+<!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
+<!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
+<script src="https://oss.maxcdn.com/libs/respond.js/1.4.2/respond.min.js"></script>
+<![endif]-->
+
+<link rel="alternate" type="application/rss+xml" title="" href="http://0.0.0.0:4000feed.xml">
+
+    <script>
+        $(document).ready(function() {
+            // Initialize navgoco with default options
+            $("#mysidebar").navgoco({
+                caretHtml: '',
+                accordion: true,
+                openClass: 'active', // open
+                save: false, // leave false or nav highlighting doesn't work right
+                cookie: {
+                    name: 'navgoco',
+                    expires: false,
+                    path: '/'
+                },
+                slide: {
+                    duration: 400,
+                    easing: 'swing'
+                }
+            });
+
+            $("#collapseAll").click(function(e) {
+                e.preventDefault();
+                $("#mysidebar").navgoco('toggle', false);
+            });
+
+            $("#expandAll").click(function(e) {
+                e.preventDefault();
+                $("#mysidebar").navgoco('toggle', true);
+            });
+
+        });
+
+    </script>
+    <script>
+        $(function () {
+            $('[data-toggle="tooltip"]').tooltip()
+        })
+    </script>
+    
+
+</head>
+<body>
+<!-- Navigation -->
+
+<nav class="navbar navbar-inverse navbar-fixed-top">
+    <div class="container topnavlinks">
+        <div class="navbar-header">
+            <button type="button" class="navbar-toggle" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1">
+                <span class="sr-only">Toggle navigation</span>
+                <span class="icon-bar"></span>
+                <span class="icon-bar"></span>
+                <span class="icon-bar"></span>
+            </button>
+
+            <a class="fa fa-lg navbar-brand" href="index.html">&nbsp;<span class="projectTitle">
+              <img src="images/hudi_site_logo.png" alt="Hudi logo"/>
+              <!--Hudi-->
+            </span><br/>
+            <p class="navbar-incubate">(Incubating)</p></a>
+        </div>
+        <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
+            <ul class="nav navbar-nav navbar-right">
+                <!-- entries without drop-downs appear here -->
+                
+                
+                
+                <li><a href="news">News</a></li>
+                
+                
+                
+                <li><a href="community.html">Community</a></li>
+                
+                
+                
+                <li><a href="https://github.com/uber/hoodie" target="_blank">Code</a></li>
+                
+                
+                
+                <!-- entries with drop-downs appear here -->
+                <!-- conditional logic to control which topnav appears for the audience defined in the configuration file.-->
+                
+                
+                <li class="dropdown">
+                    <a href="#" class="dropdown-toggle" data-toggle="dropdown">Developers<b class="caret"></b></a>
+                    <ul class="dropdown-menu">
+                        
+                        
+                        <li><a href="contributing.html">Contributing</a></li>
+                        
+                        
+                        
+                        <li><a href="https://cwiki.apache.org/confluence/display/HUDI" target="_blank">Wiki/Designs</a></li>
+                        
+                        
+                        
+                        <li><a href="https://issues.apache.org/jira/projects/HUDI/summary" target="_blank">Issues</a></li>
+                        
+                        
+                        
+                        <li><a href="https://cwiki.apache.org/confluence/pages/viewrecentblogposts.action?key=HUDI" target="_blank">Blog</a></li>
+                        
+                        
+                        
+                        <li><a href="https://projects.apache.org/project.html?incubator-hudi" target="_blank">Team</a></li>
+                        
+                        
+                    </ul>
+                </li>
+                
+                
+                
+			<li>
+
+
+
+  <a class="email" title="Submit feedback" href="#" onclick="javascript:window.location='mailto:dev@hudi.apache.org?subject=Hudi Documentation feedback&body=I have some feedback about the Writing Hudi Datasets page: ' + window.location.href;"><i class="fa fa-envelope-o"></i> Feedback</a>
+
+<li>
+
+		
+                <!--comment out this block if you want to hide search-->
+                <li>
+                    <!--start search-->
+                    <div id="search-demo-container">
+                        <input type="text" id="search-input" placeholder="search...">
+                        <ul id="results-container"></ul>
+                    </div>
+                    <script src="js/jekyll-search.js" type="text/javascript"></script>
+                    <script type="text/javascript">
+                            SimpleJekyllSearch.init({
+                                searchInput: document.getElementById('search-input'),
+                                resultsContainer: document.getElementById('results-container'),
+                                dataSource: 'search.json',
+                                searchResultTemplate: '<li><a href="{url}" title="Writing Hudi Datasets">{title}</a></li>',
+                    noResultsText: 'No results found.',
+                            limit: 10,
+                            fuzzy: true,
+                    })
+                    </script>
+                    <!--end search-->
+                </li>
+            </ul>
+        </div>
+        </div>
+        <!-- /.container -->
+</nav>
+
+<!-- Page Content -->
+<div class="container">
+    <div class="col-lg-12">&nbsp;</div>
+    <!-- Content Row -->
+    <div class="row">
+        <!-- Sidebar Column -->
+        <div class="col-md-3">
+
+          
+
+
+
+
+
+
+
+
+<ul id="mysidebar" class="nav">
+    <li class="sidebarTitle">Latest Version</li>
+    
+    
+    
+    <li>
+        <a href="#">Getting Started</a>
+        <ul>
+            
+            
+            
+            <li><a href="quickstart.html">Quickstart</a></li>
+            
+            
+            
+            
+            
+            
+            <li><a href="use_cases.html">Use Cases</a></li>
+            
+            
+            
+            
+            
+            
+            <li><a href="powered_by.html">Talks & Powered By</a></li>
+            
+            
+            
+            
+            
+            
+            <li><a href="comparison.html">Comparison</a></li>
+            
+            
+            
+            
+        </ul>
+        
+        
+    
+    <li>
+        <a href="#">Documentation</a>
+        <ul>
+            
+            
+            
+            <li><a href="concepts.html">Concepts</a></li>
+            
+            
+            
+            
+            
+            
+            <li class="active"><a href="writing_data.html">Writing Data</a></li>
+            
+            
+            
+            
+            
+            
+            <li><a href="querying_data.html">Querying Data</a></li>
+            
+            
+            
+            
+            
+            
+            <li><a href="configurations.html">Configuration</a></li>
+            
+            
+            
+            
+            
+            
+            <li><a href="performance.html">Performance</a></li>
+            
+            
+            
+            
+            
+            
+            <li><a href="admin_guide.html">Administering</a></li>
+            
+            
+            
+            
+        </ul>
+        
+        
+        
+        <!-- if you aren't using the accordion, uncomment this block:
+           <p class="external">
+               <a href="#" id="collapseAll">Collapse All</a> | <a href="#" id="expandAll">Expand All</a>
+           </p>
+           -->
+    </li>
+</ul>
+</div>
+
+<!-- this highlights the active parent class in the navgoco sidebar. this is critical so that the parent expands when you're viewing a page. This must appear below the sidebar code above. Otherwise, if placed inside customscripts.js, the script runs before the sidebar code runs and the class never gets inserted.-->
+<script>$("li.active").parents('li').toggleClass("active");</script>
+
+    <!-- Content Column -->
+    <div class="col-md-9">
+        <div class="post-header">
+   <h1 class="post-title-main">Writing Hudi Datasets</h1>
+</div>
+
+
+
+<div class="post-content">
+
+   
+    <div class="summary">In this page, we will discuss some available tools for ingesting data incrementally & consuming the changes.</div>
+   
+
+    
+
+
+    
+
+  <p>As discussed in the concepts section, the two basic primitives needed for <a href="https://www.oreilly.com/ideas/ubers-case-for-incremental-processing-on-hadoop">incrementally processing</a>,
+data using Hudi are <code class="highlighter-rouge">upserts</code> (to apply changes to a dataset) and <code class="highlighter-rouge">incremental pulls</code> (to obtain a change stream/log from a dataset). This section
+discusses a few tools that can be used to achieve these on different contexts.</p>
+
+<h2 id="incremental-ingestion">Incremental Ingestion</h2>
+
+<p>Following means can be used to apply a delta or an incremental change to a Hudi dataset. For e.g, the incremental changes could be from a Kafka topic or files uploaded to DFS or
+even changes pulled from another Hudi dataset.</p>
+
+<h4 id="deltastreamer-tool">DeltaStreamer Tool</h4>
+
+<p>The <code class="highlighter-rouge">HoodieDeltaStreamer</code> utility provides the way to achieve all of these, by using the capabilities of <code class="highlighter-rouge">HoodieWriteClient</code>, and support simply row-row ingestion (no transformations)
+from different sources such as DFS or Kafka.</p>
+
+<p>The tool is a spark job (part of hoodie-utilities), that provides the following functionality</p>
+
+<ul>
+  <li>Ability to consume new events from Kafka, incremental imports from Sqoop or output of <code class="highlighter-rouge">HiveIncrementalPuller</code> or files under a folder on DFS</li>
+  <li>Support json, avro or a custom payload types for the incoming data</li>
+  <li>Pick up avro schemas from DFS or Confluent <a href="https://github.com/confluentinc/schema-registry">schema registry</a>.</li>
+  <li>New data is written to a Hudi dataset, with support for checkpointing and registered onto Hive</li>
+</ul>
+
+<p>Command line options describe capabilities in more detail (first build hoodie-utilities using <code class="highlighter-rouge">mvn clean package</code>).</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>[hoodie]$ spark-submit --class com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer `ls hoodie-utilities/target/hoodie-utilities-*-SNAPSHOT.jar` --help
+Usage: &lt;main class&gt; [options]
+  Options:
+    --help, -h
+
+    --key-generator-class
+      Subclass of com.uber.hoodie.KeyGenerator to generate a HoodieKey from
+      the given avro record. Built in: SimpleKeyGenerator (uses provided field
+      names as recordkey &amp; partitionpath. Nested fields specified via dot
+      notation, e.g: a.b.c)
+      Default: com.uber.hoodie.SimpleKeyGenerator
+    --op
+      Takes one of these values : UPSERT (default), INSERT (use when input is
+      purely new data/inserts to gain speed)
+      Default: UPSERT
+      Possible Values: [UPSERT, INSERT, BULK_INSERT]
+    --payload-class
+      subclass of HoodieRecordPayload, that works off a GenericRecord.
+      Implement your own, if you want to do something other than overwriting
+      existing value
+      Default: com.uber.hoodie.OverwriteWithLatestAvroPayload
+    --props
+      path to properties file on localfs or dfs, with configurations for
+      Hudi client, schema provider, key generator and data source. For
+      Hudi client props, sane defaults are used, but recommend use to
+      provide basic things like metrics endpoints, hive configs etc. For
+      sources, referto individual classes, for supported properties.
+      Default: file:///Users/vinoth/bin/hoodie/src/test/resources/delta-streamer-config/dfs-source.properties
+    --schemaprovider-class
+      subclass of com.uber.hoodie.utilities.schema.SchemaProvider to attach
+      schemas to input &amp; target table data, built in options:
+      FilebasedSchemaProvider
+      Default: com.uber.hoodie.utilities.schema.FilebasedSchemaProvider
+    --source-class
+      Subclass of com.uber.hoodie.utilities.sources to read data. Built-in
+      options: com.uber.hoodie.utilities.sources.{JsonDFSSource (default),
+      AvroDFSSource, JsonKafkaSource, AvroKafkaSource, HiveIncrPullSource}
+      Default: com.uber.hoodie.utilities.sources.JsonDFSSource
+    --source-limit
+      Maximum amount of data to read from source. Default: No limit For e.g:
+      DFSSource =&gt; max bytes to read, KafkaSource =&gt; max events to read
+      Default: 9223372036854775807
+    --source-ordering-field
+      Field within source record to decide how to break ties between records
+      with same key in input data. Default: 'ts' holding unix timestamp of
+      record
+      Default: ts
+    --spark-master
+      spark master to use.
+      Default: local[2]
+  * --target-base-path
+      base path for the target Hudi dataset. (Will be created if did not
+      exist first time around. If exists, expected to be a Hudi dataset)
+  * --target-table
+      name of the target table in Hive
+    --transformer-class
+      subclass of com.uber.hoodie.utilities.transform.Transformer. UDF to
+      transform raw source dataset to a target dataset (conforming to target
+      schema) before writing. Default : Not set. E:g -
+      com.uber.hoodie.utilities.transform.SqlQueryBasedTransformer (which
+      allows a SQL query template to be passed as a transformation function)
+
+</code></pre>
+</div>
+
+<p>The tool takes a hierarchically composed property file and has pluggable interfaces for extracting data, key generation and providing schema. Sample configs for ingesting from kafka and dfs are
+provided under <code class="highlighter-rouge">hoodie-utilities/src/test/resources/delta-streamer-config</code>.</p>
+
+<p>For e.g: once you have Confluent Kafka, Schema registry up &amp; running, produce some test data using (<a href="https://docs.confluent.io/current/ksql/docs/tutorials/generate-custom-test-data.html">impressions.avro</a> provided by schema-registry repo)</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>[confluent-5.0.0]$ bin/ksql-datagen schema=../impressions.avro format=avro topic=impressions key=impressionid
+</code></pre>
+</div>
+
+<p>and then ingest it as follows.</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>[hoodie]$ spark-submit --class com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer `ls hoodie-utilities/target/hoodie-utilities-*-SNAPSHOT.jar` \
+  --props file://${PWD}/hoodie-utilities/src/test/resources/delta-streamer-config/kafka-source.properties \
+  --schemaprovider-class com.uber.hoodie.utilities.schema.SchemaRegistryProvider \
+  --source-class com.uber.hoodie.utilities.sources.AvroKafkaSource \
+  --source-ordering-field impresssiontime \
+  --target-base-path file:///tmp/hoodie-deltastreamer-op --target-table uber.impressions \
+  --op BULK_INSERT
+</code></pre>
+</div>
+
+<p>In some cases, you may want to convert your existing dataset into Hudi, before you can begin ingesting new data. This can be accomplished using the <code class="highlighter-rouge">hdfsparquetimport</code> command on the <code class="highlighter-rouge">hoodie-cli</code>.
+Currently, there is support for converting parquet datasets.</p>
+
+<h4 id="via-custom-spark-job">Via Custom Spark Job</h4>
+
+<p>The <code class="highlighter-rouge">hoodie-spark</code> module offers the DataSource API to write any data frame into a Hudi dataset. Following is how we can upsert a dataframe, while specifying the field names that need to be used
+for <code class="highlighter-rouge">recordKey =&gt; _row_key</code>, <code class="highlighter-rouge">partitionPath =&gt; partition</code> and <code class="highlighter-rouge">precombineKey =&gt; timestamp</code></p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>inputDF.write()
+       .format("com.uber.hoodie")
+       .options(clientOpts) // any of the Hudi client opts can be passed in as well
+       .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
+       .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
+       .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
+       .option(HoodieWriteConfig.TABLE_NAME, tableName)
+       .mode(SaveMode.Append)
+       .save(basePath);
+</code></pre>
+</div>
+
+<p>Please refer to <a href="configurations.html">configurations</a> section, to view all datasource options.</p>
+
+<h4 id="syncing-to-hive">Syncing to Hive</h4>
+
+<p>Once new data is written to a Hudi dataset, via tools like above, we need the ability to sync with Hive and reflect the table schema such that queries can pick up new columns and partitions. To do this, Hudi provides a <code class="highlighter-rouge">HiveSyncTool</code>, which can be
+invoked as below, once you have built the hoodie-hive module.</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code> [hoodie-hive]$ java -cp target/hoodie-hive-0.3.6-SNAPSHOT-jar-with-dependencies.jar:target/jars/* com.uber.hoodie.hive.HiveSyncTool --help
+Usage: &lt;main class&gt; [options]
+  Options:
+  * --base-path
+       Basepath of Hudi dataset to sync
+  * --database
+       name of the target database in Hive
+    --help, -h
+
+       Default: false
+  * --jdbc-url
+       Hive jdbc connect url
+  * --pass
+       Hive password
+  * --table
+       name of the target table in Hive
+  * --user
+       Hive username
+
+
+</code></pre>
+</div>
+
+<h2 id="incrementally-pulling">Incrementally Pulling</h2>
+
+<p>Hudi datasets can be pulled incrementally, which means you can get ALL and ONLY the updated &amp; new rows since a specified commit timestamp.
+This, together with upserts, are particularly useful for building data pipelines where 1 or more source Hudi tables are incrementally pulled (streams/facts),
+joined with other tables (datasets/dimensions), to produce deltas to a target Hudi dataset. Then, using the delta streamer tool these deltas can be upserted into the
+target Hudi dataset to complete the pipeline.</p>
+
+<h4 id="via-spark-job">Via Spark Job</h4>
+<p>The <code class="highlighter-rouge">hoodie-spark</code> module offers the DataSource API, offers a more elegant way to pull data from Hudi dataset (plus more) and process it via Spark.
+This class can be used within existing Spark jobs and offers the following functionality.</p>
+
+<p>A sample incremental pull, that will obtain all records written since <code class="highlighter-rouge">beginInstantTime</code>, looks like below.</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code> Dataset&lt;Row&gt; hoodieIncViewDF = spark.read()
+     .format("com.uber.hoodie")
+     .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(),
+             DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
+     .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(),
+            &lt;beginInstantTime&gt;)
+     .load(tablePath); // For incremental view, pass in the root/base path of dataset
+</code></pre>
+</div>
+
+<p>Please refer to <a href="configurations.html">configurations</a> section, to view all datasource options.</p>
+
+<p>Additionally, <code class="highlighter-rouge">HoodieReadClient</code> offers the following functionality using Hudi’s implicit indexing.</p>
+
+<table>
+  <tbody>
+    <tr>
+      <td><strong>API</strong></td>
+      <td><strong>Description</strong></td>
+    </tr>
+    <tr>
+      <td>read(keys)</td>
+      <td>Read out the data corresponding to the keys as a DataFrame, using Hudi’s own index for faster lookup</td>
+    </tr>
+    <tr>
+      <td>filterExists()</td>
+      <td>Filter out already existing records from the provided RDD[HoodieRecord]. Useful for de-duplication</td>
+    </tr>
+    <tr>
+      <td>checkExists(keys)</td>
+      <td>Check if the provided keys exist in a Hudi dataset</td>
+    </tr>
+  </tbody>
+</table>
+
+<h4 id="hiveincrementalpuller-tool">HiveIncrementalPuller Tool</h4>
+<p><code class="highlighter-rouge">HiveIncrementalPuller</code> allows the above to be done via HiveQL, combining the benefits of Hive (reliably process complex SQL queries) and incremental primitives
+(speed up query by pulling tables incrementally instead of scanning fully). The tool uses Hive JDBC to run the Hive query saving its results in a temp table.
+that can later be upserted. Upsert utility (<code class="highlighter-rouge">HoodieDeltaStreamer</code>) has all the state it needs from the directory structure to know what should be the commit time on the target table.
+e.g: <code class="highlighter-rouge">/app/incremental-hql/intermediate/{source_table_name}_temp/{last_commit_included}</code>.The Delta Hive table registered will be of the form <code class="highlighter-rouge"><span class="p">{</span><span class="err">tmpdb</span><span class="p">}</span><span class="err">.</span><span class="p">{</span><span class="err">source_table</span><span class="p">}</span><span class="err">_</span><span class="p">{</span><span class="err">last_commit_included</spa [...]
+
+<p>The following are the configuration options for HiveIncrementalPuller</p>
+
+<table>
+  <tbody>
+    <tr>
+      <td><strong>Config</strong></td>
+      <td><strong>Description</strong></td>
+      <td><strong>Default</strong></td>
+    </tr>
+    <tr>
+      <td>hiveUrl</td>
+      <td>Hive Server 2 URL to connect to</td>
+      <td> </td>
+    </tr>
+    <tr>
+      <td>hiveUser</td>
+      <td>Hive Server 2 Username</td>
+      <td> </td>
+    </tr>
+    <tr>
+      <td>hivePass</td>
+      <td>Hive Server 2 Password</td>
+      <td> </td>
+    </tr>
+    <tr>
+      <td>queue</td>
+      <td>YARN Queue name</td>
+      <td> </td>
+    </tr>
+    <tr>
+      <td>tmp</td>
+      <td>Directory where the temporary delta data is stored in DFS. The directory structure will follow conventions. Please see the below section.</td>
+      <td> </td>
+    </tr>
+    <tr>
+      <td>extractSQLFile</td>
+      <td>The SQL to execute on the source table to extract the data. The data extracted will be all the rows that changed since a particular point in time.</td>
+      <td> </td>
+    </tr>
+    <tr>
+      <td>sourceTable</td>
+      <td>Source Table Name. Needed to set hive environment properties.</td>
+      <td> </td>
+    </tr>
+    <tr>
+      <td>targetTable</td>
+      <td>Target Table Name. Needed for the intermediate storage directory structure.</td>
+      <td> </td>
+    </tr>
+    <tr>
+      <td>sourceDataPath</td>
+      <td>Source DFS Base Path. This is where the Hudi metadata will be read.</td>
+      <td> </td>
+    </tr>
+    <tr>
+      <td>targetDataPath</td>
+      <td>Target DFS Base path. This is needed to compute the fromCommitTime. This is not needed if fromCommitTime is specified explicitly.</td>
+      <td> </td>
+    </tr>
+    <tr>
+      <td>tmpdb</td>
+      <td>The database to which the intermediate temp delta table will be created</td>
+      <td>hoodie_temp</td>
+    </tr>
+    <tr>
+      <td>fromCommitTime</td>
+      <td>This is the most important parameter. This is the point in time from which the changed records are pulled from.</td>
+      <td> </td>
+    </tr>
+    <tr>
+      <td>maxCommits</td>
+      <td>Number of commits to include in the pull. Setting this to -1 will include all the commits from fromCommitTime. Setting this to a value &gt; 0, will include records that ONLY changed in the specified number of commits after fromCommitTime. This may be needed if you need to catch up say 2 commits at a time.</td>
+      <td>3</td>
+    </tr>
+    <tr>
+      <td>help</td>
+      <td>Utility Help</td>
+      <td> </td>
+    </tr>
+  </tbody>
+</table>
+
+<p>Setting the fromCommitTime=0 and maxCommits=-1 will pull in the entire source dataset and can be used to initiate backfills. If the target dataset is a Hudi dataset,
+then the utility can determine if the target dataset has no commits or is behind more than 24 hour (this is configurable),
+it will automatically use the backfill configuration, since applying the last 24 hours incrementally could take more time than doing a backfill. The current limitation of the tool
+is the lack of support for self-joining the same table in mixed mode (normal and incremental modes).</p>
+
+
+    <div class="tags">
+        
+    </div>
+
+    
+
+</div>
+
+<hr class="shaded"/>
+
+<footer>
+            <div class="row">
+                <div class="col-lg-12 footer">
+                  <p>
+                  Copyright &copy; <span id="copyright-year">2019</span> <a href="https://apache.org">The Apache Software Foundation</a>,
+                  Licensed under the Apache License, Version 2.0<br>
+                  Apache and the Apache feather logo are trademarks of The Apache Software Foundation.| <a href="/privacy">Privacy Policy</a><br>
+                  <a class="footer-link-img" href="https://apache.org">
+                    <img src="images/asf_logo.svg" alt="The Apache Software Foundation" height="100px" widh="50px"></a>
+                  </p>
+                  <p>
+                  Apache Hudi is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the <a href="http://incubator.apache.org/">Apache Incubator</a>.
+                  Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have
+                  stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a
+                  reflection of the completeness or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF.
+                  </p>
+                </div>
+            </div>
+</footer>
+
+
+    </div>
+    <!-- /.row -->
+</div>
+<!-- /.container -->
+    </div>
+
+</body>
+
+</html>
\ No newline at end of file