You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hudi.apache.org by vi...@apache.org on 2021/08/03 02:59:57 UTC
[hudi] branch asf-site updated: [HUDI-1985] Migrate the hudi site to docusaurus platform (website complete re-design) (#3366)

This is an automated email from the ASF dual-hosted git repository.

vinoth pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new 299ce0c  [HUDI-1985] Migrate the hudi site to docusaurus platform (website complete re-design) (#3366)
299ce0c is described below

commit 299ce0c6073339444b105c91be532d5682ad9916
Author: Vinoth Govindarajan <vi...@uber.com>
AuthorDate: Mon Aug 2 19:59:40 2021 -0700

    [HUDI-1985] Migrate the hudi site to docusaurus platform (website complete re-design) (#3366)
    
    * Migrate the hudi site to docusaurus platform
    
    * Fixed the doc_root in travis file
    
    * Fixed the travis build script.
    
    * Added the website folder correctly.
    
    * Replaced yarn commands with npm to avoid conflicts with hadoop yarn command
    
    * new build files
    
    * Simplified the travis build script
    
    * Fixed the travis build script and styling
    
    * Restored content to resolve merge conflicts.
    
    * Rebased to resolve merge conflicts
    
    * Fixed the config pages layout and build script
    
    * Updated footer link from roadmap to streaming data lake + added robinhood to powered-by page
    
    * added robinhood to powered-by page
    
    * Updated the site as per the review comments
---
 .travis.yml                                        |   21 +-
 README.md                                          |  193 +--
 website/.gitignore                                 |   28 +
 website/babel.config.js                            |    3 +
 website/blog/2016-12-30-strata-talk-2017.md        |   10 +
 website/blog/2019-01-18-asf-incubation.md          |    9 +
 website/blog/2019-03-07-batch-vs-incremental.md    |    8 +
 .../blog/2019-05-14-registering-dataset-to-hive.md |   86 ++
 .../blog/2019-09-09-ingesting-database-changes.md  |   45 +
 website/blog/2020-01-15-delete-support-in-hudi.md  |  189 +++
 .../blog/2020-01-20-change-capture-using-aws.md    |  202 ++++
 website/blog/2020-03-22-exporting-hudi-datasets.md |  102 ++
 .../blog/2020-04-27-apache-hudi-apache-zepplin.md  |   65 +
 ...0-05-28-monitoring-hudi-metrics-with-datadog.md |   65 +
 ...18-hudi-incremental-processing-on-data-lakes.md |  275 +++++
 ...-efficient-migration-of-large-parquet-tables.md |  175 +++
 ...2020-08-21-async-compaction-deployment-model.md |   99 ++
 ...2020-08-22-ingest-multiple-tables-using-hudi.md |  104 ++
 ...020-10-06-cdc-solution-using-hudi-by-nclouds.md |    8 +
 .../2020-10-15-apache-hudi-meets-apache-flink.md   |  196 ++++
 .../2020-10-19-hudi-meets-aws-emr-and-aws-dms.md   |    8 +
 .../blog/2020-11-11-hudi-indexing-mechanisms.md    |  124 ++
 ...gh-perf-data-lake-with-hudi-and-alluxio-t3go.md |  100 ++
 website/blog/2021-01-27-hudi-clustering-intro.md   |  132 +++
 website/blog/2021-02-13-hudi-key-generators.md     |  192 +++
 website/blog/2021-03-01-hudi-file-sizing.md        |   85 ++
 ...loying-right-configurations-for-hudi-cleaner.md |  107 ++
 .../2021-07-21-streaming-data-lake-platform.md     |  151 +++
 website/contribute/developer-setup.md              |  292 +++++
 website/contribute/get-involved.md                 |   24 +
 website/contribute/how-to-contribute.md            |   44 +
 website/contribute/report-security-issues.md       |   28 +
 website/contribute/team.md                         |   33 +
 website/docs/azure_hoodie.md                       |   50 +
 website/docs/bos_hoodie.md                         |   57 +
 website/docs/cloud.md                              |   27 +
 website/docs/comparison.md                         |   56 +
 website/docs/concepts.md                           |  172 +++
 website/docs/concurrency_control.md                |  149 +++
 website/docs/configurations.md                     |  433 +++++++
 website/docs/cos_hoodie.md                         |   71 ++
 website/docs/deployment.md                         |  578 +++++++++
 website/docs/docker_demo.md                        | 1235 +++++++++++++++++++
 website/docs/flink-quick-start-guide.md            |  167 +++
 website/docs/gcs_hoodie.md                         |   60 +
 website/docs/ibm_cos_hoodie.md                     |   77 ++
 website/docs/metrics.md                            |  160 +++
 website/docs/migration_guide.md                    |   70 ++
 website/docs/oss_hoodie.md                         |   70 ++
 website/docs/overview.md                           |  172 +++
 website/docs/performance.md                        |   64 +
 website/docs/privacy.md                            |   22 +
 website/docs/querying_data.md                      |  273 +++++
 website/docs/quick-start-guide.md                  |  594 ++++++++++
 website/docs/s3_hoodie.md                          |   80 ++
 website/docs/structure.md                          |   20 +
 website/docs/use_cases.md                          |   81 ++
 website/docs/writing_data.md                       |  614 ++++++++++
 website/docusaurus.config.js                       |  415 +++++++
 website/i18n/cn/code.json                          |  166 +++
 .../cn/docusaurus-plugin-content-docs/current.json |    6 +
 .../current/azure_hoodie.md                        |   51 +
 .../current/bos_hoodie.md                          |   58 +
 .../current/cloud.md                               |   28 +
 .../current/comparison.md                          |   48 +
 .../current/concepts.md                            |  154 +++
 .../current/configurations.md                      |  604 ++++++++++
 .../current/cos_hoodie.md                          |   73 ++
 .../current/deployment.md                          |  433 +++++++
 .../current/docker_demo.md                         | 1155 ++++++++++++++++++
 .../current/docs-versions.md                       |   12 +
 .../current/gcs_hoodie.md                          |   61 +
 .../current/ibm_cos_hoodie.md                      |   78 ++
 .../current/migration_guide.md                     |   72 ++
 .../current/oss_hoodie.md                          |   71 ++
 .../current/performance.md                         |   62 +
 .../current/powered_by.md                          |   88 ++
 .../current/privacy.md                             |   23 +
 .../current/querying_data.md                       |  224 ++++
 .../current/s3_hoodie.md                           |   81 ++
 .../current/spark_quick-start-guide.md             |  449 +++++++
 .../current/use_cases.md                           |   67 ++
 .../current/writing_data.md                        |  222 ++++
 .../version-0.5.0/admin_guide.md                   |  434 +++++++
 .../version-0.5.0/comparison.md                    |   49 +
 .../version-0.5.0/concepts.md                      |  155 +++
 .../version-0.5.0/configurations.md                |  469 ++++++++
 .../version-0.5.0/docker_demo.md                   | 1153 ++++++++++++++++++
 .../version-0.5.0/gcs_hoodie.md                    |   62 +
 .../version-0.5.0/migration_guide.md               |   73 ++
 .../version-0.5.0/performance.md                   |   63 +
 .../version-0.5.0/powered_by.md                    |   58 +
 .../version-0.5.0/privacy.md                       |   24 +
 .../version-0.5.0/querying_data.md                 |  147 +++
 .../version-0.5.0/quick-start-guide.md             |  161 +++
 .../version-0.5.0/s3_hoodie.md                     |   82 ++
 .../version-0.5.0/use_cases.md                     |   68 ++
 .../version-0.5.0/writing_data.md                  |  223 ++++
 .../version-0.5.1/comparison.md                    |   49 +
 .../version-0.5.1/concepts.md                      |  155 +++
 .../version-0.5.1/configurations.md                |  469 ++++++++
 .../version-0.5.1/deployment.md                    |  434 +++++++
 .../version-0.5.1/docker_demo.md                   | 1153 ++++++++++++++++++
 .../version-0.5.1/gcs_hoodie.md                    |   62 +
 .../version-0.5.1/migration_guide.md               |   73 ++
 .../version-0.5.1/performance.md                   |   63 +
 .../version-0.5.1/powered_by.md                    |   58 +
 .../version-0.5.1/privacy.md                       |   24 +
 .../version-0.5.1/querying_data.md                 |  177 +++
 .../version-0.5.1/quick-start-guide.md             |  161 +++
 .../version-0.5.1/s3_hoodie.md                     |   82 ++
 .../version-0.5.1/use_cases.md                     |   68 ++
 .../version-0.5.1/writing_data.md                  |  223 ++++
 .../version-0.5.2/comparison.md                    |   49 +
 .../version-0.5.2/concepts.md                      |  155 +++
 .../version-0.5.2/configurations.md                |  473 ++++++++
 .../version-0.5.2/deployment.md                    |  434 +++++++
 .../version-0.5.2/docker_demo.md                   | 1153 ++++++++++++++++++
 .../version-0.5.2/gcs_hoodie.md                    |   62 +
 .../version-0.5.2/migration_guide.md               |   73 ++
 .../version-0.5.2/performance.md                   |   63 +
 .../version-0.5.2/powered_by.md                    |   58 +
 .../version-0.5.2/privacy.md                       |   24 +
 .../version-0.5.2/querying_data.md                 |  206 ++++
 .../version-0.5.2/quick-start-guide.md             |  161 +++
 .../version-0.5.2/s3_hoodie.md                     |   82 ++
 .../version-0.5.2/use_cases.md                     |   68 ++
 .../version-0.5.2/writing_data.md                  |  223 ++++
 .../version-0.5.3/azure_hoodie.md                  |   52 +
 .../version-0.5.3/cloud.md                         |   21 +
 .../version-0.5.3/comparison.md                    |   49 +
 .../version-0.5.3/concepts.md                      |  155 +++
 .../version-0.5.3/configurations.md                |  538 +++++++++
 .../version-0.5.3/deployment.md                    |  434 +++++++
 .../version-0.5.3/docker_demo.md                   | 1153 ++++++++++++++++++
 .../version-0.5.3/gcs_hoodie.md                    |   62 +
 .../version-0.5.3/migration_guide.md               |   73 ++
 .../version-0.5.3/oss_hoodie.md                    |   72 ++
 .../version-0.5.3/performance.md                   |   63 +
 .../version-0.5.3/powered_by.md                    |   67 ++
 .../version-0.5.3/privacy.md                       |   24 +
 .../version-0.5.3/querying_data.md                 |  206 ++++
 .../version-0.5.3/quick-start-guide.md             |  420 +++++++
 .../version-0.5.3/s3_hoodie.md                     |   82 ++
 .../version-0.5.3/use_cases.md                     |   68 ++
 .../version-0.5.3/writing_data.md                  |  223 ++++
 .../version-0.6.0/azure_hoodie.md                  |   52 +
 .../version-0.6.0/cloud.md                         |   24 +
 .../version-0.6.0/comparison.md                    |   49 +
 .../version-0.6.0/concepts.md                      |  155 +++
 .../version-0.6.0/configurations.md                |  532 +++++++++
 .../version-0.6.0/cos_hoodie.md                    |   74 ++
 .../version-0.6.0/deployment.md                    |  434 +++++++
 .../version-0.6.0/docker_demo.md                   | 1153 ++++++++++++++++++
 .../version-0.6.0/gcs_hoodie.md                    |   62 +
 .../version-0.6.0/migration_guide.md               |   73 ++
 .../version-0.6.0/oss_hoodie.md                    |   72 ++
 .../version-0.6.0/performance.md                   |   63 +
 .../version-0.6.0/powered_by.md                    |   72 ++
 .../version-0.6.0/privacy.md                       |   24 +
 .../version-0.6.0/querying_data.md                 |  223 ++++
 .../version-0.6.0/quick-start-guide.md             |  163 +++
 .../version-0.6.0/s3_hoodie.md                     |   82 ++
 .../version-0.6.0/use_cases.md                     |   68 ++
 .../version-0.6.0/writing_data.md                  |  223 ++++
 .../version-0.7.0/azure_hoodie.md                  |   52 +
 .../version-0.7.0/cloud.md                         |   26 +
 .../version-0.7.0/comparison.md                    |   49 +
 .../version-0.7.0/configurations.md                |  601 ++++++++++
 .../version-0.7.0/cos_hoodie.md                    |   74 ++
 .../version-0.7.0/deployment.md                    |  434 +++++++
 .../version-0.7.0/docker_demo.md                   | 1156 ++++++++++++++++++
 .../version-0.7.0/gcs_hoodie.md                    |   62 +
 .../version-0.7.0/ibm_cos_hoodie.md                |   79 ++
 .../version-0.7.0/migration_guide.md               |   73 ++
 .../version-0.7.0/oss_hoodie.md                    |   72 ++
 .../version-0.7.0/overview.md                      |  155 +++
 .../version-0.7.0/performance.md                   |   63 +
 .../version-0.7.0/powered_by.md                    |   89 ++
 .../version-0.7.0/privacy.md                       |   24 +
 .../version-0.7.0/querying_data.md                 |  223 ++++
 .../version-0.7.0/quick-start-guide.md             |  429 +++++++
 .../version-0.7.0/s3_hoodie.md                     |   82 ++
 .../version-0.7.0/use_cases.md                     |   68 ++
 .../version-0.7.0/writing_data.md                  |  223 ++++
 .../version-0.8.0/azure_hoodie.md                  |   52 +
 .../version-0.8.0/cloud.md                         |   26 +
 .../version-0.8.0/comparison.md                    |   49 +
 .../version-0.8.0/configurations.md                |  601 ++++++++++
 .../version-0.8.0/cos_hoodie.md                    |   74 ++
 .../version-0.8.0/deployment.md                    |  434 +++++++
 .../version-0.8.0/docker_demo.md                   | 1156 ++++++++++++++++++
 .../version-0.8.0/gcs_hoodie.md                    |   62 +
 .../version-0.8.0/ibm_cos_hoodie.md                |   79 ++
 .../version-0.8.0/migration_guide.md               |   73 ++
 .../version-0.8.0/oss_hoodie.md                    |   72 ++
 .../version-0.8.0/overview.md                      |  155 +++
 .../version-0.8.0/performance.md                   |   63 +
 .../version-0.8.0/powered_by.md                    |   89 ++
 .../version-0.8.0/privacy.md                       |   24 +
 .../version-0.8.0/querying_data.md                 |  225 ++++
 .../version-0.8.0/s3_hoodie.md                     |   82 ++
 .../version-0.8.0/spark_quick-start-guide.md       |  449 +++++++
 .../version-0.8.0/use_cases.md                     |   68 ++
 .../version-0.8.0/writing_data.md                  |  223 ++++
 .../cn/docusaurus-plugin-content-pages/activity.md |    6 +
 .../contributing.md                                |  169 +++
 .../developer-setup.md                             |  293 +++++
 .../cn/docusaurus-plugin-content-pages/download.md |   34 +
 .../get-involved.md                                |   25 +
 .../how-to-contribute.md                           |   45 +
 .../older_releases.md                              |   94 ++
 .../cn/docusaurus-plugin-content-pages/releases.md |  237 ++++
 .../report-security-issues.md                      |   29 +
 .../cn/docusaurus-plugin-content-pages/security.md |   28 +
 .../cn/docusaurus-plugin-content-pages/team.md     |   34 +
 .../i18n/cn/docusaurus-theme-classic/footer.json   |   74 ++
 .../i18n/cn/docusaurus-theme-classic/navbar.json   |   50 +
 website/package.json                               |   46 +
 website/releases/download.md                       |   36 +
 website/releases/older-releases.md                 |  119 ++
 website/releases/release-0.5.3.md                  |   56 +
 website/releases/release-0.6.0.md                  |   62 +
 website/releases/release-0.7.0.md                  |   86 ++
 website/releases/release-0.8.0.md                  |   50 +
 website/scripts/build-site.sh                      |   32 +
 website/sidebars.js                                |   63 +
 website/sidebarsContribute.js                      |   24 +
 website/sidebarsReleases.js                        |   19 +
 website/src/css/custom.css                         |  173 +++
 website/src/pages/404.md                           |    7 +
 website/src/pages/index.js                         |  130 ++
 website/src/pages/index.module.css                 |   25 +
 website/src/pages/powered-by.md                    |  114 ++
 website/src/pages/roadmap.md                       |   27 +
 website/src/pages/talks-articles.md                |  108 ++
 website/src/theme/prism-include-languages.js       |   23 +
 website/static/.nojekyll                           |    0
 website/static/assets/images/asf_logo.svg          |  210 ++++
 website/static/assets/images/async_compac_1.png    |  Bin 0 -> 60344 bytes
 website/static/assets/images/async_compac_2.png    |  Bin 0 -> 54164 bytes
 website/static/assets/images/async_compac_3.png    |  Bin 0 -> 70516 bytes
 website/static/assets/images/async_compac_4.png    |  Bin 0 -> 66932 bytes
 .../blog/2020-05-28-datadog-metrics-demo.png       |  Bin 0 -> 105729 bytes
 .../assets/images/blog/2020-08-20-per-record.png   |  Bin 0 -> 5762 bytes
 .../assets/images/blog/2020-08-20-skeleton.png     |  Bin 0 -> 25778 bytes
 .../blog/2020-12-01-t3go-architecture-alluxio.png  |  Bin 0 -> 123624 bytes
 .../images/blog/2020-12-01-t3go-architecture.png   |  Bin 0 -> 72891 bytes
 .../images/blog/2020-12-01-t3go-microbenchmark.png |  Bin 0 -> 56321 bytes
 .../assets/images/blog/batch_vs_incremental.png    |  Bin 0 -> 22104 bytes
 .../images/blog/change-capture-architecture.png    |  Bin 0 -> 16807 bytes
 .../assets/images/blog/change-logs-mysql.png       |  Bin 0 -> 114403 bytes
 .../clustering/Query_Plan_After_Clustering.png     |  Bin 0 -> 97289 bytes
 .../clustering/Query_Plan_Before_Clustering.png    |  Bin 0 -> 96605 bytes
 .../blog/clustering/example_perf_improvement.png   |  Bin 0 -> 416119 bytes
 .../Hudi_design_diagram_-_Page_2_1.png             |  Bin 0 -> 52035 bytes
 .../Screen_Shot_2021-07-20_at_5.35.47_PM.png       |  Bin 0 -> 163959 bytes
 .../images/blog/datalake-platform/hudi-comic.png   |  Bin 0 -> 93630 bytes
 .../datalake-platform/hudi-data-lake-platform.png  |  Bin 0 -> 128340 bytes
 .../hudi-data-lake-platform_-_Copy_of_Page_1_3.png |  Bin 0 -> 130359 bytes
 .../hudi-data-lake-platform_-_Page_2_4.png         |  Bin 0 -> 282177 bytes
 .../hudi-design-diagram_-incr-read.png             |  Bin 0 -> 57567 bytes
 .../hudi-design-diagrams-table-format.png          |  Bin 0 -> 42148 bytes
 .../hudi-design-diagrams_-_Page_2_1.png            |  Bin 0 -> 70552 bytes
 .../hudi-design-diagrams_-_Page_4.png              |  Bin 0 -> 81348 bytes
 .../hudi-design-diagrams_-_Page_5.png              |  Bin 0 -> 123834 bytes
 .../hudi-design-diagrams_-_Page_6.png              |  Bin 0 -> 74018 bytes
 .../hudi-design-diagrams_-_Page_7.png              |  Bin 0 -> 95019 bytes
 .../hudi-design-diagrams_-_Page_8.png              |  Bin 0 -> 39783 bytes
 .../static/assets/images/blog/dms-demo-files.png   |  Bin 0 -> 52683 bytes
 website/static/assets/images/blog/dms-task.png     |  Bin 0 -> 27532 bytes
 .../blog/hoodie-cleaner/Initial_timeline.png       |  Bin 0 -> 141789 bytes
 .../blog/hoodie-cleaner/Retain_latest_commits.png  |  Bin 0 -> 145891 bytes
 .../blog/hoodie-cleaner/Retain_latest_versions.png |  Bin 0 -> 146622 bytes
 .../blog/hudi-file-sizing/adding_new_files.png     |  Bin 0 -> 44237 bytes
 .../bin_packing_existing_data_files.png            |  Bin 0 -> 23955 bytes
 .../blog/hudi-file-sizing/initial_layout.png       |  Bin 0 -> 34742 bytes
 .../images/blog/hudi-indexes/Dimension20tables.gif |  Bin 0 -> 577717 bytes
 .../images/blog/hudi-indexes/Event20tables.gif     |  Bin 0 -> 558858 bytes
 .../images/blog/hudi-indexes/Fact20tables.gif      |  Bin 0 -> 595395 bytes
 .../hudi-indexes/Hudi_Index_Blog_Event_table.png   |  Bin 0 -> 20851 bytes
 .../hudi-indexes/Hudi_Index_Blog_Fact_table.png    |  Bin 0 -> 20910 bytes
 .../Hudi_Index_Blog_dimensions_table.png           |  Bin 0 -> 21011 bytes
 .../blog/hudi-indexes/with-and-without-index.png   |  Bin 0 -> 139025 bytes
 .../assets/images/blog/hudi-meets-flink/image1.png |  Bin 0 -> 161393 bytes
 .../assets/images/blog/hudi-meets-flink/image2.png |  Bin 0 -> 123298 bytes
 .../assets/images/blog/hudi-meets-flink/image3.png |  Bin 0 -> 175346 bytes
 .../assets/images/blog/incr-processing/image1.png  |  Bin 0 -> 59805 bytes
 .../assets/images/blog/incr-processing/image2.png  |  Bin 0 -> 385336 bytes
 .../assets/images/blog/incr-processing/image3.png  |  Bin 0 -> 167680 bytes
 .../assets/images/blog/incr-processing/image4.jpg  |  Bin 0 -> 19807 bytes
 .../assets/images/blog/incr-processing/image5.png  |  Bin 0 -> 225670 bytes
 .../assets/images/blog/incr-processing/image6.png  |  Bin 0 -> 67083 bytes
 .../assets/images/blog/incr-processing/image7.png  |  Bin 0 -> 44297 bytes
 .../assets/images/blog/incr-processing/image8.png  |  Bin 0 -> 209792 bytes
 .../assets/images/blog/read_optimized_view.png     |  Bin 0 -> 134293 bytes
 .../static/assets/images/blog/real_time_view.png   |  Bin 0 -> 134366 bytes
 .../images/blog/s3-endpoint-configuration-1.png    |  Bin 0 -> 38385 bytes
 .../images/blog/s3-endpoint-configuration-2.png    |  Bin 0 -> 40796 bytes
 .../images/blog/s3-endpoint-configuration.png      |  Bin 0 -> 38582 bytes
 .../static/assets/images/blog/s3-endpoint-list.png |  Bin 0 -> 26945 bytes
 .../assets/images/blog/s3-migration-task-1.png     |  Bin 0 -> 35653 bytes
 .../assets/images/blog/s3-migration-task-2.png     |  Bin 0 -> 43549 bytes
 .../assets/images/blog/spark_edit_properties.png   |  Bin 0 -> 9342 bytes
 .../images/blog/spark_read_optimized_view.png      |  Bin 0 -> 38582 bytes
 .../assets/images/blog/spark_real_time_view.png    |  Bin 0 -> 38416 bytes
 .../images/contributing/IDE_setup_copyright_1.png  |  Bin 0 -> 710312 bytes
 .../images/contributing/IDE_setup_copyright_2.png  |  Bin 0 -> 581227 bytes
 .../images/contributing/IDE_setup_maven_1.png      |  Bin 0 -> 608854 bytes
 .../images/contributing/IDE_setup_maven_2.png      |  Bin 0 -> 588765 bytes
 website/static/assets/images/favicon.ico           |  Bin 0 -> 4286 bytes
 website/static/assets/images/github.png            |  Bin 0 -> 1104 bytes
 website/static/assets/images/hudi-lake.png         |  Bin 0 -> 150248 bytes
 website/static/assets/images/hudi.png              |  Bin 0 -> 14848 bytes
 .../static/assets/images/hudi_commit_duration.png  |  Bin 0 -> 252950 bytes
 website/static/assets/images/hudi_cow.png          |  Bin 0 -> 48994 bytes
 .../static/assets/images/hudi_graphite_metrics.png |  Bin 0 -> 165146 bytes
 website/static/assets/images/hudi_intro_1.png      |  Bin 0 -> 53386 bytes
 website/static/assets/images/hudi_intro_big.png    |  Bin 0 -> 53397 bytes
 website/static/assets/images/hudi_jxm_metrics.png  |  Bin 0 -> 243761 bytes
 .../static/assets/images/hudi_log_format_v2.png    |  Bin 0 -> 223676 bytes
 website/static/assets/images/hudi_mor.png          |  Bin 0 -> 92073 bytes
 .../static/assets/images/hudi_query_perf_hive.png  |  Bin 0 -> 158481 bytes
 .../assets/images/hudi_query_perf_presto.png       |  Bin 0 -> 33727 bytes
 .../static/assets/images/hudi_query_perf_spark.png |  Bin 0 -> 29384 bytes
 website/static/assets/images/hudi_site_logo.png    |  Bin 0 -> 13170 bytes
 website/static/assets/images/hudi_timeline.png     |  Bin 0 -> 23093 bytes
 website/static/assets/images/hudi_upsert_dag.png   |  Bin 0 -> 496203 bytes
 website/static/assets/images/hudi_upsert_perf1.png |  Bin 0 -> 15984 bytes
 website/static/assets/images/hudi_upsert_perf2.png |  Bin 0 -> 18954 bytes
 website/static/assets/images/logo-big-2.png        |  Bin 0 -> 66543 bytes
 website/static/assets/images/logo-big-bak.png      |  Bin 0 -> 7238 bytes
 website/static/assets/images/logo-big.png          |  Bin 0 -> 7238 bytes
 website/static/assets/images/powers/37.PNG         |  Bin 0 -> 46397 bytes
 website/static/assets/images/powers/H3C.JPG        |  Bin 0 -> 76097 bytes
 website/static/assets/images/powers/alibaba.png    |  Bin 0 -> 7247 bytes
 website/static/assets/images/powers/aws.jpg        |  Bin 0 -> 6937 bytes
 website/static/assets/images/powers/bmw.png        |  Bin 0 -> 29210 bytes
 website/static/assets/images/powers/clinbrain.png  |  Bin 0 -> 9452 bytes
 website/static/assets/images/powers/emis.jpg       |  Bin 0 -> 7113 bytes
 website/static/assets/images/powers/kyligence.png  |  Bin 0 -> 1379 bytes
 website/static/assets/images/powers/lingyue.png    |  Bin 0 -> 2953 bytes
 website/static/assets/images/powers/moveworks.png  |  Bin 0 -> 5911 bytes
 website/static/assets/images/powers/qq.png         |  Bin 0 -> 3532 bytes
 website/static/assets/images/powers/robinhood.png  |  Bin 0 -> 10517 bytes
 website/static/assets/images/powers/shunfeng.png   |  Bin 0 -> 5871 bytes
 website/static/assets/images/powers/tathastu.png   |  Bin 0 -> 7771 bytes
 website/static/assets/images/powers/tongcheng.png  |  Bin 0 -> 4207 bytes
 website/static/assets/images/powers/uber.png       |  Bin 0 -> 7034 bytes
 website/static/assets/images/powers/yield.png      |  Bin 0 -> 823 bytes
 website/static/assets/images/powers/yotpo.png      |  Bin 0 -> 1697 bytes
 website/static/assets/images/roadmap.png           |  Bin 0 -> 146918 bytes
 website/static/assets/images/slack.png             |  Bin 0 -> 914 bytes
 website/static/assets/images/twitter.png           |  Bin 0 -> 666 bytes
 website/static/assets/images/workflowarrow.png     |  Bin 0 -> 3595 bytes
 .../versioned_docs/version-0.5.0/admin_guide.md    |  444 +++++++
 website/versioned_docs/version-0.5.0/comparison.md |   57 +
 website/versioned_docs/version-0.5.0/concepts.md   |  171 +++
 .../versioned_docs/version-0.5.0/configurations.md |  432 +++++++
 .../versioned_docs/version-0.5.0/docker_demo.md    | 1152 ++++++++++++++++++
 website/versioned_docs/version-0.5.0/gcs_hoodie.md |   61 +
 .../version-0.5.0/migration_guide.md               |   72 ++
 .../versioned_docs/version-0.5.0/performance.md    |   65 +
 website/versioned_docs/version-0.5.0/powered_by.md |   69 ++
 website/versioned_docs/version-0.5.0/privacy.md    |   23 +
 .../versioned_docs/version-0.5.0/querying_data.md  |  146 +++
 .../version-0.5.0/quick-start-guide.md             |  174 +++
 website/versioned_docs/version-0.5.0/s3_hoodie.md  |   81 ++
 website/versioned_docs/version-0.5.0/structure.md  |   21 +
 website/versioned_docs/version-0.5.0/use_cases.md  |   67 ++
 .../versioned_docs/version-0.5.0/writing_data.md   |  222 ++++
 website/versioned_docs/version-0.5.1/comparison.md |   57 +
 website/versioned_docs/version-0.5.1/concepts.md   |  172 +++
 .../versioned_docs/version-0.5.1/configurations.md |  432 +++++++
 website/versioned_docs/version-0.5.1/deployment.md |  597 ++++++++++
 .../versioned_docs/version-0.5.1/docker_demo.md    | 1162 ++++++++++++++++++
 website/versioned_docs/version-0.5.1/gcs_hoodie.md |   61 +
 .../version-0.5.1/migration_guide.md               |   71 ++
 .../versioned_docs/version-0.5.1/performance.md    |   65 +
 website/versioned_docs/version-0.5.1/powered_by.md |   69 ++
 website/versioned_docs/version-0.5.1/privacy.md    |   23 +
 .../versioned_docs/version-0.5.1/querying_data.md  |  200 ++++
 .../version-0.5.1/quick-start-guide.md             |  219 ++++
 website/versioned_docs/version-0.5.1/s3_hoodie.md  |   81 ++
 website/versioned_docs/version-0.5.1/structure.md  |   21 +
 website/versioned_docs/version-0.5.1/use_cases.md  |   67 ++
 .../versioned_docs/version-0.5.1/writing_data.md   |  252 ++++
 website/versioned_docs/version-0.5.2/comparison.md |   57 +
 website/versioned_docs/version-0.5.2/concepts.md   |  172 +++
 .../versioned_docs/version-0.5.2/configurations.md |  436 +++++++
 website/versioned_docs/version-0.5.2/deployment.md |  597 ++++++++++
 .../versioned_docs/version-0.5.2/docker_demo.md    | 1231 +++++++++++++++++++
 website/versioned_docs/version-0.5.2/gcs_hoodie.md |   61 +
 .../version-0.5.2/migration_guide.md               |   71 ++
 .../versioned_docs/version-0.5.2/performance.md    |   65 +
 website/versioned_docs/version-0.5.2/powered_by.md |   69 ++
 website/versioned_docs/version-0.5.2/privacy.md    |   23 +
 .../versioned_docs/version-0.5.2/querying_data.md  |  201 ++++
 .../version-0.5.2/quick-start-guide.md             |  219 ++++
 website/versioned_docs/version-0.5.2/s3_hoodie.md  |   81 ++
 website/versioned_docs/version-0.5.2/structure.md  |   21 +
 website/versioned_docs/version-0.5.2/use_cases.md  |   67 ++
 .../versioned_docs/version-0.5.2/writing_data.md   |  252 ++++
 .../versioned_docs/version-0.5.3/azure_hoodie.md   |   51 +
 website/versioned_docs/version-0.5.3/cloud.md      |   21 +
 website/versioned_docs/version-0.5.3/comparison.md |   57 +
 website/versioned_docs/version-0.5.3/concepts.md   |  172 +++
 .../versioned_docs/version-0.5.3/configurations.md |  500 ++++++++
 website/versioned_docs/version-0.5.3/deployment.md |  597 ++++++++++
 .../versioned_docs/version-0.5.3/docker_demo.md    | 1234 +++++++++++++++++++
 website/versioned_docs/version-0.5.3/gcs_hoodie.md |   61 +
 .../version-0.5.3/migration_guide.md               |   71 ++
 website/versioned_docs/version-0.5.3/oss_hoodie.md |   71 ++
 .../versioned_docs/version-0.5.3/performance.md    |   65 +
 website/versioned_docs/version-0.5.3/powered_by.md |  122 ++
 website/versioned_docs/version-0.5.3/privacy.md    |   23 +
 .../versioned_docs/version-0.5.3/querying_data.md  |  201 ++++
 .../version-0.5.3/quick-start-guide.md             |  451 +++++++
 website/versioned_docs/version-0.5.3/s3_hoodie.md  |   81 ++
 website/versioned_docs/version-0.5.3/structure.md  |   21 +
 website/versioned_docs/version-0.5.3/use_cases.md  |   67 ++
 .../versioned_docs/version-0.5.3/writing_data.md   |  252 ++++
 .../versioned_docs/version-0.6.0/1_2_structure.md  |   21 +
 .../versioned_docs/version-0.6.0/2_8_metrics.md    |  161 +++
 .../versioned_docs/version-0.6.0/azure_hoodie.md   |   51 +
 website/versioned_docs/version-0.6.0/cloud.md      |   24 +
 website/versioned_docs/version-0.6.0/comparison.md |   57 +
 website/versioned_docs/version-0.6.0/concepts.md   |  172 +++
 .../versioned_docs/version-0.6.0/configurations.md |  638 ++++++++++
 website/versioned_docs/version-0.6.0/cos_hoodie.md |   72 ++
 website/versioned_docs/version-0.6.0/deployment.md |  579 +++++++++
 .../versioned_docs/version-0.6.0/docker_demo.md    | 1234 +++++++++++++++++++
 website/versioned_docs/version-0.6.0/gcs_hoodie.md |   61 +
 .../version-0.6.0/migration_guide.md               |   71 ++
 website/versioned_docs/version-0.6.0/oss_hoodie.md |   71 ++
 .../versioned_docs/version-0.6.0/performance.md    |   65 +
 website/versioned_docs/version-0.6.0/powered_by.md |  134 +++
 website/versioned_docs/version-0.6.0/privacy.md    |   23 +
 .../versioned_docs/version-0.6.0/querying_data.md  |  221 ++++
 .../version-0.6.0/quick-start-guide.md             |  457 ++++++++
 website/versioned_docs/version-0.6.0/s3_hoodie.md  |   81 ++
 website/versioned_docs/version-0.6.0/use_cases.md  |   67 ++
 .../versioned_docs/version-0.6.0/writing_data.md   |  390 ++++++
 .../versioned_docs/version-0.7.0/azure_hoodie.md   |   51 +
 website/versioned_docs/version-0.7.0/cloud.md      |   26 +
 website/versioned_docs/version-0.7.0/comparison.md |   57 +
 website/versioned_docs/version-0.7.0/concepts.md   |  172 +++
 .../versioned_docs/version-0.7.0/configurations.md |  778 ++++++++++++
 website/versioned_docs/version-0.7.0/cos_hoodie.md |   72 ++
 website/versioned_docs/version-0.7.0/deployment.md |  579 +++++++++
 .../versioned_docs/version-0.7.0/docker_demo.md    | 1236 ++++++++++++++++++++
 website/versioned_docs/version-0.7.0/gcs_hoodie.md |   61 +
 .../versioned_docs/version-0.7.0/ibm_cos_hoodie.md |   78 ++
 website/versioned_docs/version-0.7.0/metrics.md    |  161 +++
 .../version-0.7.0/migration_guide.md               |   71 ++
 website/versioned_docs/version-0.7.0/oss_hoodie.md |   71 ++
 website/versioned_docs/version-0.7.0/overview.md   |  173 +++
 .../versioned_docs/version-0.7.0/performance.md    |   65 +
 website/versioned_docs/version-0.7.0/powered_by.md |  174 +++
 website/versioned_docs/version-0.7.0/privacy.md    |   23 +
 .../versioned_docs/version-0.7.0/querying_data.md  |  221 ++++
 .../version-0.7.0/quick-start-guide.md             |  529 +++++++++
 website/versioned_docs/version-0.7.0/s3_hoodie.md  |   81 ++
 website/versioned_docs/version-0.7.0/structure.md  |   21 +
 website/versioned_docs/version-0.7.0/use_cases.md  |   82 ++
 .../versioned_docs/version-0.7.0/writing_data.md   |  392 +++++++
 .../versioned_docs/version-0.8.0/azure_hoodie.md   |   51 +
 website/versioned_docs/version-0.8.0/cloud.md      |   26 +
 website/versioned_docs/version-0.8.0/comparison.md |   57 +
 website/versioned_docs/version-0.8.0/concepts.md   |  172 +++
 .../version-0.8.0/concurrency_control.md           |  145 +++
 .../versioned_docs/version-0.8.0/configurations.md |  904 ++++++++++++++
 website/versioned_docs/version-0.8.0/cos_hoodie.md |   72 ++
 website/versioned_docs/version-0.8.0/deployment.md |  579 +++++++++
 .../versioned_docs/version-0.8.0/docker_demo.md    | 1236 ++++++++++++++++++++
 .../version-0.8.0/flink-quick-start-guide.md       |  169 +++
 website/versioned_docs/version-0.8.0/gcs_hoodie.md |   61 +
 .../versioned_docs/version-0.8.0/ibm_cos_hoodie.md |   78 ++
 website/versioned_docs/version-0.8.0/metrics.md    |  161 +++
 .../version-0.8.0/migration_guide.md               |   71 ++
 website/versioned_docs/version-0.8.0/oss_hoodie.md |   71 ++
 website/versioned_docs/version-0.8.0/overview.md   |  173 +++
 .../versioned_docs/version-0.8.0/performance.md    |   65 +
 website/versioned_docs/version-0.8.0/powered_by.md |  180 +++
 website/versioned_docs/version-0.8.0/privacy.md    |   23 +
 .../versioned_docs/version-0.8.0/querying_data.md  |  264 +++++
 .../version-0.8.0/quick-start-guide.md             |  594 ++++++++++
 website/versioned_docs/version-0.8.0/s3_hoodie.md  |   81 ++
 website/versioned_docs/version-0.8.0/structure.md  |   21 +
 website/versioned_docs/version-0.8.0/use_cases.md  |   82 ++
 .../versioned_docs/version-0.8.0/writing_data.md   |  426 +++++++
 .../versioned_sidebars/version-0.5.0-sidebars.json |   58 +
 .../versioned_sidebars/version-0.5.1-sidebars.json |   62 +
 .../versioned_sidebars/version-0.5.2-sidebars.json |   62 +
 .../versioned_sidebars/version-0.5.3-sidebars.json |   74 ++
 .../versioned_sidebars/version-0.6.0-sidebars.json |   78 ++
 .../versioned_sidebars/version-0.7.0-sidebars.json |   90 ++
 .../versioned_sidebars/version-0.8.0-sidebars.json |  105 ++
 website/versions.json                              |    9 +
 website/versionsArchived.json                      |    1 +
 500 files changed, 71571 insertions(+), 91 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index d429378..facb8f5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,6 +1,6 @@
-language: ruby
-rvm:
-  - 2.6.3
+language: node_js
+node_js:
+  - '16'
 
 env:
   global:
@@ -9,7 +9,7 @@ env:
     - GIT_REPO="apache"
     - GIT_PROJECT="hudi"
     - GIT_BRANCH="asf-site"
-    - DOCS_ROOT="`pwd`/docs"
+    - DOCS_ROOT="`pwd`/website"
 
 before_install:
   - if [ "$(git show -s --format=%ae)" = "${GIT_EMAIL}" ]; then echo "avoid recursion, ignore ..."; exit 0; fi
@@ -17,25 +17,24 @@ before_install:
   - git config --global user.email ${GIT_EMAIL}
   - git remote add hudi https://${GIT_TOKEN}@github.com/${GIT_REPO}/${GIT_PROJECT}.git
   - git checkout -b pr
-  - git pull --rebase hudi asf-site
 
 script:
   - pushd ${DOCS_ROOT}
-  - gem install bundler:2.0.2
-  - bundle install
-  - bundle update --bundler
-  - bundle exec jekyll build _config.yml --source . --destination _site
+  - npm install
+  - npm run build
   - popd
 
 after_success:
   - echo $TRAVIS_PULL_REQUEST
   - 'if [ "$TRAVIS_PULL_REQUEST" != "false" ]; then echo "ignore push build result for per submit"; exit 0; fi'
   - 'if [ "$TRAVIS_PULL_REQUEST" = "false" ]; then echo "pushing build result ..."; fi'
-  - mkdir -p content && \cp -rf ${DOCS_ROOT}/_site/* content
+  - rm -rf content
+  - cp -R ${DOCS_ROOT}/build content
   - git add -A
   - git commit -am "Travis CI build asf-site"
   - git push hudi pr:asf-site
 
 branches:
   only:
-    - asf-site
\ No newline at end of file
+    - asf-site
+
diff --git a/README.md b/README.md
index 5fc5581..b414dca 100644
--- a/README.md
+++ b/README.md
@@ -1,121 +1,154 @@
-## Site Documentation
+# Apache Hudi Website Source Code
 
-This folder contains resources that build the [Apache Hudi website](https://hudi.apache.org)
+This repo hosts the source code of [Apache Hudi Official Website](https://hudi.apache.org/).
 
+# Prerequisite
 
-### Building docs
+Install [npm](https://treehouse.github.io/installation-guides/mac/node-mac.html) for the first time.
 
-The site is based on a [Jekyll](https://jekyllrb.com/) theme hosted [here](https://github.com/mmistakes/minimal-mistakes/) with detailed instructions.
+# Test Website
 
-#### Docker
-
-Simply run `docker-compose build --no-cache && docker-compose up` from the `docs` folder and the site should be up & running at `http://localhost:4000`
+Build from source
+```bash
+./website/scripts/build-site.sh
+```
 
-To see edits reflect on the site, you may have to bounce the container
+The results are moved to directory: `content`
 
- - Stop existing container by `ctrl+c` the docker-compose program
- - (or) alternatively via `docker stop docs_server_1`
- - Bring up container again using `docker-compose up`
+## Installation
 
-#### Host OS
+```console
+cd website
+npm install
+```
 
-To build directly on host OS (\*nix), first you need to install
+## Local Development
 
-- gem, ruby (using apt-get/brew)
-- bundler (`gem install bundler`)
-- jekyll (`gem install jekyll`)
-- Update bundler `bundle update --bundler`
+```console
+cd website
+npm start
+```
 
-and then run the following commands from `docs` folder to install dependencies
+This command starts a local development server and opens up a browser window. Most changes are reflected live without having to restart the server.
 
-`bundle install`
+## Build
 
-and serve a local site
+```console
+cd website
+npm run build
+```
 
-`bundle exec jekyll serve`
+This command generates static content into the `build` directory and can be served using any static contents hosting service.
 
-### Submitting changes
+## Testing your Build Locally {#testing-build-locally}
 
-To submit changes to the docs, please make the changes on the `asf-site` branch, build the site locally, test it out and submit a pull request with the changes to .md and theme files under `docs`
+It is important to test your build locally before deploying to production.
 
-### Updating site
+```console
+cd website
+npm run serve
+```
 
-Once a pull request merged, Travis CI will regenerate the site and move the generated site from `_site` to `docs/../content`, and then submit changes as a PR automatically.
+## To Add New Docs Version
+
+To better understand how versioning works and see if it suits your needs, you can read on below.
+
+## Directory structure {#directory-structure}
+
+```shell
+website
+├── sidebars.js          # sidebar for master (next) version
+├── docs                 # docs directory for master (next) version
+│   └── hello.md         # https://mysite.com/docs/next/hello
+├── versions.json        # file to indicate what versions are available
+├── versioned_docs
+│   ├── version-0.7.0
+│   │   └── hello.md     # https://mysite.com/docs/0.7.0/hello
+│   └── version-0.8.0
+│       └── hello.md     # https://mysite.com/docs/hello
+├── versioned_sidebars
+│   ├── version-0.7.0-sidebars.json
+│   └── version-0.8.0-sidebars.json
+├── docusaurus.config.js
+└── package.json
+```
 
-### Adding docs for version
+The table below explains how a versioned file maps to its version and the generated URL.
 
-During each release, we must preserve the old version's docs so users on that version can refer to it. 
-Below documents the steps needed to do that. 
+| Path                                    | Version        | URL               |
+| --------------------------------------- | -------------- | ----------------- |
+| `versioned_docs/version-0.7.0/hello.md` | 0.7.0          | /docs/0.7.0/hello |
+| `versioned_docs/version-0.8.0/hello.md` | 0.8.0 (latest) | /docs/hello       |
+| `docs/hello.md`                         | next           | /docs/next/hello  |
 
-#### Make a copy of current docs 
+### Tagging a new version {#tagging-a-new-version}
 
-Copy the docs as-is into another folder
+1. First, make sure your content in the `docs` directory is ready to be frozen as a version. A version always should be based from master.
+1. Enter a new version number.
 
-```
-cd docs/_docs
-export VERSION=0.5.0
-mkdir -p $VERSION && cp *.md $VERSION/
+```bash npm
+npm run docusaurus docs:version 0.8.0
 ```
 
-#### Rewrite links & add version to each page
+When tagging a new version, the document versioning mechanism will:
 
-This step changes the permalink (location where these pages would be placed) with a version prefix and also changes links to each other.
+- Copy the full `docs/` folder contents into a new `versioned_docs/version-<version>/` folder.
+- Create a versioned sidebars file based from your current [sidebar](docs-introduction.md#sidebar) configuration (if it exists) - saved as `versioned_sidebars/version-<version>-sidebars.json`.
+- Append the new version number to `versions.json`.
 
-Mac users please use these commands:
-```
-cd $VERSION
-sed -i '' -e "s/permalink: \/docs\//permalink: \/docs\/${VERSION}-/g" *.md
-sed -i '' -e "s/permalink: \/cn\/docs\//permalink: \/cn\/docs\/${VERSION}-/g" *.cn.md
-sed -i '' -e "s/](\/docs\//](\/docs\/${VERSION}-/g" *.md
-sed -i '' -e "s/](\/cn\/docs\//](\/cn\/docs\/${VERSION}-/g" *.cn.md
-for f in *.md; do [ -f $f ] &&  sed -i '' -e "1s/^//p; 1s/^.*/version: ${VERSION}/" $f; done
-```
+## Docs {#docs}
 
-Non Mac please use these:
-```
-cd $VERSION
-sed -i "s/permalink: \/docs\//permalink: \/docs\/${VERSION}-/g" *.md
-sed -i "s/permalink: \/cn\/docs\//permalink: \/cn\/docs\/${VERSION}-/g" *.cn.md
-sed -i "s/](\/docs\//](\/docs\/${VERSION}-/g" *.md
-sed -i "s/](\/cn\/docs\//](\/cn\/docs\/${VERSION}-/g" *.cn.md
-sed -i "0,/---/s//---\nversion: ${VERSION}/" *.md
-```
+### Creating new docs {#creating-new-docs}
 
-#### Reworking site navigation
+1. Place the new file into the corresponding version folder.
+1. Include the reference for the new file into the corresponding sidebar file, according to version number.
 
-In `_config.yml`, add a new author section similar to `0.5.0_author`. Then, change `quick_link.html` with a if block to use this navigation, when the new version's page is rendered
-  
-```
-{%- if page.language == "0.5.0" -%}
-  {%- assign author = site.0.5.0_author -%}
-{%- else -%}
-  {%- assign author = site.author -%}
-{%- endif -%}
+**Master docs**
+
+```shell
+# The new file.
+docs/new.md
+
+# Edit the corresponding sidebar file.
+sidebar.js
 ```
 
-Then in `navigation.yml`, add a new section similar to `0.5.0_docs` (or the last release), with each link pointing to pages starting with `$VERSION-`. Change `nav_list` with else-if to 
-render the new version's equivalent navigation links. 
+**Older docs**
+
+```shell
+# The new file.
+versioned_docs/version-0.7.0/new.md
 
+# Edit the corresponding sidebar file.
+versioned_sidebars/version-0.7.0-sidebars.json
 ```
-{% if page.version %}
-    {% if page.version == "0.5.0" %}
-        {% assign navigation = site.data.navigation["0.5.0_docs"] %}
-    {% endif %}
-{% endif %}
+
+### Linking docs {#linking-docs}
+
+- Remember to include the `.md` extension.
+- Files will be linked to correct corresponding version.
+- Relative paths work as well.
+
+```md
+The [@hello](hello.md#paginate) document is great!
+
+See the [Tutorial](../getting-started/tutorial.md) for more info.
 ```
 
-Final steps:
- - In `_config.yml` add a new subsection under `previous_docs: ` for this version similar to `  - version: 0.5.0`
- - Edit `docs/_pages.index.md` to point to the latest release. Change the text of latest release and edit the href 
- link to point to the release tag in github.
- - in `docs/_pages/releases.md` Add a new section on the very top for this release. Refer to `Release 0.5.0-incubating` 
- for reference. Ensure the links for github release tag, docs, source release, raw release notes are pointing to this 
- latest release. Also include following subsections - `Download Information`, `Release Highlights` and `Raw Release Notes`.
- - Update `docs/_pages/download.md` to include the download links.
- 
-#### Link to this version's doc
+## Versions {#versions}
+
+Each directory in `versioned_docs/` will represent a documentation version.
+
+### Updating an existing version {#updating-an-existing-version}
 
+You can update multiple docs versions at the same time because each directory in `versioned_docs/` represents specific routes when published.
 
+1. Edit any file.
+1. Commit and push changes.
+1. It will be published to the version.
 
+Example: When you change any file in `versioned_docs/version-0.7.0/`, it will only affect the docs for version `0.7.0`.
 
+## Maintainer
 
+Apache Hudi Community
diff --git a/website/.gitignore b/website/.gitignore
new file mode 100644
index 0000000..dea10e4
--- /dev/null
+++ b/website/.gitignore
@@ -0,0 +1,28 @@
+# Dependencies
+/node_modules
+package-lock.json
+yarn.lock
+.node_modules/
+# Production
+/build
+
+# Generated files
+.docusaurus
+.cache-loader
+
+# Misc
+.DS_Store
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+
+# IDE
+.vscode
+.idea
+*.code-workspace
+.changelog
\ No newline at end of file
diff --git a/website/babel.config.js b/website/babel.config.js
new file mode 100644
index 0000000..e00595d
--- /dev/null
+++ b/website/babel.config.js
@@ -0,0 +1,3 @@
+module.exports = {
+  presets: [require.resolve('@docusaurus/core/lib/babel/preset')],
+};
diff --git a/website/blog/2016-12-30-strata-talk-2017.md b/website/blog/2016-12-30-strata-talk-2017.md
new file mode 100644
index 0000000..8405d6d
--- /dev/null
+++ b/website/blog/2016-12-30-strata-talk-2017.md
@@ -0,0 +1,10 @@
+---
+title:  "Connect with us at Strata San Jose March 2017"
+author: admin
+date: 2016-12-30
+category: blog
+---
+
+We will be presenting Hudi & general concepts around how incremental processing works at Uber.
+Catch our talk **"Incremental Processing on Hadoop At Uber"**
+
diff --git a/website/blog/2019-01-18-asf-incubation.md b/website/blog/2019-01-18-asf-incubation.md
new file mode 100644
index 0000000..dde651e
--- /dev/null
+++ b/website/blog/2019-01-18-asf-incubation.md
@@ -0,0 +1,9 @@
+---
+title: "Hudi entered Apache Incubator"
+author: admin
+date: 2019-01-18
+category: blog
+---
+
+In the coming weeks, we will be moving in our new home on the Apache Incubator.
+
diff --git a/website/blog/2019-03-07-batch-vs-incremental.md b/website/blog/2019-03-07-batch-vs-incremental.md
new file mode 100644
index 0000000..2273227
--- /dev/null
+++ b/website/blog/2019-03-07-batch-vs-incremental.md
@@ -0,0 +1,8 @@
+---
+title: "Big Batch vs Incremental Processing"
+author: vinoth
+category: blog
+---
+
+![](/assets/images/blog/batch_vs_incremental.png)
+
diff --git a/website/blog/2019-05-14-registering-dataset-to-hive.md b/website/blog/2019-05-14-registering-dataset-to-hive.md
new file mode 100644
index 0000000..93faadf7
--- /dev/null
+++ b/website/blog/2019-05-14-registering-dataset-to-hive.md
@@ -0,0 +1,86 @@
+---
+title: "Registering sample dataset to Hive via beeline"
+excerpt: "How to manually register HUDI dataset into Hive using beeline"
+author: vinoth
+category: blog
+---
+
+Hudi hive sync tool typically handles registration of the dataset into Hive metastore. In case, there are issues with quickstart around this, following page shows commands that can be used to do this manually via beeline.  
+
+<!--truncate-->
+Add in the _packaging/hoodie-hive-bundle/target/hoodie-hive-bundle-0.4.6-SNAPSHOT.jar,_ so that Hive can read the Hudi dataset and answer the query.
+
+```java
+hive> set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+hive> set hive.stats.autogather=false;
+hive> add jar file:///path/to/hoodie-hive-bundle-0.5.2-SNAPSHOT.jar;
+Added [file:///path/to/hoodie-hive-bundle-0.5.2-SNAPSHOT.jar] to class path
+Added resources: [file:///path/to/hoodie-hive-bundle-0.5.2-SNAPSHOT.jar]
+```
+
+
+Then, you need to create a *ReadOptimized* Hive table as below and register the sample partitions
+
+```java
+DROP TABLE hoodie_test;
+CREATE EXTERNAL TABLE hoodie_test(`_row_key` string,
+    `_hoodie_commit_time` string,
+    `_hoodie_commit_seqno` string,
+    rider string,
+    driver string,
+    begin_lat double,
+    begin_lon double,
+    end_lat double,
+    end_lon double,
+    fare double)
+    PARTITIONED BY (`datestr` string)
+    ROW FORMAT SERDE
+    'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+    STORED AS INPUTFORMAT
+    'com.uber.hoodie.hadoop.HoodieInputFormat'
+    OUTPUTFORMAT
+    'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+    LOCATION
+    'hdfs:///tmp/hoodie/sample-table';
+     
+ALTER TABLE `hoodie_test` ADD IF NOT EXISTS PARTITION (datestr='2016-03-15') LOCATION 'hdfs:///tmp/hoodie/sample-table/2016/03/15';
+ALTER TABLE `hoodie_test` ADD IF NOT EXISTS PARTITION (datestr='2015-03-16') LOCATION 'hdfs:///tmp/hoodie/sample-table/2015/03/16';
+ALTER TABLE `hoodie_test` ADD IF NOT EXISTS PARTITION (datestr='2015-03-17') LOCATION 'hdfs:///tmp/hoodie/sample-table/2015/03/17';
+     
+set mapreduce.framework.name=yarn;
+```
+
+And you can add a *Realtime* Hive table, as below
+
+```java
+DROP TABLE hoodie_rt;
+CREATE EXTERNAL TABLE hoodie_rt(
+    `_hoodie_commit_time` string,
+    `_hoodie_commit_seqno` string,
+    `_hoodie_record_key` string,
+    `_hoodie_partition_path` string,
+    `_hoodie_file_name` string,
+    timestamp double,
+    `_row_key` string,
+    rider string,
+    driver string,
+    begin_lat double,
+    begin_lon double,
+    end_lat double,
+    end_lon double,
+    fare double)
+    PARTITIONED BY (`datestr` string)
+    ROW FORMAT SERDE
+    'com.uber.hoodie.hadoop.realtime.HoodieParquetSerde'
+    STORED AS INPUTFORMAT
+    'com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat'
+    OUTPUTFORMAT
+    'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+    LOCATION
+    'file:///tmp/hoodie/sample-table';
+     
+ALTER TABLE `hoodie_rt` ADD IF NOT EXISTS PARTITION (datestr='2016-03-15') LOCATION 'file:///tmp/hoodie/sample-table/2016/03/15';
+ALTER TABLE `hoodie_rt` ADD IF NOT EXISTS PARTITION (datestr='2015-03-16') LOCATION 'file:///tmp/hoodie/sample-table/2015/03/16';
+ALTER TABLE `hoodie_rt` ADD IF NOT EXISTS PARTITION (datestr='2015-03-17') LOCATION 'file:///tmp/hoodie/sample-table/2015/03/17';
+```
+
diff --git a/website/blog/2019-09-09-ingesting-database-changes.md b/website/blog/2019-09-09-ingesting-database-changes.md
new file mode 100644
index 0000000..4a295d0
--- /dev/null
+++ b/website/blog/2019-09-09-ingesting-database-changes.md
@@ -0,0 +1,45 @@
+---
+title: "Ingesting Database changes via Sqoop/Hudi"
+excerpt: "Learn how to ingesting changes from a HUDI dataset using Sqoop/Hudi"
+author: vinoth
+category: blog
+---
+
+Very simple in just 2 steps.
+
+**Step 1**: Extract new changes to users table in MySQL, as avro data files on DFS
+<!--truncate-->
+```bash
+// Command to extract incrementals using sqoop
+bin/sqoop import \
+  -Dmapreduce.job.user.classpath.first=true \
+  --connect jdbc:mysql://localhost/users \
+  --username root \
+  --password ******* \
+  --table users \
+  --as-avrodatafile \
+  --target-dir \ 
+  s3:///tmp/sqoop/import-1/users
+```
+
+**Step 2**: Use your fav datasource to read extracted data and directly “upsert” the users table on DFS/Hive
+
+```scala
+// Spark Datasource
+import org.apache.hudi.DataSourceWriteOptions._
+// Use Spark datasource to read avro
+val inputDataset = spark.read.avro("s3://tmp/sqoop/import-1/users/*");
+     
+// save it as a Hudi dataset
+inputDataset.write.format("org.apache.hudi”)
+  .option(HoodieWriteConfig.TABLE_NAME, "hoodie.users")
+  .option(RECORDKEY_FIELD_OPT_KEY(), "userID")
+  .option(PARTITIONPATH_FIELD_OPT_KEY(),"country")
+  .option(PRECOMBINE_FIELD_OPT_KEY(), "last_mod")
+  .option(OPERATION_OPT_KEY(), UPSERT_OPERATION_OPT_VAL())
+  .mode(SaveMode.Append)
+  .save("/path/on/dfs");
+```
+
+Alternatively, you can also use the Hudi [DeltaStreamer](https://hudi.apache.org/writing_data#deltastreamer) tool with the DFSSource.
+
diff --git a/website/blog/2020-01-15-delete-support-in-hudi.md b/website/blog/2020-01-15-delete-support-in-hudi.md
new file mode 100644
index 0000000..95afa4f
--- /dev/null
+++ b/website/blog/2020-01-15-delete-support-in-hudi.md
@@ -0,0 +1,189 @@
+---
+title: "Delete support in Hudi"
+excerpt: "Deletes are supported at a record level in Hudi with 0.5.1 release. This blog is a “how to” blog on how to delete records in hudi."
+author: shivnarayan
+category: blog
+---
+
+Deletes are supported at a record level in Hudi with 0.5.1 release. This blog is a "how to" blog on how to delete records in hudi. Deletes can be done with 3 flavors: Hudi RDD APIs, with Spark data source and with DeltaStreamer.
+<!--truncate-->
+### Delete using RDD Level APIs
+
+If you have embedded  _HoodieWriteClient_ , then deletion is as simple as passing in a  _JavaRDD&#60;HoodieKey&#62;_ to the delete api.
+
+```java
+// Fetch list of HoodieKeys from elsewhere that needs to be deleted
+// convert to JavaRDD if required. JavaRDD<HoodieKey> toBeDeletedKeys
+List<WriteStatus> statuses = writeClient.delete(toBeDeletedKeys, commitTime);
+```
+
+### Deletion with Datasource
+
+Now we will walk through an example of how to perform deletes on a sample dataset using the Datasource API. Quick Start has the same example as below. Feel free to check it out.
+
+**Step 1** : Launch spark shell
+
+```bash
+bin/spark-shell --packages org.apache.hudi:hudi-spark-bundle:0.5.1-incubating \
+  --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
+```
+**Step 2** : Import as required and set up table name, etc for sample dataset
+
+```scala
+import org.apache.hudi.QuickstartUtils._
+import scala.collection.JavaConversions._
+import org.apache.spark.sql.SaveMode._
+import org.apache.hudi.DataSourceReadOptions._
+import org.apache.hudi.DataSourceWriteOptions._
+import org.apache.hudi.config.HoodieWriteConfig._
+     
+val tableName = "hudi_cow_table"
+val basePath = "file:///tmp/hudi_cow_table"
+val dataGen = new DataGenerator
+```
+
+**Step 3** : Insert data. Generate some new trips, load them into a DataFrame and write the DataFrame into the Hudi dataset as below.
+
+```scala
+val inserts = convertToStringList(dataGen.generateInserts(10))
+val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2))
+df.write.format("org.apache.hudi").
+  options(getQuickstartWriteConfigs).
+  option(PRECOMBINE_FIELD_OPT_KEY, "ts").
+  option(RECORDKEY_FIELD_OPT_KEY, "uuid").
+  option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
+  option(TABLE_NAME, tableName).
+  mode(Overwrite).
+  save(basePath);
+```
+
+**Step 4** : Query data. Load the data files into a DataFrame.
+
+```scala
+val roViewDF = spark.read.
+  format("org.apache.hudi").
+  load(basePath + "/*/*/*/*")
+roViewDF.createOrReplaceTempView("hudi_ro_table")
+spark.sql("select count(*) from hudi_ro_table").show() // should return 10 (number of records inserted above)
+val riderValue = spark.sql("select distinct rider from hudi_ro_table").show()
+// copy the value displayed to be used in next step
+```
+
+**Step 5** : Fetch records that needs to be deleted, with the above rider value. This example is just to illustrate how to delete. In real world, use a select query using spark sql to fetch records that needs to be deleted and from the result we could invoke deletes as given below. Example rider value used is "rider-213".
+
+```scala
+val df = spark.sql("select uuid, partitionPath from hudi_ro_table where rider = 'rider-213'")
+```
+
+// Replace the above query with any other query that will fetch records to be deleted.
+
+**Step 6** : Issue deletes
+
+```scala
+val deletes = dataGen.generateDeletes(df.collectAsList())
+val df = spark.read.json(spark.sparkContext.parallelize(deletes, 2));
+df.write.format("org.apache.hudi").
+  options(getQuickstartWriteConfigs).
+  option(OPERATION_OPT_KEY,"delete").
+  option(PRECOMBINE_FIELD_OPT_KEY, "ts").
+  option(RECORDKEY_FIELD_OPT_KEY, "uuid").
+  option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
+  option(TABLE_NAME, tableName).
+  mode(Append).
+  save(basePath);
+```
+
+**Step 7** : Reload the table and verify that the records are deleted
+
+```scala
+val roViewDFAfterDelete = spark.
+  read.
+  format("org.apache.hudi").
+  load(basePath + "/*/*/*/*")
+roViewDFAfterDelete.createOrReplaceTempView("hudi_ro_table")
+spark.sql("select uuid, partitionPath from hudi_ro_table where rider = 'rider-213'").show() // should not return any rows
+```
+
+## Deletion with HoodieDeltaStreamer
+
+Deletion with `HoodieDeltaStreamer` takes the same path as upsert and so it relies on a specific field called  "*_hoodie_is_deleted*" of type boolean in each record.
+
+-   If a record has the field value set to  _false_ or it's not present, then it is considered a regular upsert
+-   if not (if the value is set to  _true_ ), then its considered to be deleted record.
+
+This essentially means that the schema has to be changed for the source, to add this field and all incoming records are expected to have this field set. We will be working to relax this in future releases.
+
+Lets say the original schema is:
+
+```json
+{
+  "type":"record",
+  "name":"example_tbl",
+  "fields":[{
+     "name": "uuid",
+     "type": "String"
+  }, {
+     "name": "ts",
+     "type": "string"
+  },  {
+     "name": "partitionPath",
+     "type": "string"
+  }, {
+     "name": "rank",
+     "type": "long"
+  }
+]}
+```
+
+To leverage deletion capabilities of `DeltaStreamer`, you have to change the schema as below.
+
+```json
+{
+  "type":"record",
+  "name":"example_tbl",
+  "fields":[{
+     "name": "uuid",
+     "type": "String"
+  }, {
+     "name": "ts",
+     "type": "string"
+  },  {
+     "name": "partitionPath",
+     "type": "string"
+  }, {
+     "name": "rank",
+     "type": "long"
+  }, {
+    "name" : "_hoodie_is_deleted",
+    "type" : "boolean",
+    "default" : false
+  }
+]}
+```
+
+Example incoming record for upsert
+
+```json
+{
+  "ts": 0.0,
+  "uuid":"69cdb048-c93e-4532-adf9-f61ce6afe605",
+  "rank": 1034,
+  "partitionpath":"americas/brazil/sao_paulo",
+  "_hoodie_is_deleted":false
+}
+```
+      
+
+Example incoming record that needs to be deleted
+```json
+{
+  "ts": 0.0,
+  "uuid": "19tdb048-c93e-4532-adf9-f61ce6afe10",
+  "rank": 1045,
+  "partitionpath":"americas/brazil/sao_paulo",
+  "_hoodie_is_deleted":true
+}
+```
+
+These are one time changes. Once these are in, then the DeltaStreamer pipeline will handle both upserts and deletions within every batch. Each batch could contain a mix of upserts and deletes and no additional step or changes are required after this. Note that this is to perform hard deletion instead of soft deletion.
+
diff --git a/website/blog/2020-01-20-change-capture-using-aws.md b/website/blog/2020-01-20-change-capture-using-aws.md
new file mode 100644
index 0000000..b1ebe4e
--- /dev/null
+++ b/website/blog/2020-01-20-change-capture-using-aws.md
@@ -0,0 +1,202 @@
+---
+title: "Change Capture Using AWS Database Migration Service and Hudi"
+excerpt: "In this blog, we will build an end-end solution for capturing changes from a MySQL instance running on AWS RDS to a Hudi table on S3, using capabilities in the Hudi 0.5.1 release."
+author: vinoth
+category: blog
+---
+
+One of the core use-cases for Apache Hudi is enabling seamless, efficient database ingestion to your data lake. Even though a lot has been talked about and even users already adopting this model, content on how to go about this is sparse.
+
+In this blog, we will build an end-end solution for capturing changes from a MySQL instance running on AWS RDS to a Hudi table on S3, using capabilities in the Hudi  **0.5.1 release**
+<!--truncate-->
+  
+
+We can break up the problem into two pieces.
+
+1.  **Extracting change logs from MySQL**  : Surprisingly, this is still a pretty tricky problem to solve and often Hudi users get stuck here. Thankfully, at-least for AWS users, there is a  [Database Migration service](https://aws.amazon.com/dms/)  (DMS for short), that does this change capture and uploads them as parquet files on S3
+2.  **Applying these change logs to your data lake table**  : Once there are change logs in some form, the next step is to apply them incrementally to your table. This mundane task can be fully automated using the Hudi  [DeltaStreamer](http://hudi.apache.org/docs/writing_data#deltastreamer)  tool.
+
+  
+
+The actual end-end architecture looks something like this.
+![enter image description here](/assets/images/blog/change-capture-architecture.png)
+
+Let's now illustrate how one can accomplish this using a simple _orders_ table, stored in MySQL (these instructions should broadly apply to other database engines like Postgres, or Aurora as well, though SQL/Syntax may change)
+
+```java
+CREATE DATABASE hudi_dms;
+USE hudi_dms;
+     
+CREATE TABLE orders(
+   order_id INTEGER,
+   order_qty INTEGER,
+   customer_name VARCHAR(100),
+   updated_at TIMESTAMP DEFAULT NOW() ON UPDATE NOW(),
+   created_at TIMESTAMP DEFAULT NOW(),
+   CONSTRAINT orders_pk PRIMARY KEY(order_id)
+);
+ 
+INSERT INTO orders(order_id, order_qty, customer_name) VALUES(1, 10, 'victor');
+INSERT INTO orders(order_id, order_qty, customer_name) VALUES(2, 20, 'peter');
+```
+
+In the table, _order_id_ is the primary key which will be enforced on the Hudi table as well. Since a batch of change records can contain changes to the same primary key, we also include _updated_at_ and _created_at_ fields, which are kept upto date as writes happen to the table.
+
+### Extracting Change logs from MySQL
+
+Before we can configure DMS, we first need to [prepare the MySQL instance](https://aws.amazon.com/premiumsupport/knowledge-center/enable-binary-logging-aurora/)  for change capture, by ensuring backups are enabled and binlog is turned on.
+![](/assets/images/blog/change-logs-mysql.png)
+
+Now, proceed to create endpoints in DMS that capture MySQL data and  [store in S3, as parquet files](https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Target.S3).
+
+-   Source _hudi-source-db_ endpoint, points to the DB server and provides basic authentication details
+-   Target _parquet-s3_ endpoint, points to the bucket and folder on s3 to store the change logs records as parquet files
+![](/assets/images/blog/s3-endpoint-configuration-1.png)
+![](/assets/images/blog/s3-endpoint-configuration-2.png)
+![](/assets/images/blog/s3-endpoint-list.png)
+
+Then proceed to create a migration task, as below. Give it a name, connect the source to the target and be sure to pick the right _Migration type_ as shown below, to ensure ongoing changes are continuously replicated to S3. Also make sure to specify, the rules using which DMS decides which MySQL schema/tables to replicate. In this example, we simply whitelist _orders_ table under the _hudi_dms_ schema, as specified in the table SQL above.
+
+![](/assets/images/blog/s3-migration-task-1.png)
+![](/assets/images/blog/s3-migration-task-2.png)
+
+Starting the DMS task and should result in an initial load, like below.
+
+![](/assets/images/blog/dms-task.png)
+
+Simply reading the raw initial load file, shoud give the same values as the upstream table
+
+```scala
+scala> spark.read.parquet("s3://hudi-dms-demo/orders/hudi_dms/orders/*").sort("updated_at").show
+ 
++--------+---------+-------------+-------------------+-------------------+
+|order_id|order_qty|customer_name|         updated_at|         created_at|
++--------+---------+-------------+-------------------+-------------------+
+|       2|       10|        peter|2020-01-20 20:12:22|2020-01-20 20:12:22|
+|       1|       10|       victor|2020-01-20 20:12:31|2020-01-20 20:12:31|
++--------+---------+-------------+-------------------+-------------------+
+
+```
+
+## Applying Change Logs using Hudi DeltaStreamer
+
+Now, we are ready to start consuming the change logs. Hudi DeltaStreamer runs as Spark job on your favorite workflow scheduler (it also supports a continuous mode using _--continuous_ flag, where it runs as a long running Spark job), that tails a given path on S3 (or any DFS implementation) for new files and can issue an _upsert_ to a target hudi dataset. The tool automatically checkpoints itself and thus to repeatedly ingest, all one needs to do is to keep executing the DeltaStreamer pe [...]
+
+With an initial load already on S3, we then run the following command (deltastreamer command, here on) to ingest the full load first and create a Hudi dataset on S3.
+
+```bash
+spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer  \
+  --packages org.apache.spark:spark-avro_2.11:2.4.4 \
+  --master yarn --deploy-mode client \
+  hudi-utilities-bundle_2.11-0.5.1-SNAPSHOT.jar \
+  --table-type COPY_ON_WRITE \
+  --source-ordering-field updated_at \
+  --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \
+  --target-base-path s3://hudi-dms-demo/hudi_orders --target-table hudi_orders \
+  --transformer-class org.apache.hudi.utilities.transform.AWSDmsTransformer \
+  --payload-class org.apache.hudi.payload.AWSDmsAvroPayload \
+  --hoodie-conf hoodie.datasource.write.recordkey.field=order_id,hoodie.datasource.write.partitionpath.field=customer_name,hoodie.deltastreamer.source.dfs.root=s3://hudi-dms-demo/orders/hudi_dms/orders
+```
+
+A few things are going on here
+
+-   First, we specify the _--table-type_ as COPY_ON_WRITE. Hudi also supports another _MERGE_ON_READ ty_pe you can use if you choose from.
+-   To handle cases where the input parquet files contain multiple updates/deletes or insert/updates to the same record, we use _updated_at_ as the ordering field. This ensures that the change record which has the latest timestamp will be reflected in Hudi.
+-   We specify a target base path and a table table, all needed for creating and writing to the Hudi table
+-   We use a special payload class - _AWSDMSAvroPayload_ , to handle the different change operations correctly. The parquet files generated have an _Op_ field, that indicates whether a given change record is an insert (I), delete (D) or update (U) and the payload implementation uses this field to decide how to handle a given change record.
+-   You may also notice a special transformer class _AWSDmsTransformer_ , being specified. The reason here is tactical, but important. The initial load file does not contain an _Op_ field, so this adds one to Hudi table schema additionally.
+-   Finally, we specify the record key for the Hudi table as same as the upstream table. Then we specify partitioning by _customer_name_  and also the root of the DMS output.
+
+Once the command is run, the Hudi table should be created and have same records as the upstream table (with all the _hoodie fields as well).
+
+```scala
+scala> spark.read.format("org.apache.hudi").load("s3://hudi-dms-demo/hudi_orders/*/*.parquet").show
++-------------------+--------------------+------------------+----------------------+--------------------+--------+---------+-------------+-------------------+-------------------+---+
+|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|order_id|order_qty|customer_name|         updated_at|         created_at| Op|
++-------------------+--------------------+------------------+----------------------+--------------------+--------+---------+-------------+-------------------+-------------------+---+
+|     20200120205028|  20200120205028_0_1|                 2|                 peter|af9a2525-a486-40e...|       2|       10|        peter|2020-01-20 20:12:22|2020-01-20 20:12:22|   |
+|     20200120205028|  20200120205028_1_1|                 1|                victor|8e431ece-d51c-4c7...|       1|       10|       victor|2020-01-20 20:12:31|2020-01-20 20:12:31|   |
++-------------------+--------------------+------------------+----------------------+--------------------+--------+---------+-------------+-------------------+-------------------+---+
+```
+
+Now, let's do an insert and an update
+
+```java
+INSERT INTO orders(order_id, order_qty, customer_name) VALUES(3, 30, 'sandy');
+UPDATE orders set order_qty = 20 where order_id = 2;
+```
+
+This will add a new parquet file to the DMS output folder and when the deltastreamer command is run again, it will go ahead and apply these to the Hudi table.
+
+So, querying the Hudi table now would yield 3 rows and the _hoodie_commit_time_ accurately reflects when these writes happened. You can notice that order_qty for order_id=2, is updated from 10 to 20!
+
+```bash
++-------------------+--------------------+------------------+----------------------+--------------------+---+--------+---------+-------------+-------------------+-------------------+
+|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name| Op|order_id|order_qty|customer_name|         updated_at|         created_at|
++-------------------+--------------------+------------------+----------------------+--------------------+---+--------+---------+-------------+-------------------+-------------------+
+|     20200120211526|  20200120211526_0_1|                 2|                 peter|af9a2525-a486-40e...|  U|       2|       20|        peter|2020-01-20 21:11:47|2020-01-20 20:12:22|
+|     20200120211526|  20200120211526_1_1|                 3|                 sandy|566eb34a-e2c5-44b...|  I|       3|       30|        sandy|2020-01-20 21:11:24|2020-01-20 21:11:24|
+|     20200120205028|  20200120205028_1_1|                 1|                victor|8e431ece-d51c-4c7...|   |       1|       10|       victor|2020-01-20 20:12:31|2020-01-20 20:12:31|
++-------------------+--------------------+------------------+----------------------+--------------------+---+--------+---------+-------------+-------------------+-------------------+
+```
+
+A nice debugging aid would be read all of the DMS output now and sort it by update_at, which should give us a sequence of changes that happened on the upstream table. As we can see, the Hudi table above is a compacted snapshot of this raw change log.
+
+```bash
++----+--------+---------+-------------+-------------------+-------------------+
+|  Op|order_id|order_qty|customer_name|         updated_at|         created_at|
++----+--------+---------+-------------+-------------------+-------------------+
+|null|       2|       10|        peter|2020-01-20 20:12:22|2020-01-20 20:12:22|
+|null|       1|       10|       victor|2020-01-20 20:12:31|2020-01-20 20:12:31|
+|   I|       3|       30|        sandy|2020-01-20 21:11:24|2020-01-20 21:11:24|
+|   U|       2|       20|        peter|2020-01-20 21:11:47|2020-01-20 20:12:22|
++----+--------+---------+-------------+-------------------+-------------------+
+```
+
+Initial load with no _Op_ field value , followed by an insert and an update.
+
+Now, lets do deletes an inserts
+
+```java
+DELETE FROM orders WHERE order_id = 2;
+INSERT INTO orders(order_id, order_qty, customer_name) VALUES(4, 40, 'barry');
+INSERT INTO orders(order_id, order_qty, customer_name) VALUES(5, 50, 'nathan');
+```
+
+This should result in more files on S3, written by DMS , which the DeltaStreamer command will continue to process incrementally (i.e only the newly written files are read each time)
+
+![](/assets/images/blog/dms-demo-files.png)
+
+Running the deltastreamer command again, would result in the follow state for the Hudi table. You can notice the two new records and that the _order_id=2_ is now gone
+
+```bash
++-------------------+--------------------+------------------+----------------------+--------------------+---+--------+---------+-------------+-------------------+-------------------+
+|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name| Op|order_id|order_qty|customer_name|         updated_at|         created_at|
++-------------------+--------------------+------------------+----------------------+--------------------+---+--------+---------+-------------+-------------------+-------------------+
+|     20200120212522|  20200120212522_1_1|                 5|                nathan|3da94b20-c70b-457...|  I|       5|       50|       nathan|2020-01-20 21:23:00|2020-01-20 21:23:00|
+|     20200120212522|  20200120212522_2_1|                 4|                 barry|8cc46715-8f0f-48a...|  I|       4|       40|        barry|2020-01-20 21:22:49|2020-01-20 21:22:49|
+|     20200120211526|  20200120211526_1_1|                 3|                 sandy|566eb34a-e2c5-44b...|  I|       3|       30|        sandy|2020-01-20 21:11:24|2020-01-20 21:11:24|
+|     20200120205028|  20200120205028_1_1|                 1|                victor|8e431ece-d51c-4c7...|   |       1|       10|       victor|2020-01-20 20:12:31|2020-01-20 20:12:31|
++-------------------+--------------------+------------------+----------------------+--------------------+---+--------+---------+-------------+-------------------+-------------------+
+```
+
+Our little informal change log query yields the following.
+
+```bash
++----+--------+---------+-------------+-------------------+-------------------+
+|  Op|order_id|order_qty|customer_name|         updated_at|         created_at|
++----+--------+---------+-------------+-------------------+-------------------+
+|null|       2|       10|        peter|2020-01-20 20:12:22|2020-01-20 20:12:22|
+|null|       1|       10|       victor|2020-01-20 20:12:31|2020-01-20 20:12:31|
+|   I|       3|       30|        sandy|2020-01-20 21:11:24|2020-01-20 21:11:24|
+|   U|       2|       20|        peter|2020-01-20 21:11:47|2020-01-20 20:12:22|
+|   D|       2|       20|        peter|2020-01-20 21:11:47|2020-01-20 20:12:22|
+|   I|       4|       40|        barry|2020-01-20 21:22:49|2020-01-20 21:22:49|
+|   I|       5|       50|       nathan|2020-01-20 21:23:00|2020-01-20 21:23:00|
++----+--------+---------+-------------+-------------------+-------------------+
+```
+
+Note that the delete and update have the same _updated_at,_ value. thus it can very well order differently here.. In short this way of looking at the changelog has its caveats. For a true changelog of the Hudi table itself, you can issue an [incremental query](http://hudi.apache.org/docs/querying_data).
+
+And Life goes on ..... Hope this was useful to all the data engineers out there!
+
diff --git a/website/blog/2020-03-22-exporting-hudi-datasets.md b/website/blog/2020-03-22-exporting-hudi-datasets.md
new file mode 100644
index 0000000..0811d01
--- /dev/null
+++ b/website/blog/2020-03-22-exporting-hudi-datasets.md
@@ -0,0 +1,102 @@
+---
+title: "Export Hudi datasets as a copy or as different formats"
+excerpt: "Learn how to copy or export HUDI dataset in various formats."
+author: rxu
+category: blog
+---
+
+### Copy to Hudi dataset
+
+Similar to the existing  `HoodieSnapshotCopier`, the Exporter scans the source dataset and then makes a copy of it to the target output path.
+<!--truncate-->
+```bash
+spark-submit \
+  --jars "packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-0.6.0-SNAPSHOT.jar" \
+  --deploy-mode "client" \
+  --class "org.apache.hudi.utilities.HoodieSnapshotExporter" \
+      packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.6.0-SNAPSHOT.jar \
+  --source-base-path "/tmp/" \
+  --target-output-path "/tmp/exported/hudi/" \
+  --output-format "hudi"
+```
+
+### Export to json or parquet dataset
+The Exporter can also convert the source dataset into other formats. Currently only "json" and "parquet" are supported.
+
+```bash
+spark-submit \
+  --jars "packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-0.6.0-SNAPSHOT.jar" \
+  --deploy-mode "client" \
+  --class "org.apache.hudi.utilities.HoodieSnapshotExporter" \
+      packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.6.0-SNAPSHOT.jar \
+  --source-base-path "/tmp/" \
+  --target-output-path "/tmp/exported/json/" \
+  --output-format "json"  # or "parquet"
+```
+
+### Re-partitioning
+
+When export to a different format, the Exporter takes parameters to do some custom re-partitioning. By default, if neither of the 2 parameters below is given, the output dataset will have no partition.
+
+#### `--output-partition-field`
+
+This parameter uses an existing non-metadata field as the output partitions. All  `_hoodie_*`  metadata field will be stripped during export.
+
+```bash
+spark-submit \
+  --jars "packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-0.6.0-SNAPSHOT.jar" \
+  --deploy-mode "client" \
+  --class "org.apache.hudi.utilities.HoodieSnapshotExporter" \
+      packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.6.0-SNAPSHOT.jar \  
+  --source-base-path "/tmp/" \
+  --target-output-path "/tmp/exported/json/" \
+  --output-format "json" \
+  --output-partition-field "symbol"  # assume the source dataset contains a field `symbol`
+```
+
+The output directory will look like this
+
+```bash
+`_SUCCESS symbol=AMRS symbol=AYX symbol=CDMO symbol=CRC symbol=DRNA ...`
+```
+
+#### `--output-partitioner`
+
+This parameter takes in a fully-qualified name of a class that implements  `HoodieSnapshotExporter.Partitioner`. This parameter takes higher precedence than  `--output-partition-field`, which will be ignored if this is provided.
+
+An example implementation is shown below:
+
+**MyPartitioner.java**
+
+```java
+package com.foo.bar;
+public class MyPartitioner implements HoodieSnapshotExporter.Partitioner {
+
+  private static final String PARTITION_NAME = "date";
+ 
+  @Override
+  public DataFrameWriter<Row> partition(Dataset<Row> source) {
+    // use the current hoodie partition path as the output partition
+    return source
+        .withColumnRenamed(HoodieRecord.PARTITION_PATH_METADATA_FIELD, PARTITION_NAME)
+        .repartition(new Column(PARTITION_NAME))
+        .write()
+        .partitionBy(PARTITION_NAME);
+  }
+}
+```
+
+After putting this class in `my-custom.jar`, which is then placed on the job classpath, the submit command will look like this:
+
+```bash
+spark-submit \
+  --jars "packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-0.6.0-SNAPSHOT.jar,my-custom.jar" \
+  --deploy-mode "client" \
+  --class "org.apache.hudi.utilities.HoodieSnapshotExporter" \
+      packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.6.0-SNAPSHOT.jar \
+  --source-base-path "/tmp/" \
+  --target-output-path "/tmp/exported/json/" \
+  --output-format "json" \
+  --output-partitioner "com.foo.bar.MyPartitioner"
+```
+
diff --git a/website/blog/2020-04-27-apache-hudi-apache-zepplin.md b/website/blog/2020-04-27-apache-hudi-apache-zepplin.md
new file mode 100644
index 0000000..7dfcf35
--- /dev/null
+++ b/website/blog/2020-04-27-apache-hudi-apache-zepplin.md
@@ -0,0 +1,65 @@
+---
+title: "Apache Hudi Support on Apache Zeppelin"
+excerpt: "Integrating HUDI's real-time and read-optimized query capabilities into Apache Zeppelin’s notebook"
+author: leesf
+category: blog
+---
+
+
+## 1. Introduction
+Apache Zeppelin is a web-based notebook that provides interactive data analysis. It is convenient for you to make beautiful documents that can be data-driven, interactive, and collaborative, and supports multiple languages, including Scala (using Apache Spark), Python (Apache Spark), SparkSQL, Hive, Markdown, Shell, and so on. Hive and SparkSQL currently support querying Hudi’s read-optimized view and real-time view. So in theory, Zeppelin’s notebook should also have such query capabilities.
+<!--truncate-->
+## 2. Achieve the effect
+### 2.1 Hive
+
+### 2.1.1 Read optimized view
+![Read Optimized View](/assets/images/blog/read_optimized_view.png)
+
+### 2.1.2 Real-time view
+![Real-time View](/assets/images/blog/real_time_view.png)
+
+### 2.2 Spark SQL
+
+### 2.2.1 Read optimized view
+![Read Optimized View](/assets/images/blog/spark_read_optimized_view.png)
+
+### 2.2.2 Real-time view
+![Real-time View](/assets/images/blog/spark_real_time_view.png)
+
+## 3. Common problems
+
+### 3.1 Hudi package adaptation
+Zeppelin will load the packages under lib by default when starting. For external dependencies such as Hudi, it is suitable to be placed directly under zeppelin / lib to avoid Hive or Spark SQL not finding the corresponding Hudi dependency on the cluster.
+
+### 3.2 Parquet jar package adaptation
+The parquet version of the Hudi package is 1.10, and the current parquet version of the CDH cluster is 1.9, so when executing the Hudi table query, many jar package conflict errors will be reported.
+
+**Solution**: upgrade the parquet package to 1.10 in the spark / jars directory of the node where zepeelin is located.
+**Side effects**: The tasks of saprk jobs other than zeppelin assigned to the cluster nodes of parquet 1.10 may fail.
+**Suggestions**: Clients other than zeppelin will also have jar conflicts. Therefore, it is recommended to fully upgrade the spark jar, parquet jar and related dependent jars of the cluster to better adapt to Hudi’s capabilities.
+
+### 3.3 Spark Interpreter adaptation
+
+The same SQL using Spark SQL query on Zeppelin will have more records than the hive query.
+
+**Cause of the problem**: When reading and writing Parquet tables to the Hive metastore, Spark SQL will use the Parquet SerDe (SerDe: Serialize / Deserilize for short) for Spark serialization and deserialization, not the Hive’s SerDe, because Spark SQL’s own SerDe has better performance.
+
+This causes Spark SQL to only query Hudi’s pipeline records, not the final merge result.
+
+**Solution**: set `spark.sql.hive.convertMetastoreParquet=false`
+
+ 1. **Method 1**: Edit properties directly on the page**
+![](/assets/images/blog/spark_edit_properties.png)
+ 2. **Method 2**: Edit `zeppelin / conf / interpreter.json` and add**
+
+```json
+"spark.sql.hive.convertMetastoreParquet": {
+  "name": "spark.sql.hive.convertMetastoreParquet",
+  "value": false,
+  "type": "checkbox"
+}
+```
+## 4. Hudi incremental view
+
+For Hudi incremental view, currently only supports pulling by writing Spark code. Considering that Zeppelin has the ability to execute code and shell commands directly on the notebook, later consider packaging these notebooks to query Hudi incremental views in a way that supports SQL.
+
diff --git a/website/blog/2020-05-28-monitoring-hudi-metrics-with-datadog.md b/website/blog/2020-05-28-monitoring-hudi-metrics-with-datadog.md
new file mode 100644
index 0000000..3697754
--- /dev/null
+++ b/website/blog/2020-05-28-monitoring-hudi-metrics-with-datadog.md
@@ -0,0 +1,65 @@
+---
+title: "Monitor Hudi metrics with Datadog"
+excerpt: "Introducing the feature of reporting Hudi metrics via Datadog HTTP API"
+author: rxu
+category: blog
+---
+
+## Availability
+
+**0.6.0 (unreleased)**
+
+## Introduction
+
+[Datadog](https://www.datadoghq.com/) is a popular monitoring service. In the upcoming `0.6.0` release of Apache Hudi, we will introduce the feature of reporting Hudi metrics via Datadog HTTP API, in addition to the current reporter types: Graphite and JMX.
+<!--truncate-->
+## Configurations
+
+Similar to other supported reporters, turning on Datadog reporter requires these 2 properties.
+
+```properties
+hoodie.metrics.on=true
+hoodie.metrics.reporter.type=DATADOG
+```
+
+The following property sets the Datadog API site. It determines whether the requests will be sent to `api.datadoghq.eu` (EU) or `api.datadoghq.com` (US). Set this according to your Datadog account settings.
+
+```properties
+hoodie.metrics.datadog.api.site=EU # or US
+```
+
+The property `hoodie.metrics.datadog.api.key` allows you to set the api key directly from the configuration. 
+
+```properties
+hoodie.metrics.datadog.api.key=<your api key>
+hoodie.metrics.datadog.api.key.supplier=<your api key supplier>
+```
+
+Due to security consideration in some cases, you may prefer to return the api key at runtime. To go with this approach, implement `java.util.function.Supplier<String>` and set the implementation's FQCN to `hoodie.metrics.datadog.api.key.supplier`, and make sure `hoodie.metrics.datadog.api.key` is _not_ set since it will take higher precedence.
+
+The following property helps segregate metrics by setting different prefixes for different jobs. 
+
+```properties
+hoodie.metrics.datadog.metric.prefix=<your metrics prefix>
+```
+
+Note that it will use `.` to delimit the prefix and the metric name. For example, if the prefix is set to `foo`, then `foo.` will be prepended to the metric name.
+
+There are other optional properties, which are explained in the configuration reference page.
+
+## Demo
+
+In this demo, we ran a `HoodieDeltaStreamer` job with metrics turn on and other configurations set properly. 
+
+![datadog metrics demo](/assets/images/blog/2020-05-28-datadog-metrics-demo.png)
+
+As shown above, we were able to collect Hudi's action-related metrics like
+
+- `<prefix>.<table name>.commit.totalScanTime`
+- `<prefix>.<table name>.clean.duration`
+- `<prefix>.<table name>.index.lookup.duration`
+
+as well as `HoodieDeltaStreamer`-specific metrics
+
+- `<prefix>.<table name>.deltastreamer.duration`
+- `<prefix>.<table name>.deltastreamer.hiveSyncDuration`
diff --git a/website/blog/2020-08-18-hudi-incremental-processing-on-data-lakes.md b/website/blog/2020-08-18-hudi-incremental-processing-on-data-lakes.md
new file mode 100644
index 0000000..5361ad3
--- /dev/null
+++ b/website/blog/2020-08-18-hudi-incremental-processing-on-data-lakes.md
@@ -0,0 +1,275 @@
+---
+title: "Incremental Processing on the Data Lake"
+excerpt: "How Apache Hudi provides ability for incremental data processing."
+author: vinoyang
+category: blog
+---
+
+### NOTE: This article is a translation of the infoq.cn article, found [here](https://www.infoq.cn/article/CAgIDpfJBVcJHKJLSbhe), with minor edits
+
+Apache Hudi is a data lake framework which provides the ability to ingest, manage and query large analytical data sets on a distributed file system/cloud stores. 
+Hudi joined the Apache incubator for incubation in January 2019, and was promoted to the top Apache project in May 2020. This article mainly discusses the importance 
+of Hudi to the data lake from the perspective of "incremental processing". More information about Apache Hudi's framework functions, features, usage scenarios, and 
+latest developments can be found at [QCon Global Software Development Conference (Shanghai Station) 2020](https://qconplus.infoq.cn/2020/shanghai/presentation/2646).
+<!--truncate-->
+Throughout the development of big data technology, Hadoop has steadily seized the opportunities of this era and has become the de-facto standard for enterprises to build big data infrastructure. 
+Among them, the distributed file system HDFS that supports the Hadoop ecosystem almost naturally has become the standard interface for big data storage systems. In recent years, with the rise of 
+cloud-native architectures, we have seen a wave of newer models embracing low-cost cloud storage emerging, a number of data lake frameworks compatible with HDFS interfaces 
+embracing cloud vendor storage have emerged in the industry as well. 
+
+However, we are still processing data pretty much in the same way we did 10 years ago. This article will try to talk about its importance to the data lake from the perspective of "incremental processing".
+
+## Traditional data lakes lack the primitives for incremental processing
+
+In the era of mobile Internet and Internet of Things, delayed arrival of data is very common. 
+Here we are involved in the definition of two time semantics: [event time and processing time](https://www.oreilly.com/radar/the-world-beyond-batch-streaming-101/). 
+
+As the name suggests:
+
+ - **Event time:** the time when the event actually occurred;
+ - **Processing time:** the time when an event is observed (processed) in the system;
+
+Ideally, the event time and the processing time are the same, but in reality, they may have more or less deviation, which we often call "Time Skew". 
+Whether for low-latency stream computing or common batch processing, the processing of event time and processing time and late data is a common and difficult problem. 
+In general, in order to ensure correctness, when we strictly follow the "event time" semantics, late data will trigger the 
+[recalculation of the time window](https://ci.apache.org/projects/flink/flink-docs-release-1.10/dev/stream/operators/windows#late-elements-considerations) 
+(usually Hive partitions for batch processing), although the results of these "windows" may have been calculated or even interacted with the end user. 
+For recalculation, the extensible key-value storage structure is usually used in streaming processing, which is processed incrementally at the record/event level and optimized 
+based on point queries and updates. However, in data lakes, recalculating usually means rewriting the entire (immutable) Hive partition (or simply a folder in DFS), and 
+re-triggering the recalculation of cascading tasks that have consumed that Hive partition.
+
+With data lakes supporting massive amounts of data, many long-tail businesses still have a strong demand for updating cold data. However, for a long time, 
+the data in a single partition in the data lake was designed to be non-updatable. If it needs to be updated, the entire partition needs to be rewritten. 
+This will seriously damage the efficiency of the entire ecosystem. From the perspective of latency and resource utilization, these operations on Hadoop will incur expensive overhead.
+Besides, this overhead is usually also cascaded to the entire Hadoop data processing pipeline, which ultimately leads to an increase in latency by several hours.
+
+In response to the two problems mentioned above, if the data lake supports fine-grained incremental processing, we can incorporate changes into existing Hive partitions 
+more effectively, and provide a way for downstream table consumers to obtain only the changed data. For effectively supporting incremental processing, we can decompose it into the 
+following two primitive operations:
+
+ - **Update insert (upsert):** Conceptually, rewriting the entire partition can be regarded as a very inefficient upsert operation, which will eventually write much more data than the 
+original data itself. Therefore, support for (bulk) upsert is considered a very important feature. [Google's Mesa](https://research.google/pubs/pub42851/) (Google's data warehouse system) also 
+talks about several techniques that can be applied to rapid data ingestion scenarios.
+
+ - **Incremental consumption:** Although upsert can solve the problem of quickly releasing new data to a partition, downstream data consumers do not know 
+ which data has been changed from which time in the past. Usually, consumers can only know the changed data by scanning the entire partition/data table and 
+ recalculating all the data, which requires considerable time and resources. Therefore, we also need a mechanism to more efficiently obtain data records that 
+ have changed since the last time the partition was consumed.
+
+With the above two primitive operations, you can upsert a data set, and then incrementally consume from it, and create another (also incremental) data set to solve the two problems 
+we mentioned above and support many common cases, so as to support end-to-end incremental processing and reduce end-to-end latency. These two primitives combine with each other, 
+unlocking the ability of stream/incremental processing based on DFS abstraction.
+
+The storage scale of the data lake far exceeds that of the data warehouse. Although the two have different focuses on the definition of functions, 
+there is still a considerable intersection (of course, there are still disputes and deviations from definition and implementation. 
+This is not the topic this article tries to discuss). In any case, the data lake will support larger analytical data sets with cheaper storage, 
+so incremental processing is also very important for it. Next let's discuss the significance of incremental processing for the data lake.
+
+## The significance of incremental processing for the data lake
+
+### Streaming Semantics
+
+It has long been stated that there is a "[dualism](https://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying)" 
+between the change log (that is, the "flow" in the conventional sense we understand) and the table.
+
+![dualism](/assets/images/blog/incr-processing/image4.jpg)
+
+The core of this discussion is: if there is a change log, you can use these changes to generate a data table and get the current status. If you update a table, 
+you can record these changes and publish all "change logs" to the table's status information. This interchangeable nature is called "stream table duality" for short.
+
+A more general understanding of "stream table duality": when the business system is modifying the data in the MySQL table, MySQL will reflect these changes as Binlog, 
+if we publish these continuous Binlog (stream) to Kafka, and then let the downstream processing system subscribe to the Kafka, and use the state store to gradually 
+accumulate the intermediate results. Then the current state of this intermediate result can reflects the current snapshot of the table.
+
+If the two primitives mentioned above that support incremental processing can be introduced to the data lake, the above pipeline, which can reflect the 
+"stream table duality", is also applicable on the data lake. Based on the first primitive, the data lake can also ingest the Binlog log streams in Kafka, 
+and then store these Binlog log streams into "tables" on the data lake. Based on the second primitive, these tables recognize the changed records as "Binlog" 
+streams to support the incremental consumption of subsequent cascading tasks.
+
+Of course, as the data in the data lake needs to be landed on the final file/object storage, considering the trade-off between throughput and write performance, 
+Binlog on the data lake reacts to a small batch of change logs over a period of time on the stream. For example, the Apache Hudi community is further trying to 
+provide an incremental view similar to Binlog for different Commits (a Commit refers to a batch of data write commit), 
+as shown in the following figure:
+
+![idu](/assets/images/blog/incr-processing/image1.png)
+
+Remarks in the "Flag" column:
+
+I: Insert;
+D: Delete;
+U: After image of Update;
+X: Before image of Update;
+
+Based on the above discussion, we can think that incremental processing and stream are naturally compatible, and we can naturally connect them on the data lake.
+
+### Warehousing needs Incremental Processing
+
+In the data warehouse, whether it is dimensional modeling or relational modeling theory, it is usually constructed based on the [layered design ideas](https://en.wikipedia.org/wiki/Data_warehouse#Design_methods). 
+In terms of technical implementation, multiple stages (steps) of a long pipeline are formed by connecting multiple levels of ETL tasks through a workflow scheduling engine, 
+as shown in the following figure:
+
+![image2](/assets/images/blog/incr-processing/image2.png)
+
+As the main application of the data warehouse, in the OLAP field, for the conventional business scenarios(for no or few changes), there are already some frameworks in the industry 
+that focus on the scenarios where they are good at providing efficient analysis capabilities. However, in the Hadoop data warehouse/data lake ecosystem, 
+there is still no good solution for the analysis scenario of frequent changes of business data.
+
+For example, let’s consider the scenario of updating the order status of a travel business. This scenario has a typical long-tail effect: 
+you cannot know whether an order will be billed tomorrow, one month later, or one year later. In this scenario, the order table is the main data table, 
+but usually we will derive other derived tables based on this table to support the modeling of various business scenarios. 
+The initial update takes place in the order table at the ODS level, but the derived tables need to be updated in cascade.
+
+For this scenario, in the past, once there is a change, people usually need to find the partition where the data to be updated is located in the Hive order 
+table of the ODS layer, and update the entire partition, besides, the partition of the relevant data of the derived table needs to be updated in cascade.
+
+Yes, someone will definitely think of that Kudu's support for Upsert can solve the problem of the old version of Hive missing the first incremental primitive. 
+But the Kudu storage engine has its own limitations:
+
+ 1. Performance: additional requirements for the hardware itself;
+ 2. Ecologically: In terms of adapting to mainstream big data computing frameworks and machine learning frameworks, it is far less advantageous than Hive;
+ 3. Cost: requires special maintenance costs and expenses;
+ 4. Did not solve the second primitive of incremental processing mentioned above: the problem of incremental consumption.
+
+In summary, incremental processing has the following advantages on the data lake:
+
+**Performance improvement:** Ingesting data usually needs to handle updates, deletes, and enforce unique key constraints. Since incremental primitives support record-level updates, 
+it can bring orders of magnitude performance improvements to these operations. 
+
+**Faster ETL/derived Pipelines:** An ubiquitous next step, once the data has been ingested from external sources is to build derived data pipelines using 
+Apache Spark/Apache Hive or any other data processing framework to ETL the ingested data for a variety of use-cases like data warehouse, 
+machine learning, or even just analytics. Typically, such processes again rely on batch processing jobs expressed in code or SQL. Such data pipelines can be speed up dramatically, 
+by querying one or more input tables using an incremental query instead of a regular snapshot query, resulting in only processing the incremental changes from upstream tables and 
+then upsert or delete the target derived table.Similar to raw data ingestion, in order to reduce the data delay of the modelled table, the ETL job only needs to gradually extract the 
+changed data from the original table and update the previously derived output table instead of rebuilding the entire output table every few hours .
+
+**Unified storage:** Based on the above two advantages, faster and lighter processing on the existing data lake means that only for the purpose of accessing near real-time data, 
+no special storage or data mart is needed.
+
+Next, we use two simple examples to illustrate how [incremental processing](https://www.oreilly.com/content/ubers-case-for-incremental-processing-on-hadoop/) can speed up the processing 
+of pipelines in analytical scenarios. First of all, data projection is the most common and easy to understand case:
+
+![image7](/assets/images/blog/incr-processing/image7.png)
+
+This simple example shows that: by upserting new changes into table_1 and establishing a simple projected table (projected_table) through incremental consumption, we can 
+operate simpler with lower latency more efficiently projection.
+
+Next, for a more complex scenario, we can use incremental processing to support the stream and batch connections supported by the stream computing framework, 
+and stream-stream connections (just need to add some additional logic to align window) :
+
+![image6](/assets/images/blog/incr-processing/image6.png)
+
+The example in the figure above connects a fact table to multiple dimension tables to create a connected table. This case is one of the rare scenarios where we can save hardware 
+costs while significantly reducing latency.
+
+### Quasi-real-time scenarios, resource/efficiency trade-offs
+
+Incremental processing of new data in mini batches can use resources more efficiently. Let's refer to a specific example. We have a Kafka event stream that is pouring in 
+at a rate of 10,000 per second. We want to count the number of messages in some dimensions over the past 15 minutes. Many stream processing pipelines use an external/internal 
+result state store (such as RocksDB, Cassandra, ElasticSearch) to save the aggregated count results, and run the containers in resource managers such as YARN/Mesos continuously, 
+which is very reasonable in less than a five-minute delay window scene. In fact, the YARN container itself has some startup overhead. In addition, in order to improve the 
+performance of writing to result storage system, we usually cache the results before performing batch updates. This kind of protocol requires the container to run continuously.
+
+However, in quasi-real-time processing scenarios, these options may not be optimal. To achieve the same effect, you can use short-life containers and optimize overall 
+resource utilization. For example, a streaming processor may need to perform six million updates to the result storage system in 15 minutes. However, in the incremental 
+batch mode, we only need to perform an in-memory merge on the accumulated data and update the result storage system only once, then only use the resource container for 
+five minutes. Compared with the pure stream processing mode, the incremental batch processing mode has several times the CPU efficiency improvement, and there are several 
+orders of magnitude efficiency improvement in updating to the result storage. Basically, this processing method obtains resources on demand, instead of swallowing CPU and 
+memory while waiting for data to be calculated in real time.
+
+### Incremental processing facilitates unified data lake architecture
+
+Whether in the data warehouse or in the data lake, data processing is an unavoidable problem. Data processing involves the selection of computing engines and 
+the design of architectures. There are currently two mainstream architectures in the industry: Lambda and Kappa architectures. Each architecture has its own 
+characteristics and existing problems. Derivative versions of these architectures are also [emerging endlessly](https://www.infoq.cn/article/Uo4pFswlMzBVhq*Y2tB9).
+
+In reality, many enterprises still maintain the implementation of the [Lambda architecture]( https://en.wikipedia.org/wiki/Lambda_architecture). 
+The typical Lambda architecture has two modules for the data processing part: the speed layer and the batch layer.
+
+![image5](/assets/images/blog/incr-processing/image5.png)
+
+They are usually two independent implementations (from code to infrastructure). For example, Flink (formerly Storm) is a popular option on the speed layer, 
+while MapReduce/Spark can serve as a batch layer. In fact, people often rely on the speed layer to provide updated results (which may not be accurate), and 
+once the data is considered complete, the results of the speed layer are corrected at a later time through the batch layer. With incremental processing, 
+we have the opportunity to implement the Lambda architecture for batch processing and quasi-real-time processing at the code level and infrastructure level in 
+a unified manner. It typically looks like below:
+
+![image3](/assets/images/blog/incr-processing/image3.png)
+
+As we said, you can use SQL or a batch processing framework like Spark to consistently implement your processing logic. The result table is built incrementally, 
+and SQL is executed on "new data" like streaming to produce a quick view of the results. The same SQL can be executed periodically on the full amount of data to 
+correct any inaccurate results (remember, join operations are always tricky!) and produce a more "complete" view of the results. In both cases, we will use the 
+same infrastructure to perform calculations, which can reduce overall operating costs and complexity.
+
+Setting aside the Lambda architecture, even in the Kappa architecture, the first primitive of incremental processing (upsert) also plays an important role. 
+Uber [proposed](https://www.slideshare.net/FlinkForward/flink-forward-san-francisco-2019-moving-from-lambda-and-kappa-architectures-to-kappa-at-uber-roshan-naik) the Kappa + architecture 
+based on this. The Kappa architecture advocates a single stream computing layer sufficient to become a general solution 
+for data processing. Although the batch layer is removed in this model, there are still two problems in the service layer:
+
+Now days many stream processing engines support row-level data processing, which requires that our service layer should also support row-level updates;
+The trade-offs between data ingestion delay, scanning performance and computing resources and operational complexity are unavoidable.
+
+![image8](/assets/images/blog/incr-processing/image8.png)
+
+However, if our business scenarios have low latency requirements, for example, we can accept a delay of about 10 minutes. And if we can quickly ingest and prepare data on DFS, 
+effectively connect and propagate updates to the upper-level modeling data set, Speed Serving in the service layer is unnecessary. Then the service layer can be unified, 
+greatly reducing the overall complexity and resource consumption of the system.
+
+Above, we introduced the significance of incremental processing for the data lake. Next, we introduce the implementation and support of incremental processing. 
+Among the three open source data lake frameworks (Apache Hudi/Iceberg, Delta Lake), only Apache Hudi provides good support for incremental processing. 
+This is completely rooted in a framework developed by Uber at the time when it encountered the pain points of data analysis on the Hadoop data lake. 
+So, next, let's introduce how Hudi supports incremental processing.
+
+## Hudi's support for incremental processing
+
+Apache Hudi (Hadoop Upserts Deletes and Incrementals) is a top-level project of the Apache Foundation. It allows you to process very large-scale data on 
+top of Hadoop-compatible storage, and it also provides two primitives that enable stream processing on the data lake in addition to classic batch processing.
+
+From the naming of the letter "I" denotes "Incremental Processing", we can see that it will support incremental processing as a first class citizen. 
+The two primitives we mentioned at the beginning of this article that support incremental processing are reflected in the following two aspects in Apache Hudi:
+
+Update/Delete operation:Hudi provides support for updating/deleting records, using fine-grained file/record level indexes while providing transactional guarantees 
+for the write operation. Queries process the last such committed snapshot, to produce results..
+
+Change stream: Hudi also provides first-class support for obtaining an incremental stream of all the records that were updated/inserted/deleted in a given table, from a given point-in-time.
+
+The specific implementation of the change flow is "incremental view". Hudi is the only one of the three open source data lake frameworks that supports 
+the incremental query feature, with support for record level change streams. The following sample code snippet shows us how to query the incremental view:
+
+
+```java
+// spark-shell
+// reload data
+spark.
+  read.
+  format("hudi").
+  load(basePath + "/*/*/*/*").
+  createOrReplaceTempView("hudi_trips_snapshot")
+
+val commits = spark.sql("select distinct(_hoodie_commit_time) as commitTime from  hudi_trips_snapshot order by commitTime").map(k => k.getString(0)).take(50)
+val beginTime = commits(commits.length - 2) // commit time we are interested in
+
+// incrementally query data
+val tripsIncrementalDF = spark.read.format("hudi").
+  option(QUERY_TYPE_OPT_KEY, QUERY_TYPE_INCREMENTAL_OPT_VAL).
+  option(BEGIN_INSTANTTIME_OPT_KEY, beginTime).
+  load(basePath)
+tripsIncrementalDF.createOrReplaceTempView("hudi_trips_incremental")
+
+spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from  hudi_trips_incremental where fare > 20.0").show()
+
+```
+
+The code snippet above creates a Hudi trip increment table (hudi_trips_incremental), and then queries all the change records in the increment table after the "beginTime" submission time 
+and the "cost"  is greater than 20.0. Based on this query, you can create incremental data pipelines on batch data.
+
+## Summary
+
+In this article, we first elaborated many problems caused by the lack of incremental processing primitives in the traditional Hadoop data warehouse due to the trade-off between data integrity 
+and latency, and some long-tail applications that rely heavily on updates. Next, we argued that to support incremental processing, we must have at least two primitives: upsert and 
+incremental consumption, and explained why these two primitives can solve the problems explained above.
+
+Then, we introduced why incremental processing is also important to the data lake. There are many common parts in data processing between the data lake and the data warehouse. 
+In the data warehouse, some "pain points" caused by the lack of incremental processing also exist in the data lake. We elaborated its significance to the data lake from four 
+aspects: incremental processing of semantics of natural fit flow, the need for analytical scenarios, quasi-real-time scene resource/efficiency trade-offs, and unified lake architecture.
+
+Finally, we introduced the open source data lake storage framework Apache Hudi's support for incremental processing and simple cases.
diff --git a/website/blog/2020-08-20-efficient-migration-of-large-parquet-tables.md b/website/blog/2020-08-20-efficient-migration-of-large-parquet-tables.md
new file mode 100644
index 0000000..cd959ce
--- /dev/null
+++ b/website/blog/2020-08-20-efficient-migration-of-large-parquet-tables.md
@@ -0,0 +1,175 @@
+---
+title: "Efficient Migration of Large Parquet Tables to Apache Hudi"
+excerpt: "Migrating a large parquet table to Apache Hudi without having to rewrite the entire dataset."
+author: vbalaji
+category: blog
+---
+
+We will look at how to migrate a large parquet table to Hudi without having to rewrite the entire dataset. 
+
+<!--truncate-->
+# Motivation:
+
+Apache Hudi maintains per record metadata to perform core operations such as upserts and incremental pull. To take advantage of Hudi’s upsert and incremental processing support, users would need to rewrite their whole dataset to make it an Apache Hudi table.  Hudi 0.6.0 comes with an ***experimental feature*** to support efficient migration of large Parquet tables to Hudi without the need to rewrite the entire dataset.
+
+
+# High Level Idea:
+
+## Per Record Metadata:
+
+Apache Hudi maintains record level metadata for perform efficient upserts and incremental pull.
+
+![Per Record Metadata](/assets/images/blog/2020-08-20-per-record.png)
+
+Apache HUDI physical file contains 3 parts
+
+1. For each record, 5 HUDI metadata fields with column indices 0 to 4
+1. For each record, the original data columns that comprises the record (Original Data)
+1. Additional Hudi Metadata at file footer for index lookup
+
+The parts (1) and (3) constitute what we term as  “Hudi skeleton”. Hudi skeleton contains additional metadata that it maintains in each physical parquet file for supporting Hudi primitives. The conceptual idea is to decouple Hudi skeleton data from original data (2). Hudi skeleton can be stored in a Hudi file while the original data is stored in an external non-Hudi file. A migration of large parquet would result in creating only Hudi skeleton files without having to rewrite original data.
+
+![skeleton](/assets/images/blog/2020-08-20-skeleton.png)
+
+# Design Deep Dive:
+
+ For a deep dive on the internals, please take a look at the [RFC document](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+12+%3A+Efficient+Migration+of+Large+Parquet+Tables+to+Apache+Hudi) 
+
+# Migration:
+
+Hudi supports 2 modes when migrating parquet tables.  We will use the term bootstrap and migration interchangeably in this document.  
+
+* METADATA_ONLY : In this mode, record level metadata alone is generated for each source record and stored in new bootstrap location.
+* FULL_RECORD : In this mode, record level metadata is generated for each source record and both original record and metadata for each record copied
+
+You can pick and choose these modes at partition level. One of the common strategy would be to use FULL_RECORD mode for a small set of "hot" partitions which are accessed more frequently and METADATA_ONLY for a larger set of "warm" partitions. 
+
+
+## Query Engine Support:
+For a METADATA_ONLY bootstrapped table, Spark - data source, Spark-Hive and native Hive query engines are supported. Presto support is in the works.
+
+## Ways To Migrate :
+
+There are 2 ways to migrate a large parquet table to Hudi. 
+
+- Spark Datasource Write
+- Hudi DeltaStreamer
+
+We will look at how to migrate using both these approaches.
+
+## Configurations:
+
+These are bootstrap specific configurations that needs to be set in addition to regular hudi write configurations.
+
+
+|Configuration Name  | Default  | Mandatory ?  |  Description |
+|---|---|---|---|
+|hoodie.bootstrap.base.path| | Yes |Base Path of  source parquet table.|
+|hoodie.bootstrap.parallelism | 1500 | Yes | Spark Parallelism used when running bootstrap |
+|hoodie.bootstrap.keygen.class | |Yes |Bootstrap Index internally used by Hudi to map Hudi skeleton and source parquet files. |
+|hoodie.bootstrap.mode.selector | org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector | Yes |Bootstap Mode Selector class. By default, Hudi employs METADATA_ONLY boostrap for all partitions. |
+|hoodie.bootstrap.partitionpath.translator.class |org.apache.hudi.client.bootstrap.translator. IdentityBootstrapPartitionPathTranslator | No | For METADATA_ONLY bootstrap, this class allows customization of partition paths used in Hudi target dataset. By default, no customization is done and the partition paths reflects what is available in source parquet table. |
+|hoodie.bootstrap.full.input.provider| org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider|No |For FULL_RECORD bootstrap, this class provides the input RDD of Hudi records to write.  |
+| hoodie.bootstrap.mode.selector.regex.mode |METADATA_ONLY |No |Bootstrap Mode used when the partition matches the regex pattern in hoodie.bootstrap.mode.selector.regex . Used only when hoodie.bootstrap.mode.selector set to BootstrapRegexModeSelector. |
+| hoodie.bootstrap.mode.selector.regex |\.\* |No |Partition Regex used when  hoodie.bootstrap.mode.selector set to BootstrapRegexModeSelector. |
+
+## Spark Data Source:
+
+Here, we use a Spark Datasource Write to perform bootstrap. 
+Here is an example code snippet to perform METADATA_ONLY bootstrap.
+
+
+```properties
+import org.apache.hudi.{DataSourceWriteOptions, HoodieDataSourceHelpers}
+import org.apache.hudi.config.{HoodieBootstrapConfig, HoodieWriteConfig}
+import org.apache.hudi.keygen.SimpleKeyGenerator
+import org.apache.spark.sql.SaveMode
+ 
+val bootstrapDF = spark.emptyDataFrame
+bootstrapDF.write
+      .format("hudi")
+      .option(HoodieWriteConfig.TABLE_NAME, "hoodie_test")
+      .option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL)
+      .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key")
+      .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "datestr")
+      .option(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP, srcPath)
+      .option(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS, classOf[SimpleKeyGenerator].getName)
+      .mode(SaveMode.Overwrite)
+      .save(basePath)
+```
+
+Here is an example code snippet to perform METADATA_ONLY bootstrap for August 20 2020 - August 29 2020 partitions and FULL_RECORD bootstrap for other partitions.
+
+
+```properties
+import org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider
+import org.apache.hudi.client.bootstrap.selector.BootstrapRegexModeSelector
+import org.apache.hudi.{DataSourceWriteOptions, HoodieDataSourceHelpers}
+import org.apache.hudi.config.{HoodieBootstrapConfig, HoodieWriteConfig}
+import org.apache.hudi.keygen.SimpleKeyGenerator
+import org.apache.spark.sql.SaveMode
+ 
+val bootstrapDF = spark.emptyDataFrame
+bootstrapDF.write
+      .format("hudi")
+      .option(HoodieWriteConfig.TABLE_NAME, "hoodie_test")
+      .option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL)
+      .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key")
+      .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "datestr")
+      .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "timestamp")
+      .option(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP, srcPath)
+      .option(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS, classOf[SimpleKeyGenerator].getName)
+      .option(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR, classOf[BootstrapRegexModeSelector].getName)
+      .option(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR_REGEX, "2020/08/2[0-9]")
+      .option(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR_REGEX_MODE, "METADATA_ONLY")
+      .option(HoodieBootstrapConfig.FULL_BOOTSTRAP_INPUT_PROVIDER, classOf[SparkParquetBootstrapDataProvider].getName)
+      .mode(SaveMode.Overwrite)
+      .save(basePath)
+```
+
+## Hoodie DeltaStreamer:
+
+Hoodie Deltastreamer allows bootstrap to be performed using --run-bootstrap command line option.
+
+If you are planning to use delta-streamer after the initial boostrap to incrementally ingest data to the new hudi dataset, you need to pass either --checkpoint or --initial-checkpoint-provider to set the initial checkpoint for the deltastreamer.
+
+Here is an example for running METADATA_ONLY bootstrap using Delta Streamer.
+
+```properties
+spark-submit --package org.apache.hudi:hudi-spark-bundle_2.11:0.6.0
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
+--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer  \
+--run-bootstrap \
+--target-base-path <Hudi_Base_Path> \
+--target-table <Hudi_Table_Name> \
+--props <props_file> \
+--checkpoint <initial_checkpoint_if_you_are_going_to_use_deltastreamer_to_incrementally_ingest> \
+--hoodie-conf hoodie.bootstrap.base.path=<Parquet_Source_base_Path> \
+--hoodie-conf hoodie.datasource.write.recordkey.field=_row_key \
+--hoodie-conf hoodie.datasource.write.partitionpath.field=datestr \
+--hoodie-conf hoodie.bootstrap.keygen.class=org.apache.hudi.keygen.SimpleKeyGenerator
+```
+
+
+```properties
+spark-submit --package org.apache.hudi:hudi-spark-bundle_2.11:0.6.0
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
+--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer  \
+--run-bootstrap \
+--target-base-path <Hudi_Base_Path> \
+--target-table <Hudi_Table_Name> \
+--props <props_file> \
+--checkpoint <initial_checkpoint_if_you_are_going_to_use_deltastreamer_to_incrementally_ingest> \
+--hoodie-conf hoodie.bootstrap.base.path=<Parquet_Source_base_Path> \
+--hoodie-conf hoodie.datasource.write.recordkey.field=_row_key \
+--hoodie-conf hoodie.datasource.write.partitionpath.field=datestr \
+--hoodie-conf hoodie.bootstrap.keygen.class=org.apache.hudi.keygen.SimpleKeyGenerator \
+--hoodie-conf hoodie.bootstrap.full.input.provider=org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider \
+--hoodie-conf hoodie.bootstrap.mode.selector=org.apache.hudi.client.bootstrap.selector.BootstrapRegexModeSelector \
+--hoodie-conf hoodie.bootstrap.mode.selector.regex="2020/08/2[0-9]" \
+--hoodie-conf hoodie.bootstrap.mode.selector.regex.mode=METADATA_ONLY
+```
+
+## Known Caveats
+1. Need proper defaults for the bootstrap config : hoodie.bootstrap.full.input.provider. Here is the [ticket](https://issues.apache.org/jira/browse/HUDI-1213)
+1. DeltaStreamer manages checkpoints inside hoodie commit files and expects checkpoints in previously committed metadata. Users are expected to pass checkpoint or initial checkpoint provider when performing bootstrap through deltastreamer. Such support is not present when doing bootstrap using Spark Datasource. Here is the [ticket](https://issues.apache.org/jira/browse/HUDI-1214).
diff --git a/website/blog/2020-08-21-async-compaction-deployment-model.md b/website/blog/2020-08-21-async-compaction-deployment-model.md
new file mode 100644
index 0000000..3ffa1b4
--- /dev/null
+++ b/website/blog/2020-08-21-async-compaction-deployment-model.md
@@ -0,0 +1,99 @@
+---
+title: "Async Compaction Deployment Models"
+excerpt: "Mechanisms for executing compaction jobs in Hudi asynchronously"
+author: vbalaji
+category: blog
+---
+
+We will look at different deployment models for executing compactions asynchronously.
+<!--truncate-->
+# Compaction
+
+For Merge-On-Read table, data is stored using a combination of columnar (e.g parquet) + row based (e.g avro) file formats. 
+Updates are logged to delta files & later compacted to produce new versions of columnar files synchronously or 
+asynchronously. One of th main motivations behind Merge-On-Read is to reduce data latency when ingesting records.
+Hence, it makes sense to run compaction asynchronously without blocking ingestion.
+
+
+# Async Compaction
+
+Async Compaction is performed in 2 steps:
+
+1. ***Compaction Scheduling***: This is done by the ingestion job. In this step, Hudi scans the partitions and selects **file 
+slices** to be compacted. A compaction plan is finally written to Hudi timeline.
+1. ***Compaction Execution***: A separate process reads the compaction plan and performs compaction of file slices.
+
+  
+# Deployment Models
+
+There are few ways by which we can execute compactions asynchronously. 
+
+## Spark Structured Streaming
+
+With 0.6.0, we now have support for running async compactions in Spark 
+Structured Streaming jobs. Compactions are scheduled and executed asynchronously inside the 
+streaming job.  Async Compactions are enabled by default for structured streaming jobs
+on Merge-On-Read table.
+
+Here is an example snippet in java
+
+```properties
+import org.apache.hudi.DataSourceWriteOptions;
+import org.apache.hudi.HoodieDataSourceHelpers;
+import org.apache.hudi.config.HoodieCompactionConfig;
+import org.apache.hudi.config.HoodieWriteConfig;
+
+import org.apache.spark.sql.streaming.OutputMode;
+import org.apache.spark.sql.streaming.ProcessingTime;
+
+
+ DataStreamWriter<Row> writer = streamingInput.writeStream().format("org.apache.hudi")
+        .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), operationType)
+        .option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY(), tableType)
+        .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
+        .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
+        .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
+        .option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, "10")
+        .option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_OPT_KEY(), "true")
+        .option(HoodieWriteConfig.TABLE_NAME, tableName).option("checkpointLocation", checkpointLocation)
+        .outputMode(OutputMode.Append());
+ writer.trigger(new ProcessingTime(30000)).start(tablePath);
+```
+
+## DeltaStreamer Continuous Mode
+Hudi DeltaStreamer provides continuous ingestion mode where a single long running spark application  
+ingests data to Hudi table continuously from upstream sources. In this mode, Hudi supports managing asynchronous 
+compactions. Here is an example snippet for running in continuous mode with async compactions
+
+```properties
+spark-submit --packages org.apache.hudi:hudi-utilities-bundle_2.11:0.6.0 \
+--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer \
+--table-type MERGE_ON_READ \
+--target-base-path <hudi_base_path> \
+--target-table <hudi_table> \
+--source-class org.apache.hudi.utilities.sources.JsonDFSSource \
+--source-ordering-field ts \
+--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
+--props /path/to/source.properties \
+--continous
+```
+
+## Hudi CLI
+Hudi CLI is yet another way to execute specific compactions asynchronously. Here is an example 
+
+```properties
+hudi:trips->compaction run --tableName <table_name> --parallelism <parallelism> --compactionInstant <InstantTime>
+...
+```
+
+## Hudi Compactor Script
+Hudi provides a standalone tool to also execute specific compactions asynchronously. Here is an example
+
+```properties
+spark-submit --packages org.apache.hudi:hudi-utilities-bundle_2.11:0.6.0 \
+--class org.apache.hudi.utilities.HoodieCompactor \
+--base-path <base_path> \
+--table-name <table_name> \
+--instant-time <compaction_instant> \
+--schema-file <schema_file>
+```
diff --git a/website/blog/2020-08-22-ingest-multiple-tables-using-hudi.md b/website/blog/2020-08-22-ingest-multiple-tables-using-hudi.md
new file mode 100644
index 0000000..9f68c8c
--- /dev/null
+++ b/website/blog/2020-08-22-ingest-multiple-tables-using-hudi.md
@@ -0,0 +1,104 @@
+---
+title: "Ingest multiple tables using Hudi"
+excerpt: "Ingesting multiple tables using Hudi at a single go is now possible. This blog gives a detailed explanation of how to achieve the same using `HoodieMultiTableDeltaStreamer.java`"
+author: pratyakshsharma
+category: blog
+---
+
+When building a change data capture pipeline for already existing or newly created relational databases, one of the most common problems that one faces is simplifying the onboarding process for multiple tables. Ingesting multiple tables to Hudi dataset at a single go is now possible using `HoodieMultiTableDeltaStreamer` class which is a wrapper on top of the more popular `HoodieDeltaStreamer` class. Currently `HoodieMultiTableDeltaStreamer` supports **COPY_ON_WRITE** storage type only an [...]
+<!--truncate-->
+This blog will guide you through configuring and running `HoodieMultiTableDeltaStreamer`.
+
+### Configuration
+
+ - `HoodieMultiTableDeltaStreamer` expects users to maintain table wise overridden properties in separate files in a dedicated config folder. Common properties can be configured via common properties file also.
+ - By default, hudi datasets are created under the path `<base-path-prefix>/<database_name>/<name_of_table_to_be_ingested>`. You need to provide the names of tables to be ingested via the property `hoodie.deltastreamer.ingestion.tablesToBeIngested` in the format `<database>.<table>`, for example 
+ 
+```java
+hoodie.deltastreamer.ingestion.tablesToBeIngested=db1.table1,db2.table2
+``` 
+ 
+ - If you do not provide database name, then it is assumed the table belongs to default database and the hudi dataset for the concerned table is created under the path `<base-path-prefix>/default/<name_of_table_to_be_ingested>`. Also there is a provision to override the default path for hudi datasets. You can create hudi dataset for a particular table by setting the property `hoodie.deltastreamer.ingestion.targetBasePath` in table level config file
+ - There are a lot of properties that one might like to override per table, for example
+ 
+```java
+hoodie.datasource.write.recordkey.field=_row_key
+hoodie.datasource.write.partitionpath.field=created_at
+hoodie.deltastreamer.source.kafka.topic=topic2
+hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP
+hoodie.deltastreamer.keygen.timebased.input.dateformat=yyyy-MM-dd HH:mm:ss.S
+hoodie.datasource.hive_sync.table=short_trip_uber_hive_dummy_table
+hoodie.deltastreamer.ingestion.targetBasePath=s3:///temp/hudi/table1
+```  
+ 
+ - Properties like above need to be set for every table to be ingested. As already suggested at the beginning, users are expected to maintain separate config files for every table by setting the below property
+ 
+```java
+hoodie.deltastreamer.ingestion.<db>.<table>.configFile=s3:///tmp/config/config1.properties
+``` 
+
+If you do not want to set the above property for every table, you can simply create config files for every table to be ingested under the config folder with the name - `<database>_<table>_config.properties`. For example if you want to ingest table1 and table2 from dummy database, where config folder is set to `s3:///tmp/config`, then you need to create 2 config files on the given paths - `s3:///tmp/config/dummy_table1_config.properties` and `s3:///tmp/config/dummy_table2_config.properties`.
+
+ - Finally you can specify all the common properties in a common properties file. Common properties file does not necessarily have to lie under config folder but it is advised to keep it along with other config files. This file will contain the below properties
+ 
+```java
+hoodie.deltastreamer.ingestion.tablesToBeIngested=db1.table1,db2.table2
+hoodie.deltastreamer.ingestion.db1.table1.configFile=s3:///tmp/config_table1.properties
+hoodie.deltastreamer.ingestion.db2.table2.configFile=s3:///tmp/config_table2.properties
+``` 
+
+### Run Command
+
+`HoodieMultiTableDeltaStreamer` can be run similar to how one runs `HoodieDeltaStreamer`. Please refer to the example given below for the command. 
+
+
+### Example
+
+Suppose you want to ingest table1 and table2 from db1 and want to ingest the 2 tables under the path `s3:///temp/hudi`. You can ingest them using the below command
+
+```java
+[hoodie]$ spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer `ls packaging/hudi-utilities-bundle/target/hudi-utilities-bundle-*.jar` \
+  --props s3:///temp/hudi-ingestion-config/kafka-source.properties \
+  --config-folder s3:///temp/hudi-ingestion-config \
+  --schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider \
+  --source-class org.apache.hudi.utilities.sources.AvroKafkaSource \
+  --source-ordering-field impresssiontime \
+  --base-path-prefix s3:///temp/hudi \ 
+  --target-table dummy_table \
+  --op UPSERT
+```
+
+s3:///temp/config/kafka-source.properties
+
+```java
+hoodie.deltastreamer.ingestion.tablesToBeIngested=db1.table1,db1.table2
+hoodie.deltastreamer.ingestion.db1.table1.configFile=s3:///temp/hudi-ingestion-config/config_table1.properties
+hoodie.deltastreamer.ingestion.db21.table2.configFile=s3:///temp/hudi-ingestion-config/config_table2.properties
+
+#Kafka props
+bootstrap.servers=localhost:9092
+auto.offset.reset=earliest
+schema.registry.url=http://localhost:8081
+
+hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.CustomKeyGenerator
+```
+
+s3:///temp/hudi-ingestion-config/config_table1.properties
+
+```java
+hoodie.datasource.write.recordkey.field=_row_key1
+hoodie.datasource.write.partitionpath.field=created_at
+hoodie.deltastreamer.source.kafka.topic=topic1
+```
+
+s3:///temp/hudi-ingestion-config/config_table2.properties
+
+```java
+hoodie.datasource.write.recordkey.field=_row_key2
+hoodie.datasource.write.partitionpath.field=created_at
+hoodie.deltastreamer.source.kafka.topic=topic2
+```
+
+Contributions are welcome for extending multiple tables ingestion support to **MERGE_ON_READ** storage type and enabling `HoodieMultiTableDeltaStreamer` ingest multiple tables parallely. 
+
+Happy ingesting! 
\ No newline at end of file
diff --git a/website/blog/2020-10-06-cdc-solution-using-hudi-by-nclouds.md b/website/blog/2020-10-06-cdc-solution-using-hudi-by-nclouds.md
new file mode 100644
index 0000000..7e32a42
--- /dev/null
+++ b/website/blog/2020-10-06-cdc-solution-using-hudi-by-nclouds.md
@@ -0,0 +1,8 @@
+---
+title: "How nClouds Helps Accelerate Data Delivery with Apache Hudi on Amazon EMR"
+excerpt: "Solution to set up a new data and analytics platform using Apache Hudi on Amazon EMR and other managed services, including Amazon QuickSight for data visualization."
+author: nclouds
+category: blog
+---
+
+This [blog](https://aws.amazon.com/blogs/apn/how-nclouds-helps-accelerate-data-delivery-with-apache-hudi-on-amazon-emr/) published by nClouds in partnership with AWS shows how to build a CDC pipeline using Apache Hudi on Amazon EMR and other managed services like Amazon RDS and AWS DMS, including Amazon QuickSight for data visualization.
\ No newline at end of file
diff --git a/website/blog/2020-10-15-apache-hudi-meets-apache-flink.md b/website/blog/2020-10-15-apache-hudi-meets-apache-flink.md
new file mode 100644
index 0000000..93350e4
--- /dev/null
+++ b/website/blog/2020-10-15-apache-hudi-meets-apache-flink.md
@@ -0,0 +1,196 @@
+---
+title: "Apache Hudi meets Apache Flink"
+excerpt: "The design and latest progress of the integration of Apache Hudi and Apache Flink."
+author: wangxianghu
+category: blog
+---
+
+Apache Hudi (Hudi for short) is a data lake framework created at Uber. Hudi joined the Apache incubator for incubation in January 2019, and was promoted to the top Apache project in May 2020. It is one of the most popular data lake frameworks.
+<!--truncate-->
+## 1. Why decouple
+
+Hudi has been using Spark as its data processing engine since its birth. If users want to use Hudi as their data lake framework, they must introduce Spark into their platform technology stack. 
+A few years ago, using Spark as a big data processing engine can be said to be very common or even natural. Since Spark can either perform batch processing or use micro-batch to simulate streaming, one engine solves both streaming and batch problems. 
+However, in recent years, with the development of big data technology, Flink, which is also a big data processing engine, has gradually entered people's vision and has occupied a certain market in the field of computing engines. 
+In the big data technology community, forums and other territories, the voice of whether Hudi supports Flink has gradually appeared and has become more frequent. Therefore, it is a valuable thing to make Hudi support the Flink engine, and the first step of integrating the Flink engine is that Hudi and Spark are decoupled.
+
+In addition, looking at the mature, active, and viable frameworks in the big data, all frameworks are elegant in design and can be integrated with other frameworks and leverage each other's expertise. 
+Therefore, decoupling Hudi from Spark and turning it into an engine-independent data lake framework will undoubtedly create more possibilities for the integration of Hudi and other components, allowing Hudi to better integrate into the big data ecosystem.
+
+## 2. Challenges
+
+Hudi's internal use of Spark API is as common as our usual development and use of List. Since the data source reads the data, and finally writes the data to the table, Spark RDD is used as the main data structure everywhere, and even ordinary tools are implemented using the Spark API. 
+It can be said that Hudi is a universal data lake framework implemented by Spark. Hudi also leverages deep Spark functionality like custom partitioning, in-memory caching to implement indexing and file sizing using workload heuristics. 
+For some of these, Flink offers better out-of-box support (e.g using Flink’s state store for indexing) and can in fact, make Hudi approach real-time latencies more and more. 
+
+In addition, the primary engine integrated after this decoupling is Flink. Flink and Spark differ greatly in core abstraction. Spark believes that data is bounded, and its core abstraction is a limited set of data. 
+Flink believes that the essence of data is a stream, and its core abstract DataStream contains various operations on data. Hudi has a streaming first design (record level updates, record level streams), that arguably fit the Flink model more naturally. 
+At the same time, there are multiple RDDs operating at the same time in Hudi, and the processing result of one RDD is combined with another RDD. 
+This difference in abstraction and the reuse of intermediate results during implementation make it difficult for Hudi to use a unified API to operate both RDD and DataStream in terms of decoupling abstraction.
+
+## 3. Decoupling Spark
+In theory, Hudi uses Spark as its computing engine to use Spark's distributed computing power and RDD's rich operator capabilities. Apart from distributed computing power, Hudi uses RDD more as a data structure, and RDD is essentially a bounded data set. 
+Therefore, it is theoretically feasible to replace RDD with List (of course, it may sacrifice performance/scale). In order to ensure the performance and stability of the Hudi Spark version as much as possible. We can keep setting the bounded data set as the basic operation unit. 
+Hudi's main operation API remains unchanged, and RDD is extracted as a generic type. The Spark engine implementation still uses RDD, and other engines use List or other bounded  data set according to the actual situation.
+
+### Decoupling principle
+1) Unified generics. The input records `JavaRDD<HoodieRecord>`, key of input records `JavaRDD<HoodieKey>`, and result of write operations `JavaRDD<WriteStatus>` used by the Spark API use generic `I,K,O` instead;
+
+2) De-sparkization. All APIs of the abstraction layer must have nothing to do with Spark. Involving specific operations that are difficult to implement in the abstract layer, rewrite them as abstract methods and introduce Spark subclasses.
+
+For example: Hudi uses the `JavaSparkContext#map()` method in many places. To de-spark, you need to hide the `JavaSparkContext`. For this problem, we introduced the `HoodieEngineContext#map()` method, which will block the specific implementation details of `map`, so as to achieve de-sparkization in abstraction.
+
+3) Minimize changes to the abstraction layer to ensure the original function and performance of Hudi;
+
+4) Replace the `JavaSparkContext` with the `HoodieEngineContext` abstract class to provide the running environment context.
+
+In addition, some of the core algorithms in Hudi, like [rollback](https://github.com/apache/hudi/pull/1756), has been redone without the need for computing a workload profile ahead of time, which used to rely on Spark caching. 
+
+## 4. Flink integration design
+Hudi's write operation is batch processing in nature, and the continuous mode of `DeltaStreamer` is realized by looping batch processing. In order to use a unified API, when Hudi integrates Flink, we choose to collect a batch of data before processing, and finally submit it in a unified manner (here we use List to collect data in Flink).
+In Hudi terminology, we will stream data for a given commit, but only publish the commits every so often, making it practical to scale storage on cloud storage and also tunable.
+
+The easiest way to think of batch operation is to use a time window. However, when using a window, when there is no data flowing in a window, there will be no output data, and it is difficult for the Flink sink to judge whether all the data from a given batch has been processed. 
+Therefore, we use Flink's checkpoint mechanism to collect batches. The data between every two barriers is a batch. When there is no data in a subtask, the mock result data is made up. 
+In this way, on the sink side, when each subtask has result data issued, it can be considered that a batch of data has been processed and the commit can be executed.
+
+The DAG is as follows:
+
+![dualism](/assets/images/blog/hudi-meets-flink/image1.png)
+
+ - **Source:** receives Kafka data and converts it into `List<HoodieRecord>`;
+ - **InstantGeneratorOperator:** generates a globally unique instant. When the previous instant is not completed or the current batch has no data, no new instant is created;
+ - **KeyBy partitionPath:** partitions according to `partitionPath` to avoid multiple subtasks from writing the same partition;
+ - **WriteProcessOperator:** performs a write operation. When there is no data in the current partition, it sends empty result data to the downstream to make up the number;
+ - **CommitSink:** receives the calculation results of the upstream task. When receiving the parallelism results, it is considered that all the upstream subtasks are completed and the commit is executed.
+
+Note:
+`InstantGeneratorOperator` and `WriteProcessOperator` are both custom Flink operators. `InstantGeneratorOperator` will block checking the state of the previous instant to ensure that there is only one instant in the global (or requested) state.
+`WriteProcessOperator` is the actual execution Where a write operation is performed, the write operation is triggered at checkpoint.
+
+### 4.1 Index design based on Flink State
+
+Stateful computing is one of the highlights of the Flink engine. Compared with using external storage, using Flink's built-in `State` can significantly improve the performance of Flink applications. 
+Therefore, it would be a good choice to implement a Hudi index based on Flink's State.
+
+The core of the Hudi index is to maintain the mapping of the Hudi key `HoodieKey` and the location of the Hudi data `HoodieRecordLocation`. 
+Therefore, based on the current design, we can simply maintain a `MapState<HoodieKey, HoodieRecordLocation>` in Flink UDF to map the `HoodieKey` and `HoodieRecordLocation`, and leave the fault tolerance and persistence of State to the Flink framework.
+
+![dualism](/assets/images/blog/hudi-meets-flink/image2.png)
+
+## 5. Implementation examples
+### 1) HoodieTable
+
+```
+/**
+  * Abstract implementation of a HoodieTable.
+  *
+  * @param <T> Sub type of HoodieRecordPayload
+  * @param <I> Type of inputs
+  * @param <K> Type of keys
+  * @param <O> Type of outputs
+  */
+public abstract class HoodieTable<T extends HoodieRecordPayload, I, K, O> implements Serializable {
+
+   protected final HoodieWriteConfig config;
+   protected final HoodieTableMetaClient metaClient;
+   protected final HoodieIndex<T, I, K, O> index;
+
+   public abstract HoodieWriteMetadata<O> upsert(HoodieEngineContext context, String instantTime,
+       I records);
+
+   public abstract HoodieWriteMetadata<O> insert(HoodieEngineContext context, String instantTime,
+       I records);
+
+   public abstract HoodieWriteMetadata<O> bulkInsert(HoodieEngineContext context, String instantTime,
+       I records, Option<BulkInsertPartitioner<I>> bulkInsertPartitioner);
+
+   ...
+}
+```
+
+`HoodieTable` is one of the core abstractions of Hudi, which defines operations such as `insert`, `upsert`, and `bulkInsert` supported by the table. 
+Take `upsert` as an example, the input data is changed from the original `JavaRDD<HoodieRecord> inputRdds` to `I records`, and the runtime `JavaSparkContext jsc` is changed to `HoodieEngineContext context`.
+
+From the class annotations, we can see that `T, I, K, O` represents the load data type, input data type, primary key type and output data type of Hudi operation respectively. 
+These generics will run through the entire abstraction layer.
+
+### 2) HoodieEngineContext
+
+```
+/**
+ * Base class contains the context information needed by the engine at runtime. It will be extended by different
+ * engine implementation if needed.
+ */
+public abstract class HoodieEngineContext {
+
+  public abstract <I, O> List<O> map(List<I> data, SerializableFunction<I, O> func, int parallelism);
+
+  public abstract <I, O> List<O> flatMap(List<I> data, SerializableFunction<I, Stream<O>> func, int parallelism);
+
+  public abstract <I> void foreach(List<I> data, SerializableConsumer<I> consumer, int parallelism);
+
+  ......
+}
+```
+
+`HoodieEngineContext` plays the role of `JavaSparkContext`, it not only provides all the information that `JavaSparkContext` can provide, 
+but also encapsulates many methods such as `map`, `flatMap`, `foreach`, and hides The specific implementation of `JavaSparkContext#map()`,`JavaSparkContext#flatMap()`, `JavaSparkContext#foreach()` and other methods.
+
+Take the `map` method as an example. In the Spark implementation class `HoodieSparkEngineContext`, the `map` method is as follows:
+
+```
+  @Override
+  public <I, O> List<O> map(List<I> data, SerializableFunction<I, O> func, int parallelism) {
+    return javaSparkContext.parallelize(data, parallelism).map(func::apply).collect();
+  }
+```
+
+In the engine that operates List, the implementation can be as follows (different methods need to pay attention to thread-safety issues, use `parallel()` with caution):
+
+```
+  @Override
+  public <I, O> List<O> map(List<I> data, SerializableFunction<I, O> func, int parallelism) {
+    return data.stream().parallel().map(func::apply).collect(Collectors.toList());
+  }
+```
+
+Note:
+The exception thrown in the map function can be solved by wrapping `SerializableFunction<I, O> func`.
+
+Here is a brief introduction to `SerializableFunction`:
+
+```
+@FunctionalInterface
+public interface SerializableFunction<I, O> extends Serializable {
+  O apply(I v1) throws Exception;
+}
+```
+
+This method is actually a variant of `java.util.function.Function`. The difference from `java.util.function.Function` is that `SerializableFunction` can be serialized and can throw exceptions. 
+This function is introduced because the input parameters that the `JavaSparkContext#map()` function can receive must be serializable. 
+At the same time, there are many exceptions that need to be thrown in the logic of Hudi, and the code for `try-catch` in the Lambda expression will be omitted It is bloated and not very elegant.
+
+## 6. Current progress and follow-up plan
+
+### 6.1 Working time axis
+
+![dualism](/assets/images/blog/hudi-meets-flink/image3.png)
+
+[T3go](https://www.t3go.cn/)
+[Aliyun](https://cn.aliyun.com/)
+[SF-express](https://www.sf-express.com/cn/sc/)
+
+### 6.2 Follow-up plan
+
+#### 1) Promote the integration of Hudi and Flink
+
+Push the integration of Flink and Hudi to the community as soon as possible. In the initial stage, this feature may only support Kafka data sources.
+
+#### 2) Performance optimization
+
+In order to ensure the stability and performance of the Hudi-Spark version, the decoupling did not take too much into consideration the possible performance problems of the Flink version.
+
+#### 3) flink-connector-hudi like third-party package development
+
+Make the binding of Hudi-Flink into a third-party package. Users can this third-party package to read/write from/to Hudi with Flink.
\ No newline at end of file
diff --git a/website/blog/2020-10-19-hudi-meets-aws-emr-and-aws-dms.md b/website/blog/2020-10-19-hudi-meets-aws-emr-and-aws-dms.md
new file mode 100644
index 0000000..6e11d70
--- /dev/null
+++ b/website/blog/2020-10-19-hudi-meets-aws-emr-and-aws-dms.md
@@ -0,0 +1,8 @@
+---
+title: "Apply record level changes from relational databases to Amazon S3 data lake using Apache Hudi on Amazon EMR and AWS Database Migration Service"
+excerpt: "AWS blog showing how to build a CDC pipeline that captures data from an Amazon RDS for MySQL database using AWS DMS and applies those changes to an Amazon S3 dataset using Apache Hudi on Amazon EMR."
+author: aws
+category: blog
+---
+
+This [blog](https://aws.amazon.com/blogs/big-data/apply-record-level-changes-from-relational-databases-to-amazon-s3-data-lake-using-apache-hudi-on-amazon-emr-and-aws-database-migration-service/) published by AWS shows how to build a CDC pipeline that captures data from an Amazon Relational Database Service (Amazon RDS) for MySQL database using AWS Database Migration Service (AWS DMS) and applies those changes to a dataset in Amazon S3 using Apache Hudi on Amazon EMR.
\ No newline at end of file
diff --git a/website/blog/2020-11-11-hudi-indexing-mechanisms.md b/website/blog/2020-11-11-hudi-indexing-mechanisms.md
new file mode 100644
index 0000000..97bdcc1
--- /dev/null
+++ b/website/blog/2020-11-11-hudi-indexing-mechanisms.md
@@ -0,0 +1,124 @@
+---
+title: "Employing the right indexes for fast updates, deletes in Apache Hudi"
+excerpt: "Detailing different indexing mechanisms in Hudi and when to use each of them"
+author: vinoth
+category: blog
+---
+
+Apache Hudi employs an index to locate the file group, that an update/delete belongs to. For Copy-On-Write tables, this enables
+fast upsert/delete operations, by avoiding the need to join against the entire dataset to determine which files to rewrite.
+For Merge-On-Read tables, this design allows Hudi to bound the amount of records any given base file needs to be merged against.
+Specifically, a given base file needs to merged only against updates for records that are part of that base file. In contrast,
+designs without an indexing component (e.g: [Apache Hive ACID](https://cwiki.apache.org/confluence/display/Hive/Hive+Transactions)),
+could end up having to merge all the base files against all incoming updates/delete records.
+<!--truncate-->
+At a high level, an index maps a record key + an optional partition path to a file group ID on storage (explained
+more in detail [here](/docs/concepts)) and during write operations, we lookup this mapping to route an incoming update/delete
+to a log file attached to the base file (MOR) or to the latest base file that now needs to be merged against (COW). The index also enables 
+Hudi to enforce unique constraints based on the record keys.
+
+![Fact table](/assets/images/blog/hudi-indexes/with-and-without-index.png)
+_Figure: Comparison of merge cost for updates (yellow blocks) against base files (white blocks)_
+
+Given that Hudi already supports few different indexing techniques and is also continuously improving/adding more to its toolkit, the rest of the blog 
+attempts to explain different categories of workloads, from our experience and suggests what index types to use for each. We will also interlace 
+commentary on existing limitations, upcoming work and optimizations/tradeoffs along the way. 
+
+## Index Types in Hudi
+
+Currently, Hudi supports the following indexing options. 
+
+- **Bloom Index (default):** Employs bloom filters built out of the record keys, optionally also pruning candidate files using record key ranges.
+- **Simple Index:** Performs a lean join of the incoming update/delete records against keys extracted from the table on storage.
+- **HBase Index:** Manages the index mapping in an external Apache HBase table.
+
+Writers can pick one of these options using `hoodie.index.type` config option. Additionally, a custom index implementation can also be employed
+using `hoodie.index.class` and supplying a subclass of `SparkHoodieIndex` (for Apache Spark writers) 
+
+Another key aspect worth understanding is the difference between global and non-global indexes. Both bloom and simple index have 
+global options - `hoodie.index.type=GLOBAL_BLOOM` and `hoodie.index.type=GLOBAL_SIMPLE` - respectively. HBase index is by nature a global index.
+
+- **Global index:**  Global indexes enforce uniqueness of keys across all partitions of a table i.e guarantees that exactly 
+one record exists in the table for a given record key. Global indexes offer stronger guarantees, but the update/delete cost grows
+with size of the table `O(size of table)`, which might still be acceptable for smaller tables.
+
+- **Non Global index:** On the other hand, the default index implementations enforce this constraint only within a specific partition. 
+As one might imagine, non global indexes depends on the writer to provide the same consistent partition path for a given record key during update/delete, 
+but can deliver much better performance since the index lookup operation becomes `O(number of records updated/deleted)` and 
+scales well with write volume.
+
+Since data comes in at different volumes, velocity and has different access patterns, different indices could be used for different workloads. 
+Next, let’s walk through some typical workloads and see how to leverage the right Hudi index for such use-cases.
+
+## Workload: Late arriving updates to fact tables
+
+Many companies store large volumes of transactional data in NoSQL data stores. For eg, trip tables in case of ride-sharing, buying and selling of shares, 
+orders in an e-commerce site. These tables are usually ever growing with random updates on most recent data with long tail updates going to older data, either
+due to transactions settling at a later date/data corrections. In other words, most updates go into the latest partitions with few updates going to older ones.
+
+![Fact table](/assets/images/blog/hudi-indexes/Fact20tables.gif)
+_Figure: Typical update pattern for Fact tables_
+
+For such workloads, the `BLOOM` index performs well, since index look-up will prune a lot of data files based on a well-sized bloom filter.
+Additionally, if the keys can be constructed such that they have a certain ordering, the number of files to be compared is further reduced by range pruning. 
+Hudi constructs an interval tree with all the file key ranges and efficiently filters out the files that don't match any key ranges in the updates/deleted records.
+
+In order to efficiently compare incoming record keys against bloom filters i.e with minimal number of bloom filter reads and uniform distribution of work across
+the executors, Hudi leverages caching of input records and employs a custom partitioner that can iron out data skews using statistics. At times, if the bloom filter 
+false positive ratio is high, it could increase the amount of data shuffled to perform the lookup. Hudi supports dynamic bloom filters 
+(enabled using `hoodie.bloom.index.filter.type=DYNAMIC_V0`), which adjusts its size based on the number of records stored in a given file to deliver the 
+configured false positive ratio. 
+
+In the near future, we plan to introduce a much speedier version of the BLOOM index that tracks bloom filters/ranges in an internal Hudi metadata table, indexed for fast 
+point lookups. This would avoid any current limitations around reading bloom filters/ranges from the base files themselves, to perform the lookup. (see 
+[RFC-15](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+15%3A+HUDI+File+Listing+and+Query+Planning+Improvements?src=contextnavpagetreemode) for the general design)
+
+## Workload: De-Duplication in event tables
+
+Event Streaming is everywhere. Events coming from Apache Kafka or similar message bus are typically 10-100x the size of fact tables and often treat "time" (event's arrival time/processing 
+time) as a first class citizen. For eg, IoT event stream, click stream data, ad impressions etc. Inserts and updates only span the last few partitions as these are mostly append only data. 
+Given duplicate events can be introduced anywhere in the end-end pipeline, de-duplication before storing on the data lake is a common requirement. 
+
+![Event table](/assets/images/blog/hudi-indexes/Event20tables.gif)
+_Figure showing the spread of updates for Event table._
+
+In general, this is a very challenging problem to solve at lower cost. Although, we could even employ a key value store to perform this de-duplication ala HBASE index, the index storage
+costs would grow linear with number of events and thus can be prohibitively expensive. In fact, `BLOOM` index with range pruning is the optimal solution here. One can leverage the fact
+that time is often a first class citizen and construct a key such as `event_ts + event_id` such that the inserted records have monotonically increasing keys. This yields great returns
+by pruning large amounts of files even within the latest table partitions. 
+
+## Workload: Random updates/deletes to a dimension table
+
+These types of tables usually contain high dimensional data and hold reference data e.g user profile, merchant information. These are high fidelity tables where the updates are often small but also spread 
+across a lot of partitions and data files ranging across the dataset from old to new. Often times, these tables are also un-partitioned, since there is also not a good way to partition these tables.
+
+![Dimensions table](/assets/images/blog/hudi-indexes/Dimension20tables.gif)
+_Figure showing the spread of updates for Dimensions table._
+
+As discussed before, the `BLOOM` index may not yield benefits if a good number of files cannot be pruned out by comparing ranges/filters. In such a random write workload, updates end up touching 
+most files within in the table and thus bloom filters will typically indicate a true positive for all files based on some incoming update. Consequently, we would end up comparing ranges/filter, only
+to finally check the incoming updates against all files. The `SIMPLE` Index will be a better fit as it does not do any upfront pruning based, but directly joins with interested fields from every data file. 
+`HBASE` index can be employed, if the operational overhead is acceptable and would provide much better lookup times for these tables. 
+
+When using a global index, users should also consider setting `hoodie.bloom.index.update.partition.path=true` or `hoodie.simple.index.update.partition.path=true` to deal with cases where the 
+partition path value could change due to an update e.g users table partitioned by home city; user relocates to a different city. These tables are also excellent candidates for the Merge-On-Read table type.
+
+Going forward, we plan to build [record level indexing](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+08+%3A+Record+level+indexing+mechanisms+for+Hudi+datasets?src=contextnavpagetreemode)
+right within Hudi, which will improve the index look-up time and will also avoid additional overhead of maintaining an external system like hbase. 
+
+## Summary 
+
+Without the indexing capabilities in Hudi, it would not been possible to make upserts/deletes happen at [very large scales](https://eng.uber.com/apache-hudi-graduation/). 
+Hopefully this post gave you good enough context on the indexing mechanisms today and how different tradeoffs play out. 
+
+Some interesting work underway in this area:
+
+- Apache Flink based writing with a RocksDB state store backed indexing mechanism, unlocking true streaming upserts on data lakes. 
+- A brand new MetadataIndex, which reimagines the bloom index today on top of the metadata table in Hudi.
+- Record level index implementation, as a secondary index using another Hudi table.
+
+Going forward, this will remain an area of active investment for the project. we are always looking for contributors who can drive these roadmap items forward.
+Please [engage](/contribute/get-involved) with our community if you want to get involved.
+ 
+
+
diff --git a/website/blog/2020-12-01-high-perf-data-lake-with-hudi-and-alluxio-t3go.md b/website/blog/2020-12-01-high-perf-data-lake-with-hudi-and-alluxio-t3go.md
new file mode 100644
index 0000000..a58b2e5
--- /dev/null
+++ b/website/blog/2020-12-01-high-perf-data-lake-with-hudi-and-alluxio-t3go.md
@@ -0,0 +1,100 @@
+---
+title: "Building High-Performance Data Lake Using Apache Hudi and Alluxio at T3Go"
+excerpt: "How T3Go’s high-performance data lake using Apache Hudi and Alluxio shortened the time for data ingestion into the lake by up to a factor of 2. Data analysts using Presto, Hudi, and Alluxio in conjunction to query data on the lake saw queries speed up by 10 times faster."
+author: t3go
+category: blog
+---
+
+# Building High-Performance Data Lake Using Apache Hudi and Alluxio at T3Go
+[T3Go](https://www.t3go.cn/)  is China’s first platform for smart travel based on the Internet of Vehicles. In this article, Trevor Zhang and Vino Yang from T3Go describe the evolution of their data lake architecture, built on cloud-native or open-source technologies including Alibaba OSS, Apache Hudi, and Alluxio. Today, their data lake stores petabytes of data, supporting hundreds of pipelines and tens of thousands of tasks daily. It is essential for business units at T3Go including Da [...]
+
+In this blog, you will see how we slashed data ingestion time by half using Hudi and Alluxio. Furthermore, data analysts using Presto, Hudi, and Alluxio saw the queries speed up by 10 times. We built our data lake based on data orchestration for multiple stages of our data pipeline, including ingestion and analytics.
+<!--truncate-->
+# I. T3Go data lake Overview
+
+Prior to the data lake, different business units within T3Go managed their own data processing solutions, utilizing different storage systems, ETL tools, and data processing frameworks. Data for each became siloed from every other unit, significantly increasing cost and complexity. Due to the rapid business expansion of T3Go, this inefficiency became our engineering bottleneck.
+
+We moved to a unified data lake solution based on Alibaba OSS, an object store similar to AWS S3, to provide a centralized location to store structured and unstructured data, following the design principles of  _Multi-cluster Shared-data Architecture_; all the applications access OSS storage as the source of truth, as opposed to different data silos. This architecture allows us to store the data as-is, without having to first structure the data, and run different types of analytics to gu [...]
+
+# II. Efficient Near Real-time Analytics Using Hudi
+
+Our business in smart travel drives the need to process and analyze data in a near real-time manner. With a traditional data warehouse, we faced the following challenges:  
+
+1.  High overhead when updating due to long-tail latency
+2.  High cost of order analysis due to the long window of a business session
+3.  Reduced query accuracy due to late or ad-hoc updates
+4.  Unreliability in data ingestion pipeline
+5.  Data lost in the distributed data pipeline that cannot be reconciled
+6.  High latency to access data storage
+
+As a result, we adopted Apache Hudi on top of OSS to address these issues. The following diagram outlines the architecture:
+
+![architecture](/assets/images/blog/2020-12-01-t3go-architecture.png)
+
+## Enable Near real time data ingestion and analysis
+
+With Hudi, our data lake supports multiple data sources including Kafka, MySQL binlog, GIS, and other business logs in near real time. As a result, more than 60% of the company’s data is stored in the data lake and this proportion continues to increase.
+
+We are also able to speed up the data ingestion time down to a few minutes by introducing Apache Hudi into the data pipeline. Combined with big data interactive query and analysis framework such as Presto and SparkSQL, real-time data analysis and insights are achieved.
+
+## Enable Incremental processing pipeline
+
+With the help of Hudi, it is possible to provide incremental changes to the downstream derived table when the upstream table updates frequently. Even with a large number of interdependent tables, we can quickly run partial data updates. This also effectively avoids updating the full partitions of cold tables in the traditional Hive data warehouse.
+
+## Accessing Data using Hudi as a unified format
+
+Traditional data warehouses often deploy Hadoop to store data and provide batch analysis. Kafka is used separately to distribute Hadoop data to other data processing frameworks, resulting in duplicated data. Hudi helps effectively solve this problem; we always use Spark pipelines to insert new updates into the Hudi tables, then incrementally read the update of Hudi tables. In other words, Hudi tables are used as the unified storage format to access data.
+
+# III. Efficient Data Caching Using Alluxio
+
+In the early version of our data lake without Alluxio, data received from Kafka in real time is processed by Spark and then written to OSS data lake using Hudi DeltaStreamer tasks. With this architecture, Spark often suffered high network latency when writing to OSS directly. Since all data is in OSS storage, OLAP queries on Hudi data may also be slow due to lack of data locality.
+
+To address the latency issue, we deployed Alluxio as a data orchestration layer, co-located with computing engines such as Spark and Presto, and used Alluxio to accelerate read and write on the data lake as shown in the following diagram:
+
+![architecture-alluxio](/assets/images/blog/2020-12-01-t3go-architecture-alluxio.png)
+
+Data in formats such as Hudi, Parquet, ORC, and JSON are stored mostly on OSS, consisting of 95% of the data. Computing engines such as Flink, Spark, Kylin, and Presto are deployed in isolated clusters respectively. When each engine accesses OSS, Alluxio acts as a virtual distributed storage system to accelerate data, being co-located with each of the computing clusters.
+
+Specifically, here are a few applications leveraging Alluxio in the T3Go data lake.
+
+## Data lake ingestion
+
+We mount the corresponding OSS path to the Alluxio file system and set Hudi’s  _“__target-base-path__”_  parameter value to use the alluxio:// scheme in place of oss:// scheme. Spark pipelines with Hudi continuously ingest data to Alluxio. After data is written to Alluxio, it is asynchronously persisted from the Alluxio cache to the remote OSS every minute. These modifications allow Spark to write to a local Alluxio node instead of writing to remote OSS, significantly reducing the time f [...]
+
+## Data analysis on the lake
+
+We use Presto as an ad-hoc query engine to analyze the Hudi tables in the lake, co-locating Alluxio workers on each Presto worker node. When Presto and Alluxio services are co-located and running, Alluxio caches the input data locally in the Presto worker which greatly benefits Presto for subsequent retrievals. On a cache hit, Presto can read from the local Alluxio worker storage at memory speed without any additional data transfer over the network.
+
+## Concurrent accesses across multiple storage systems
+
+In order to ensure the accuracy of training samples, our machine learning team often synchronizes desensitized data in production to an offline machine learning environment. During synchronization, the data flows across multiple file systems, from production OSS to an offline HDFS followed by another offline Machine Learning HDFS.
+
+This data migration process is not only inefficient but also error-prune for modelers because multiple different storages with varying configurations are involved. Alluxio helps in this specific scenario by mounting the destination storage systems under the same filesystem to be accessed by their corresponding logical paths in Alluxio namespace. By decoupling the physical storage, this allows applications with different APIs to access and transfer data seamlessly. This data access layout [...]
+
+## Microbenchmark
+
+Overall, we observed the following improvements with Alluxio:
+
+1.  It supports a hierarchical and transparent caching mechanism
+2.  It supports cache promote omode mode when reading
+3.  It supports asynchronous writing mode
+4.  It supports LRU recycling strategy
+5.  It has pin and TTL features
+
+After comparison and verification, we choose to use Spark SQL as the query engine. Our performance testing queries the Hudi table, comparing Alluxio + OSS together against OSS directly as well as HDFS.
+
+![microbench](/assets/images/blog/2020-12-01-t3go-microbenchmark.png)
+
+In the stress test shown above, after the data volume is greater than a certain magnitude (2400W), the query speed using Alluxio+OSS surpasses the HDFS query speed of the hybrid deployment. After the data volume is greater than 1E, the query speed starts to double. After reaching 6E data, it is up to 12 times higher than querying native OSS and 8 times higher than querying native HDFS. The improvement depends on the machine configuration.
+
+Based on our performance benchmarking, we found that the performance can be improved by over 10 times with the help of Alluxio. Furthermore, the larger the data scale, the more prominent the performance improvement.
+
+# IV. Next Step
+
+As T3Go’s data lake ecosystem expands, we will continue facing the critical scenario of compute and storage segregation. With T3Go’s growing data processing needs, our team plans to deploy Alluxio on a larger scale to accelerate our data lake storage.
+
+In addition to the deployment of Alluxio on the data lake computing engine, which currently is mainly SparkSQL, we plan to add a layer of Alluxio to the OLAP cluster using Apache Kylin and an ad_hoc cluster using Presto. The goal is to have Alluxio cover all computing scenarios, with Alluxio interconnected between each scene to improve the read and write efficiency of the data lake and the surrounding lake ecology.
+
+# V. Conclusion
+
+As mentioned earlier, Hudi and Alluxio covers all scenarios of Hudi’s near real-time ingestion, near real-time analysis, incremental processing, and data distribution on DFS, among many others, and plays the role of a powerful accelerator on data ingestion and data analysis on the lake. With Hudi and Alluxio together,  **our R&D engineers shortened the time for data ingestion into the lake by up to a factor of 2. Data analysts using Presto, Hudi, and Alluxio in conjunction to query data  [...]
diff --git a/website/blog/2021-01-27-hudi-clustering-intro.md b/website/blog/2021-01-27-hudi-clustering-intro.md
new file mode 100644
index 0000000..5f47ffe
--- /dev/null
+++ b/website/blog/2021-01-27-hudi-clustering-intro.md
@@ -0,0 +1,132 @@
+---
+title: "Optimize Data lake layout using Clustering in Apache Hudi"
+excerpt: "Introduce clustering feature to change data layout"
+author: satish.kotha
+category: blog
+---
+
+# Background
+
+Apache Hudi brings stream processing to big data, providing fresh data while being an order of magnitude efficient over traditional batch processing. In a data lake/warehouse, one of the key trade-offs is between ingestion speed and query performance. Data ingestion typically prefers small files to improve parallelism and make data available to queries as soon as possible. However, query performance degrades poorly with a lot of small files. Also, during ingestion, data is typically co-l [...]
+<!--truncate-->
+
+# Clustering Architecture
+
+At a high level, Hudi provides different operations such as insert/upsert/bulk_insert through it’s write client API to be able to write data to a Hudi table. To be able to choose a trade-off between file size and ingestion speed, Hudi provides a knob `hoodie.parquet.small.file.limit` to be able to configure the smallest allowable file size. Users are able to configure the small file [soft limit](https://hudi.apache.org/docs/configurations#compactionSmallFileSize) to `0` to force new data [...]
+
+  
+
+To be able to support an architecture that allows for fast ingestion without compromising query performance, we have introduced a ‘clustering’ service to rewrite the data to optimize Hudi data lake file layout.
+
+Clustering table service can run asynchronously or synchronously adding a new action type called “REPLACE”, that will mark the clustering action in the Hudi metadata timeline.
+
+  
+
+### Overall, there are 2 parts to clustering
+
+1.  Scheduling clustering: Create a clustering plan using a pluggable clustering strategy.
+2.  Execute clustering: Process the plan using an execution strategy to create new files and replace old files.
+    
+
+### Scheduling clustering
+
+Following steps are followed to schedule clustering.
+
+1.  Identify files that are eligible for clustering: Depending on the clustering strategy chosen, the scheduling logic will identify the files eligible for clustering.
+2.  Group files that are eligible for clustering based on specific criteria. Each group is expected to have data size in multiples of ‘targetFileSize’. Grouping is done as part of ‘strategy’ defined in the plan. Additionally, there is an option to put a cap on group size to improve parallelism and avoid shuffling large amounts of data.
+3.  Finally, the clustering plan is saved to the timeline in an avro [metadata format](https://github.com/apache/hudi/blob/master/hudi-common/src/main/avro/HoodieClusteringPlan.avsc).
+    
+
+### Running clustering
+
+1.  Read the clustering plan and get the ‘clusteringGroups’ that mark the file groups that need to be clustered.
+2.  For each group, we instantiate appropriate strategy class with strategyParams (example: sortColumns) and apply that strategy to rewrite the data.
+3.  Create a “REPLACE” commit and update the metadata in [HoodieReplaceCommitMetadata](https://github.com/apache/hudi/blob/master/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieReplaceCommitMetadata.java).
+    
+
+Clustering Service builds on Hudi’s MVCC based design to allow for writers to continue to insert new data while clustering action runs in the background to reformat data layout, ensuring snapshot isolation between concurrent readers and writers.
+
+NOTE: Clustering can only be scheduled for tables / partitions not receiving any concurrent updates. In the future, concurrent updates use-case will be supported as well.
+
+![Clustering example](/assets/images/blog/clustering/example_perf_improvement.png)
+_Figure: Illustrating query performance improvements by clustering_
+
+### Setting up clustering
+Inline clustering can be setup easily using spark dataframe options. See sample below
+
+```scala
+import org.apache.hudi.QuickstartUtils._
+import scala.collection.JavaConversions._
+import org.apache.spark.sql.SaveMode._
+import org.apache.hudi.DataSourceReadOptions._
+import org.apache.hudi.DataSourceWriteOptions._
+import org.apache.hudi.config.HoodieWriteConfig._
+
+
+val df =  //generate data frame
+df.write.format("org.apache.hudi").
+        options(getQuickstartWriteConfigs).
+        option(PRECOMBINE_FIELD_OPT_KEY, "ts").
+        option(RECORDKEY_FIELD_OPT_KEY, "uuid").
+        option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
+        option(TABLE_NAME, "tableName").
+        option("hoodie.parquet.small.file.limit", "0").
+        option("hoodie.clustering.inline", "true").
+        option("hoodie.clustering.inline.max.commits", "4").
+        option("hoodie.clustering.plan.strategy.target.file.max.bytes", "1073741824").
+        option("hoodie.clustering.plan.strategy.small.file.limit", "629145600").
+        option("hoodie.clustering.plan.strategy.sort.columns", "column1,column2"). //optional, if sorting is needed as part of rewriting data
+        mode(Append).
+        save("dfs://location");
+```
+
+For more advanced usecases, async clustering pipeline can also be setup. See an example [here](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+19+Clustering+data+for+freshness+and+query+performance#RFC19Clusteringdataforfreshnessandqueryperformance-SetupforAsyncclusteringJob).
+
+
+# Table Query Performance
+
+We created a dataset from one partition of a known production style table with ~20M records and on-disk size of ~200GB. The dataset has rows for multiple “sessions”. Users always query this data using a predicate on session. Data for a single session is spread across multiple data files because ingestion groups data based on arrival time. The below experiment shows that by clustering on session, we are able to improve the data locality and reduce query execution time by more than 50%.
+
+Query: 
+```scala
+spark.sql("select  *  from table where session_id=123")
+```
+
+## Before Clustering
+
+Query took 2.2 minutes to complete. Note that the number of output rows in the “scan parquet” part of the query plan includes all 20M rows in the table.
+
+![Query Plan Before Clustering](/assets/images/blog/clustering/Query_Plan_Before_Clustering.png)
+_Figure: Spark SQL query details before clustering_
+
+## After Clustering
+
+The query plan is similar to above. But, because of improved data locality and predicate push down, spark is able to prune a lot of rows. After clustering, the same query only outputs 110K rows (out of 20M rows) while scanning parquet files. This cuts query time to less than a minute from 2.2 minutes.
+
+![Query Plan Before Clustering](/assets/images/blog/clustering/Query_Plan_After_Clustering.png)
+_Figure: Spark SQL query details after clustering_
+
+The table below summarizes query performance improvements from experiments run using Spark3
+
+
+| Table State | Query runtime                           | Num Records Processed | Num files on disk                          |  Size of each file
+|----------------|-------------------------------|-----------------------------|------------|---------|
+|**Unclustered**| 130,673 ms            | ~20M | 13642            | ~150 MB |
+|**Clustered**          |  55,963 ms | ~110K | 294 | ~600 MB
+
+Query runtime is reduced by 60% after clustering. Similar results were observed on other sample datasets. See example query plans and more details at the [RFC-19 performance evaluation](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+19+Clustering+data+for+freshness+and+query+performance#RFC19Clusteringdataforfreshnessandqueryperformance-PerformanceEvaluation).
+
+We expect dramatic speedup for large tables, where the query runtime is almost entirely dominated by actual I/O and not query planning, unlike the example above.
+
+# Summary
+
+Using clustering, we can improve query performance by
+1.  Leveraging concepts such as [space filling curves](https://en.wikipedia.org/wiki/Z-order_curve) to adapt data lake layout and reduce the amount of data read during queries.
+2.  Stitch small files into larger ones and reduce the total number of files that need to be scanned by the query engine.
+  
+
+Clustering also enables stream processing over big data. Ingestion can write small files to satisfy latency requirements of stream processing. Clustering can be used in the background to stitch these small files into larger files and reduce file count.
+
+Besides this, the clustering framework also provides the flexibility to asynchronously rewrite data based on specific requirements. We foresee many other use-cases adopting clustering framework with custom pluggable strategies to satisfy on-demand data lake management activities. Some such notable use-cases that are actively being solved using clustering:
+1.  Rewrite data and encrypt data at rest.
+2.  Prune unused columns from tables and reduce storage footprint.
diff --git a/website/blog/2021-02-13-hudi-key-generators.md b/website/blog/2021-02-13-hudi-key-generators.md
new file mode 100644
index 0000000..405793e
--- /dev/null
+++ b/website/blog/2021-02-13-hudi-key-generators.md
@@ -0,0 +1,192 @@
+---
+title: "Apache Hudi Key Generators"
+excerpt: "Different key generators available with Apache Hudi"
+author: shivnarayan
+category: blog
+---
+
+Every record in Hudi is uniquely identified by a primary key, which is a pair of record key and partition path where
+the record belongs to. Using primary keys, Hudi can impose a) partition level uniqueness integrity constraint
+b) enable fast updates and deletes on records. One should choose the partitioning scheme wisely as it could be a
+determining factor for your ingestion and query latency.
+<!--truncate-->
+In general, Hudi supports both partitioned and global indexes. For a dataset with partitioned index(which is most
+commonly used), each record is uniquely identified by a pair of record key and partition path. But for a dataset with
+global index, each record is uniquely identified by just the record key. There won't be any duplicate record keys across
+partitions.
+
+## Key Generators
+
+Hudi provides several key generators out of the box that users can use based on their need, while having a pluggable
+implementation for users to implement and use their own KeyGenerator. This blog goes over all different types of key 
+generators that are readily available to use.
+
+[Here](https://github.com/apache/hudi/blob/master/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenerator.java)
+is the interface for KeyGenerator in Hudi for your reference.
+
+Before diving into different types of key generators, let’s go over some of the common configs required to be set for 
+key generators.
+
+| Config        | Meaning/purpose|        
+| ------------- |:-------------:| 
+| ```hoodie.datasource.write.recordkey.field```     | Refers to record key field. This is a mandatory field. | 
+| ```hoodie.datasource.write.partitionpath.field```     | Refers to partition path field. This is a mandatory field. | 
+| ```hoodie.datasource.write.keygenerator.class``` | Refers to Key generator class(including full path). Could refer to any of the available ones or user defined one. This is a mandatory field. | 
+| ```hoodie.datasource.write.partitionpath.urlencode```| When set to true, partition path will be url encoded. Default value is false. |
+| ```hoodie.datasource.write.hive_style_partitioning```| When set to true, uses hive style partitioning. Partition field name will be prefixed to the value. Format: “<partition_path_field_name>=<partition_path_value>”. Default value is false.|
+
+There are few more configs involved if you are looking for TimestampBasedKeyGenerator. Will cover those in the respective section.
+
+Lets go over different key generators available to be used with Hudi.
+
+### [SimpleKeyGenerator](https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/SimpleKeyGenerator.java)
+
+Record key refers to one field(column in dataframe) by name and partition path refers to one field (single column in dataframe) 
+by name. This is one of the most commonly used one. Values are interpreted as is from dataframe and converted to string.
+
+### [ComplexKeyGenerator](https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java)
+Both record key and partition paths comprise one or more than one field by name(combination of multiple fields). Fields 
+are expected to be comma separated in the config value. For example ```"Hoodie.datasource.write.recordkey.field" : “col1,col4”```
+
+### [GlobalDeleteKeyGenerator](https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/GlobalDeleteKeyGenerator.java)
+Global index deletes do not require partition value. So this key generator avoids using partition value for generating HoodieKey.
+
+### [TimestampBasedKeyGenerator](https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/TimestampBasedKeyGenerator.java)
+This key generator relies on timestamps for the partition field. The field values are interpreted as timestamps 
+and not just converted to string while generating partition path value for records.  Record key is same as before where it is chosen by 
+field name.  Users are expected to set few more configs to use this KeyGenerator.
+
+Configs to be set:
+
+| Config        | Meaning/purpose |       
+| ------------- | -------------|
+| ```hoodie.deltastreamer.keygen.timebased.timestamp.type```    | One of the timestamp types supported(UNIX_TIMESTAMP, DATE_STRING, MIXED, EPOCHMILLISECONDS, SCALAR) |
+| ```hoodie.deltastreamer.keygen.timebased.output.dateformat```| Output date format | 
+| ```hoodie.deltastreamer.keygen.timebased.timezone```| Timezone of the data format| 
+| ```oodie.deltastreamer.keygen.timebased.input.dateformat```| Input date format |
+
+Let's go over some example values for TimestampBasedKeyGenerator.
+
+#### Timestamp is GMT
+
+| Config field | Value |
+| ------------- | -------------|
+|```hoodie.deltastreamer.keygen.timebased.timestamp.type```| "EPOCHMILLISECONDS"|
+|```hoodie.deltastreamer.keygen.timebased.output.dateformat``` | "yyyy-MM-dd hh" |
+|```hoodie.deltastreamer.keygen.timebased.timezone```| "GMT+8:00" |
+
+Input Field value: “1578283932000L” <br/>
+Partition path generated from key generator: “2020-01-06 12”
+
+If input field value is null for some rows. <br/>
+Partition path generated from key generator: “1970-01-01 08”
+
+#### Timestamp is DATE_STRING
+
+| Config field | Value |
+| ------------- | -------------|
+|```hoodie.deltastreamer.keygen.timebased.timestamp.type```|  "DATE_STRING"  |
+|```hoodie.deltastreamer.keygen.timebased.output.dateformat```|  "yyyy-MM-dd hh" | 
+|```hoodie.deltastreamer.keygen.timebased.timezone```|  "GMT+8:00" |
+|```hoodie.deltastreamer.keygen.timebased.input.dateformat```|  "yyyy-MM-dd hh:mm:ss" |
+
+Input field value: “2020-01-06 12:12:12” <br/>
+Partition path generated from key generator: “2020-01-06 12”
+
+If input field value is null for some rows. <br/>
+Partition path generated from key generator: “1970-01-01 12:00:00”
+<br/>
+
+#### Scalar examples
+
+| Config field | Value |
+| ------------- | -------------|
+|```hoodie.deltastreamer.keygen.timebased.timestamp.type```| "SCALAR"|
+|```hoodie.deltastreamer.keygen.timebased.output.dateformat```| "yyyy-MM-dd hh" |
+|```hoodie.deltastreamer.keygen.timebased.timezone```| "GMT" |
+|```hoodie.deltastreamer.keygen.timebased.timestamp.scalar.time.unit```| "days" |
+
+Input field value: “20000L” <br/>
+Partition path generated from key generator: “2024-10-04 12”
+
+If input field value is null. <br/>
+Partition path generated from key generator: “1970-01-02 12”
+
+#### ISO8601WithMsZ with Single Input format
+
+| Config field | Value |
+| ------------- | -------------|
+|```hoodie.deltastreamer.keygen.timebased.timestamp.type```| "DATE_STRING"|
+|```hoodie.deltastreamer.keygen.timebased.input.dateformat```| "yyyy-MM-dd'T'HH:mm:ss.SSSZ" |
+|```hoodie.deltastreamer.keygen.timebased.input.dateformat.list.delimiter.regex```| "" |
+|```hoodie.deltastreamer.keygen.timebased.input.timezone```| "" |
+|```hoodie.deltastreamer.keygen.timebased.output.dateformat```| "yyyyMMddHH" |
+|```hoodie.deltastreamer.keygen.timebased.output.timezone```| "GMT" |
+
+Input field value: "2020-04-01T13:01:33.428Z" <br/>
+Partition path generated from key generator: "2020040113"
+
+#### ISO8601WithMsZ with Multiple Input formats
+
+| Config field | Value |
+| ------------- | -------------|
+|```hoodie.deltastreamer.keygen.timebased.timestamp.type```| "DATE_STRING"|
+|```hoodie.deltastreamer.keygen.timebased.input.dateformat```| "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ" |
+|```hoodie.deltastreamer.keygen.timebased.input.dateformat.list.delimiter.regex```| "" |
+|```hoodie.deltastreamer.keygen.timebased.input.timezone```| "" |
+|```hoodie.deltastreamer.keygen.timebased.output.dateformat```| "yyyyMMddHH" |
+|```hoodie.deltastreamer.keygen.timebased.output.timezone```| "UTC" |
+
+Input field value: "2020-04-01T13:01:33.428Z" <br/>
+Partition path generated from key generator: "2020040113"
+
+#### ISO8601NoMs with offset using multiple input formats
+
+| Config field | Value |
+| ------------- | -------------|
+|```hoodie.deltastreamer.keygen.timebased.timestamp.type```| "DATE_STRING"|
+|```hoodie.deltastreamer.keygen.timebased.input.dateformat```| "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ" |
+|```hoodie.deltastreamer.keygen.timebased.input.dateformat.list.delimiter.regex```| "" |
+|```hoodie.deltastreamer.keygen.timebased.input.timezone```| "" |
+|```hoodie.deltastreamer.keygen.timebased.output.dateformat```| "yyyyMMddHH" |
+|```hoodie.deltastreamer.keygen.timebased.output.timezone```| "UTC" |
+
+Input field value: "2020-04-01T13:01:33-**05:00**" <br/>
+Partition path generated from key generator: "2020040118"
+
+#### Input as short date string and expect date in date format
+
+| Config field | Value |
+| ------------- | -------------|
+|```hoodie.deltastreamer.keygen.timebased.timestamp.type```| "DATE_STRING"|
+|```hoodie.deltastreamer.keygen.timebased.input.dateformat```| "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ,yyyyMMdd" |
+|```hoodie.deltastreamer.keygen.timebased.input.dateformat.list.delimiter.regex```| "" |
+|```hoodie.deltastreamer.keygen.timebased.input.timezone```| "UTC" |
+|```hoodie.deltastreamer.keygen.timebased.output.dateformat```| "MM/dd/yyyy" |
+|```hoodie.deltastreamer.keygen.timebased.output.timezone```| "UTC" |
+
+Input field value: "220200401" <br/>
+Partition path generated from key generator: "04/01/2020"
+
+### [CustomKeyGenerator](https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/CustomKeyGenerator.java)
+This is a generic implementation of KeyGenerator where users are able to leverage the benefits of SimpleKeyGenerator, 
+ComplexKeyGenerator and TimestampBasedKeyGenerator all at the same time. One can configure record key and partition 
+paths as a single field or a combination of fields. This keyGenerator is particularly useful if you want to define 
+complex partition paths involving regular fields and timestamp based fields. It expects value for prop ```"hoodie.datasource.write.partitionpath.field"``` 
+in a specific format. The format should be "field1:PartitionKeyType1,field2:PartitionKeyType2..."
+
+The complete partition path is created as 
+```<value for field1 basis PartitionKeyType1>/<value for field2 basis PartitionKeyType2> ```
+and so on. Each partition key type could either be SIMPLE or TIMESTAMP.
+
+Example config value: ```“field_3:simple,field_5:timestamp”```
+
+RecordKey config value is either single field incase of SimpleKeyGenerator or a comma separate field names if referring to ComplexKeyGenerator.
+Eg: “col1” or “col3,col4”.
+
+### [NonPartitionedKeyGenerator](https://github.com/apache/hudi/blob/master/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/NonpartitionedKeyGenerator.java)
+If your hudi dataset is not partitioned, you could use this “NonPartitionedKeyGenerator” which will return an empty 
+partition for all records. In other words, all records go to the same partition (which is empty “”) 
+
+Hope this blog gave you a good understanding of different types of Key Generators available in Apache Hudi. Thanks for your continued support for Hudi's community. 
+
diff --git a/website/blog/2021-03-01-hudi-file-sizing.md b/website/blog/2021-03-01-hudi-file-sizing.md
new file mode 100644
index 0000000..3d30496
--- /dev/null
+++ b/website/blog/2021-03-01-hudi-file-sizing.md
@@ -0,0 +1,85 @@
+---
+title: "Streaming Responsibly - How Apache Hudi maintains optimum sized files"
+excerpt: "Maintaining well-sized files can improve query performance significantly"
+author: shivnarayan
+category: blog
+---
+
+Apache Hudi is a data lake platform technology that provides several functionalities needed to build and manage data lakes. 
+One such key feature that hudi provides is self-managing file sizing so that users don’t need to worry about 
+manual table maintenance. Having a lot of small files will make it harder to achieve good query performance, due to query engines
+having to open/read/close files way too many times, to plan and execute queries. But for streaming data lake use-cases, 
+inherently ingests are going to end up having smaller volume of writes, which might result in lot of small files if no special handling is done.
+<!--truncate-->
+# During Write vs After Write
+
+Common approaches to writing very small files and then later stitching them together solve for system scalability issues posed 
+by small files but might violate query SLA's by exposing small files to them. In fact, you can easily do so on a Hudi table, 
+by running a clustering operation, as detailed in a [previous blog](/blog/2021/01/27/hudi-clustering-intro). 
+
+In this blog, we discuss file sizing optimizations in Hudi, during the initial write time, so we don't have to effectively 
+re-write all data again, just for file sizing. If you want to have both (a) self managed file sizing and 
+(b) Avoid exposing small files to queries, automatic file sizing feature saves the day.
+
+Hudi has the ability to maintain a configured target file size, when performing inserts/upsert operations. 
+(Note: bulk_insert operation does not provide this functionality and is designed as a simpler replacement for 
+normal `spark.write.parquet`).
+
+## Configs
+
+For illustration purposes, we are going to consider only COPY_ON_WRITE table.
+
+Configs of interest before we dive into the algorithm:
+
+- [Max file size](/docs/configurations#limitFileSize): Max size for a given data file. Hudi will try to maintain file sizes to this configured value <br/>
+- [Soft file limit](/docs/configurations#compactionSmallFileSize): Max file size below which a given data file is considered to a small file <br/>
+- [Insert split size](/docs/configurations#insertSplitSize): Number of inserts grouped for a single partition. This value should match 
+the number of records in a single file (you can determine based on max file size and per record size)
+
+For instance, if your first config value is 120MB and 2nd config value is set to 100MB, any file whose size is < 100MB 
+would be considered a small file.
+
+If you wish to turn off this feature, set the config value for soft file limit to 0.
+
+## Example
+
+Let’s say this is the layout of data files for a given partition.
+
+![Initial layout](/assets/images/blog/hudi-file-sizing/initial_layout.png)
+_Figure: Initial data file sizes for a given partition of interest_
+
+Let’s assume the configured values for max file size and small file size limit are 120MB and 100MB. File_1’s current 
+size is 40MB, File_2’s size is 80MB, File_3’s size is 90MB, File_4’s size is 130MB and File_5’s size is 105MB. Let’s see 
+what happens when a new write happens. 
+
+**Step 1:** Assigning updates to files. In this step, We look up the index to find the tagged location and records are 
+assigned to respective files. Note that we assume updates are only going to increase the file size and that would simply result
+in a much bigger file. When updates lower the file size (by say, nulling out lot of fields), then a subsequent write will deem 
+it a small file.
+
+**Step 2:**  Determine small files for each partition path. The soft file limit config value will be leveraged here 
+to determine eligible small files. In our example, given the config value is set to 100MB, the small files are File_1(40MB)
+and File_2(80MB) and file_3’s (90MB).
+
+**Step 3:** Once small files are determined, incoming inserts are assigned to them so that they reach their max capacity of 
+120MB. File_1 will be ingested with 80MB worth of inserts, file_2 will be ingested with 40MB worth of inserts and 
+File_3 will be ingested with 30MB worth of inserts.
+
+![Bin packing small files](/assets/images/blog/hudi-file-sizing/bin_packing_existing_data_files.png)
+_Figure: Incoming records are bin packed to existing small files_
+
+**Step 4:** Once all small files are bin packed to its max capacity and if there are pending inserts unassigned, new file 
+groups/data files are created and inserts are assigned to them. Number of records per new data file is determined from insert split 
+size config. Assuming the insert split size is configured to 120k records, if there are 300k remaining records, 3 new 
+files will be created in which 2 of them (File_6 and File_7) will be filled with 120k records and the last one (File_8)
+will be filled with 60k records (assuming each record is 1000 bytes). In future ingestions, 3rd new file will be 
+considered as a small file to be packed with more data.
+
+![Assigning to new files](/assets/images/blog/hudi-file-sizing/adding_new_files.png)
+_Figure: Remaining records are assigned to new files_
+
+Hudi leverages mechanisms such as custom partitioning for optimized record distribution to different files, executing
+the algorithm above. After this round of ingestion is complete, all files except File_8 are nicely sized to the optimum size. 
+This process is followed during every ingestion to ensure there are no small files in your Hudi tables. 
+
+Hopefully the blog gave you an overview into how hudi manages small files and assists in boosting your query performance.
diff --git a/website/blog/2021-06-10-employing-right-configurations-for-hudi-cleaner.md b/website/blog/2021-06-10-employing-right-configurations-for-hudi-cleaner.md
new file mode 100644
index 0000000..f095af4
--- /dev/null
+++ b/website/blog/2021-06-10-employing-right-configurations-for-hudi-cleaner.md
@@ -0,0 +1,107 @@
+---
+title: "Employing correct configurations for Hudi's cleaner table service"
+excerpt: "Ensuring isolation between Hudi writers and readers using `HoodieCleaner.java`"
+author: pratyakshsharma
+category: blog
+---
+
+Apache Hudi provides snapshot isolation between writers and readers. This is made possible by Hudi’s MVCC concurrency model. In this blog, we will explain how to employ the right configurations to manage multiple file versions. Furthermore, we will discuss mechanisms available to users on how to maintain just the required number of old file versions so that long running readers do not fail. 
+
+<!--truncate-->
+### Reclaiming space and keeping your data lake storage costs in check
+
+Hudi provides different table management services to be able to manage your tables on the data lake. One of these services is called the **Cleaner**. As you write more data to your table, for every batch of updates received, Hudi can either generate a new version of the data file with updates applied to records (COPY_ON_WRITE) or write these delta updates to a log file, avoiding rewriting newer version of an existing file (MERGE_ON_READ). In such situations, depending on the frequency of [...]
+
+### Problem Statement
+
+In a data lake architecture, it is a very common scenario to have readers and writers concurrently accessing the same table. As the Hudi cleaner service periodically reclaims older file versions, scenarios arise where a long running query might be accessing a file version that is deemed to be reclaimed by the cleaner. Here, we need to employ the correct configs to ensure readers (aka queries) don’t fail.
+
+### Deeper dive into Hudi Cleaner
+
+To deal with the mentioned scenario, lets understand the  different cleaning policies that Hudi offers and the corresponding properties that need to be configured. Options are available to schedule cleaning asynchronously or synchronously. Before going into more details, we would like to explain a few underlying concepts:
+
+ - **Hudi base file**: Columnar file which consists of final data after compaction. A base file’s name follows the following naming convention: `<fileId>_<writeToken>_<instantTime>.parquet`. In subsequent writes of this file, file id remains the same and commit time gets updated to show the latest version. This also implies any particular version of a record, given its partition path, can be uniquely located using the file id and instant time. 
+ - **File slice**: A file slice consists of the base file and any log files consisting of the delta, in case of MERGE_ON_READ table type.
+ - **Hudi File Group**: Any file group in Hudi is uniquely identified by the partition path and the  file id that the files in this group have as part of their name. A file group consists of all the file slices in a particular partition path. Also any partition path can have multiple file groups.
+
+### Cleaning Policies
+
+Hudi cleaner currently supports below cleaning policies:
+
+ - **KEEP_LATEST_COMMITS**: This is the default policy. This is a temporal cleaning policy that ensures the effect of having lookback into all the changes that happened in the last X commits. Suppose a writer is ingesting data  into a Hudi dataset every 30 minutes and the longest running query can take 5 hours to finish, then the user should retain atleast the last 10 commits. With such a configuration, we ensure that the oldest version of a file is kept on disk for at least 5 hours, the [...]
+ - **KEEP_LATEST_FILE_VERSIONS**: This policy has the effect of keeping N number of file versions irrespective of time. This policy is useful when it is known how many MAX versions of the file does one want to keep at any given time. To achieve the same behaviour as before of preventing long running queries from failing, one should do their calculations based on data patterns. Alternatively, this policy is also useful if a user just wants to maintain 1 latest version of the file.
+
+### Examples
+
+Suppose a user is ingesting data into a hudi dataset of type COPY_ON_WRITE every 30 minutes as shown below:
+
+![Initial timeline](/assets/images/blog/hoodie-cleaner/Initial_timeline.png)
+_Figure1: Incoming records getting ingested into a hudi dataset every 30 minutes_
+
+The figure shows a particular partition on DFS where commits and corresponding file versions are color coded. 4 different file groups are created in this partition as depicted by fileGroup1, fileGroup2, fileGroup3 and fileGroup4. File group corresponding to fileGroup2 has records ingested from all the 5 commits, while the group corresponding to fileGroup4 has records from the latest 2 commits only.
+
+Suppose the user uses the below configs for cleaning:
+
+```java
+hoodie.cleaner.policy=KEEP_LATEST_COMMITS
+hoodie.cleaner.commits.retained=2
+```
+
+Cleaner selects the versions of files to be cleaned by taking care of the following:
+
+ - Latest version of a file should not be cleaned.
+ - The commit times of the last 2 (configured) + 1 commits are determined. In Figure1, `commit 10:30` and `commit 10:00` correspond to the latest 2 commits in the timeline. One extra commit is included because the time window for retaining commits is essentially equal to the longest query run time. So if the longest query takes 1 hour to finish, and ingestion happens every 30 minutes, you need to retain last 2 commits since 2*30 = 60 (1 hour). At this point of time, the longest query can [...]
+ -  Now for any file group, only those file slices are scheduled for cleaning which are not savepointed (another Hudi table service) and whose commit time is less than the 3rd commit (`commit 9:30` in figure below) in reverse order.
+
+![Retain latest commits](/assets/images/blog/hoodie-cleaner/Retain_latest_commits.png)
+_Figure2: Files corresponding to latest 3 commits are retained_
+
+Now, suppose the user uses the below configs for cleaning:
+
+```java
+hoodie.cleaner.policy=KEEP_LATEST_FILE_VERSIONS
+hoodie.cleaner.fileversions.retained=1
+```
+
+Cleaner does the following:
+
+ - For any file group, latest version (including any for pending compaction) of file slices are kept and the rest are scheduled for cleaning. Clearly as shown in Figure3, if clean action is triggered right after `commit 10:30`, the cleaner will simply leave the latest version in every file group and delete the rest.
+
+![Retain latest versions](/assets/images/blog/hoodie-cleaner/Retain_latest_versions.png)
+_Figure3: Latest file version in every file group is retained_
+
+### Configurations
+
+You can find the details about all the possible configurations along with the default values [here](https://hudi.apache.org/docs/configurations#compaction-configs).
+
+### Run command
+
+Hudi's cleaner table service can be run as a separate process or along with your data ingestion. As mentioned earlier, it basically cleans up any stale/old files lying around. In case you want to run it along with ingesting data, configs are available which enable you to run it [synchronously or asynchronously](https://hudi.apache.org/docs/configurations#withAsyncClean). You can use the below command for running the cleaner independently:
+
+```java
+[hoodie]$ spark-submit --class org.apache.hudi.utilities.HoodieCleaner \
+  --props s3:///temp/hudi-ingestion-config/kafka-source.properties \
+  --target-base-path s3:///temp/hudi \
+  --spark-master yarn-cluster
+```
+
+In case you wish to run the cleaner service asynchronously with writing, please configure the below:
+
+```java
+hoodie.clean.automatic=true
+hoodie.clean.async=true
+```
+
+Further you can use [Hudi CLI](https://hudi.apache.org/docs/deployment#cli) for managing your Hudi dataset. CLI provides the below commands for cleaner service:
+
+ - `cleans show`
+ - `clean showpartitions`
+ - `cleans run`
+
+You can find more details and the relevant code for these commands in [`org.apache.hudi.cli.commands.CleansCommand` class](https://github.com/apache/hudi/blob/master/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CleansCommand.java). 
+
+### Future Scope
+
+Work is currently going on for introducing a new cleaning policy based on time elapsed. This will help in achieving a consistent retention throughout regardless of how frequently ingestion happens. You may track the progress [here](https://issues.apache.org/jira/browse/HUDI-349).
+
+We hope this blog gives you an idea about how to configure the Hudi cleaner and the supported cleaning policies. Please visit the [blog section](https://hudi.apache.org/blog) for a deeper understanding of various Hudi concepts. Cheers!
\ No newline at end of file
diff --git a/website/blog/2021-07-21-streaming-data-lake-platform.md b/website/blog/2021-07-21-streaming-data-lake-platform.md
new file mode 100644
index 0000000..cde1b2c
--- /dev/null
+++ b/website/blog/2021-07-21-streaming-data-lake-platform.md
@@ -0,0 +1,151 @@
+---
+title: "Apache Hudi - The Streaming Data Lake Platform"
+excerpt: "It's been called many things. But, we have always been building a data lake platform"
+author: vinoth
+category: blog
+---
+
+As early as 2016, we set out a [bold, new vision](https://www.oreilly.com/content/ubers-case-for-incremental-processing-on-hadoop/) reimagining batch data processing through a new “**incremental**” data processing stack - alongside the existing batch and streaming stacks. 
+While a stream processing pipeline does row-oriented processing, delivering a few seconds of processing latency, an incremental pipeline would apply the same principles to *columnar* data in the data lake, 
+delivering orders of magnitude improvements in processing efficiency within few minutes, on extremely scalable batch storage/compute infrastructure. This new stack would be able to effortlessly support regular batch processing for bulk reprocessing/backfilling as well.
+Hudi was built as the manifestation of this vision, rooted in real, hard problems faced at [Uber](https://eng.uber.com/uber-big-data-platform/) and later took a life of its own in the open source community. Together, we have been able to 
+usher in fully incremental data ingestion and moderately complex ETLs on data lakes already.
+
+<!--truncate-->
+![the different components that make up the stream and batch processing stack today, showing how an incremental stack blends the best of both the worlds.](/assets/images/blog/datalake-platform/hudi-data-lake-platform_-_Page_2_4.png)
+
+Today, this grand vision of being able to express almost any batch pipeline incrementally is more attainable than it ever was. Stream processing is [maturing rapidly](https://flink.apache.org/blog/) and gaining [tremendous momentum](https://www.confluent.io/blog/every-company-is-becoming-software/), 
+with [generalization](https://flink.apache.org/2021/03/11/batch-execution-mode.html) of stream processing APIs to work over a batch execution model. Hudi completes the missing pieces of the puzzle by providing streaming optimized lake storage, 
+much like how Kafka/Pulsar enable efficient storage for event streaming. [Many organizations](https://hudi.apache.org/docs/powered_by.html) have already reaped real benefits of adopting a streaming model for their data lakes, in terms of fresh data, simplified architecture and great cost reductions.
+
+But first, we needed to tackle the basics - transactions and mutability - on the data lake. In many ways, Apache Hudi pioneered the transactional data lake movement as we know it today. Specifically, during a time when more special-purpose systems were being born, Hudi introduced a server-less, transaction layer, which worked over the general-purpose Hadoop FileSystem abstraction on Cloud Stores/HDFS. This model helped Hudi to scale writers/readers to 1000s of cores on day one, compared  [...]
+
+This is going to be a rather long post, but we will do our best to make it worth your time. Let’s roll.
+
+## Data Lake Platform
+
+We have noticed that, Hudi is sometimes positioned as a “[table format](https://cloud.google.com/blog/products/data-analytics/getting-started-with-new-table-formats-on-dataproc)” or “transactional layer”. While this is not incorrect, this does not do full justice to all that Hudi has to offer. 
+
+### Is Hudi a “format”?
+
+Hudi was not designed as a general purpose table format, tracking files/folders for batch processing. Rather, the functionality provided by a table format is merely one layer in the Hudi software stack. Hudi was designed to play well with the Hive format (if you will), given how popular and widespread it is. Over time, to solve scaling challenges or bring in additional functionality, we have invested in our own native table format with an eye for incremental processing vision. for e.g, w [...]
+
+### Is Hudi a transactional layer?
+
+Of course, Hudi had to provide transactions for implementing deletes/updates, but Hudi’s transactional layer is designed around an [event log](https://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying) that is also well-integrated with an entire set of built-in table/data services. For e.g compaction is aware of clustering actions already scheduled and optimizes by skipping over the files being clustered - while the u [...]
+
+Thus, the best way to describe Apache Hudi is as a **Streaming Data Lake Platform** built around a *database kernel*. The words carry significant meaning.
+
+![/assets/images/blog/datalake-platform/Screen_Shot_2021-07-20_at_5.35.47_PM.png](/assets/images/blog/datalake-platform/Screen_Shot_2021-07-20_at_5.35.47_PM.png)
+
+**Streaming**: At its core, by optimizing for fast upserts & change streams, Hudi provides the primitives to data lake workloads that are comparable to what [Apache Kafka](https://kafka.apache.org/) does for event-streaming (namely, incremental produce/consume of events and a state-store for interactive querying).
+
+**Data Lake**: Nonetheless, Hudi provides an optimized, self-managing data plane for large scale data processing on the lake (adhoc queries, ML pipelines, batch pipelines), powering arguably the [largest transactional lake](https://eng.uber.com/apache-hudi-graduation/) in the world. While Hudi can be used to build a [lakehouse](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html), given its transactional capabilities, Hudi goes beyond and unlocks an end-to-end streaming  [...]
+
+**Platform**: Oftentimes in open source, there is great tech, but there is just too many of them - all differing ever so slightly in their opinionated ways, ultimately making the integration task onerous on the end user. Lake users deserve the same great usability that cloud warehouses provide, with the additional freedom and transparency of a true open source community. Hudi’s data and table services, tightly integrated with the Hudi “kernel”, gives us the ability to deliver cross layer [...]
+
+## Hudi Stack
+
+The following stack captures layers of software components that make up Hudi, with each layer depending on and drawing strength from the layer below. Typically, data lake users write data out once using an open file format like Apache [Parquet](http://parquet.apache.org/)/[ORC](https://orc.apache.org/) stored on top of extremely scalable cloud storage or distributed file systems. Hudi provides a self-managing data plane to ingest, transform and manage this data, in a way that unlocks inc [...]
+
+![Figure showing the Hudi stack](/assets/images/blog/datalake-platform/hudi-data-lake-platform_-_Copy_of_Page_1_3.png)
+
+Furthermore, Hudi either already provides or plans to add components that make this data universally accessible to all the different query engines out there. The features annotated with `*` represent work in progress and dotted boxes represent planned future work, to complete our vision for the project. 
+While we have strawman designs outlined for the newer components in the blog, we welcome with open arms fresh perspectives from the community.
+Rest of the blog will delve into each layer in our stack - explaining what it does, how it's designed for incremental processing and how it will evolve in the future.
+
+## Lake Storage
+
+Hudi interacts with lake storage using the [Hadoop FileSystem API](https://hadoop.apache.org/docs/stable/api/org/apache/hadoop/fs/FileSystem.html), which makes it compatible with all of its implementations ranging from HDFS to Cloud Stores to even in-memory filesystems like [Alluxio](https://www.alluxio.io/blog/building-high-performance-data-lake-using-apache-hudi-and-alluxio-at-t3go/)/Ignite. Hudi internally implements its own [wrapper filesystem](https://github.com/apache/hudi/blob/9d2 [...]
+
+## File Format
+
+Hudi is designed around the notion of base file and delta log files that store updates/deltas to a given base file (called a file slice). Their formats are pluggable, with Parquet (columnar access) and HFile (indexed access) being the supported base file formats today. The delta logs encode data in [Avro](http://avro.apache.org/) (row oriented) format for speedier logging (just like Kafka topics for e.g). Going forward, we plan to [inline any base file format](https://github.com/apache/h [...]
+
+Zooming one level up, Hudi's unique file layout scheme encodes all changes to a given base file, as a sequence of blocks (data blocks, delete blocks, rollback blocks) that are merged in order to derive newer base files. In essence, this makes up a self contained redo log that the lets us implement interesting features on top. For e.g, most of today's data privacy enforcement happens by masking data read off the lake storage on-the-fly, invoking hashing/encryption algorithms over and over [...]
+
+![Hudi base and delta logs](/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_2_1.png)
+
+## Table Format
+
+The term “table format” is new and still means many things to many people. Drawing an analogy to file formats, a table format simply consists of : the file layout of the table, table’s schema and metadata tracking changes to the table. Hudi is not a table format, it implements one internally. Hudi uses Avro schemas to store, manage and evolve a table’s schema. Currently, Hudi enforces schema-on-write, which although stricter than schema-on-read, is adopted [widely](https://docs.confluent [...]
+
+Hudi consciously lays out files within a table/partition into groups and maintains a mapping between an incoming record’s key to an existing file group. All updates are recorded into delta log files specific to a given file group and this design ensures low merge overhead compared to approaches like Hive ACID, which have to merge all delta records against all base files to satisfy queries. For e.g, with uuid keys (used very widely) all base files are very likely to overlap with all delta [...]
+
+![Shows the Hudi table format components](/assets/images/blog/datalake-platform/hudi-design-diagrams-table-format.png)
+
+The *timeline* is the source-of-truth event log for all Hudi’s table metadata, stored under the `.hoodie` folder, that provides an ordered log of all actions performed on the table. Events are retained on the timeline up to a configured interval of time/activity. Each file group is also designed as it’s own self-contained log, which means that even if an action that affected a file group is archived from the timeline, the right state of the records in each file group can be reconstructed [...]
+
+Lastly, new events on the timeline are then consumed and reflected onto an internal metadata table, implemented as another merge-on-read table offering low write amplification. Hudi is able to absorb quick/rapid changes to table’s metadata, unlike table formats designed for slow-moving data. Additionally, the metadata table uses the [HFile](https://hbase.apache.org/2.0/devapidocs/org/apache/hadoop/hbase/io/hfile/HFile.html) base file format, which provides indexed lookups of keys avoidin [...]
+
+A key challenge faced by all the table formats out there today, is the need for expiring snapshots/controlling retention for time travel queries such that it does not interfere with query planning/performance. In the future, we plan to build an indexed timeline in Hudi, which can span the entire history of the table, supporting a time travel look back window of several months/years.
+
+## Indexes
+
+Indexes help databases plan better queries, that reduce the overall amount of I/O and deliver faster response times. Table metadata about file listings and column statistics are often enough for lake query engines to generate optimized, engine specific query plans quickly. This is however not sufficient for Hudi to realize fast upserts. Hudi already supports different key based indexing schemes to quickly map incoming record keys into the file group they reside in. For this purpose, Hudi [...]
+
+![/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_5.png](/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_5.png)
+
+In the future, we intend to add additional forms of indexing as new partitions on the metadata table. Let’s discuss the role  each one has to play briefly. Query engines typically rely on partitioning to cut down the number of files read for a given query. In database terms, a Hive partition is nothing but a coarse range index, that maps a set of columns to a list of files. Table formats born in the cloud like Iceberg/Delta Lake, have built-in tracking of column ranges per file in a sing [...]
+
+While Hudi already supports external indexes for random write workloads, we would like to support [point-lookup-ish queries](https://github.com/apache/hudi/pull/2487) right on top of lake storage, which helps avoid the overhead of an additional database for many classes of data applications. We also anticipate that uuid/key based joins will be sped up a lot, by leveraging record level indexing schemes, we build out for fast upsert performance. We also plan to move our tracking of bloom f [...]
+
+## Concurrency Control
+
+Concurrency control defines how different writers/readers coordinate access to the table. Hudi ensures atomic writes, by way of publishing commits atomically to the timeline, stamped with an instant time that denotes the time at which the action is deemed to have occurred. Unlike general purpose file version control, Hudi draws clear distinction between writer processes (that issue user’s upserts/deletes), table services (that write data/metadata to optimize/perform bookkeeping) and read [...]
+
+Projects that solely rely on OCC deal with competing operations, by either implementing a lock or relying on atomic renames. Such approaches are optimistic that real contention never happens and resort to failing one of the writer operations if conflicts occur, which can cause significant resource wastage or operational overhead. Imagine a scenario of two writer processes : an ingest writer job producing new data every 30 minutes and a deletion writer job that is enforcing GDPR taking 2  [...]
+
+![Figure showing competing transactions leading to starvation with just OCC](/assets/images/blog/datalake-platform/Hudi_design_diagram_-_Page_2_1.png)
+
+We are hard at work, improving our OCC based implementation around early detection of conflicts for concurrent writers and terminate early without burning up CPU resources. We are also working on [adding fully log based](https://cwiki.apache.org/confluence/display/HUDI/RFC+-+22+%3A+Snapshot+Isolation+using+Optimistic+Concurrency+Control+for+multi-writers#RFC22:SnapshotIsolationusingOptimisticConcurrencyControlformultiwriters-FutureWork(LockFree-ishConcurrencyControl)), non-blocking concu [...]
+
+## Writers
+
+Hudi tables can be used as sinks for Spark/Flink pipelines and the Hudi writing path provides several enhanced capabilities over file writing done by vanilla parquet/avro sinks. Hudi classifies write operations carefully into incremental (`insert`, `upsert`, `delete`) and batch/bulk operations (`insert_overwrite`, `insert_overwrite_table`, `delete_partition`, `bulk_insert`) and provides relevant functionality for each operation in a performant and cohesive way. Both upsert and delete ope [...]
+
+Keys are first class citizens inside Hudi and the pre-combining/index lookups done before upsert/deletes ensure a key is unique across partitions or within partitions, as desired. In contrast with other approaches where this is left to data engineer to co-ordinate using `MERGE INTO` statements, this approach ensures quality data especially for critical use-cases. Hudi also ships with several [built-in key generators](http://hudi.apache.org/blog/hudi-key-generators/) that can parse all co [...]
+
+Hudi writers add metadata to each record, that codify the commit time and a sequence number for each record within that commit (comparable to a Kafka offset), which make it possible to derive record level change streams. Hudi also provides users the ability to specify event time fields in incoming data streams and track them in the timeline.Mapping these to stream processing concepts, Hudi contains both [arrival and event time](https://www.oreilly.com/radar/the-world-beyond-batch-streami [...]
+
+## Readers
+
+Hudi provides snapshot isolation between writers and readers and allows for any table snapshot to be queries consistently from all major lake query engines (Spark, Hive, Flink, Presto, Trino, Impala) and even cloud warehouses like Redshift. In fact, we would love to bring Hudi tables as external tables with BigQuery/Snowflake as well, once they also embrace the lake table formats more natively. Our design philosophy around query performance has been to make Hudi as lightweight as possibl [...]
+
+![Log merging done for incremental queries](/assets/images/blog/datalake-platform/hudi-design-diagram_-incr-read.png)
+
+True to its design goals, Hudi provides some very powerful incremental querying capabilities that tied together the meta fields added during writing and the file group based storage layout. While table formats that merely track files, are only able to provide information about files that changed during each snapshot or commits, Hudi generates the exact set of records that changed given a point in the timeline, due to tracking of record level event and arrival times. Further more, this de [...]
+
+## Table Services
+
+What defines and sustains a project’s value over years are its fundamental design principles and the subtle trade offs. Databases often consist of several internal components, working in tandem to deliver efficiency, performance and great operability to its users. True to intent to act as state store for incremental data pipelines, we designed Hudi with built-in table services and self-managing runtime that can orchestrate/trigger these services to optimize everything internally. In fact [...]
+
+![/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_4.png](/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_4.png)
+
+There are several built-in table services, all with the goal of ensuring performant table storage layout and metadata management, which are automatically invoked either synchronously after each write operation, or asynchronously as a separate background job. Furthermore, Spark (and Flink) streaming writers can run in continuous mode, and invoke table services asynchronously sharing the underlying executors intelligently with writers. Archival service ensures that the timeline holds suffi [...]
+
+We are always looking for ways to improve and enhance our table services in meaningful ways. In the coming releases, we are working towards a much more [scalable model](https://github.com/apache/hudi/pull/3233) of cleaning up partial writes, by consolidating marker file creation using our timeline metaserver, which avoids expensive full table scans to seek out and remove uncommitted files. We also have [various proposals](https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=1 [...]
+
+## Data Services
+
+As noted at the start, we wanted to make Hudi immediately usable for common end-end use-cases and thus invested deeply into a set of data services, that provide functionality that is data/workload specific, sitting on top of the table services, writers/readers directly. Foremost in that list, is the Hudi DeltaStreamer utility, which has been an extremely popular choice for painlessly building a data lake out of  Kafka streams and files landing in different formats on top of lake storage. [...]
+
+![/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_8.png](/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_8.png)
+
+Going forward, we would love contributions to enhance our [multi delta streamer utility](http://hudi.apache.org/blog/ingest-multiple-tables-using-hudi/), which can ingest entire Kafka clusters in a single large Spark application, to be on par and hardened. To further our progress towards end-end complex incremental pipelines, we plan to work towards enhancing the delta streamer utility and its SQL transformers to be triggered by multiple source streams (as opposed to just the one today)  [...]
+
+## Timeline Metaserver
+
+Storing and serving table metadata right on the lake storage is scalable, but can be much less performant compared to RPCs against a scalable meta server. Most cloud warehouses internally are built on a metadata layer that leverages an external database (e.g [Snowflake uses foundationDB](https://www.snowflake.com/blog/how-foundationdb-powers-snowflake-metadata-forward/)). Hudi also provides a metadata server, called the “Timeline server”, which offers an alternative backing store for Hud [...]
+
+![/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_6.png](/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_6.png)
+
+## Lake Cache
+
+There is a fundamental tradeoff today in data lakes between faster writing and great query performance. Faster writing typically involves writing smaller files (and later clustering them) or logging deltas (and later merging on read). While this provides good performance already, the pursuit of great query performance often warrants opening fewer number of files/objects on lake storage and may be pre-materializing the merges between base and delta logs. After all, most databases employ a [...]
+
+![/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_7.png](/assets/images/blog/datalake-platform/hudi-design-diagrams_-_Page_7.png)
+
+## Onwards
+
+We hope that this blog painted a complete picture of Apache Hudi, staying true to its founding principles. Interested users and readers can expect blogs delving into each layer of the stack and an overhaul of our docs along these lines in the coming weeks/months. We view the current efforts around table formats as merely removing decade-old bottlenecks in data lake storage/query planes, problems which have been already solved very well in cloud warehouses like Big Query/Snowflake. We wou [...]
\ No newline at end of file
diff --git a/website/contribute/developer-setup.md b/website/contribute/developer-setup.md
new file mode 100644
index 0000000..a823eab
--- /dev/null
+++ b/website/contribute/developer-setup.md
@@ -0,0 +1,292 @@
+---
+title: Developer Setup
+sidebar_position: 4
+keywords: [ hudi, ide, developer, setup]
+toc: true
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+
+## Pre-requisites
+
+To contribute code, you need
+
+ - a GitHub account
+ - a Linux (or) macOS development environment with Java JDK 8, Apache Maven (3.x+) installed
+ - [Docker](https://www.docker.com/) installed for running demo, integ tests or building website
+ - for large contributions, a signed [Individual Contributor License
+   Agreement](https://www.apache.org/licenses/icla.pdf) (ICLA) to the Apache
+   Software Foundation (ASF).
+ - (Recommended) Create an account on [JIRA](https://issues.apache.org/jira/projects/HUDI/summary) to open issues/find similar issues.
+ - (Recommended) Join our dev mailing list & slack channel, listed on [community](/contribute/get-involved) page.
+
+
+## IDE Setup
+
+To contribute, you would need to do the following
+ 
+- Fork the Hudi code on Github & then clone your own fork locally. Once cloned, we recommend building as per instructions on [spark quickstart](/docs/quick-start-guide) or [flink quickstart](/docs/flink-quick-start-guide)
+
+- \[Recommended\] We have embraced the code style largely based on [google format](https://google.github.io/styleguide/javaguide). Please setup your IDE with style files from [\<project root\>/style/](https://github.com/apache/hudi/tree/master/style). These instructions have been tested on IntelliJ.
+
+- \[Recommended\] Set up the [Save Action Plugin](https://plugins.jetbrains.com/plugin/7642-save-actions) to auto format & organize imports on save. The Maven Compilation life-cycle will fail if there are checkstyle violations.
+
+- \[Recommended\] As it is required to add [Apache License header](https://www.apache.org/legal/src-headers#headers) to all source files, configuring "Copyright" settings as shown below will come in handy.
+
+![IDE setup copyright 1](/assets/images/contributing/IDE_setup_copyright_1.png)
+![IDE setup copyright 2](/assets/images/contributing/IDE_setup_copyright_2.png)
+
+- \[Optional\] If needed, add spark jars to the classpath of your module in Intellij by following the steps from [here](https://stackoverflow.com/questions/1051640/correct-way-to-add-external-jars-lib-jar-to-an-intellij-idea-project). 
+
+- \[Optional\] You may configure IntelliJ to respect maven CLI and pom.xml settings.
+
+![IDE setup maven 1](/assets/images/contributing/IDE_setup_maven_1.png)
+![IDE setup maven 2](/assets/images/contributing/IDE_setup_maven_2.png)
+
+## Accounts and Permissions
+
+ - [Hudi issue tracker (JIRA)](https://issues.apache.org/jira/projects/HUDI/issues):
+   Anyone can access it and browse issues. Anyone can register an account and login
+   to create issues or add comments. Only contributors can be assigned issues. If
+   you want to be assigned issues, a PMC member can add you to the project contributor
+   group.  Email the dev mailing list to ask to be added as a contributor, and include your ASF Jira username.
+
+ - [Hudi Wiki Space](https://cwiki.apache.org/confluence/display/HUDI):
+   Anyone has read access. If you wish to contribute changes, please create an account and
+   request edit access on the dev@ mailing list (include your Wiki account user ID).
+
+ - Pull requests can only be merged by a HUDI committer, listed [here](https://incubator.apache.org/projects/hudi)
+
+ - [Voting on a release](https://www.apache.org/foundation/voting): Everyone can vote.
+   Only Hudi PMC members should mark their votes as binding.
+
+## Life of a Contributor
+
+This document details processes and procedures we follow to make contributions to the project and take it forward. 
+If you are looking to ramp up into the project as a contributor, we highly encourage you to read this guide in full, familiarize yourself with the workflow 
+and more importantly also try to improve the process along the way as well. 
+
+### Filing JIRAs
+
+ - Hudi uses JIRA to manage issues. First, familiarize yourself with the various [components](https://issues.apache.org/jira/projects/HUDI/components) against which issues are filed in Hudi.
+ - Make an attempt to find an existing JIRA, that may solve the same issue you are reporting. When in doubt, you can always email the mailing list so that the community can provide early feedback, 
+   point out any similar JIRAs or RFCs. 
+ - Try to gauge whether this JIRA needs an [RFC](https://cwiki.apache.org/confluence/display/HUDI/RFC+Process). As always, email the mailing list if unsure. If you need an RFC since the change is
+   large in scope, then please follow the wiki instructions to get the process rolling along.
+ - While raising a new JIRA or updating an existing one, please make sure to do the following
+      - The issue `type` and `components` (when resolving the ticket) are set correctly
+      - If you intend to target the JIRA for a specific release, please fill in the `fix version(s)` field, with the [release number](https://issues.apache.org/jira/projects/HUDI/releases).
+      - Summary should be descriptive enough to catch the essence of the problem/ feature
+      - Where necessary, capture the version of Hudi/Spark/Hive/Hadoop/Cloud environments in the ticket
+      - Whenever possible, provide steps to reproduce via sample code or on the [docker setup](https://hudi.apache.org/docker_demo)
+ - All newly filed JIRAs are placed in the `NEW` state. If you are sure about this JIRA representing valid, scoped piece of work, please click `Accept Issue` to move it `OPEN` state
+ - If you are not sure, please wait for a PMC/Committer to confirm/triage the issue and accept it. This process avoids contributors spending time on JIRAs with unclear scope.
+ - Whenever possible, break down large JIRAs (e.g JIRAs resulting from an [RFC](https://cwiki.apache.org/confluence/display/HUDI/RFC+Process)) into `sub tasks` by clicking `More > create sub-task` from the parent JIRA , 
+   so that the community can contribute at large and help implement it much quickly. We recommend prefixing such JIRA titles with `[UMBRELLA]`
+
+### Claiming JIRAs
+
+ - Finding a JIRA to work on 
+      - If you are new to the project, you can ramp up by picking up any issues tagged with the [newbie](https://issues.apache.org/jira/issues/?jql=project+%3D+HUDI+AND+component+%3D+newbie) component.
+      - If you want to work on some higher priority issue, then scout for Open issues against the next release on the JIRA, engage on unassigned/inactive JIRAs and offer help.
+      - Issues tagged with `Usability` , `Code Cleanup`, `Testing` components often present excellent opportunities to make a great impact.
+ - If you don't have perms to self-assign JIRAs, please email the dev mailing list with your JIRA id and a small intro for yourself. We'd be happy to add you as a contributor.
+ - As courtesy, if you are unable to continue working on a JIRA, please move it back to "OPEN" state and un-assign yourself.
+      - If a JIRA or its corresponding pull request has been inactive for a week, awaiting feedback from you, PMC/Committers could choose to re-assign them to another contributor.
+      - Such re-assignment process would be communicated over JIRA/GitHub comments, checking with the original contributor on his/her intent to continue working on the issue.
+      - You can also contribute by helping others contribute. So, if you don't have cycles to work on a JIRA and another contributor offers help, take it!
+
+### Contributing Code
+
+ - Once you finalize on a project/task, please open a new JIRA or assign an existing one to yourself. 
+      - Almost all PRs should be linked to a JIRA. It's always good to have a JIRA upfront to avoid duplicating efforts.
+      - If the changes are minor, then `[MINOR]` prefix can be added to Pull Request title without a JIRA. Below are some tips to judge **MINOR** Pull Request :
+        - trivial fixes (for example, a typo, a broken link, intellisense or an obvious error)
+        - the change does not alter functionality or performance in any way
+        - changed lines less than 100
+        - obviously judge that the PR would pass without waiting for CI / CD verification
+      - But, you may be asked to file a JIRA, if reviewer deems it necessary
+ - Before you begin work,
+      - Claim the JIRA using the process above and assign the JIRA to yourself.
+      - Click "Start Progress" on the JIRA, which tells everyone that you are working on the issue actively.
+ - [Optional] Familiarize yourself with internals of Hudi using content on this page, as well as [wiki](https://cwiki.apache.org/confluence/display/HUDI)
+ - Make your code change
+   - Get existing tests to pass using `mvn clean install -DskipITs`
+   - Add adequate tests for your new functionality
+   - For involved changes, it's best to test the changes in real production environments and report the results in the PR. 
+   - For website changes, please build the site locally & test navigation, formatting & links thoroughly
+   - If your code change changes some aspect of documentation (e.g new config, default value change), 
+     please ensure there is another PR to [update the docs](https://github.com/apache/hudi/tree/asf-site/README.md) as well.
+ - Sending a Pull Request
+   - Format commit and the pull request title like `[HUDI-XXX] Fixes bug in Spark Datasource`, 
+     where you replace `HUDI-XXX` with the appropriate JIRA issue. 
+   - Please ensure your commit message body is descriptive of the change. Bulleted summary would be appreciated.
+   - Push your commit to your own fork/branch & create a pull request (PR) against the Hudi repo.
+   - If you don't hear back within 3 days on the PR, please send an email to the dev @ mailing list.
+   - Address code review comments & keep pushing changes to your fork/branch, which automatically updates the PR
+   - Before your change can be merged, it should be squashed into a single commit for cleaner commit history.
+ - Finally, once your pull request is merged, make sure to `Close` the JIRA.
+
+### Coding guidelines 
+
+Our code can benefit from contributors speaking the same "language" when authoring code. After all, it gets read a lot more than it gets
+written. So optimizing for "reads" is a good goal. The list below is a set of guidelines, that contributors strive to upkeep and reflective 
+of how we want to evolve our code in the future.
+
+#### Style 
+
+ - **Formatting** We should rely on checkstyle and spotless to auto fix formatting; automate this completely. Where we cannot,
+    we will err on the side of not taxing contributors with manual effort.
+ - **Refactoring**
+   - Refactor with purpose; any refactor suggested should be attributable to functionality that now becomes easy to implement.
+   - A class is asking to be refactored, when it has several overloaded responsibilities/have sets of fields/methods which are used more cohesively than others. 
+   - Try to name tests using the given-when-then model, that cleans separates preconditions (given), an action (when), and assertions (then).
+ - **Naming things**
+   - Let's name uniformly; using the same word to denote the same concept. e.g: bootstrap vs external vs source, when referring to bootstrapped tables. 
+     Maybe they all mean the same, but having one word makes the code lot more easily readable. 
+   - Let's name consistently with Hudi terminology. e.g dataset vs table, base file vs data file.
+   - Class names preferably are nouns (e.g Runner) which reflect their responsibility and methods are verbs (e.g run()).
+   - Avoid filler words, that don't add value e.g xxxInfo, xxxData, etc.
+   - We name classes in code starting with `Hoodie` and not `Hudi` and we want to keep it that way for consistency/historical reasons. 
+ - **Methods**
+   - Individual methods should short (~20-30 lines) and have a single purpose; If you feel like it has a secondary purpose, then maybe it needs
+     to be broken down more.
+   - Lesser the number of arguments, the better; 
+   - Place caller methods on top of callee methods, whenever possible.
+   - Avoid "output" arguments e.g passing in a list and filling its values within the method.
+   - Try to limit individual if/else blocks to few lines to aid readability.
+   - Separate logical blocks of code with a newline in between e.g read a file into memory, loop over the lines.
+ - **Classes**
+   - Like method, each Class should have a single purpose/responsibility.
+   - Try to keep class files to about 200 lines of length, nothing beyond 500.
+   - Avoid stating the obvious in comments; e.g each line does not deserve a comment; Document corner-cases/special perf considerations etc clearly.
+   - Try creating factory methods/builders and interfaces wherever you feel a specific implementation may be changed down the line.
+
+#### Substance
+
+- Try to avoid large PRs; if unavoidable (many times they are) please separate refactoring with the actual implementation of functionality. 
+  e.g renaming/breaking up a file and then changing code changes, makes the diff very hard to review.
+- **Licensing**
+    - Every source file needs to include the Apache license header. Every new dependency needs to have 
+      an open source license [compatible](https://www.apache.org/legal/resolved#criteria) with Apache.
+    - If you are re-using code from another apache/open-source project, licensing needs to be compatible and attribution added to `LICENSE` file
+    - Please DO NOT copy paste any code from StackOverflow or other online sources, since their license attribution would be unclear. Author them yourself!
+- **Code Organization** 
+    - Anything in `hudi-common` cannot depend on a specific engine runtime like Spark. 
+    - Any changes to bundles under `packaging`, will be reviewed with additional scrutiny to avoid breakages across versions.
+- **Code reuse**
+  - Whenever you can, please use/enhance use existing utils classes in code (`CollectionUtils`, `ParquetUtils`, `HoodieAvroUtils`). Search for classes ending in `Utils`.
+  - As a complex project, that must integrate with multiple systems, we tend to avoid dependencies like `guava`, `apache commons` for the sake of easy integration. 
+     Please start a discussion on the mailing list, before attempting to reintroduce them
+  - As a data system, that takes performance seriously, we also write pieces of infrastructure (e.g `ExternalSpillableMap`) natively, that are optimized specifically for our scenarios.
+     Please start with them first, when solving problems.
+ - **Breaking changes**
+   - Any version changes for dependencies, needs to be ideally vetted across different user environments in the community, to get enough confidence before merging.
+   - Any changes to methods annotated with `PublicAPIMethod` or classes annotated with `PublicAPIClass` require upfront discussion and potentially an RFC.
+   - Any non-backwards compatible changes similarly need upfront discussion and the functionality needs to implement an upgrade-downgrade path.
+
+#### Tests
+
+- **Categories**
+    - unit - testing basic functionality at the class level, potentially using mocks. Expected to finish quicker
+    - functional - brings up the services needed and runs test without mocking
+    - integration - runs subset of functional tests, on a full fledged enviroment with dockerized services
+- **Prepare Test Data**
+    - Many unit and functional test cases require a Hudi dataset to be prepared beforehand. `HoodieTestTable` and `HoodieWriteableTestTable` are dedicated test utility classes for this purpose. Use them whenever appropriate, and add new APIs to them when needed.
+    - When add new APIs in the test utility classes, overload APIs with variety of arguments to do more heavy-liftings for callers.
+    - In most scenarios, you won't need to use `FileCreateUtils` directly.
+    - If test cases require interaction with actual `HoodieRecord`s, use `HoodieWriteableTestTable` (and `HoodieTestDataGenerator` probably). Otherwise, `HoodieTestTable` that manipulates empty files shall serve the purpose.
+- **Strive for Readability**
+    - Avoid writing flow controls for different assertion cases. Split to a new test case when appropriate.
+    - Use plain for-loop to avoid try-catch in lambda block. Declare exceptions is okay.
+    - Use static import for constants and static helper methods to avoid lengthy code.
+    - Avoid reusing local variable names. Create new variables generously.
+    - Keep helper methods local to the test class until it becomes obviously generic and re-useable. When that happens, move the helper method to the right utility class. For example, `Assertions` contains common assert helpers, and `SchemaTestUtil` is for schema related helpers.
+    - Avoid putting new helpers in `HoodieTestUtils` and `HoodieClientTestUtils`, which are named too generic. Eventually, all test helpers shall be categorized properly.  
+
+### Reviewing Code/RFCs
+
+ - All pull requests would be subject to code reviews, from one or more of the PMC/Committers. 
+ - Typically, each PR will get an "Assignee" based on their area of expertise, who will work with you to land the PR.
+ - Code reviews are vital, but also often time-consuming for everyone involved. Below are some principles which could help align us better.
+   - Reviewers need to provide actionable, concrete feedback that states what needs to be done to get the PR closer to landing.
+   - Reviewers need to make it explicit, which of the requested changes would block the PR vs good-to-dos.
+   - Both contributors/reviewers need to keep an open mind and ground themselves to making the most technically sound argument.
+   - If progress is hard, please involve another PMC member/Committer to share another perspective.
+   - Staying humble and eager to learn, goes a long way in ensuring these reviews are smooth.
+ - Reviewers are expected to uphold the code quality, standards outlined above.
+ - When merging PRs, always make sure you are squashing the commits using the "Squash and Merge" feature in Github
+ - When necessary/appropriate, reviewers could make changes themselves to PR branches, with the intent to get the PR landed sooner. (see [how-to](https://cwiki.apache.org/confluence/display/HUDI/Resources#Resources-PushingChangesToPRs))
+   Reviewers should seek explicit approval from author, before making large changes to the original PR.
+
+### Suggest Changes
+
+We welcome new ideas and suggestions to improve the project, along any dimensions - management, processes, technical vision/direction. To kick start a discussion on the mailing thread
+to effect change and source feedback, start a new email thread with the `[DISCUSS]` prefix and share your thoughts. If your proposal leads to a larger change, then it may be followed up
+by a [vote](https://www.apache.org/foundation/voting) by a PMC member or others (depending on the specific scenario). 
+For technical suggestions, you can also leverage [our RFC Process](https://cwiki.apache.org/confluence/display/HUDI/RFC+Process) to outline your ideas in greater detail.
+
+
+## Releases
+
+ - Apache Hudi community plans to do minor version releases every 6 weeks or so.
+ - If your contribution merged onto the `master` branch after the last release, it will become part of the next release.
+ - Website changes are regenerated on-demand basis (until automation in place to reflect immediately)
+
+## Communication
+
+All communication is expected to align with the [Code of Conduct](https://www.apache.org/foundation/policies/conduct).
+Discussion about contributing code to Hudi happens on the [dev@ mailing list](/contribute/get-involved). Introduce yourself!
+
+## Code & Project Structure
+
+  * `docker` : Docker containers used by demo and integration tests. Brings up a mini data ecosystem locally
+  * `hudi-cli` : CLI to inspect, manage and administer datasets
+  * `hudi-client` : Spark client library to take a bunch of inserts + updates and apply them to a Hoodie table
+  * `hudi-common` : Common classes used across modules
+  * `hudi-hadoop-mr` : InputFormat implementations for ReadOptimized, Incremental, Realtime views
+  * `hudi-hive` : Manage hive tables off Hudi datasets and houses the HiveSyncTool
+  * `hudi-integ-test` : Longer running integration test processes
+  * `hudi-spark` : Spark datasource for writing and reading Hudi datasets. Streaming sink.
+  * `hudi-utilities` : Houses tools like DeltaStreamer, SnapshotCopier
+  * `packaging` : Poms for building out bundles for easier drop in to Spark, Hive, Presto, Utilities
+  * `style`  : Code formatting, checkstyle files
+
+## Code WalkThrough
+
+This Quick Video will give a code walkthrough to start with [watch](https://www.youtube.com/watch?v=N2eDfU_rQ_U).
+
+## Docker Setup
+
+We encourage you to test your code on docker cluster please follow this for [docker setup](https://hudi.apache.org/docs/docker_demo).
+
+## Remote Debugging 
+
+if your code fails on docker cluster you can remotely debug your code please follow the below steps.
+
+Step 1 :- Run your Delta Streamer Job with --conf as defined this will ensure to wait till you attach your intellij with Remote Debugging on port 4044
+
+```scala
+spark-submit \
+  --conf spark.driver.extraJavaOptions="-Dconfig.resource=myapp.conf  -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=4044" \
+  --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \
+  --table-type COPY_ON_WRITE \
+  --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \
+  --source-ordering-field ts  \
+  --base-file-format parquet \
+  --target-base-path /user/hive/warehouse/stock_ticks_cow \
+  --target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties \
+  --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider
+```
+
+Step 2 :- Attaching Intellij (tested on Intellij Version > 2019. this steps may change acc. to intellij version)
+
+- Come to Intellij --> Edit Configurations -> Remote -> Add Remote - > Put Below Configs -> Apply & Save -> Put Debug Point -> Start. <br/>
+- Name : Hudi Remote <br/>
+- Port : 4044 <br/>
+- Command Line Args for Remote JVM : -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=4044 <br/>
+- Use Module ClassPath : select hudi <br/>
+ 
+## Website
+
+[Apache Hudi site](https://hudi.apache.org) is hosted on a special `asf-site` branch. Please follow the `README` file under `docs` on that branch for
+instructions on making changes to the website.
diff --git a/website/contribute/get-involved.md b/website/contribute/get-involved.md
new file mode 100644
index 0000000..c455539
--- /dev/null
+++ b/website/contribute/get-involved.md
@@ -0,0 +1,24 @@
+---
+sidebar_position: 1
+title: "Get Involved"
+toc: true
+last_modified_at: 2020-09-01T15:59:57-04:00
+---
+
+## Engage with us
+
+There are several ways to get in touch with the Hudi community.
+
+| When? | Channel to use |
+|-------|--------|
+| For development discussions | Dev Mailing list ([Subscribe](mailto:dev-subscribe@hudi.apache.org), [Unsubscribe](mailto:dev-unsubscribe@hudi.apache.org), [Archives](https://lists.apache.org/list?dev@hudi.apache.org)). Empty email works for subscribe/unsubscribe. Please use [gists](https://gist.github.com) to share code/stacktraces on the email. |
+| For any general questions, user support | Users Mailing list ([Subscribe](mailto:users-subscribe@hudi.apache.org), [Unsubscribe](mailto:users-unsubscribe@hudi.apache.org), [Archives](https://lists.apache.org/list?users@hudi.apache.org)). Empty email works for subscribe/unsubscribe. Please use [gists](https://gist.github.com) to share code/stacktraces on the email. |
+| For reporting bugs or issues or discover known issues | Please use [ASF Hudi JIRA](https://issues.apache.org/jira/projects/HUDI/summary). See [#here](#accounts) for access |
+| For quick pings & 1-1 chats | Join our [slack group](https://join.slack.com/t/apache-hudi/shared_invite/enQtODYyNDAxNzc5MTg2LTE5OTBlYmVhYjM0N2ZhOTJjOWM4YzBmMWU2MjZjMGE4NDc5ZDFiOGQ2N2VkYTVkNzU3ZDQ4OTI1NmFmYWQ0NzE). In case this does not work, please leave a comment on this [github issue](https://github.com/apache/hudi/issues/143) |
+| For proposing large features, changes | Start a RFC. Instructions [here](https://cwiki.apache.org/confluence/display/HUDI/RFC+Process).
+| Join weekly sync-up meeting | Follow instructions [here](https://cwiki.apache.org/confluence/display/HUDI/Apache+Hudi+Community+Weekly+Sync). |
+ See [#here](#accounts) for wiki access |
+| For stream of commits, pull requests etc | Commits Mailing list ([Subscribe](mailto:commits-subscribe@hudi.apache.org), [Unsubscribe](mailto:commits-unsubscribe@hudi.apache.org), [Archives](https://lists.apache.org/list?commits@hudi.apache.org)) |
+
+If you wish to report a security vulnerability, please contact [security@apache.org](mailto:security@apache.org).
+Apache Hudi follows the typical Apache vulnerability handling [process](https://apache.org/security/committers#vulnerability-handling).
diff --git a/website/contribute/how-to-contribute.md b/website/contribute/how-to-contribute.md
new file mode 100644
index 0000000..1d2ff4d
--- /dev/null
+++ b/website/contribute/how-to-contribute.md
@@ -0,0 +1,44 @@
+---
+sidebar_position: 3
+title: "How to Contribute"
+toc: true
+last_modified_at: 2020-09-01T15:59:57-04:00
+---
+
+Apache Hudi community welcomes contributions from anyone!
+
+Here are few ways, you can get involved.
+
+ - Ask (and/or) answer questions on our support channels listed above.
+ - Review code or RFCs
+ - Help improve documentation
+ - Author blogs on our wiki
+ - Testing; Improving out-of-box experience by reporting bugs
+ - Share new ideas/directions to pursue or propose a new RFC
+ - Contributing code to the project ([newbie JIRAs](https://issues.apache.org/jira/issues/?jql=project+%3D+HUDI+AND+component+%3D+newbie))
+
+## Become a Committer
+
+We are always looking for strong contributors, who can become [committers](https://www.apache.org/dev/committers) on the project. 
+Committers are chosen by a majority vote of the Apache Hudi [PMC](https://www.apache.org/foundation/how-it-works#pmc-members), after a discussion on their candidacy based on the following criteria (not exclusive/comprehensive).
+
+ - Embodies the ASF model code of [conduct](https://www.apache.org/foundation/policies/conduct)
+ - Has made significant technical contributions such as submitting PRs, filing bugs, testing, benchmarking, authoring RFCs, providing feedback/code reviews (+ more).
+ - Has helped the community over a few months, by answering questions on support channels above and triaging issues/jiras.
+ - Demonstrates clear code/design ownership of a component or code area (eg: Delta Streamer, Hive/Presto Integration etc).
+ - Brought thought leadership and new ideas into the project and evangelized them with the community via conference talks, blog posts.
+ - Great citizenship in helping with all peripheral (but very critical) work like site maintenance, wiki/jira cleanups and so on.
+ - Proven commitment to the project by way of upholding all agreed upon processes, conventions and principles of the community.
+
+## Code Contributions
+
+Useful resources for contributing can be found under the "Quick Links" left menu.
+Specifically, please refer to the detailed [contribution guide](/contribute/how-to-contribute).
+
+## Accounts
+
+It's useful to obtain few accounts to be able to effectively contribute to Hudi.
+ 
+ - Github account is needed to send pull requests to Hudi
+ - Sign-up/in to the Apache [JIRA](https://issues.apache.org/jira). Then please email the dev mailing list with your username, asking to be added as a contributor to the project. This enables you to assign/be-assigned tickets and comment on them. 
+ - Sign-up/in to the Apache [cWiki](https://cwiki.apache.org/confluence/signup.action), to be able to contribute to the wiki pages/RFCs. 
diff --git a/website/contribute/report-security-issues.md b/website/contribute/report-security-issues.md
new file mode 100644
index 0000000..ccefc16
--- /dev/null
+++ b/website/contribute/report-security-issues.md
@@ -0,0 +1,28 @@
+---
+title: Report Security Issues
+sidebar_position: 5
+keywords: [ hudi, security]
+toc: true
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+
+## Reporting Security Issues
+
+The Apache Software Foundation takes a rigorous standpoint in annihilating the security issues in its software projects. Apache Hudi is highly sensitive and forthcoming to issues pertaining to its features and functionality.
+
+## Reporting Vulnerability
+
+If you have apprehensions regarding Hudi's security or you discover vulnerability or potential threat, don’t hesitate to get in touch with the [Apache Security Team](http://www.apache.org/security/) by dropping a mail at [security@apache.org](mailto:security@apache.org). In the mail, specify the description of the issue or potential threat. You are also urged to recommend the way to reproduce and replicate the issue. The Hudi community will get back to you after assessing and analysing t [...]
+
+**PLEASE PAY ATTENTION** to report the security issue on the security email before disclosing it on public domain.
+
+## Vulnerability Handling
+
+An overview of the vulnerability handling process is:
+
+* The reporter reports the vulnerability privately to Apache.
+* The appropriate project's security team works privately with the reporter to resolve the vulnerability.
+* A new release of the Apache product concerned is made that includes the fix.
+* The vulnerability is publically announced.
+
+A more detailed description of the process can be found [here](https://www.apache.org/security/committers).
\ No newline at end of file
diff --git a/website/contribute/team.md b/website/contribute/team.md
new file mode 100644
index 0000000..f630544
--- /dev/null
+++ b/website/contribute/team.md
@@ -0,0 +1,33 @@
+---
+sidebar_position: 2
+title: "Team"
+toc: true
+last_modified_at: 2020-09-01T15:59:57-04:00
+---
+
+### Active Team
+
+| Image                                                        | Name                                                         | Role            | Apache ID    |
+| ------------------------------------------------------------ | ------------------------------------------------------------ | --------------- | ------------ |
+| <img src={"https://avatars.githubusercontent.com/alunarbeach"} className="profile-pic" alt="alunarbeach" align="middle" /> | [Anbu Cheeralan](https://github.com/alunarbeach)             | PMC, Committer | anchee       |
+| <img src={"https://avatars.githubusercontent.com/bhasudha"} className="profile-pic" alt="bhasudha" align="middle" /> | [Bhavani Sudha](https://github.com/bhasudha)                 | PMC, Committer | bhavanisudha |
+| <img src={"https://avatars.githubusercontent.com/bvaradar"} className="profile-pic" alt="bvaradar" align="middle" /> | [Balaji Varadarajan](https://github.com/bvaradar)            | PMC, Committer | vbalaji      |
+| <img src={"https://avatars.githubusercontent.com/danny0405"} className="profile-pic" alt="danny0405" align="middle" /> | [Danny Chan](https://github.com/danny0405)                      | Committer       | danny0405        |
+| <img src={"https://avatars.githubusercontent.com/garyli1019"} className="profile-pic" alt="garyli1019" align="middle" /> | [Gary Li](https://github.com/garyli1019)                      | PMC, Committer       | garyli        |
+| <img src={"https://avatars.githubusercontent.com/lresende"} className="profile-pic" alt="lresende" align="middle" /> | [Luciano Resende](https://github.com/lresende)               | PMC, Committer | lresende     |
+| <img src={"https://avatars.githubusercontent.com/lamberken"} className="profile-pic" alt="lamberken" className="profile-pic" align="middle" /> | [lamberken](https://github.com/lamberken)               | Committer | lamberken     |
+| <img src={"https://avatars.githubusercontent.com/n3nash"} className="profile-pic" alt="n3nash" align="middle" /> | [Nishith Agarwal](https://github.com/n3nash)                 | PMC, Committer | nagarwal     |
+| <img src={"https://avatars.githubusercontent.com/prasannarajaperumal"} className="profile-pic" alt="prasannarajaperumal" align="middle" /> | [Prasanna Rajaperumal](https://github.com/prasannarajaperumal) | PMC, Committer | prasanna     |
+| <img src={"https://avatars.githubusercontent.com/pratyakshsharma"} className="profile-pic" alt="pratyakshsharma" align="middle" /> | [Pratyaksh Sharma](https://github.com/pratyakshsharma)                      | Committer       | pratyakshsharma        |
+| <img src={"https://avatars.githubusercontent.com/xushiyan"} className="profile-pic" alt="xushiyan" align="middle" /> | [Raymond Xu](https://github.com/xushiyan)                      | PMC, Committer       | xushiyan        |
+| <img src={"https://avatars.githubusercontent.com/leesf"} className="profile-pic" alt="leesf" align="middle" /> | [Shaofeng Li](https://github.com/leesf)                      | PMC, Committer       | leesf        |
+| <img src={"https://avatars.githubusercontent.com/nsivabalan"} className="profile-pic" alt="nsivabalan" align="middle" /> | [Sivabalan Narayanan](https://github.com/nsivabalan)         | PMC, Committer | sivabalan      |
+| <img src={"https://avatars.githubusercontent.com/smarthi"} className="profile-pic" alt="smarthi" align="middle" /> | [Suneel Marthi](https://github.com/smarthi)                  | PMC, Committer | smarthi      |
+| <img src={"https://avatars.githubusercontent.com/tweise"} className="profile-pic" alt="tweise" align="middle" /> | [Thomas Weise](https://github.com/tweise)                    | PMC, Committer | thw          |
+| <img src={"https://avatars.githubusercontent.com/umehrot2"} className="profile-pic" alt="umehrot2" align="middle" /> | [Udit Mehrotra](https://github.com/umehrot2)                      | Committer       | uditme        |
+| <img src={"https://avatars.githubusercontent.com/vinothchandar"} className="profile-pic" alt="vinothchandar" align="middle" /> | [Vinoth Chandar](https://github.com/vinothchandar)           | PMC, Committer | vinoth       |
+| <img src={"https://avatars.githubusercontent.com/yanghua"} className="profile-pic" alt="yanghua" /> | [vinoyang](https://github.com/yanghua)                       | PMC, Committer       | vinoyang     |
+| <img src={"https://avatars.githubusercontent.com/lw309637554"} alt="liway" className="profile-pic" align="middle" /> | [Wei Li](https://github.com/lw309637554)               | Committer | liway|
+| <img src={"https://avatars.githubusercontent.com/zhedoubushishi"} className="profile-pic" alt="zhedoubushishi" /> | [Wenning Ding](https://github.com/zhedoubushishi)                       | Committer       | wenningd     |
+| <img src={"https://avatars.githubusercontent.com/wangxianghu"} alt="wangxianghu" className="profile-pic" align="middle" /> | [Xianghu Wang](https://github.com/wangxianghu)               | Committer | wangxianghu|
+| <img src={"https://avatars.githubusercontent.com/pengzhiwei2018"} className="profile-pic" alt="pengzhiwei2018" align="middle" /> | [Zhiwei Peng](https://github.com/pengzhiwei2018)                      | Committer       | zhiwei        |
diff --git a/website/docs/azure_hoodie.md b/website/docs/azure_hoodie.md
new file mode 100644
index 0000000..f28ec60
--- /dev/null
+++ b/website/docs/azure_hoodie.md
@@ -0,0 +1,50 @@
+---
+title: Microsoft Azure
+keywords: [ hudi, hive, azure, spark, presto]
+summary: In this page, we go over how to configure Hudi with Azure filesystem.
+last_modified_at: 2020-05-25T19:00:57-04:00
+---
+In this page, we explain how to use Hudi on Microsoft Azure.
+
+## Disclaimer
+
+This page is maintained by the Hudi community.
+If the information is inaccurate or you have additional information to add.
+Please feel free to create a JIRA ticket. Contribution is highly appreciated.
+
+## Supported Storage System
+
+There are two storage systems support Hudi .
+
+- Azure Blob Storage
+- Azure Data Lake Gen 2
+
+## Verified Combination of Spark and storage system
+
+#### HDInsight Spark2.4 on Azure Data Lake Storage Gen 2
+This combination works out of the box. No extra config needed.
+
+#### Databricks Spark2.4 on Azure Data Lake Storage Gen 2
+- Import Hudi jar to databricks workspace
+
+- Mount the file system to dbutils.
+  ```scala
+  dbutils.fs.mount(
+    source = "abfss://xxx@xxx.dfs.core.windows.net",
+    mountPoint = "/mountpoint",
+    extraConfigs = configs)
+  ```
+- When writing Hudi dataset, use abfss URL
+  ```scala
+  inputDF.write
+    .format("org.apache.hudi")
+    .options(opts)
+    .mode(SaveMode.Append)
+    .save("abfss://<<storage-account>>.dfs.core.windows.net/hudi-tables/customer")
+  ```
+- When reading Hudi dataset, use the mounting point
+  ```scala
+  spark.read
+    .format("org.apache.hudi")
+    .load("/mountpoint/hudi-tables/customer")
+  ```
diff --git a/website/docs/bos_hoodie.md b/website/docs/bos_hoodie.md
new file mode 100644
index 0000000..2a6cde8
--- /dev/null
+++ b/website/docs/bos_hoodie.md
@@ -0,0 +1,57 @@
+---
+title: Baidu Cloud
+keywords: [ hudi, hive, baidu, bos, spark, presto]
+summary: In this page, we go over how to configure Hudi with bos filesystem.
+last_modified_at: 2021-06-09T11:38:24-10:00
+---
+In this page, we explain how to get your Hudi job to store into Baidu BOS.
+
+## Baidu BOS configs
+
+There are two configurations required for Hudi-BOS compatibility:
+
+- Adding Baidu BOS Credentials for Hudi
+- Adding required Jars to classpath
+
+### Baidu BOS Credentials
+
+Add the required configs in your core-site.xml from where Hudi can fetch them. Replace the `fs.defaultFS` with your BOS bucket name, replace `fs.bos.endpoint` with your bos endpoint, replace `fs.bos.access.key` with your bos key, replace `fs.bos.secret.access.key` with your bos secret key. Hudi should be able to read/write from the bucket.
+
+```xml
+<property>
+  <name>fs.defaultFS</name>
+  <value>bos://bucketname/</value>
+</property>
+
+<property>
+  <name>fs.bos.endpoint</name>
+  <value>bos-endpoint-address</value>
+  <description>Baidu bos endpoint to connect to,for example : http://bj.bcebos.com</description>
+</property>
+
+<property>
+  <name>fs.bos.access.key</name>
+  <value>bos-key</value>
+  <description>Baidu access key</description>
+</property>
+
+<property>
+  <name>fs.bos.secret.access.key</name>
+  <value>bos-secret-key</value>
+  <description>Baidu secret key.</description>
+</property>
+
+<property>
+  <name>fs.bos.impl</name>
+  <value>org.apache.hadoop.fs.bos.BaiduBosFileSystem</value>
+</property>
+```
+
+### Baidu bos Libs
+
+Baidu hadoop libraries jars to add to our classpath
+
+- com.baidubce:bce-java-sdk:0.10.165
+- bos-hdfs-sdk-1.0.2-community.jar 
+
+You can  download the bos-hdfs-sdk jar from [here](https://sdk.bce.baidu.com/console-sdk/bos-hdfs-sdk-1.0.2-community.jar.zip) , and then unzip it.
\ No newline at end of file
diff --git a/website/docs/cloud.md b/website/docs/cloud.md
new file mode 100644
index 0000000..57491ec
--- /dev/null
+++ b/website/docs/cloud.md
@@ -0,0 +1,27 @@
+---
+title: Cloud Storage
+keywords: [hudi, aws, gcp, oss, azure, cloud]
+summary: "In this page, we introduce how Hudi work with different Cloud providers."
+toc: true
+last_modified_at: 2019-06-16T21:59:57-04:00
+---
+ 
+## Talking to Cloud Storage
+
+Immaterial of whether RDD/WriteClient APIs or Datasource is used, the following information helps configure access
+to cloud stores.
+
+ * [AWS S3](/docs/s3_hoodie) <br/>
+   Configurations required for S3 and Hudi co-operability.
+ * [Google Cloud Storage](/docs/gcs_hoodie) <br/>
+   Configurations required for GCS and Hudi co-operability.
+ * [Alibaba Cloud OSS](/docs/oss_hoodie) <br/>
+   Configurations required for OSS and Hudi co-operability.
+ * [Microsoft Azure](/docs/azure_hoodie) <br/>
+   Configurations required for Azure and Hudi co-operability.
+* [Tencent Cloud Object Storage](/docs/cos_hoodie) <br/>
+   Configurations required for COS and Hudi co-operability.
+* [IBM Cloud Object Storage](/docs/ibm_cos_hoodie) <br/>
+   Configurations required for IBM Cloud Object Storage and Hudi co-operability.   
+* [Baidu Cloud Object Storage](bos_hoodie) <br/>
+   Configurations required for BOS and Hudi co-operability.
diff --git a/website/docs/comparison.md b/website/docs/comparison.md
new file mode 100644
index 0000000..681b359
--- /dev/null
+++ b/website/docs/comparison.md
@@ -0,0 +1,56 @@
+---
+title: "Comparison"
+keywords: [ apache, hudi, kafka, kudu, hive, hbase, stream processing]
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+
+Apache Hudi fills a big void for processing data on top of DFS, and thus mostly co-exists nicely with these technologies. However,
+it would be useful to understand how Hudi fits into the current big data ecosystem, contrasting it with a few related systems
+and bring out the different tradeoffs these systems have accepted in their design.
+
+## Kudu
+
+[Apache Kudu](https://kudu.apache.org) is a storage system that has similar goals as Hudi, which is to bring real-time analytics on petabytes of data via first
+class support for `upserts`. A key differentiator is that Kudu also attempts to serve as a datastore for OLTP workloads, something that Hudi does not aspire to be.
+Consequently, Kudu does not support incremental pulling (as of early 2017), something Hudi does to enable incremental processing use cases.
+
+
+Kudu diverges from a distributed file system abstraction and HDFS altogether, with its own set of storage servers talking to each  other via RAFT.
+Hudi, on the other hand, is designed to work with an underlying Hadoop compatible filesystem (HDFS,S3 or Ceph) and does not have its own fleet of storage servers,
+instead relying on Apache Spark to do the heavy-lifting. Thus, Hudi can be scaled easily, just like other Spark jobs, while Kudu would require hardware
+& operational support, typical to datastores like HBase or Vertica. We have not at this point, done any head to head benchmarks against Kudu (given RTTable is WIP).
+But, if we were to go with results shared by [CERN](https://db-blog.web.cern.ch/blog/zbigniew-baranowski/2017-01-performance-comparison-different-file-formats-and-storage-engines) ,
+we expect Hudi to positioned at something that ingests parquet with superior performance.
+
+
+## Hive Transactions
+
+[Hive Transactions/ACID](https://cwiki.apache.org/confluence/display/Hive/Hive+Transactions) is another similar effort, which tries to implement storage like
+`merge-on-read`, on top of ORC file format. Understandably, this feature is heavily tied to Hive and other efforts like [LLAP](https://cwiki.apache.org/confluence/display/Hive/LLAP).
+Hive transactions does not offer the read-optimized storage option or the incremental pulling, that Hudi does. In terms of implementation choices, Hudi leverages
+the full power of a processing framework like Spark, while Hive transactions feature is implemented underneath by Hive tasks/queries kicked off by user or the Hive metastore.
+Based on our production experience, embedding Hudi as a library into existing Spark pipelines was much easier and less operationally heavy, compared with the other approach.
+Hudi is also designed to work with non-hive engines like PrestoDB/Spark and will incorporate file formats other than parquet over time.
+
+## HBase
+
+Even though [HBase](https://hbase.apache.org) is ultimately a key-value store for OLTP workloads, users often tend to associate HBase with analytics given the proximity to Hadoop.
+Given HBase is heavily write-optimized, it supports sub-second upserts out-of-box and Hive-on-HBase lets users query that data. However, in terms of actual performance for analytical workloads,
+hybrid columnar storage formats like Parquet/ORC handily beat HBase, since these workloads are predominantly read-heavy. Hudi bridges this gap between faster data and having
+analytical storage formats. From an operational perspective, arming users with a library that provides faster data, is more scalable, than managing a big farm of HBase region servers,
+just for analytics. Finally, HBase does not support incremental processing primitives like `commit times`, `incremental pull` as first class citizens like Hudi.
+
+## Stream Processing
+
+A popular question, we get is : "How does Hudi relate to stream processing systems?", which we will try to answer here. Simply put, Hudi can integrate with
+batch (`copy-on-write table`) and streaming (`merge-on-read table`) jobs of today, to store the computed results in Hadoop. For Spark apps, this can happen via direct
+integration of Hudi library with Spark/Spark streaming DAGs. In case of Non-Spark processing systems (eg: Flink, Hive), the processing can be done in the respective systems
+and later sent into a Hudi table via a Kafka topic/DFS intermediate file. In more conceptual level, data processing
+pipelines just consist of three components : `source`, `processing`, `sink`, with users ultimately running queries against the sink to use the results of the pipeline.
+Hudi can act as either a source or sink, that stores data on DFS. Applicability of Hudi to a given stream processing pipeline ultimately boils down to suitability
+of PrestoDB/SparkSQL/Hive for your queries.
+
+More advanced use cases revolve around the concepts of [incremental processing](https://www.oreilly.com/ideas/ubers-case-for-incremental-processing-on-hadoop), which effectively
+uses Hudi even inside the `processing` engine to speed up typical batch pipelines. For e.g: Hudi can be used as a state store inside a processing DAG (similar
+to how [rocksDB](https://ci.apache.org/projects/flink/flink-docs-release-1.2/ops/state_backends#the-rocksdbstatebackend) is used by Flink). This is an item on the roadmap
+and will eventually happen as a [Beam Runner](https://issues.apache.org/jira/browse/HUDI-60)
diff --git a/website/docs/concepts.md b/website/docs/concepts.md
new file mode 100644
index 0000000..277484d
--- /dev/null
+++ b/website/docs/concepts.md
@@ -0,0 +1,172 @@
+---
+version: 0.6.0
+title: "Concepts"
+keywords: [ hudi, design, table, queries, timeline]
+summary: "Here we introduce some basic concepts & give a broad technical overview of Hudi"
+toc: true
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+
+Apache Hudi (pronounced “Hudi”) provides the following streaming primitives over hadoop compatible storages
+
+ * Update/Delete Records      (how do I change records in a table?)
+ * Change Streams             (how do I fetch records that changed?)
+
+In this section, we will discuss key concepts & terminologies that are important to understand, to be able to effectively use these primitives.
+
+## Timeline
+At its core, Hudi maintains a `timeline` of all actions performed on the table at different `instants` of time that helps provide instantaneous views of the table,
+while also efficiently supporting retrieval of data in the order of arrival. A Hudi instant consists of the following components 
+
+ * `Instant action` : Type of action performed on the table
+ * `Instant time` : Instant time is typically a timestamp (e.g: 20190117010349), which monotonically increases in the order of action's begin time.
+ * `state` : current state of the instant
+ 
+Hudi guarantees that the actions performed on the timeline are atomic & timeline consistent based on the instant time.
+
+Key actions performed include
+
+ * `COMMITS` - A commit denotes an **atomic write** of a batch of records into a table.
+ * `CLEANS` - Background activity that gets rid of older versions of files in the table, that are no longer needed.
+ * `DELTA_COMMIT` - A delta commit refers to an **atomic write** of a batch of records into a  MergeOnRead type table, where some/all of the data could be just written to delta logs.
+ * `COMPACTION` - Background activity to reconcile differential data structures within Hudi e.g: moving updates from row based log files to columnar formats. Internally, compaction manifests as a special commit on the timeline
+ * `ROLLBACK` - Indicates that a commit/delta commit was unsuccessful & rolled back, removing any partial files produced during such a write
+ * `SAVEPOINT` - Marks certain file groups as "saved", such that cleaner will not delete them. It helps restore the table to a point on the timeline, in case of disaster/data recovery scenarios.
+
+Any given instant can be 
+in one of the following states
+
+ * `REQUESTED` - Denotes an action has been scheduled, but has not initiated
+ * `INFLIGHT` - Denotes that the action is currently being performed
+ * `COMPLETED` - Denotes completion of an action on the timeline
+
+<figure>
+    <img className="docimage" src="/assets/images/hudi_timeline.png" alt="hudi_timeline.png" />
+</figure>
+
+Example above shows upserts happenings between 10:00 and 10:20 on a Hudi table, roughly every 5 mins, leaving commit metadata on the Hudi timeline, along
+with other background cleaning/compactions. One key observation to make is that the commit time indicates the `arrival time` of the data (10:20AM), while the actual data
+organization reflects the actual time or `event time`, the data was intended for (hourly buckets from 07:00). These are two key concepts when reasoning about tradeoffs between latency and completeness of data.
+
+When there is late arriving data (data intended for 9:00 arriving >1 hr late at 10:20), we can see the upsert producing new data into even older time buckets/folders.
+With the help of the timeline, an incremental query attempting to get all new data that was committed successfully since 10:00 hours, is able to very efficiently consume
+only the changed files without say scanning all the time buckets > 07:00.
+
+## File management
+Hudi organizes a table into a directory structure under a `basepath` on DFS. Table is broken up into partitions, which are folders containing data files for that partition,
+very similar to Hive tables. Each partition is uniquely identified by its `partitionpath`, which is relative to the basepath.
+
+Within each partition, files are organized into `file groups`, uniquely identified by a `file id`. Each file group contains several
+`file slices`, where each slice contains a base file (`*.parquet`) produced at a certain commit/compaction instant time,
+ along with set of log files (`*.log.*`) that contain inserts/updates to the base file since the base file was produced. 
+Hudi adopts a MVCC design, where compaction action merges logs and base files to produce new file slices and cleaning action gets rid of 
+unused/older file slices to reclaim space on DFS. 
+
+## Index
+Hudi provides efficient upserts, by mapping a given hoodie key (record key + partition path) consistently to a file id, via an indexing mechanism. 
+This mapping between record key and file group/file id, never changes once the first version of a record has been written to a file. In short, the 
+mapped file group contains all versions of a group of records.
+
+## Table Types & Queries
+Hudi table types define how data is indexed & laid out on the DFS and how the above primitives and timeline activities are implemented on top of such organization (i.e how data is written). 
+In turn, `query types` define how the underlying data is exposed to the queries (i.e how data is read). 
+
+| Table Type    | Supported Query types |
+|-------------- |------------------|
+| Copy On Write | Snapshot Queries + Incremental Queries  |
+| Merge On Read | Snapshot Queries + Incremental Queries + Read Optimized Queries |
+
+### Table Types
+Hudi supports the following table types.
+
+  - [Copy On Write](#copy-on-write-table) : Stores data using exclusively columnar file formats (e.g parquet). Updates simply version & rewrite the files by performing a synchronous merge during write.
+  - [Merge On Read](#merge-on-read-table) : Stores data using a combination of columnar (e.g parquet) + row based (e.g avro) file formats. Updates are logged to delta files & later compacted to produce new versions of columnar files synchronously or asynchronously.
+    
+Following table summarizes the trade-offs between these two table types
+
+| Trade-off     | CopyOnWrite      | MergeOnRead |
+|-------------- |------------------| ------------------|
+| Data Latency | Higher   | Lower |
+| Update cost (I/O) | Higher (rewrite entire parquet) | Lower (append to delta log) |
+| Parquet File Size | Smaller (high update(I/0) cost) | Larger (low update cost) |
+| Write Amplification | Higher | Lower (depending on compaction strategy) |
+
+
+### Query types
+Hudi supports the following query types
+
+ - **Snapshot Queries** : Queries see the latest snapshot of the table as of a given commit or compaction action. In case of merge on read table, it exposes near-real time data(few mins) by merging 
+    the base and delta files of the latest file slice on-the-fly. For copy on write table,  it provides a drop-in replacement for existing parquet tables, while providing upsert/delete and other write side features. 
+ - **Incremental Queries** : Queries only see new data written to the table, since a given commit/compaction. This effectively provides change streams to enable incremental data pipelines. 
+ - **Read Optimized Queries** : Queries see the latest snapshot of table as of a given commit/compaction action. Exposes only the base/columnar files in latest file slices and guarantees the 
+    same columnar query performance compared to a non-hudi columnar table.
+
+Following table summarizes the trade-offs between the different query types.
+
+| Trade-off     | Snapshot    | Read Optimized |
+|-------------- |-------------| ------------------|
+| Data Latency  | Lower | Higher
+| Query Latency | Higher (merge base / columnar file + row based delta / log files) | Lower (raw base / columnar file performance)
+
+
+## Copy On Write Table
+
+File slices in Copy-On-Write table only contain the base/columnar file and each commit produces new versions of base files. 
+In other words, we implicitly compact on every commit, such that only columnar data exists. As a result, the write amplification 
+(number of bytes written for 1 byte of incoming data) is much higher, where read amplification is zero. 
+This is a much desired property for analytical workloads, which is predominantly read-heavy.
+
+Following illustrates how this works conceptually, when data written into copy-on-write table  and two queries running on top of it.
+
+
+<figure>
+    <img className="docimage" src="/assets/images/hudi_cow.png" alt="hudi_cow.png" />
+</figure>
+
+
+As data gets written, updates to existing file groups produce a new slice for that file group stamped with the commit instant time, 
+while inserts allocate a new file group and write its first slice for that file group. These file slices and their commit instant times are color coded above.
+SQL queries running against such a table (eg: `select count(*)` counting the total records in that partition), first checks the timeline for the latest commit
+and filters all but latest file slices of each file group. As you can see, an old query does not see the current inflight commit's files color coded in pink,
+but a new query starting after the commit picks up the new data. Thus queries are immune to any write failures/partial writes and only run on committed data.
+
+The intention of copy on write table, is to fundamentally improve how tables are managed today through
+
+  - First class support for atomically updating data at file-level, instead of rewriting whole tables/partitions
+  - Ability to incremental consume changes, as opposed to wasteful scans or fumbling with heuristics
+  - Tight control of file sizes to keep query performance excellent (small files hurt query performance considerably).
+
+
+## Merge On Read Table
+
+Merge on read table is a superset of copy on write, in the sense it still supports read optimized queries of the table by exposing only the base/columnar files in latest file slices.
+Additionally, it stores incoming upserts for each file group, onto a row based delta log, to support snapshot queries by applying the delta log, 
+onto the latest version of each file id on-the-fly during query time. Thus, this table type attempts to balance read and write amplification intelligently, to provide near real-time data.
+The most significant change here, would be to the compactor, which now carefully chooses which delta log files need to be compacted onto
+their columnar base file, to keep the query performance in check (larger delta log files would incur longer merge times with merge data on query side)
+
+Following illustrates how the table works, and shows two types of queries - snapshot query and read optimized query.
+
+<figure>
+    <img className="docimage" src="/assets/images/hudi_mor.png" alt="hudi_mor.png"  />
+</figure>
+
+There are lot of interesting things happening in this example, which bring out the subtleties in the approach.
+
+ - We now have commits every 1 minute or so, something we could not do in the other table type.
+ - Within each file id group, now there is an delta log file, which holds incoming updates to records in the base columnar files. In the example, the delta log files hold
+ all the data from 10:05 to 10:10. The base columnar files are still versioned with the commit, as before.
+ Thus, if one were to simply look at base files alone, then the table layout looks exactly like a copy on write table.
+ - A periodic compaction process reconciles these changes from the delta log and produces a new version of base file, just like what happened at 10:05 in the example.
+ - There are two ways of querying the same underlying table: Read Optimized query and Snapshot query, depending on whether we chose query performance or freshness of data.
+ - The semantics around when data from a commit is available to a query changes in a subtle way for a read optimized query. Note, that such a query
+ running at 10:10, wont see data after 10:05 above, while a snapshot query always sees the freshest data.
+ - When we trigger compaction & what it decides to compact hold all the key to solving these hard problems. By implementing a compacting
+ strategy, where we aggressively compact the latest partitions compared to older partitions, we could ensure the read optimized queries see data
+ published within X minutes in a consistent fashion.
+
+The intention of merge on read table is to enable near real-time processing directly on top of DFS, as opposed to copying
+data out to specialized systems, which may not be able to handle the data volume. There are also a few secondary side benefits to 
+this table such as reduced write amplification by avoiding synchronous merge of data, i.e, the amount of data written per 1 bytes of data in a batch
+
+
diff --git a/website/docs/concurrency_control.md b/website/docs/concurrency_control.md
new file mode 100644
index 0000000..96ff6eb
--- /dev/null
+++ b/website/docs/concurrency_control.md
@@ -0,0 +1,149 @@
+---
+title: "Concurrency Control"
+summary: In this page, we will discuss how to perform concurrent writes to Hudi Tables.
+toc: true
+last_modified_at: 2021-03-19T15:59:57-04:00
+---
+
+In this section, we will cover Hudi's concurrency model and describe ways to ingest data into a Hudi Table from multiple writers; using the [DeltaStreamer](#deltastreamer) tool as well as 
+using the [Hudi datasource](#datasource-writer).
+
+## Supported Concurrency Controls
+
+- **MVCC** : Hudi table services such as compaction, cleaning, clustering leverage Multi Version Concurrency Control to provide snapshot isolation
+between multiple table service writers and readers. Additionally, using MVCC, Hudi provides snapshot isolation between an ingestion writer and multiple concurrent readers. 
+  With this model, Hudi supports running any number of table service jobs concurrently, without any concurrency conflict. 
+  This is made possible by ensuring that scheduling plans of such table services always happens in a single writer mode to ensure no conflict and avoids race conditions.
+
+- **[NEW] OPTIMISTIC CONCURRENCY** : Write operations such as the ones described above (UPSERT, INSERT) etc, leverage optimistic concurrency control to enable multiple ingestion writers to
+the same Hudi Table. Hudi supports `file level OCC`, i.e., for any 2 commits (or writers) happening to the same table, if they do not have writes to overlapping files being changed, both writers are allowed to succeed. 
+  This feature is currently *experimental* and requires either Zookeeper or HiveMetastore to acquire locks.
+
+It may be helpful to understand the different guarantees provided by [write operations](/docs/writing_data#write-operations) via Hudi datasource or the delta streamer.
+
+## Single Writer Guarantees
+
+ - *UPSERT Guarantee*: The target table will NEVER show duplicates.
+ - *INSERT Guarantee*: The target table wilL NEVER have duplicates if [dedup](/docs/configurations#INSERT_DROP_DUPS_OPT_KEY) is enabled.
+ - *BULK_INSERT Guarantee*: The target table will NEVER have duplicates if [dedup](/docs/configurations#INSERT_DROP_DUPS_OPT_KEY) is enabled.
+ - *INCREMENTAL PULL Guarantee*: Data consumption and checkpoints are NEVER out of order.
+
+## Multi Writer Guarantees
+
+With multiple writers using OCC, some of the above guarantees change as follows
+
+- *UPSERT Guarantee*: The target table will NEVER show duplicates.
+- *INSERT Guarantee*: The target table MIGHT have duplicates even if [dedup](/docs/configurations#INSERT_DROP_DUPS_OPT_KEY) is enabled.
+- *BULK_INSERT Guarantee*: The target table MIGHT have duplicates even if [dedup](/docs/configurations#INSERT_DROP_DUPS_OPT_KEY) is enabled.
+- *INCREMENTAL PULL Guarantee*: Data consumption and checkpoints MIGHT be out of order due to multiple writer jobs finishing at different times.
+
+## Enabling Multi Writing
+
+The following properties are needed to be set properly to turn on optimistic concurrency control.
+
+```
+hoodie.write.concurrency.mode=optimistic_concurrency_control
+hoodie.cleaner.policy.failed.writes=LAZY
+hoodie.write.lock.provider=<lock-provider-classname>
+```
+
+There are 2 different server based lock providers that require different configuration to be set.
+
+**`Zookeeper`** based lock provider
+
+```
+hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider
+hoodie.write.lock.zookeeper.url
+hoodie.write.lock.zookeeper.port
+hoodie.write.lock.zookeeper.lock_key
+hoodie.write.lock.zookeeper.base_path
+```
+
+**`HiveMetastore`** based lock provider
+
+```
+hoodie.write.lock.provider=org.apache.hudi.hive.HiveMetastoreBasedLockProvider
+hoodie.write.lock.hivemetastore.database
+hoodie.write.lock.hivemetastore.table
+```
+
+`The HiveMetastore URI's are picked up from the hadoop configuration file loaded during runtime.`
+
+## Datasource Writer
+
+The `hudi-spark` module offers the DataSource API to write (and read) a Spark DataFrame into a Hudi table.
+
+Following is an example of how to use optimistic_concurrency_control via spark datasource
+
+```java
+inputDF.write.format("hudi")
+       .options(getQuickstartWriteConfigs)
+       .option(PRECOMBINE_FIELD_OPT_KEY, "ts")
+       .option("hoodie.cleaner.policy.failed.writes", "LAZY")
+       .option("hoodie.write.concurrency.mode", "optimistic_concurrency_control")
+       .option("hoodie.write.lock.zookeeper.url", "zookeeper")
+       .option("hoodie.write.lock.zookeeper.port", "2181")
+       .option("hoodie.write.lock.zookeeper.lock_key", "test_table")
+       .option("hoodie.write.lock.zookeeper.base_path", "/test")
+       .option(RECORDKEY_FIELD_OPT_KEY, "uuid")
+       .option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath")
+       .option(TABLE_NAME, tableName)
+       .mode(Overwrite)
+       .save(basePath)
+```
+
+## DeltaStreamer
+
+The `HoodieDeltaStreamer` utility (part of hudi-utilities-bundle) provides ways to ingest from different sources such as DFS or Kafka, with the following capabilities.
+
+Using optimistic_concurrency_control via delta streamer requires adding the above configs to the properties file that can be passed to the
+job. For example below, adding the configs to kafka-source.properties file and passing them to deltastreamer will enable optimistic concurrency.
+A deltastreamer job can then be triggered as follows:
+
+```java
+[hoodie]$ spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls packaging/hudi-utilities-bundle/target/hudi-utilities-bundle-*.jar` \
+  --props file://${PWD}/hudi-utilities/src/test/resources/delta-streamer-config/kafka-source.properties \
+  --schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider \
+  --source-class org.apache.hudi.utilities.sources.AvroKafkaSource \
+  --source-ordering-field impresssiontime \
+  --target-base-path file:\/\/\/tmp/hudi-deltastreamer-op \ 
+  --target-table uber.impressions \
+  --op BULK_INSERT
+```
+
+## Best Practices when using Optimistic Concurrency Control
+
+Concurrent Writing to Hudi tables requires acquiring a lock with either Zookeeper or HiveMetastore. Due to several reasons you might want to configure retries to allow your application to acquire the lock. 
+1. Network connectivity or excessive load on servers increasing time for lock acquisition resulting in timeouts
+2. Running a large number of concurrent jobs that are writing to the same hudi table can result in contention during lock acquisition can cause timeouts
+3. In some scenarios of conflict resolution, Hudi commit operations might take upto 10's of seconds while the lock is being held. This can result in timeouts for other jobs waiting to acquire a lock.
+
+Set the correct native lock provider client retries. NOTE that sometimes these settings are set on the server once and all clients inherit the same configs. Please check your settings before enabling optimistic concurrency.
+   
+```
+hoodie.write.lock.wait_time_ms
+hoodie.write.lock.num_retries
+```
+
+Set the correct hudi client retries for Zookeeper & HiveMetastore. This is useful in cases when native client retry settings cannot be changed. Please note that these retries will happen in addition to any native client retries that you may have set. 
+
+```
+hoodie.write.lock.client.wait_time_ms
+hoodie.write.lock.client.num_retries
+```
+
+*Setting the right values for these depends on a case by case basis; some defaults have been provided for general cases.*
+
+## Disabling Multi Writing
+
+Remove the following settings that were used to enable multi-writer or override with default values.
+
+```
+hoodie.write.concurrency.mode=single_writer
+hoodie.cleaner.policy.failed.writes=EAGER
+```
+
+## Caveats
+
+If you are using the `WriteClient` API, please note that multiple writes to the table need to be initiated from 2 different instances of the write client. 
+It is NOT recommended to use the same instance of the write client to perform multi writing. 
\ No newline at end of file
diff --git a/website/docs/configurations.md b/website/docs/configurations.md
new file mode 100644
index 0000000..f235f14
--- /dev/null
+++ b/website/docs/configurations.md
@@ -0,0 +1,433 @@
+---
+title: Configurations
+keywords: [garbage collection, hudi, jvm, configs, tuning]
+summary: This page covers the different ways of configuring your job to write/read Hudi tables. At a high level, you can control behaviour at few levels.
+toc: true
+last_modified_at: 2021-07-24T00:48:18.710466
+---
+
+This page covers the different ways of configuring your job to write/read Hudi tables. At a high level, you can control behaviour at few levels.
+
+- [**Spark Datasource Configs**](#SPARK_DATASOURCE): These configs control the Hudi Spark Datasource, providing ability to define keys/partitioning, pick out the write operation, specify how to merge records or choosing query type to read.
+- [**Flink Sql Configs**](#FLINK_SQL): These configs control the Hudi Flink SQL source/sink connectors, providing ability to define record keys, pick out the write operation, specify how to merge records, enable/disable asynchronous compaction or choosing query type to read.
+- [**Write Client Configs**](#WRITE_CLIENT): Internally, the Hudi datasource uses a RDD based HoodieWriteClient API to actually perform writes to storage. These configs provide deep control over lower level aspects like file sizing, compression, parallelism, compaction, write schema, cleaning etc. Although Hudi provides sane defaults, from time-time these configs may need to be tweaked to optimize for specific workloads.
+- [**Metrics Configs**](#METRICS): These set of configs are used to enable monitoring and reporting of keyHudi stats and metrics.
+- [**Record Payload Config**](#RECORD_PAYLOAD): This is the lowest level of customization offered by Hudi. Record payloads define how to produce new values to upsert based on incoming new record and stored old record. Hudi provides default implementations such as OverwriteWithLatestAvroPayload which simply update table with the latest/last-written record. This can be overridden to a custom class extending HoodieRecordPayload class, on both datasource and WriteClient levels.
+
+## Metrics Configs {#METRICS}
+These set of configs are used to enable monitoring and reporting of keyHudi stats and metrics.
+
+### Metrics Configurations for Datadog reporter {#Metrics-Configurations-for-Datadog-reporter}
+
+Enables reporting on Hudi metrics using the Datadog reporter type. Hudi publishes metrics on every commit, clean, rollback etc.
+
+`Config Class`: org.apache.hudi.config.HoodieMetricsDatadogConfig<br/>
+> #### hoodie.metrics.datadog.metric.tags
+> Datadog metric tags (comma-delimited) to be sent along with metrics data.<br/>
+> **Default Value**: N/A (Required)<br/>
+> `Config Param: DATADOG_METRIC_TAGS`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.metrics.datadog.api.key.supplier
+> Datadog API key supplier to supply the API key at runtime. This will take effect if hoodie.metrics.datadog.api.key is not set.<br/>
+> **Default Value**: N/A (Required)<br/>
+> `Config Param: DATADOG_API_KEY_SUPPLIER`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.metrics.datadog.metric.prefix
+> Datadog metric prefix to be prepended to each metric name with a dot as delimiter. For example, if it is set to foo, foo. will be prepended.<br/>
+> **Default Value**: N/A (Required)<br/>
+> `Config Param: DATADOG_METRIC_PREFIX`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.metrics.datadog.api.timeout.seconds
+> Datadog API timeout in seconds. Default to 3.<br/>
+> **Default Value**: 3 (Optional)<br/>
+> `Config Param: DATADOG_API_TIMEOUT_SECONDS`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.metrics.datadog.report.period.seconds
+> Datadog reporting period in seconds. Default to 30.<br/>
+> **Default Value**: 30 (Optional)<br/>
+> `Config Param: DATADOG_REPORT_PERIOD_SECONDS`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.metrics.datadog.metric.host
+> Datadog metric host to be sent along with metrics data.<br/>
+> **Default Value**: N/A (Required)<br/>
+> `Config Param: DATADOG_METRIC_HOST`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.metrics.datadog.api.key.skip.validation
+> Before sending metrics via Datadog API, whether to skip validating Datadog API key or not. Default to false.<br/>
+> **Default Value**: false (Optional)<br/>
+> `Config Param: DATADOG_API_KEY_SKIP_VALIDATION`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.metrics.datadog.api.site
+> Datadog API site: EU or US<br/>
+> **Default Value**: N/A (Required)<br/>
+> `Config Param: DATADOG_API_SITE`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.metrics.datadog.api.key
+> Datadog API key<br/>
+> **Default Value**: N/A (Required)<br/>
+> `Config Param: DATADOG_API_KEY`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+### Metrics Configurations for Prometheus {#Metrics-Configurations-for-Prometheus}
+
+Enables reporting on Hudi metrics using Prometheus.  Hudi publishes metrics on every commit, clean, rollback etc.
+
+`Config Class`: org.apache.hudi.config.HoodieMetricsPrometheusConfig<br/>
+> #### hoodie.metrics.pushgateway.host
+> Hostname of the prometheus push gateway<br/>
+> **Default Value**: localhost (Optional)<br/>
+> `Config Param: PUSHGATEWAY_HOST`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.metrics.pushgateway.delete.on.shutdown
+> <br/>
+> **Default Value**: true (Optional)<br/>
+> `Config Param: PUSHGATEWAY_DELETE_ON_SHUTDOWN`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.metrics.prometheus.port
+> Port for prometheus server.<br/>
+> **Default Value**: 9090 (Optional)<br/>
+> `Config Param: PROMETHEUS_PORT`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.metrics.pushgateway.random.job.name.suffix
+> <br/>
+> **Default Value**: true (Optional)<br/>
+> `Config Param: PUSHGATEWAY_RANDOM_JOB_NAME_SUFFIX`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.metrics.pushgateway.report.period.seconds
+> Reporting interval in seconds.<br/>
+> **Default Value**: 30 (Optional)<br/>
+> `Config Param: PUSHGATEWAY_REPORT_PERIOD_SECONDS`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.metrics.pushgateway.job.name
+> Name of the push gateway job.<br/>
+> **Default Value**:  (Optional)<br/>
+> `Config Param: PUSHGATEWAY_JOB_NAME`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.metrics.pushgateway.port
+> Port for the push gateway.<br/>
+> **Default Value**: 9091 (Optional)<br/>
+> `Config Param: PUSHGATEWAY_PORT`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+### Metrics Configurations {#Metrics-Configurations}
+
+Enables reporting on Hudi metrics. Hudi publishes metrics on every commit, clean, rollback etc. The following sections list the supported reporters.
+
+`Config Class`: org.apache.hudi.config.HoodieMetricsConfig<br/>
+> #### hoodie.metrics.jmx.host
+> Jmx host to connect to<br/>
+> **Default Value**: localhost (Optional)<br/>
+> `Config Param: JMX_HOST`<br/>
+> `Since Version: 0.5.1`<br/>
+
+---
+
+> #### hoodie.metrics.executor.enable
+> <br/>
+> **Default Value**: N/A (Required)<br/>
+> `Config Param: ENABLE_EXECUTOR_METRICS`<br/>
+> `Since Version: 0.7.0`<br/>
+
+---
+
+> #### hoodie.metrics.jmx.port
+> Jmx port to connect to<br/>
+> **Default Value**: 9889 (Optional)<br/>
+> `Config Param: JMX_PORT`<br/>
+> `Since Version: 0.5.1`<br/>
+
+---
+
+> #### hoodie.metrics.graphite.host
+> Graphite host to connect to<br/>
+> **Default Value**: localhost (Optional)<br/>
+> `Config Param: GRAPHITE_SERVER_HOST`<br/>
+> `Since Version: 0.5.0`<br/>
+
+---
+
+> #### hoodie.metrics.on
+> Turn on/off metrics reporting. off by default.<br/>
+> **Default Value**: false (Optional)<br/>
+> `Config Param: METRICS_ON`<br/>
+> `Since Version: 0.5.0`<br/>
+
+---
+
+> #### hoodie.metrics.graphite.metric.prefix
+> Standard prefix applied to all metrics. This helps to add datacenter, environment information for e.g<br/>
+> **Default Value**: N/A (Required)<br/>
+> `Config Param: GRAPHITE_METRIC_PREFIX`<br/>
+> `Since Version: 0.5.1`<br/>
+
+---
+
+> #### hoodie.metrics.graphite.port
+> Graphite port to connect to<br/>
+> **Default Value**: 4756 (Optional)<br/>
+> `Config Param: GRAPHITE_SERVER_PORT`<br/>
+> `Since Version: 0.5.0`<br/>
+
+---
+
+> #### hoodie.metrics.reporter.type
+> Type of metrics reporter.<br/>
+> **Default Value**: GRAPHITE (Optional)<br/>
+> `Config Param: METRICS_REPORTER_TYPE`<br/>
+> `Since Version: 0.5.0`<br/>
+
+---
+
+> #### hoodie.metrics.reporter.class
+> <br/>
+> **Default Value**:  (Optional)<br/>
+> `Config Param: METRICS_REPORTER_CLASS`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+## Record Payload Config {#RECORD_PAYLOAD}
+This is the lowest level of customization offered by Hudi. Record payloads define how to produce new values to upsert based on incoming new record and stored old record. Hudi provides default implementations such as OverwriteWithLatestAvroPayload which simply update table with the latest/last-written record. This can be overridden to a custom class extending HoodieRecordPayload class, on both datasource and WriteClient levels.
+
+### Payload Configurations {#Payload-Configurations}
+
+Payload related configs, that can be leveraged to control merges based on specific business fields in the data.
+
+`Config Class`: org.apache.hudi.config.HoodiePayloadConfig<br/>
+> #### hoodie.payload.event.time.field
+> Table column/field name to derive timestamp associated with the records. This canbe useful for e.g, determining the freshness of the table.<br/>
+> **Default Value**: ts (Optional)<br/>
+> `Config Param: PAYLOAD_EVENT_TIME_FIELD_PROP`<br/>
+
+---
+
+> #### hoodie.payload.ordering.field
+> Table column/field name to order records that have the same key, before merging and writing to storage.<br/>
+> **Default Value**: ts (Optional)<br/>
+> `Config Param: PAYLOAD_ORDERING_FIELD_PROP`<br/>
+
+---
+
+## Spark Datasource Configs {#SPARK_DATASOURCE}
+These configs control the Hudi Spark Datasource, providing ability to define keys/partitioning, pick out the write operation, specify how to merge records or choosing query type to read.
+
+### Read Options {#Read-Options}
+
+Options useful for reading tables via `read.format.option(...)`
+
+
+`Config Class`: org.apache.hudi.DataSourceOptions.scala<br/>
+> #### hoodie.file.index.enable
+> Enables use of the spark file index implementation for Hudi, that speeds up listing of large tables.<br/>
+> **Default Value**: true (Optional)<br/>
+> `Config Param: ENABLE_HOODIE_FILE_INDEX`<br/>
+
+---
+
+> #### hoodie.datasource.merge.type
+> For Snapshot query on merge on read table, control whether we invoke the record payload implementation to merge (payload_combine) or skip merging altogetherskip_merge<br/>
+> **Default Value**: payload_combine (Optional)<br/>
+> `Config Param: REALTIME_MERGE_OPT_KEY`<br/>
+
+---
+
+> #### hoodie.datasource.read.incr.path.glob
+> For the use-cases like users only want to incremental pull from certain partitions instead of the full table. This option allows using glob pattern to directly filter on path.<br/>
+> **Default Value**:  (Optional)<br/>
+> `Config Param: INCR_PATH_GLOB_OPT_KEY`<br/>
+
+---
+
+> #### hoodie.datasource.query.type
+> Whether data needs to be read, in incremental mode (new data since an instantTime) (or) Read Optimized mode (obtain latest view, based on base files) (or) Snapshot mode (obtain latest view, by merging base and (if any) log files)<br/>
+> **Default Value**: snapshot (Optional)<br/>
+> `Config Param: QUERY_TYPE_OPT_KEY`<br/>
+
+---
+
+> #### hoodie.datasource.write.precombine.field
+> Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)<br/>
+> **Default Value**: ts (Optional)<br/>
+> `Config Param: READ_PRE_COMBINE_FIELD`<br/>
+
+---
+
+> #### hoodie.datasource.read.end.instanttime
+> Instant time to limit incrementally fetched data to. New data written with an instant_time &lt; 0.3.1. Will be removed eventually<br/>
+> **Default Value**: false (Optional)<br/>
+> `Config Param: HOODIE_ASSUME_DATE_PARTITIONING_PROP`<br/>
+> `Since Version: 0.3.0`<br/>
+
+---
+
+> #### hoodie.metadata.keep.max.commits
+> Controls the archival of the metadata table’s timeline.<br/>
+> **Default Value**: 30 (Optional)<br/>
+> `Config Param: MAX_COMMITS_TO_KEEP_PROP`<br/>
+> `Since Version: 0.7.0`<br/>
+
+---
+
+> #### hoodie.metadata.dir.filter.regex
+> Directories matching this regex, will be filtered out when initializing metadata table from lake storage for the first time.<br/>
+> **Default Value**:  (Optional)<br/>
+> `Config Param: DIRECTORY_FILTER_REGEX`<br/>
+> `Since Version: 0.7.0`<br/>
+
+---
+
+> #### hoodie.metadata.validate
+> Validate contents of metadata table on each access; e.g against the actual listings from lake storage<br/>
+> **Default Value**: false (Optional)<br/>
+> `Config Param: METADATA_VALIDATE_PROP`<br/>
+> `Since Version: 0.7.0`<br/>
+
+---
+
+> #### hoodie.metadata.clean.async
+> Enable asynchronous cleaning for metadata table<br/>
+> **Default Value**: false (Optional)<br/>
+> `Config Param: METADATA_ASYNC_CLEAN_PROP`<br/>
+> `Since Version: 0.7.0`<br/>
+
+---
+
+> #### hoodie.file.listing.parallelism
+> Parallelism to use, when listing the table on lake storage.<br/>
+> **Default Value**: 1500 (Optional)<br/>
+> `Config Param: FILE_LISTING_PARALLELISM_PROP`<br/>
+> `Since Version: 0.7.0`<br/>
+
+---
+
+### Bootstrap Configs {#Bootstrap-Configs}
+
+Configurations that control how you want to bootstrap your existing tables for the first time into hudi. The bootstrap operation can flexibly avoid copying data over before you can use Hudi and support running the existing  writers and new hudi writers in parallel, to validate the migration.
+
+`Config Class`: org.apache.hudi.config.HoodieBootstrapConfig<br/>
+> #### hoodie.bootstrap.partitionpath.translator.class
+> Translates the partition paths from the bootstrapped data into how is laid out as a Hudi table.<br/>
+> **Default Value**: org.apache.hudi.client.bootstrap.translator.IdentityBootstrapPartitionPathTranslator (Optional)<br/>
+> `Config Param: BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.bootstrap.keygen.class
+> Key generator implementation to be used for generating keys from the bootstrapped dataset<br/>
+> **Default Value**: N/A (Required)<br/>
+> `Config Param: BOOTSTRAP_KEYGEN_CLASS`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.bootstrap.mode.selector
+> Selects the mode in which each file/partition in the bootstrapped dataset gets bootstrapped<br/>
+> **Default Value**: org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector (Optional)<br/>
+> `Config Param: BOOTSTRAP_MODE_SELECTOR`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.bootstrap.keygen.type
+> Type of build-in key generator, currently support SIMPLE, COMPLEX, TIMESTAMP, CUSTOM, NON_PARTITION, GLOBAL_DELETE<br/>
+> **Default Value**: SIMPLE (Optional)<br/>
+> `Config Param: BOOTSTRAP_KEYGEN_TYPE`<br/>
+> `Since Version: 0.9.0`<br/>
+
+---
+
+> #### hoodie.bootstrap.mode.selector.regex.mode
+> Bootstrap mode to apply for partition paths, that match regex above. METADATA_ONLY will generate just skeleton base files with keys/footers, avoiding full cost of rewriting the dataset. FULL_RECORD will perform a full copy/rewrite of the data as a Hudi table.<br/>
+> **Default Value**: METADATA_ONLY (Optional)<br/>
+> `Config Param: BOOTSTRAP_MODE_SELECTOR_REGEX_MODE`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.bootstrap.index.class
+> Implementation to use, for mapping a skeleton base file to a boostrap base file.<br/>
+> **Default Value**: org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex (Optional)<br/>
+> `Config Param: BOOTSTRAP_INDEX_CLASS_PROP`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.bootstrap.full.input.provider
+> Class to use for reading the bootstrap dataset partitions/files, for Bootstrap mode FULL_RECORD<br/>
+> **Default Value**: org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider (Optional)<br/>
+> `Config Param: FULL_BOOTSTRAP_INPUT_PROVIDER`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.bootstrap.parallelism
+> Parallelism value to be used to bootstrap data into hudi<br/>
+> **Default Value**: 1500 (Optional)<br/>
+> `Config Param: BOOTSTRAP_PARALLELISM`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.bootstrap.mode.selector.regex
+> Matches each bootstrap dataset partition against this regex and applies the mode below to it.<br/>
+> **Default Value**: .* (Optional)<br/>
+> `Config Param: BOOTSTRAP_MODE_SELECTOR_REGEX`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
+> #### hoodie.bootstrap.base.path
+> Base path of the dataset that needs to be bootstrapped as a Hudi table<br/>
+> **Default Value**: N/A (Required)<br/>
+> `Config Param: BOOTSTRAP_BASE_PATH_PROP`<br/>
+> `Since Version: 0.6.0`<br/>
+
+---
+
diff --git a/website/docs/cos_hoodie.md b/website/docs/cos_hoodie.md
new file mode 100644
index 0000000..dfde6e8
--- /dev/null
+++ b/website/docs/cos_hoodie.md
@@ -0,0 +1,71 @@
+---
+title: Tencent Cloud
+keywords: [ hudi, hive, tencent, cos, spark, presto]
+summary: In this page, we go over how to configure Hudi with COS filesystem.
+last_modified_at: 2020-04-21T11:38:24-10:00
+---
+In this page, we explain how to get your Hudi spark job to store into Tencent Cloud COS.
+
+## Tencent Cloud COS configs
+
+There are two configurations required for Hudi-COS compatibility:
+
+- Adding Tencent Cloud COS Credentials for Hudi
+- Adding required Jars to classpath
+
+### Tencent Cloud COS Credentials
+
+Add the required configs in your core-site.xml from where Hudi can fetch them. Replace the `fs.defaultFS` with your COS bucket name, replace `fs.cosn.userinfo.secretId` with your COS secret Id, replace `fs.cosn.userinfo.secretKey` with your COS key. Hudi should be able to read/write from the bucket.
+
+```xml
+    <property>
+        <name>fs.defaultFS</name>
+        <value>cosn://bucketname</value>
+        <description>COS bucket name</description>
+    </property>
+
+    <property>
+        <name>fs.cosn.userinfo.secretId</name>
+        <value>cos-secretId</value>
+        <description>Tencent Cloud Secret Id</description>
+    </property>
+
+    <property>
+        <name>fs.cosn.userinfo.secretKey</name>
+        <value>cos-secretkey</value>
+        <description>Tencent Cloud Secret Key</description>
+    </property>
+
+    <property>
+        <name>fs.cosn.bucket.region</name>
+        <value>ap-region</value>
+        <description>The region where the bucket is located.</description>
+    </property>
+
+    <property>
+        <name>fs.cosn.bucket.endpoint_suffix</name>
+        <value>cos.endpoint.suffix</value>
+        <description>
+          COS endpoint to connect to. 
+          For public cloud users, it is recommended not to set this option, and only the correct area field is required.
+        </description>
+    </property>
+
+    <property>
+        <name>fs.cosn.impl</name>
+        <value>org.apache.hadoop.fs.CosFileSystem</value>
+        <description>The implementation class of the CosN Filesystem.</description>
+    </property>
+
+    <property>
+        <name>fs.AbstractFileSystem.cosn.impl</name>
+        <value>org.apache.hadoop.fs.CosN</value>
+        <description>The implementation class of the CosN AbstractFileSystem.</description>
+    </property>
+
+```
+
+### Tencent Cloud COS Libs
+COS hadoop libraries to add to our classpath
+
+- org.apache.hadoop:hadoop-cos:2.8.5
diff --git a/website/docs/deployment.md b/website/docs/deployment.md
new file mode 100644
index 0000000..3b2366a
--- /dev/null
+++ b/website/docs/deployment.md
@@ -0,0 +1,578 @@
+---
+title: Deployment
+keywords: [ hudi, administration, operation, devops, deployment]
+summary: This section offers an overview of tools available to operate an ecosystem of Hudi
+toc: true
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+
+This section provides all the help you need to deploy and operate Hudi tables at scale. 
+Specifically, we will cover the following aspects.
+
+ - [Deployment Model](#deploying) : How various Hudi components are deployed and managed.
+ - [Upgrading Versions](#upgrading) : Picking up new releases of Hudi, guidelines and general best-practices.
+ - [Migrating to Hudi](#migrating) : How to migrate your existing tables to Apache Hudi.
+ - [Interacting via CLI](#cli) : Using the CLI to perform maintenance or deeper introspection.
+ - [Monitoring](#monitoring) : Tracking metrics from your hudi tables using popular tools.
+ - [Troubleshooting](#troubleshooting) : Uncovering, triaging and resolving issues in production.
+ 
+## Deploying
+
+All in all, Hudi deploys with no long running servers or additional infrastructure cost to your data lake. In fact, Hudi pioneered this model of building a transactional distributed storage layer
+using existing infrastructure and its heartening to see other systems adopting similar approaches as well. Hudi writing is done via Spark jobs (DeltaStreamer or custom Spark datasource jobs), deployed per standard Apache Spark [recommendations](https://spark.apache.org/docs/latest/cluster-overview).
+Querying Hudi tables happens via libraries installed into Apache Hive, Apache Spark or PrestoDB and hence no additional infrastructure is necessary. 
+
+A typical Hudi data ingestion can be achieved in 2 modes. In a single run mode, Hudi ingestion reads next batch of data, ingest them to Hudi table and exits. In continuous mode, Hudi ingestion runs as a long-running service executing ingestion in a loop.
+
+With Merge_On_Read Table, Hudi ingestion needs to also take care of compacting delta files. Again, compaction can be performed in an asynchronous-mode by letting compaction run concurrently with ingestion or in a serial fashion with one after another.
+
+### DeltaStreamer
+
+[DeltaStreamer](/docs/writing_data#deltastreamer) is the standalone utility to incrementally pull upstream changes from varied sources such as DFS, Kafka and DB Changelogs and ingest them to hudi tables. It runs as a spark application in 2 modes.
+
+ - **Run Once Mode** : In this mode, Deltastreamer performs one ingestion round which includes incrementally pulling events from upstream sources and ingesting them to hudi table. Background operations like cleaning old file versions and archiving hoodie timeline are automatically executed as part of the run. For Merge-On-Read tables, Compaction is also run inline as part of ingestion unless disabled by passing the flag "--disable-compaction". By default, Compaction is run inline for eve [...]
+
+Here is an example invocation for reading from kafka topic in a single-run mode and writing to Merge On Read table type in a yarn cluster.
+
+```java
+[hoodie]$ spark-submit --packages org.apache.hudi:hudi-utilities-bundle_2.11:0.5.3,org.apache.spark:spark-avro_2.11:2.4.4 \
+ --master yarn \
+ --deploy-mode cluster \
+ --num-executors 10 \
+ --executor-memory 3g \
+ --driver-memory 6g \
+ --conf spark.driver.extraJavaOptions="-XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintGCTimeStamps -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/varadarb_ds_driver.hprof" \
+ --conf spark.executor.extraJavaOptions="-XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintGCTimeStamps -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/varadarb_ds_executor.hprof" \
+ --queue hadoop-platform-queue \
+ --conf spark.scheduler.mode=FAIR \
+ --conf spark.yarn.executor.memoryOverhead=1072 \
+ --conf spark.yarn.driver.memoryOverhead=2048 \
+ --conf spark.task.cpus=1 \
+ --conf spark.executor.cores=1 \
+ --conf spark.task.maxFailures=10 \
+ --conf spark.memory.fraction=0.4 \
+ --conf spark.rdd.compress=true \
+ --conf spark.kryoserializer.buffer.max=200m \
+ --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
+ --conf spark.memory.storageFraction=0.1 \
+ --conf spark.shuffle.service.enabled=true \
+ --conf spark.sql.hive.convertMetastoreParquet=false \
+ --conf spark.ui.port=5555 \
+ --conf spark.driver.maxResultSize=3g \
+ --conf spark.executor.heartbeatInterval=120s \
+ --conf spark.network.timeout=600s \
+ --conf spark.eventLog.overwrite=true \
+ --conf spark.eventLog.enabled=true \
+ --conf spark.eventLog.dir=hdfs:///user/spark/applicationHistory \
+ --conf spark.yarn.max.executor.failures=10 \
+ --conf spark.sql.catalogImplementation=hive \
+ --conf spark.sql.shuffle.partitions=100 \
+ --driver-class-path $HADOOP_CONF_DIR \
+ --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer \
+ --table-type MERGE_ON_READ \
+ --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \
+ --source-ordering-field ts  \
+ --target-base-path /user/hive/warehouse/stock_ticks_mor \
+ --target-table stock_ticks_mor \
+ --props /var/demo/config/kafka-source.properties \
+ --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider
+```
+
+ - **Continuous Mode** :  Here, deltastreamer runs an infinite loop with each round performing one ingestion round as described in **Run Once Mode**. The frequency of data ingestion can be controlled by the configuration "--min-sync-interval-seconds". For Merge-On-Read tables, Compaction is run in asynchronous fashion concurrently with ingestion unless disabled by passing the flag "--disable-compaction". Every ingestion run triggers a compaction request asynchronously and this frequency  [...]
+
+Here is an example invocation for reading from kafka topic in a continuous mode and writing to Merge On Read table type in a yarn cluster.
+
+```java
+[hoodie]$ spark-submit --packages org.apache.hudi:hudi-utilities-bundle_2.11:0.5.3,org.apache.spark:spark-avro_2.11:2.4.4 \
+ --master yarn \
+ --deploy-mode cluster \
+ --num-executors 10 \
+ --executor-memory 3g \
+ --driver-memory 6g \
+ --conf spark.driver.extraJavaOptions="-XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintGCTimeStamps -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/varadarb_ds_driver.hprof" \
+ --conf spark.executor.extraJavaOptions="-XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintGCTimeStamps -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/varadarb_ds_executor.hprof" \
+ --queue hadoop-platform-queue \
+ --conf spark.scheduler.mode=FAIR \
+ --conf spark.yarn.executor.memoryOverhead=1072 \
+ --conf spark.yarn.driver.memoryOverhead=2048 \
+ --conf spark.task.cpus=1 \
+ --conf spark.executor.cores=1 \
+ --conf spark.task.maxFailures=10 \
+ --conf spark.memory.fraction=0.4 \
+ --conf spark.rdd.compress=true \
+ --conf spark.kryoserializer.buffer.max=200m \
+ --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
+ --conf spark.memory.storageFraction=0.1 \
+ --conf spark.shuffle.service.enabled=true \
+ --conf spark.sql.hive.convertMetastoreParquet=false \
+ --conf spark.ui.port=5555 \
+ --conf spark.driver.maxResultSize=3g \
+ --conf spark.executor.heartbeatInterval=120s \
+ --conf spark.network.timeout=600s \
+ --conf spark.eventLog.overwrite=true \
+ --conf spark.eventLog.enabled=true \
+ --conf spark.eventLog.dir=hdfs:///user/spark/applicationHistory \
+ --conf spark.yarn.max.executor.failures=10 \
+ --conf spark.sql.catalogImplementation=hive \
+ --conf spark.sql.shuffle.partitions=100 \
+ --driver-class-path $HADOOP_CONF_DIR \
+ --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer \
+ --table-type MERGE_ON_READ \
+ --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \
+ --source-ordering-field ts  \
+ --target-base-path /user/hive/warehouse/stock_ticks_mor \
+ --target-table stock_ticks_mor \
+ --props /var/demo/config/kafka-source.properties \
+ --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
+ --continuous
+```
+
+### Spark Datasource Writer Jobs
+
+As described in [Writing Data](/docs/writing_data#datasource-writer), you can use spark datasource to ingest to hudi table. This mechanism allows you to ingest any spark dataframe in Hudi format. Hudi Spark DataSource also supports spark streaming to ingest a streaming source to Hudi table. For Merge On Read table types, inline compaction is turned on by default which runs after every ingestion run. The compaction frequency can be changed by setting the property "hoodie.compact.inline.ma [...]
+
+Here is an example invocation using spark datasource
+
+```java
+inputDF.write()
+       .format("org.apache.hudi")
+       .options(clientOpts) // any of the Hudi client opts can be passed in as well
+       .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
+       .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
+       .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
+       .option(HoodieWriteConfig.TABLE_NAME, tableName)
+       .mode(SaveMode.Append)
+       .save(basePath);
+```
+ 
+## Upgrading 
+
+New Hudi releases are listed on the [releases page](/releases), with detailed notes which list all the changes, with highlights in each release. 
+At the end of the day, Hudi is a storage system and with that comes a lot of responsibilities, which we take seriously. 
+
+As general guidelines, 
+
+ - We strive to keep all changes backwards compatible (i.e new code can read old data/timeline files) and when we cannot, we will provide upgrade/downgrade tools via the CLI
+ - We cannot always guarantee forward compatibility (i.e old code being able to read data/timeline files written by a greater version). This is generally the norm, since no new features can be built otherwise.
+   However any large such changes, will be turned off by default, for smooth transition to newer release. After a few releases and once enough users deem the feature stable in production, we will flip the defaults in a subsequent release.
+ - Always upgrade the query bundles (mr-bundle, presto-bundle, spark-bundle) first and then upgrade the writers (deltastreamer, spark jobs using datasource). This often provides the best experience and it's easy to fix 
+   any issues by rolling forward/back the writer code (which typically you might have more control over)
+ - With large, feature rich releases we recommend migrating slowly, by first testing in staging environments and running your own tests. Upgrading Hudi is no different than upgrading any database system.
+
+Note that release notes can override this information with specific instructions, applicable on case-by-case basis.
+
+## Migrating
+
+Currently migrating to Hudi can be done using two approaches 
+
+- **Convert newer partitions to Hudi** : This model is suitable for large event tables (e.g: click streams, ad impressions), which also typically receive writes for the last few days alone. You can convert the last 
+   N partitions to Hudi and proceed writing as if it were a Hudi table to begin with. The Hudi query side code is able to correctly handle both hudi and non-hudi data partitions.
+- **Full conversion to Hudi** : This model is suitable if you are currently bulk/full loading the table few times a day (e.g database ingestion). The full conversion of Hudi is simply a one-time step (akin to 1 run of your existing job),
+   which moves all of the data into the Hudi format and provides the ability to incrementally update for future writes.
+
+For more details, refer to the detailed [migration guide](/docs/migration_guide). In the future, we will be supporting seamless zero-copy bootstrap of existing tables with all the upsert/incremental query capabilities fully supported.
+
+## CLI
+
+Once hudi has been built, the shell can be fired by via  `cd hudi-cli && ./hudi-cli.sh`. A hudi table resides on DFS, in a location referred to as the `basePath` and 
+we would need this location in order to connect to a Hudi table. Hudi library effectively manages this table internally, using `.hoodie` subfolder to track all metadata.
+
+To initialize a hudi table, use the following command.
+
+```java
+===================================================================
+*         ___                          ___                        *
+*        /\__\          ___           /\  \           ___         *
+*       / /  /         /\__\         /  \  \         /\  \        *
+*      / /__/         / /  /        / /\ \  \        \ \  \       *
+*     /  \  \ ___    / /  /        / /  \ \__\       /  \__\      *
+*    / /\ \  /\__\  / /__/  ___   / /__/ \ |__|     / /\/__/      *
+*    \/  \ \/ /  /  \ \  \ /\__\  \ \  \ / /  /  /\/ /  /         *
+*         \  /  /    \ \  / /  /   \ \  / /  /   \  /__/          *
+*         / /  /      \ \/ /  /     \ \/ /  /     \ \__\          *
+*        / /  /        \  /  /       \  /  /       \/__/          *
+*        \/__/          \/__/         \/__/    Apache Hudi CLI    *
+*                                                                 *
+===================================================================
+
+hudi->create --path /user/hive/warehouse/table1 --tableName hoodie_table_1 --tableType COPY_ON_WRITE
+.....
+```
+
+To see the description of hudi table, use the command:
+
+```java
+hudi:hoodie_table_1->desc
+18/09/06 15:57:19 INFO timeline.HoodieActiveTimeline: Loaded instants []
+    _________________________________________________________
+    | Property                | Value                        |
+    |========================================================|
+    | basePath                | ...                          |
+    | metaPath                | ...                          |
+    | fileSystem              | hdfs                         |
+    | hoodie.table.name       | hoodie_table_1               |
+    | hoodie.table.type       | COPY_ON_WRITE                |
+    | hoodie.archivelog.folder|                              |
+```
+
+Following is a sample command to connect to a Hudi table contains uber trips.
+
+```java
+hudi:trips->connect --path /app/uber/trips
+
+16/10/05 23:20:37 INFO model.HoodieTableMetadata: All commits :HoodieCommits{commitList=[20161002045850, 20161002052915, 20161002055918, 20161002065317, 20161002075932, 20161002082904, 20161002085949, 20161002092936, 20161002105903, 20161002112938, 20161002123005, 20161002133002, 20161002155940, 20161002165924, 20161002172907, 20161002175905, 20161002190016, 20161002192954, 20161002195925, 20161002205935, 20161002215928, 20161002222938, 20161002225915, 20161002232906, 20161003003028, 201 [...]
+Metadata for table trips loaded
+```
+
+Once connected to the table, a lot of other commands become available. The shell has contextual autocomplete help (press TAB) and below is a list of all commands, few of which are reviewed in this section
+are reviewed
+
+```java
+hudi:trips->help
+* ! - Allows execution of operating system (OS) commands
+* // - Inline comment markers (start of line only)
+* ; - Inline comment markers (start of line only)
+* addpartitionmeta - Add partition metadata to a table, if not present
+* clear - Clears the console
+* cls - Clears the console
+* commit rollback - Rollback a commit
+* commits compare - Compare commits with another Hoodie table
+* commit showfiles - Show file level details of a commit
+* commit showpartitions - Show partition level details of a commit
+* commits refresh - Refresh the commits
+* commits show - Show the commits
+* commits sync - Compare commits with another Hoodie table
+* connect - Connect to a hoodie table
+* date - Displays the local date and time
+* exit - Exits the shell
+* help - List all commands usage
+* quit - Exits the shell
+* records deduplicate - De-duplicate a partition path contains duplicates & produce repaired files to replace with
+* script - Parses the specified resource file and executes its commands
+* stats filesizes - File Sizes. Display summary stats on sizes of files
+* stats wa - Write Amplification. Ratio of how many records were upserted to how many records were actually written
+* sync validate - Validate the sync by counting the number of records
+* system properties - Shows the shell's properties
+* utils loadClass - Load a class
+* version - Displays shell version
+
+hudi:trips->
+```
+
+
+### Inspecting Commits
+
+The task of upserting or inserting a batch of incoming records is known as a **commit** in Hudi. A commit provides basic atomicity guarantees such that only committed data is available for querying.
+Each commit has a monotonically increasing string/number called the **commit number**. Typically, this is the time at which we started the commit.
+
+To view some basic information about the last 10 commits,
+
+
+```java
+hudi:trips->commits show --sortBy "Total Bytes Written" --desc true --limit 10
+    ________________________________________________________________________________________________________________________________________________________________________
+    | CommitTime    | Total Bytes Written| Total Files Added| Total Files Updated| Total Partitions Written| Total Records Written| Total Update Records Written| Total Errors|
+    |=======================================================================================================================================================================|
+    ....
+    ....
+    ....
+```
+
+At the start of each write, Hudi also writes a .inflight commit to the .hoodie folder. You can use the timestamp there to estimate how long the commit has been inflight
+
+
+```java
+$ hdfs dfs -ls /app/uber/trips/.hoodie/*.inflight
+-rw-r--r--   3 vinoth supergroup     321984 2016-10-05 23:18 /app/uber/trips/.hoodie/20161005225920.inflight
+```
+
+
+### Drilling Down to a specific Commit
+
+To understand how the writes spread across specific partiions,
+
+
+```java
+hudi:trips->commit showpartitions --commit 20161005165855 --sortBy "Total Bytes Written" --desc true --limit 10
+    __________________________________________________________________________________________________________________________________________
+    | Partition Path| Total Files Added| Total Files Updated| Total Records Inserted| Total Records Updated| Total Bytes Written| Total Errors|
+    |=========================================================================================================================================|
+     ....
+     ....
+```
+
+If you need file level granularity , we can do the following
+
+
+```java
+hudi:trips->commit showfiles --commit 20161005165855 --sortBy "Partition Path"
+    ________________________________________________________________________________________________________________________________________________________
+    | Partition Path| File ID                             | Previous Commit| Total Records Updated| Total Records Written| Total Bytes Written| Total Errors|
+    |=======================================================================================================================================================|
+    ....
+    ....
+```
+
+
+### FileSystem View
+
+Hudi views each partition as a collection of file-groups with each file-group containing a list of file-slices in commit order (See concepts). 
+The below commands allow users to view the file-slices for a data-set.
+
+```java
+hudi:stock_ticks_mor->show fsview all
+ ....
+  _______________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________
+ | Partition | FileId | Base-Instant | Data-File | Data-File Size| Num Delta Files| Total Delta File Size| Delta Files |
+ |==============================================================================================================================================================================================================================================================================================================================================================================================================|
+ | 2018/08/31| 111415c3-f26d-4639-86c8-f9956f245ac3| 20181002180759| hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/111415c3-f26d-4639-86c8-f9956f245ac3_0_20181002180759.parquet| 432.5 KB | 1 | 20.8 KB | [HoodieLogFile {hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/.111415c3-f26d-4639-86c8-f9956f245ac3_20181002180759.log.1}]|
+
+
+
+hudi:stock_ticks_mor->show fsview latest --partitionPath "2018/08/31"
+ ......
+ ___________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________ [...]
+ | Partition | FileId | Base-Instant | Data-File | Data-File Size| Num Delta Files| Total Delta Size| Delta Size - compaction scheduled| Delta Size - compaction unscheduled| Delta To Base Ratio - compaction scheduled| Delta To Base Ratio - compaction unscheduled| Delta Files - compaction scheduled | Delta Files - compaction unscheduled|
+ |========================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================== [...]
+ | 2018/08/31| 111415c3-f26d-4639-86c8-f9956f245ac3| 20181002180759| hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/111415c3-f26d-4639-86c8-f9956f245ac3_0_20181002180759.parquet| 432.5 KB | 1 | 20.8 KB | 20.8 KB | 0.0 B | 0.0 B | 0.0 B | [HoodieLogFile {hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/.111415c3-f26d-4639-86c8-f9956f245ac3_20181002180759.log.1}]| [] |
+
+```
+
+
+### Statistics
+
+Since Hudi directly manages file sizes for DFS table, it might be good to get an overall picture
+
+
+```java
+hudi:trips->stats filesizes --partitionPath 2016/09/01 --sortBy "95th" --desc true --limit 10
+    ________________________________________________________________________________________________
+    | CommitTime    | Min     | 10th    | 50th    | avg     | 95th    | Max     | NumFiles| StdDev  |
+    |===============================================================================================|
+    | <COMMIT_ID>   | 93.9 MB | 93.9 MB | 93.9 MB | 93.9 MB | 93.9 MB | 93.9 MB | 2       | 2.3 KB  |
+    ....
+    ....
+```
+
+In case of Hudi write taking much longer, it might be good to see the write amplification for any sudden increases
+
+
+```java
+hudi:trips->stats wa
+    __________________________________________________________________________
+    | CommitTime    | Total Upserted| Total Written| Write Amplifiation Factor|
+    |=========================================================================|
+    ....
+    ....
+```
+
+
+### Archived Commits
+
+In order to limit the amount of growth of .commit files on DFS, Hudi archives older .commit files (with due respect to the cleaner policy) into a commits.archived file.
+This is a sequence file that contains a mapping from commitNumber => json with raw information about the commit (same that is nicely rolled up above).
+
+
+### Compactions
+
+To get an idea of the lag between compaction and writer applications, use the below command to list down all
+pending compactions.
+
+```java
+hudi:trips->compactions show all
+     ___________________________________________________________________
+    | Compaction Instant Time| State    | Total FileIds to be Compacted|
+    |==================================================================|
+    | <INSTANT_1>            | REQUESTED| 35                           |
+    | <INSTANT_2>            | INFLIGHT | 27                           |
+```
+
+To inspect a specific compaction plan, use
+
+```java
+hudi:trips->compaction show --instant <INSTANT_1>
+    _________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________
+    | Partition Path| File Id | Base Instant  | Data File Path                                    | Total Delta Files| getMetrics                                                                                                                    |
+    |================================================================================================================================================================================================================================================
+    | 2018/07/17    | <UUID>  | <INSTANT_1>   | viewfs://ns-default/.../../UUID_<INSTANT>.parquet | 1                | {TOTAL_LOG_FILES=1.0, TOTAL_IO_READ_MB=1230.0, TOTAL_LOG_FILES_SIZE=2.51255751E8, TOTAL_IO_WRITE_MB=991.0, TOTAL_IO_MB=2221.0}|
+
+```
+
+To manually schedule or run a compaction, use the below command. This command uses spark launcher to perform compaction
+operations. 
+
+**NOTE:** Make sure no other application is scheduling compaction for this table concurrently
+{: .notice--info}
+
+```java
+hudi:trips->help compaction schedule
+Keyword:                   compaction schedule
+Description:               Schedule Compaction
+ Keyword:                  sparkMemory
+   Help:                   Spark executor memory
+   Mandatory:              false
+   Default if specified:   '__NULL__'
+   Default if unspecified: '1G'
+
+* compaction schedule - Schedule Compaction
+```
+
+```java
+hudi:trips->help compaction run
+Keyword:                   compaction run
+Description:               Run Compaction for given instant time
+ Keyword:                  tableName
+   Help:                   Table name
+   Mandatory:              true
+   Default if specified:   '__NULL__'
+   Default if unspecified: '__NULL__'
+
+ Keyword:                  parallelism
+   Help:                   Parallelism for hoodie compaction
+   Mandatory:              true
+   Default if specified:   '__NULL__'
+   Default if unspecified: '__NULL__'
+
+ Keyword:                  schemaFilePath
+   Help:                   Path for Avro schema file
+   Mandatory:              true
+   Default if specified:   '__NULL__'
+   Default if unspecified: '__NULL__'
+
+ Keyword:                  sparkMemory
+   Help:                   Spark executor memory
+   Mandatory:              true
+   Default if specified:   '__NULL__'
+   Default if unspecified: '__NULL__'
+
+ Keyword:                  retry
+   Help:                   Number of retries
+   Mandatory:              true
+   Default if specified:   '__NULL__'
+   Default if unspecified: '__NULL__'
+
+ Keyword:                  compactionInstant
+   Help:                   Base path for the target hoodie table
+   Mandatory:              true
+   Default if specified:   '__NULL__'
+   Default if unspecified: '__NULL__'
+
+* compaction run - Run Compaction for given instant time
+```
+
+### Validate Compaction
+
+Validating a compaction plan : Check if all the files necessary for compactions are present and are valid
+
+```java
+hudi:stock_ticks_mor->compaction validate --instant 20181005222611
+...
+
+   COMPACTION PLAN VALID
+
+    ___________________________________________________________________________________________________________________________________________________________________________________________________________________________
+    | File Id                             | Base Instant Time| Base Data File                                                                                                                   | Num Delta Files| Valid| Error|
+    |==========================================================================================================================================================================================================================|
+    | 05320e98-9a57-4c38-b809-a6beaaeb36bd| 20181005222445   | hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/05320e98-9a57-4c38-b809-a6beaaeb36bd_0_20181005222445.parquet| 1              | true |      |
+
+
+
+hudi:stock_ticks_mor->compaction validate --instant 20181005222601
+
+   COMPACTION PLAN INVALID
+
+    _______________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________
+    | File Id                             | Base Instant Time| Base Data File                                                                                                                   | Num Delta Files| Valid| Error                                                                           |
+    |=====================================================================================================================================================================================================================================================================================================|
+    | 05320e98-9a57-4c38-b809-a6beaaeb36bd| 20181005222445   | hdfs://namenode:8020/user/hive/warehouse/stock_ticks_mor/2018/08/31/05320e98-9a57-4c38-b809-a6beaaeb36bd_0_20181005222445.parquet| 1              | false| All log files specified in compaction operation is not present. Missing ....    |
+```
+
+**NOTE:** The following commands must be executed without any other writer/ingestion application running.
+{: .notice--warning}
+
+Sometimes, it becomes necessary to remove a fileId from a compaction-plan inorder to speed-up or unblock compaction
+operation. Any new log-files that happened on this file after the compaction got scheduled will be safely renamed
+so that are preserved. Hudi provides the following CLI to support it
+
+
+### Unscheduling Compaction
+
+```java
+hudi:trips->compaction unscheduleFileId --fileId <FileUUID>
+....
+No File renames needed to unschedule file from pending compaction. Operation successful.
+```
+
+In other cases, an entire compaction plan needs to be reverted. This is supported by the following CLI
+
+```java
+hudi:trips->compaction unschedule --compactionInstant <compactionInstant>
+.....
+No File renames needed to unschedule pending compaction. Operation successful.
+```
+
+### Repair Compaction
+
+The above compaction unscheduling operations could sometimes fail partially (e:g -> DFS temporarily unavailable). With
+partial failures, the compaction operation could become inconsistent with the state of file-slices. When you run
+`compaction validate`, you can notice invalid compaction operations if there is one.  In these cases, the repair
+command comes to the rescue, it will rearrange the file-slices so that there is no loss and the file-slices are
+consistent with the compaction plan
+
+```java
+hudi:stock_ticks_mor->compaction repair --instant 20181005222611
+......
+Compaction successfully repaired
+.....
+```
+
+## Troubleshooting
+
+Section below generally aids in debugging Hudi failures. Off the bat, the following metadata is added to every record to help triage  issues easily using standard Hadoop SQL engines (Hive/PrestoDB/Spark)
+
+ - **_hoodie_record_key** - Treated as a primary key within each DFS partition, basis of all updates/inserts
+ - **_hoodie_commit_time** - Last commit that touched this record
+ - **_hoodie_file_name** - Actual file name containing the record (super useful to triage duplicates)
+ - **_hoodie_partition_path** - Path from basePath that identifies the partition containing this record
+ 
+ For performance related issues, please refer to the [tuning guide](https://cwiki.apache.org/confluence/display/HUDI/Tuning+Guide)
+
+
+### Missing records
+
+Please check if there were any write errors using the admin commands above, during the window at which the record could have been written.
+If you do find errors, then the record was not actually written by Hudi, but handed back to the application to decide what to do with it.
+
+### Duplicates
+
+First of all, please confirm if you do indeed have duplicates **AFTER** ensuring the query is accessing the Hudi table [properly](/docs/querying_data) .
+
+ - If confirmed, please use the metadata fields above, to identify the physical files & partition files containing the records .
+ - If duplicates span files across partitionpath, then this means your application is generating different partitionPaths for same recordKey, Please fix your app
+ - if duplicates span multiple files within the same partitionpath, please engage with mailing list. This should not happen. You can use the `records deduplicate` command to fix your data.
+
+### Spark failures {#spark-ui}
+
+Typical upsert() DAG looks like below. Note that Hudi client also caches intermediate RDDs to intelligently profile workload and size files and spark parallelism.
+Also Spark UI shows sortByKey twice due to the probe job also being shown, nonetheless its just a single sort.
+
+<figure>
+    <img className="docimage" src={require("/assets/images/hudi_upsert_dag.png").default} alt="hudi_upsert_dag.png"  />
+</figure>
+
+At a high level, there are two steps
+
+**Index Lookup to identify files to be changed**
+
+ - Job 1 : Triggers the input data read, converts to HoodieRecord object and then stops at obtaining a spread of input records to target partition paths
+ - Job 2 : Load the set of file names which we need check against
+ - Job 3  & 4 : Actual lookup after smart sizing of spark join parallelism, by joining RDDs in 1 & 2 above
+ - Job 5 : Have a tagged RDD of recordKeys with locations
+
+**Performing the actual writing of data**
+
+ - Job 6 : Lazy join of incoming records against recordKey, location to provide a final set of HoodieRecord which now contain the information about which file/partitionpath they are found at (or null if insert). Then also profile the workload again to determine sizing of files
+ - Job 7 : Actual writing of data (update + insert + insert turned to updates to maintain file size)
+
+Depending on the exception source (Hudi/Spark), the above knowledge of the DAG can be used to pinpoint the actual issue. The most often encountered failures result from YARN/DFS temporary failures.
+In the future, a more sophisticated debug/management UI would be added to the project, that can help automate some of this debugging.
diff --git a/website/docs/docker_demo.md b/website/docs/docker_demo.md
new file mode 100644
index 0000000..98b0417
--- /dev/null
+++ b/website/docs/docker_demo.md
@@ -0,0 +1,1235 @@
+---
+title: Docker Demo
+keywords: [ hudi, docker, demo]
+toc: true
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+
+## A Demo using docker containers
+
+Lets use a real world example to see how hudi works end to end. For this purpose, a self contained
+data infrastructure is brought up in a local docker cluster within your computer.
+
+The steps have been tested on a Mac laptop
+
+### Prerequisites
+
+  * Docker Setup :  For Mac, Please follow the steps as defined in [https://docs.docker.com/v17.12/docker-for-mac/install/]. For running Spark-SQL queries, please ensure atleast 6 GB and 4 CPUs are allocated to Docker (See Docker -> Preferences -> Advanced). Otherwise, spark-SQL queries could be killed because of memory issues.
+  * kafkacat : A command-line utility to publish/consume from kafka topics. Use `brew install kafkacat` to install kafkacat.
+  * /etc/hosts : The demo references many services running in container by the hostname. Add the following settings to /etc/hosts
+
+    ```java
+    127.0.0.1 adhoc-1
+    127.0.0.1 adhoc-2
+    127.0.0.1 namenode
+    127.0.0.1 datanode1
+    127.0.0.1 hiveserver
+    127.0.0.1 hivemetastore
+    127.0.0.1 kafkabroker
+    127.0.0.1 sparkmaster
+    127.0.0.1 zookeeper
+    ```
+  * Java : Java SE Development Kit 8.
+  * Maven : A build automation tool for Java projects.
+  * jq : A lightweight and flexible command-line JSON processor. Use `brew install jq` to install jq.
+  
+Also, this has not been tested on some environments like Docker on Windows.
+
+
+## Setting up Docker Cluster
+
+
+### Build Hudi
+
+The first step is to build hudi. **Note** This step builds hudi on default supported scala version - 2.11.
+```java
+cd <HUDI_WORKSPACE>
+mvn package -DskipTests
+```
+
+### Bringing up Demo Cluster
+
+The next step is to run the docker compose script and setup configs for bringing up the cluster.
+This should pull the docker images from docker hub and setup docker cluster.
+
+```java
+cd docker
+./setup_demo.sh
+....
+....
+....
+Stopping spark-worker-1            ... done
+Stopping hiveserver                ... done
+Stopping hivemetastore             ... done
+Stopping historyserver             ... done
+.......
+......
+Creating network "compose_default" with the default driver
+Creating volume "compose_namenode" with default driver
+Creating volume "compose_historyserver" with default driver
+Creating volume "compose_hive-metastore-postgresql" with default driver
+Creating hive-metastore-postgresql ... done
+Creating namenode                  ... done
+Creating zookeeper                 ... done
+Creating kafkabroker               ... done
+Creating hivemetastore             ... done
+Creating historyserver             ... done
+Creating hiveserver                ... done
+Creating datanode1                 ... done
+Creating presto-coordinator-1      ... done
+Creating sparkmaster               ... done
+Creating presto-worker-1           ... done
+Creating adhoc-1                   ... done
+Creating adhoc-2                   ... done
+Creating spark-worker-1            ... done
+Copying spark default config and setting up configs
+Copying spark default config and setting up configs
+$ docker ps
+```
+
+At this point, the docker cluster will be up and running. The demo cluster brings up the following services
+
+   * HDFS Services (NameNode, DataNode)
+   * Spark Master and Worker
+   * Hive Services (Metastore, HiveServer2 along with PostgresDB)
+   * Kafka Broker and a Zookeeper Node (Kafka will be used as upstream source for the demo)
+   * Adhoc containers to run Hudi/Hive CLI commands
+
+## Demo
+
+Stock Tracker data will be used to showcase different Hudi query types and the effects of Compaction.
+
+Take a look at the directory `docker/demo/data`. There are 2 batches of stock data - each at 1 minute granularity.
+The first batch contains stocker tracker data for some stock symbols during the first hour of trading window
+(9:30 a.m to 10:30 a.m). The second batch contains tracker data for next 30 mins (10:30 - 11 a.m). Hudi will
+be used to ingest these batches to a table which will contain the latest stock tracker data at hour level granularity.
+The batches are windowed intentionally so that the second batch contains updates to some of the rows in the first batch.
+
+### Step 1 : Publish the first batch to Kafka
+
+Upload the first batch to Kafka topic 'stock ticks' `cat docker/demo/data/batch_1.json | kafkacat -b kafkabroker -t stock_ticks -P`
+
+To check if the new topic shows up, use
+```java
+kafkacat -b kafkabroker -L -J | jq .
+{
+  "originating_broker": {
+    "id": 1001,
+    "name": "kafkabroker:9092/1001"
+  },
+  "query": {
+    "topic": "*"
+  },
+  "brokers": [
+    {
+      "id": 1001,
+      "name": "kafkabroker:9092"
+    }
+  ],
+  "topics": [
+    {
+      "topic": "stock_ticks",
+      "partitions": [
+        {
+          "partition": 0,
+          "leader": 1001,
+          "replicas": [
+            {
+              "id": 1001
+            }
+          ],
+          "isrs": [
+            {
+              "id": 1001
+            }
+          ]
+        }
+      ]
+    }
+  ]
+}
+```
+
+### Step 2: Incrementally ingest data from Kafka topic
+
+Hudi comes with a tool named DeltaStreamer. This tool can connect to variety of data sources (including Kafka) to
+pull changes and apply to Hudi table using upsert/insert primitives. Here, we will use the tool to download
+json data from kafka topic and ingest to both COW and MOR tables we initialized in the previous step. This tool
+automatically initializes the tables in the file-system if they do not exist yet.
+
+```java
+docker exec -it adhoc-2 /bin/bash
+
+# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow table in HDFS
+spark-submit \
+  --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \
+  --table-type COPY_ON_WRITE \
+  --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \
+  --source-ordering-field ts  \
+  --target-base-path /user/hive/warehouse/stock_ticks_cow \
+  --target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties \
+  --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider
+
+# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor table in HDFS
+spark-submit \
+  --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \
+  --table-type MERGE_ON_READ \
+  --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \
+  --source-ordering-field ts \
+  --target-base-path /user/hive/warehouse/stock_ticks_mor \
+  --target-table stock_ticks_mor \
+  --props /var/demo/config/kafka-source.properties \
+  --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
+  --disable-compaction
+
+# As part of the setup (Look at setup_demo.sh), the configs needed for DeltaStreamer is uploaded to HDFS. The configs
+# contain mostly Kafa connectivity settings, the avro-schema to be used for ingesting along with key and partitioning fields.
+
+exit
+```
+
+You can use HDFS web-browser to look at the tables
+`http://namenode:50070/explorer#/user/hive/warehouse/stock_ticks_cow`.
+
+You can explore the new partition folder created in the table along with a "commit" / "deltacommit"
+file under .hoodie which signals a successful commit.
+
+There will be a similar setup when you browse the MOR table
+`http://namenode:50070/explorer#/user/hive/warehouse/stock_ticks_mor`
+
+
+### Step 3: Sync with Hive
+
+At this step, the tables are available in HDFS. We need to sync with Hive to create new Hive tables and add partitions
+inorder to run Hive queries against those tables.
+
+```java
+docker exec -it adhoc-2 /bin/bash
+
+# This command takes in HiveServer URL and COW Hudi table location in HDFS and sync the HDFS state to Hive
+/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh \
+  --jdbc-url jdbc:hive2://hiveserver:10000 \
+  --user hive \
+  --pass hive \
+  --partitioned-by dt \
+  --base-path /user/hive/warehouse/stock_ticks_cow \
+  --database default \
+  --table stock_ticks_cow
+.....
+2020-01-25 19:51:28,953 INFO  [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_cow
+.....
+
+# Now run hive-sync for the second data-set in HDFS using Merge-On-Read (MOR table type)
+/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh \
+  --jdbc-url jdbc:hive2://hiveserver:10000 \
+  --user hive \
+  --pass hive \
+  --partitioned-by dt \
+  --base-path /user/hive/warehouse/stock_ticks_mor \
+  --database default \
+  --table stock_ticks_mor
+...
+2020-01-25 19:51:51,066 INFO  [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_mor_ro
+...
+2020-01-25 19:51:51,569 INFO  [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_mor_rt
+....
+
+exit
+```
+After executing the above command, you will notice
+
+1. A hive table named `stock_ticks_cow` created which supports Snapshot and Incremental queries on Copy On Write table.
+2. Two new tables `stock_ticks_mor_rt` and `stock_ticks_mor_ro` created for the Merge On Read table. The former
+supports Snapshot and Incremental queries (providing near-real time data) while the later supports ReadOptimized queries.
+
+
+### Step 4 (a): Run Hive Queries
+
+Run a hive query to find the latest timestamp ingested for stock symbol 'GOOG'. You will notice that both snapshot 
+(for both COW and MOR _rt table) and read-optimized queries (for MOR _ro table) give the same value "10:29 a.m" as Hudi create a
+parquet file for the first batch of data.
+
+```java
+docker exec -it adhoc-2 /bin/bash
+beeline -u jdbc:hive2://hiveserver:10000 \
+  --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \
+  --hiveconf hive.stats.autogather=false
+
+# List Tables
+0: jdbc:hive2://hiveserver:10000> show tables;
++---------------------+--+
+|      tab_name       |
++---------------------+--+
+| stock_ticks_cow     |
+| stock_ticks_mor_ro  |
+| stock_ticks_mor_rt  |
++---------------------+--+
+3 rows selected (1.199 seconds)
+0: jdbc:hive2://hiveserver:10000>
+
+
+# Look at partitions that were added
+0: jdbc:hive2://hiveserver:10000> show partitions stock_ticks_mor_rt;
++----------------+--+
+|   partition    |
++----------------+--+
+| dt=2018-08-31  |
++----------------+--+
+1 row selected (0.24 seconds)
+
+
+# COPY-ON-WRITE Queries:
+=========================
+
+
+0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
++---------+----------------------+--+
+| symbol  |         _c1          |
++---------+----------------------+--+
+| GOOG    | 2018-08-31 10:29:00  |
++---------+----------------------+--+
+
+Now, run a projection query:
+
+0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG';
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| 20180924221953       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
+| 20180924221953       | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+
+
+# Merge-On-Read Queries:
+==========================
+
+Lets run similar queries against M-O-R table. Lets look at both 
+ReadOptimized and Snapshot(realtime data) queries supported by M-O-R table
+
+# Run ReadOptimized Query. Notice that the latest timestamp is 10:29
+0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';
+WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
++---------+----------------------+--+
+| symbol  |         _c1          |
++---------+----------------------+--+
+| GOOG    | 2018-08-31 10:29:00  |
++---------+----------------------+--+
+1 row selected (6.326 seconds)
+
+
+# Run Snapshot Query. Notice that the latest timestamp is again 10:29
+
+0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
+WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
++---------+----------------------+--+
+| symbol  |         _c1          |
++---------+----------------------+--+
+| GOOG    | 2018-08-31 10:29:00  |
++---------+----------------------+--+
+1 row selected (1.606 seconds)
+
+
+# Run Read Optimized and Snapshot project queries
+
+0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_ro where  symbol = 'GOOG';
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
+| 20180924222155       | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+
+0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_rt where  symbol = 'GOOG';
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
+| 20180924222155       | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+
+exit
+```
+
+### Step 4 (b): Run Spark-SQL Queries
+Hudi support Spark as query processor just like Hive. Here are the same hive queries
+running in spark-sql
+
+```java
+docker exec -it adhoc-1 /bin/bash
+$SPARK_INSTALL/bin/spark-shell \
+  --jars $HUDI_SPARK_BUNDLE \
+  --master local[2] \
+  --driver-class-path $HADOOP_CONF_DIR \
+  --conf spark.sql.hive.convertMetastoreParquet=false \
+  --deploy-mode client \
+  --driver-memory 1G \
+  --executor-memory 3G \
+  --num-executors 1 \
+  --packages org.apache.spark:spark-avro_2.11:2.4.4
+...
+
+Welcome to
+      ____              __
+     / __/__  ___ _____/ /__
+    _\ \/ _ \/ _ `/ __/  '_/
+   /___/ .__/\_,_/_/ /_/\_\   version 2.4.4
+      /_/
+
+Using Scala version 2.11.12 (OpenJDK 64-Bit Server VM, Java 1.8.0_212)
+Type in expressions to have them evaluated.
+Type :help for more information.
+
+scala> spark.sql("show tables").show(100, false)
++--------+------------------+-----------+
+|database|tableName         |isTemporary|
++--------+------------------+-----------+
+|default |stock_ticks_cow   |false      |
+|default |stock_ticks_mor_ro|false      |
+|default |stock_ticks_mor_rt|false      |
++--------+------------------+-----------+
+
+# Copy-On-Write Table
+
+## Run max timestamp query against COW table
+
+scala> spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false)
+[Stage 0:>                                                          (0 + 1) / 1]SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
+SLF4J: Defaulting to no-operation (NOP) logger implementation
+SLF4J: See http://www.slf4j.org/codes#StaticLoggerBinder for further details.
++------+-------------------+
+|symbol|max(ts)            |
++------+-------------------+
+|GOOG  |2018-08-31 10:29:00|
++------+-------------------+
+
+## Projection Query
+
+scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG'").show(100, false)
++-------------------+------+-------------------+------+---------+--------+
+|_hoodie_commit_time|symbol|ts                 |volume|open     |close   |
++-------------------+------+-------------------+------+---------+--------+
+|20180924221953     |GOOG  |2018-08-31 09:59:00|6330  |1230.5   |1230.02 |
+|20180924221953     |GOOG  |2018-08-31 10:29:00|3391  |1230.1899|1230.085|
++-------------------+------+-------------------+------+---------+--------+
+
+# Merge-On-Read Queries:
+==========================
+
+Lets run similar queries against M-O-R table. Lets look at both
+ReadOptimized and Snapshot queries supported by M-O-R table
+
+# Run ReadOptimized Query. Notice that the latest timestamp is 10:29
+scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'").show(100, false)
++------+-------------------+
+|symbol|max(ts)            |
++------+-------------------+
+|GOOG  |2018-08-31 10:29:00|
++------+-------------------+
+
+
+# Run Snapshot Query. Notice that the latest timestamp is again 10:29
+
+scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
++------+-------------------+
+|symbol|max(ts)            |
++------+-------------------+
+|GOOG  |2018-08-31 10:29:00|
++------+-------------------+
+
+# Run Read Optimized and Snapshot project queries
+
+scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_ro where  symbol = 'GOOG'").show(100, false)
++-------------------+------+-------------------+------+---------+--------+
+|_hoodie_commit_time|symbol|ts                 |volume|open     |close   |
++-------------------+------+-------------------+------+---------+--------+
+|20180924222155     |GOOG  |2018-08-31 09:59:00|6330  |1230.5   |1230.02 |
+|20180924222155     |GOOG  |2018-08-31 10:29:00|3391  |1230.1899|1230.085|
++-------------------+------+-------------------+------+---------+--------+
+
+scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_rt where  symbol = 'GOOG'").show(100, false)
++-------------------+------+-------------------+------+---------+--------+
+|_hoodie_commit_time|symbol|ts                 |volume|open     |close   |
++-------------------+------+-------------------+------+---------+--------+
+|20180924222155     |GOOG  |2018-08-31 09:59:00|6330  |1230.5   |1230.02 |
+|20180924222155     |GOOG  |2018-08-31 10:29:00|3391  |1230.1899|1230.085|
++-------------------+------+-------------------+------+---------+--------+
+```
+
+### Step 4 (c): Run Presto Queries
+
+Here are the Presto queries for similar Hive and Spark queries. Currently, Presto does not support snapshot or incremental queries on Hudi tables.
+
+```java
+docker exec -it presto-worker-1 presto --server presto-coordinator-1:8090
+presto> show catalogs;
+  Catalog
+-----------
+ hive
+ jmx
+ localfile
+ system
+(4 rows)
+
+Query 20190817_134851_00000_j8rcz, FINISHED, 1 node
+Splits: 19 total, 19 done (100.00%)
+0:04 [0 rows, 0B] [0 rows/s, 0B/s]
+
+presto> use hive.default;
+USE
+presto:default> show tables;
+       Table
+--------------------
+ stock_ticks_cow
+ stock_ticks_mor_ro
+ stock_ticks_mor_rt
+(3 rows)
+
+Query 20190822_181000_00001_segyw, FINISHED, 2 nodes
+Splits: 19 total, 19 done (100.00%)
+0:05 [3 rows, 99B] [0 rows/s, 18B/s]
+
+
+# COPY-ON-WRITE Queries:
+=========================
+
+
+presto:default> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
+ symbol |        _col1
+--------+---------------------
+ GOOG   | 2018-08-31 10:29:00
+(1 row)
+
+Query 20190822_181011_00002_segyw, FINISHED, 1 node
+Splits: 49 total, 49 done (100.00%)
+0:12 [197 rows, 613B] [16 rows/s, 50B/s]
+
+presto:default> select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';
+ _hoodie_commit_time | symbol |         ts          | volume |   open    |  close
+---------------------+--------+---------------------+--------+-----------+----------
+ 20190822180221      | GOOG   | 2018-08-31 09:59:00 |   6330 |    1230.5 |  1230.02
+ 20190822180221      | GOOG   | 2018-08-31 10:29:00 |   3391 | 1230.1899 | 1230.085
+(2 rows)
+
+Query 20190822_181141_00003_segyw, FINISHED, 1 node
+Splits: 17 total, 17 done (100.00%)
+0:02 [197 rows, 613B] [109 rows/s, 341B/s]
+
+
+# Merge-On-Read Queries:
+==========================
+
+Lets run similar queries against M-O-R table. 
+
+# Run ReadOptimized Query. Notice that the latest timestamp is 10:29
+    presto:default> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';
+ symbol |        _col1
+--------+---------------------
+ GOOG   | 2018-08-31 10:29:00
+(1 row)
+
+Query 20190822_181158_00004_segyw, FINISHED, 1 node
+Splits: 49 total, 49 done (100.00%)
+0:02 [197 rows, 613B] [110 rows/s, 343B/s]
+
+
+presto:default>  select "_hoodie_commit_time", symbol, ts, volume, open, close  from stock_ticks_mor_ro where  symbol = 'GOOG';
+ _hoodie_commit_time | symbol |         ts          | volume |   open    |  close
+---------------------+--------+---------------------+--------+-----------+----------
+ 20190822180250      | GOOG   | 2018-08-31 09:59:00 |   6330 |    1230.5 |  1230.02
+ 20190822180250      | GOOG   | 2018-08-31 10:29:00 |   3391 | 1230.1899 | 1230.085
+(2 rows)
+
+Query 20190822_181256_00006_segyw, FINISHED, 1 node
+Splits: 17 total, 17 done (100.00%)
+0:02 [197 rows, 613B] [92 rows/s, 286B/s]
+
+presto:default> exit
+```
+
+### Step 5: Upload second batch to Kafka and run DeltaStreamer to ingest
+
+Upload the second batch of data and ingest this batch using delta-streamer. As this batch does not bring in any new
+partitions, there is no need to run hive-sync
+
+```java
+cat docker/demo/data/batch_2.json | kafkacat -b kafkabroker -t stock_ticks -P
+
+# Within Docker container, run the ingestion command
+docker exec -it adhoc-2 /bin/bash
+
+# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow table in HDFS
+spark-submit \
+  --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \
+  --table-type COPY_ON_WRITE \
+  --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \
+  --source-ordering-field ts \
+  --target-base-path /user/hive/warehouse/stock_ticks_cow \
+  --target-table stock_ticks_cow \
+  --props /var/demo/config/kafka-source.properties \
+  --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider
+
+# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor table in HDFS
+spark-submit \
+  --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \
+  --table-type MERGE_ON_READ \
+  --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \
+  --source-ordering-field ts \
+  --target-base-path /user/hive/warehouse/stock_ticks_mor \
+  --target-table stock_ticks_mor \
+  --props /var/demo/config/kafka-source.properties \
+  --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
+  --disable-compaction
+
+exit
+```
+
+With Copy-On-Write table, the second ingestion by DeltaStreamer resulted in a new version of Parquet file getting created.
+See `http://namenode:50070/explorer#/user/hive/warehouse/stock_ticks_cow/2018/08/31`
+
+With Merge-On-Read table, the second ingestion merely appended the batch to an unmerged delta (log) file.
+Take a look at the HDFS filesystem to get an idea: `http://namenode:50070/explorer#/user/hive/warehouse/stock_ticks_mor/2018/08/31`
+
+### Step 6 (a): Run Hive Queries
+
+With Copy-On-Write table, the Snapshot query immediately sees the changes as part of second batch once the batch
+got committed as each ingestion creates newer versions of parquet files.
+
+With Merge-On-Read table, the second ingestion merely appended the batch to an unmerged delta (log) file.
+This is the time, when ReadOptimized and Snapshot queries will provide different results. ReadOptimized query will still
+return "10:29 am" as it will only read from the Parquet file. Snapshot query will do on-the-fly merge and return
+latest committed data which is "10:59 a.m".
+
+```java
+docker exec -it adhoc-2 /bin/bash
+beeline -u jdbc:hive2://hiveserver:10000 \
+  --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \
+  --hiveconf hive.stats.autogather=false
+
+# Copy On Write Table:
+
+0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
+WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
++---------+----------------------+--+
+| symbol  |         _c1          |
++---------+----------------------+--+
+| GOOG    | 2018-08-31 10:59:00  |
++---------+----------------------+--+
+1 row selected (1.932 seconds)
+
+0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG';
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| 20180924221953       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
+| 20180924224524       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+
+As you can notice, the above queries now reflect the changes that came as part of ingesting second batch.
+
+
+# Merge On Read Table:
+
+# Read Optimized Query
+0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';
+WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
++---------+----------------------+--+
+| symbol  |         _c1          |
++---------+----------------------+--+
+| GOOG    | 2018-08-31 10:29:00  |
++---------+----------------------+--+
+1 row selected (1.6 seconds)
+
+0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_ro where  symbol = 'GOOG';
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
+| 20180924222155       | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+
+# Snapshot Query
+0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
+WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
++---------+----------------------+--+
+| symbol  |         _c1          |
++---------+----------------------+--+
+| GOOG    | 2018-08-31 10:59:00  |
++---------+----------------------+--+
+
+0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_rt where  symbol = 'GOOG';
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
+| 20180924224537       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+
+exit
+```
+
+### Step 6 (b): Run Spark SQL Queries
+
+Running the same queries in Spark-SQL:
+
+```java
+docker exec -it adhoc-1 /bin/bash
+$SPARK_INSTALL/bin/spark-shell \
+  --jars $HUDI_SPARK_BUNDLE \
+  --driver-class-path $HADOOP_CONF_DIR \
+  --conf spark.sql.hive.convertMetastoreParquet=false \
+  --deploy-mode client \
+  --driver-memory 1G \
+  --master local[2] \
+  --executor-memory 3G \
+  --num-executors 1 \
+  --packages org.apache.spark:spark-avro_2.11:2.4.4
+
+# Copy On Write Table:
+
+scala> spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false)
++------+-------------------+
+|symbol|max(ts)            |
++------+-------------------+
+|GOOG  |2018-08-31 10:59:00|
++------+-------------------+
+
+scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG'").show(100, false)
+
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| 20180924221953       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
+| 20180924224524       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+
+As you can notice, the above queries now reflect the changes that came as part of ingesting second batch.
+
+
+# Merge On Read Table:
+
+# Read Optimized Query
+scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'").show(100, false)
++---------+----------------------+
+| symbol  |         _c1          |
++---------+----------------------+
+| GOOG    | 2018-08-31 10:29:00  |
++---------+----------------------+
+1 row selected (1.6 seconds)
+
+scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_ro where  symbol = 'GOOG'").show(100, false)
++----------------------+---------+----------------------+---------+------------+-----------+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+
+| 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
+| 20180924222155       | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
++----------------------+---------+----------------------+---------+------------+-----------+
+
+# Snapshot Query
+scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
++---------+----------------------+
+| symbol  |         _c1          |
++---------+----------------------+
+| GOOG    | 2018-08-31 10:59:00  |
++---------+----------------------+
+
+scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_rt where  symbol = 'GOOG'").show(100, false)
++----------------------+---------+----------------------+---------+------------+-----------+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+
+| 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
+| 20180924224537       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
++----------------------+---------+----------------------+---------+------------+-----------+
+
+exit
+```
+
+### Step 6 (c): Run Presto Queries
+
+Running the same queries on Presto for ReadOptimized queries. 
+
+```java
+docker exec -it presto-worker-1 presto --server presto-coordinator-1:8090
+presto> use hive.default;
+USE
+
+# Copy On Write Table:
+
+presto:default>select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
+ symbol |        _col1
+--------+---------------------
+ GOOG   | 2018-08-31 10:59:00
+(1 row)
+
+Query 20190822_181530_00007_segyw, FINISHED, 1 node
+Splits: 49 total, 49 done (100.00%)
+0:02 [197 rows, 613B] [125 rows/s, 389B/s]
+
+presto:default>select "_hoodie_commit_time", symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG';
+ _hoodie_commit_time | symbol |         ts          | volume |   open    |  close
+---------------------+--------+---------------------+--------+-----------+----------
+ 20190822180221      | GOOG   | 2018-08-31 09:59:00 |   6330 |    1230.5 |  1230.02
+ 20190822181433      | GOOG   | 2018-08-31 10:59:00 |   9021 | 1227.1993 | 1227.215
+(2 rows)
+
+Query 20190822_181545_00008_segyw, FINISHED, 1 node
+Splits: 17 total, 17 done (100.00%)
+0:02 [197 rows, 613B] [106 rows/s, 332B/s]
+
+As you can notice, the above queries now reflect the changes that came as part of ingesting second batch.
+
+
+# Merge On Read Table:
+
+# Read Optimized Query
+presto:default> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';
+ symbol |        _col1
+--------+---------------------
+ GOOG   | 2018-08-31 10:29:00
+(1 row)
+
+Query 20190822_181602_00009_segyw, FINISHED, 1 node
+Splits: 49 total, 49 done (100.00%)
+0:01 [197 rows, 613B] [139 rows/s, 435B/s]
+
+presto:default>select "_hoodie_commit_time", symbol, ts, volume, open, close  from stock_ticks_mor_ro where  symbol = 'GOOG';
+ _hoodie_commit_time | symbol |         ts          | volume |   open    |  close
+---------------------+--------+---------------------+--------+-----------+----------
+ 20190822180250      | GOOG   | 2018-08-31 09:59:00 |   6330 |    1230.5 |  1230.02
+ 20190822180250      | GOOG   | 2018-08-31 10:29:00 |   3391 | 1230.1899 | 1230.085
+(2 rows)
+
+Query 20190822_181615_00010_segyw, FINISHED, 1 node
+Splits: 17 total, 17 done (100.00%)
+0:01 [197 rows, 613B] [154 rows/s, 480B/s]
+
+presto:default> exit
+```
+
+### Step 7 (a): Incremental Query for COPY-ON-WRITE Table
+
+With 2 batches of data ingested, lets showcase the support for incremental queries in Hudi Copy-On-Write tables
+
+Lets take the same projection query example
+
+```java
+docker exec -it adhoc-2 /bin/bash
+beeline -u jdbc:hive2://hiveserver:10000 \
+  --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \
+  --hiveconf hive.stats.autogather=false
+
+0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG';
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| 20180924064621       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
+| 20180924065039       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+```
+
+As you notice from the above queries, there are 2 commits - 20180924064621 and 20180924065039 in timeline order.
+When you follow the steps, you will be getting different timestamps for commits. Substitute them
+in place of the above timestamps.
+
+To show the effects of incremental-query, let us assume that a reader has already seen the changes as part of
+ingesting first batch. Now, for the reader to see effect of the second batch, he/she has to keep the start timestamp to
+the commit time of the first batch (20180924064621) and run incremental query
+
+Hudi incremental mode provides efficient scanning for incremental queries by filtering out files that do not have any
+candidate rows using hudi-managed metadata.
+
+```java
+docker exec -it adhoc-2 /bin/bash
+beeline -u jdbc:hive2://hiveserver:10000 \
+  --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \
+  --hiveconf hive.stats.autogather=false
+
+0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.mode=INCREMENTAL;
+No rows affected (0.009 seconds)
+0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.max.commits=3;
+No rows affected (0.009 seconds)
+0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.start.timestamp=20180924064621;
+```
+
+With the above setting, file-ids that do not have any updates from the commit 20180924065039 is filtered out without scanning.
+Here is the incremental query :
+
+```java
+0: jdbc:hive2://hiveserver:10000>
+0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG' and `_hoodie_commit_time` > '20180924064621';
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| 20180924065039       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+1 row selected (0.83 seconds)
+0: jdbc:hive2://hiveserver:10000>
+```
+
+### Step 7 (b): Incremental Query with Spark SQL:
+
+```java
+docker exec -it adhoc-1 /bin/bash
+$SPARK_INSTALL/bin/spark-shell \
+  --jars $HUDI_SPARK_BUNDLE \
+  --driver-class-path $HADOOP_CONF_DIR \
+  --conf spark.sql.hive.convertMetastoreParquet=false \
+  --deploy-mode client \
+  --driver-memory 1G \
+  --master local[2] \
+  --executor-memory 3G \
+  --num-executors 1 \
+  --packages org.apache.spark:spark-avro_2.11:2.4.4
+
+Welcome to
+      ____              __
+     / __/__  ___ _____/ /__
+    _\ \/ _ \/ _ `/ __/  '_/
+   /___/ .__/\_,_/_/ /_/\_\   version 2.4.4
+      /_/
+
+Using Scala version 2.11.12 (OpenJDK 64-Bit Server VM, Java 1.8.0_212)
+Type in expressions to have them evaluated.
+Type :help for more information.
+
+scala> import org.apache.hudi.DataSourceReadOptions
+import org.apache.hudi.DataSourceReadOptions
+
+# In the below query, 20180925045257 is the first commit's timestamp
+scala> val hoodieIncViewDF =  spark.read.format("org.apache.hudi").option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL).option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY, "20180924064621").load("/user/hive/warehouse/stock_ticks_cow")
+SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
+SLF4J: Defaulting to no-operation (NOP) logger implementation
+SLF4J: See http://www.slf4j.org/codes#StaticLoggerBinder for further details.
+hoodieIncViewDF: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string, _hoodie_commit_seqno: string ... 15 more fields]
+
+scala> hoodieIncViewDF.registerTempTable("stock_ticks_cow_incr_tmp1")
+warning: there was one deprecation warning; re-run with -deprecation for details
+
+scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow_incr_tmp1 where  symbol = 'GOOG'").show(100, false);
++----------------------+---------+----------------------+---------+------------+-----------+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+
+| 20180924065039       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
++----------------------+---------+----------------------+---------+------------+-----------+
+```
+
+### Step 8: Schedule and Run Compaction for Merge-On-Read table
+
+Lets schedule and run a compaction to create a new version of columnar  file so that read-optimized readers will see fresher data.
+Again, You can use Hudi CLI to manually schedule and run compaction
+
+```java
+docker exec -it adhoc-1 /bin/bash
+root@adhoc-1:/opt# /var/hoodie/ws/hudi-cli/hudi-cli.sh
+...
+Table command getting loaded
+HoodieSplashScreen loaded
+===================================================================
+*         ___                          ___                        *
+*        /\__\          ___           /\  \           ___         *
+*       / /  /         /\__\         /  \  \         /\  \        *
+*      / /__/         / /  /        / /\ \  \        \ \  \       *
+*     /  \  \ ___    / /  /        / /  \ \__\       /  \__\      *
+*    / /\ \  /\__\  / /__/  ___   / /__/ \ |__|     / /\/__/      *
+*    \/  \ \/ /  /  \ \  \ /\__\  \ \  \ / /  /  /\/ /  /         *
+*         \  /  /    \ \  / /  /   \ \  / /  /   \  /__/          *
+*         / /  /      \ \/ /  /     \ \/ /  /     \ \__\          *
+*        / /  /        \  /  /       \  /  /       \/__/          *
+*        \/__/          \/__/         \/__/    Apache Hudi CLI    *
+*                                                                 *
+===================================================================
+
+Welcome to Apache Hudi CLI. Please type help if you are looking for help.
+hudi->connect --path /user/hive/warehouse/stock_ticks_mor
+18/09/24 06:59:34 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+18/09/24 06:59:35 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor
+18/09/24 06:59:35 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]]
+18/09/24 06:59:35 INFO table.HoodieTableConfig: Loading table properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties
+18/09/24 06:59:36 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ(version=1) from /user/hive/warehouse/stock_ticks_mor
+Metadata for table stock_ticks_mor loaded
+hoodie:stock_ticks_mor->compactions show all
+20/02/10 03:41:32 INFO timeline.HoodieActiveTimeline: Loaded instants [[20200210015059__clean__COMPLETED], [20200210015059__deltacommit__COMPLETED], [20200210022758__clean__COMPLETED], [20200210022758__deltacommit__COMPLETED], [==>20200210023843__compaction__REQUESTED]]
+___________________________________________________________________
+| Compaction Instant Time| State    | Total FileIds to be Compacted|
+|==================================================================|
+
+# Schedule a compaction. This will use Spark Launcher to schedule compaction
+hoodie:stock_ticks_mor->compaction schedule
+....
+Compaction successfully completed for 20180924070031
+
+# Now refresh and check again. You will see that there is a new compaction requested
+
+hoodie:stock_ticks->connect --path /user/hive/warehouse/stock_ticks_mor
+18/09/24 07:01:16 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor
+18/09/24 07:01:16 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]]
+18/09/24 07:01:16 INFO table.HoodieTableConfig: Loading table properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties
+18/09/24 07:01:16 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ(version=1) from /user/hive/warehouse/stock_ticks_mor
+Metadata for table stock_ticks_mor loaded
+
+hoodie:stock_ticks_mor->compactions show all
+18/09/24 06:34:12 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924041125__clean__COMPLETED], [20180924041125__deltacommit__COMPLETED], [20180924042735__clean__COMPLETED], [20180924042735__deltacommit__COMPLETED], [==>20180924063245__compaction__REQUESTED]]
+___________________________________________________________________
+| Compaction Instant Time| State    | Total FileIds to be Compacted|
+|==================================================================|
+| 20180924070031         | REQUESTED| 1                            |
+
+# Execute the compaction. The compaction instant value passed below must be the one displayed in the above "compactions show all" query
+hoodie:stock_ticks_mor->compaction run --compactionInstant  20180924070031 --parallelism 2 --sparkMemory 1G  --schemaFilePath /var/demo/config/schema.avsc --retry 1  
+....
+Compaction successfully completed for 20180924070031
+
+## Now check if compaction is completed
+
+hoodie:stock_ticks_mor->connect --path /user/hive/warehouse/stock_ticks_mor
+18/09/24 07:03:00 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor
+18/09/24 07:03:00 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]]
+18/09/24 07:03:00 INFO table.HoodieTableConfig: Loading table properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties
+18/09/24 07:03:00 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ(version=1) from /user/hive/warehouse/stock_ticks_mor
+Metadata for table stock_ticks_mor loaded
+
+hoodie:stock_ticks->compactions show all
+18/09/24 07:03:15 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924064636__clean__COMPLETED], [20180924064636__deltacommit__COMPLETED], [20180924065057__clean__COMPLETED], [20180924065057__deltacommit__COMPLETED], [20180924070031__commit__COMPLETED]]
+___________________________________________________________________
+| Compaction Instant Time| State    | Total FileIds to be Compacted|
+|==================================================================|
+| 20180924070031         | COMPLETED| 1                            |
+
+```
+
+### Step 9: Run Hive Queries including incremental queries
+
+You will see that both ReadOptimized and Snapshot queries will show the latest committed data.
+Lets also run the incremental query for MOR table.
+From looking at the below query output, it will be clear that the fist commit time for the MOR table is 20180924064636
+and the second commit time is 20180924070031
+
+```java
+docker exec -it adhoc-2 /bin/bash
+beeline -u jdbc:hive2://hiveserver:10000 \
+  --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \
+  --hiveconf hive.stats.autogather=false
+
+# Read Optimized Query
+0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';
+WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
++---------+----------------------+--+
+| symbol  |         _c1          |
++---------+----------------------+--+
+| GOOG    | 2018-08-31 10:59:00  |
++---------+----------------------+--+
+1 row selected (1.6 seconds)
+
+0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_ro where  symbol = 'GOOG';
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| 20180924064636       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
+| 20180924070031       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+
+# Snapshot Query
+0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
+WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
++---------+----------------------+--+
+| symbol  |         _c1          |
++---------+----------------------+--+
+| GOOG    | 2018-08-31 10:59:00  |
++---------+----------------------+--+
+
+0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_rt where  symbol = 'GOOG';
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| 20180924064636       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
+| 20180924070031       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+
+# Incremental Query:
+
+0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.mode=INCREMENTAL;
+No rows affected (0.008 seconds)
+# Max-Commits covers both second batch and compaction commit
+0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.max.commits=3;
+No rows affected (0.007 seconds)
+0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.start.timestamp=20180924064636;
+No rows affected (0.013 seconds)
+# Query:
+0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_ro where  symbol = 'GOOG' and `_hoodie_commit_time` > '20180924064636';
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+| 20180924070031       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
++----------------------+---------+----------------------+---------+------------+-----------+--+
+
+exit
+```
+
+### Step 10: Read Optimized and Snapshot queries for MOR with Spark-SQL after compaction
+
+```java
+docker exec -it adhoc-1 /bin/bash
+$SPARK_INSTALL/bin/spark-shell \
+  --jars $HUDI_SPARK_BUNDLE \
+  --driver-class-path $HADOOP_CONF_DIR \
+  --conf spark.sql.hive.convertMetastoreParquet=false \
+  --deploy-mode client \
+  --driver-memory 1G \
+  --master local[2] \
+  --executor-memory 3G \
+  --num-executors 1 \
+  --packages org.apache.spark:spark-avro_2.11:2.4.4
+
+# Read Optimized Query
+scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG'").show(100, false)
++---------+----------------------+
+| symbol  |        max(ts)       |
++---------+----------------------+
+| GOOG    | 2018-08-31 10:59:00  |
++---------+----------------------+
+1 row selected (1.6 seconds)
+
+scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_ro where  symbol = 'GOOG'").show(100, false)
++----------------------+---------+----------------------+---------+------------+-----------+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+
+| 20180924064636       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
+| 20180924070031       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
++----------------------+---------+----------------------+---------+------------+-----------+
+
+# Snapshot Query
+scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
++---------+----------------------+
+| symbol  |     max(ts)          |
++---------+----------------------+
+| GOOG    | 2018-08-31 10:59:00  |
++---------+----------------------+
+
+scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_rt where  symbol = 'GOOG'").show(100, false)
++----------------------+---------+----------------------+---------+------------+-----------+
+| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
++----------------------+---------+----------------------+---------+------------+-----------+
+| 20180924064636       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
+| 20180924070031       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
++----------------------+---------+----------------------+---------+------------+-----------+
+```
+
+### Step 11:  Presto Read Optimized queries on MOR table after compaction
+
+```java
+docker exec -it presto-worker-1 presto --server presto-coordinator-1:8090
+presto> use hive.default;
+USE
+
+# Read Optimized Query
+resto:default> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';
+  symbol |        _col1
+--------+---------------------
+ GOOG   | 2018-08-31 10:59:00
+(1 row)
+
+Query 20190822_182319_00011_segyw, FINISHED, 1 node
+Splits: 49 total, 49 done (100.00%)
+0:01 [197 rows, 613B] [133 rows/s, 414B/s]
+
+presto:default> select "_hoodie_commit_time", symbol, ts, volume, open, close  from stock_ticks_mor_ro where  symbol = 'GOOG';
+ _hoodie_commit_time | symbol |         ts          | volume |   open    |  close
+---------------------+--------+---------------------+--------+-----------+----------
+ 20190822180250      | GOOG   | 2018-08-31 09:59:00 |   6330 |    1230.5 |  1230.02
+ 20190822181944      | GOOG   | 2018-08-31 10:59:00 |   9021 | 1227.1993 | 1227.215
+(2 rows)
+
+Query 20190822_182333_00012_segyw, FINISHED, 1 node
+Splits: 17 total, 17 done (100.00%)
+0:02 [197 rows, 613B] [98 rows/s, 307B/s]
+
+presto:default>
+```
+
+
+This brings the demo to an end.
+
+## Testing Hudi in Local Docker environment
+
+You can bring up a hadoop docker environment containing Hadoop, Hive and Spark services with support for hudi.
+```java
+$ mvn pre-integration-test -DskipTests
+```
+The above command builds docker images for all the services with
+current Hudi source installed at /var/hoodie/ws and also brings up the services using a compose file. We
+currently use Hadoop (v2.8.4), Hive (v2.3.3) and Spark (v2.4.4) in docker images.
+
+To bring down the containers
+```java
+$ cd hudi-integ-test
+$ mvn docker-compose:down
+```
+
+If you want to bring up the docker containers, use
+```java
+$ cd hudi-integ-test
+$ mvn docker-compose:up -DdetachedMode=true
+```
+
+Hudi is a library that is operated in a broader data analytics/ingestion environment
+involving Hadoop, Hive and Spark. Interoperability with all these systems is a key objective for us. We are
+actively adding integration-tests under __hudi-integ-test/src/test/java__ that makes use of this
+docker environment (See __hudi-integ-test/src/test/java/org/apache/hudi/integ/ITTestHoodieSanity.java__ )
+
+
+### Building Local Docker Containers:
+
+The docker images required for demo and running integration test are already in docker-hub. The docker images
+and compose scripts are carefully implemented so that they serve dual-purpose
+
+1. The docker images have inbuilt hudi jar files with environment variable pointing to those jars (HUDI_HADOOP_BUNDLE, ...)
+2. For running integration-tests, we need the jars generated locally to be used for running services within docker. The
+   docker-compose scripts (see `docker/compose/docker-compose_hadoop284_hive233_spark231.yml`) ensures local jars override
+   inbuilt jars by mounting local HUDI workspace over the docker location
+3. As these docker containers have mounted local HUDI workspace, any changes that happen in the workspace would automatically 
+   reflect in the containers. This is a convenient way for developing and verifying Hudi for
+   developers who do not own a distributed environment. Note that this is how integration tests are run.
+
+This helps avoid maintaining separate docker images and avoids the costly step of building HUDI docker images locally.
+But if users want to test hudi from locations with lower network bandwidth, they can still build local images
+run the script
+`docker/build_local_docker_images.sh` to build local docker images before running `docker/setup_demo.sh`
+
+Here are the commands:
+
+```java
+cd docker
+./build_local_docker_images.sh
+.....
+
+[INFO] Reactor Summary:
+[INFO]
+[INFO] hoodie ............................................. SUCCESS [  1.709 s]
+[INFO] hudi-common ...................................... SUCCESS [  9.015 s]
+[INFO] hudi-hadoop-mr ................................... SUCCESS [  1.108 s]
+[INFO] hudi-client ...................................... SUCCESS [  4.409 s]
+[INFO] hudi-hive ........................................ SUCCESS [  0.976 s]
+[INFO] hudi-spark ....................................... SUCCESS [ 26.522 s]
+[INFO] hudi-utilities ................................... SUCCESS [ 16.256 s]
+[INFO] hudi-cli ......................................... SUCCESS [ 11.341 s]
+[INFO] hudi-hadoop-mr-bundle ............................ SUCCESS [  1.893 s]
+[INFO] hudi-hive-bundle ................................. SUCCESS [ 14.099 s]
+[INFO] hudi-spark-bundle ................................ SUCCESS [ 58.252 s]
+[INFO] hudi-hadoop-docker ............................... SUCCESS [  0.612 s]
+[INFO] hudi-hadoop-base-docker .......................... SUCCESS [04:04 min]
+[INFO] hudi-hadoop-namenode-docker ...................... SUCCESS [  6.142 s]
+[INFO] hudi-hadoop-datanode-docker ...................... SUCCESS [  7.763 s]
+[INFO] hudi-hadoop-history-docker ....................... SUCCESS [  5.922 s]
+[INFO] hudi-hadoop-hive-docker .......................... SUCCESS [ 56.152 s]
+[INFO] hudi-hadoop-sparkbase-docker ..................... SUCCESS [01:18 min]
+[INFO] hudi-hadoop-sparkmaster-docker ................... SUCCESS [  2.964 s]
+[INFO] hudi-hadoop-sparkworker-docker ................... SUCCESS [  3.032 s]
+[INFO] hudi-hadoop-sparkadhoc-docker .................... SUCCESS [  2.764 s]
+[INFO] hudi-integ-test .................................. SUCCESS [  1.785 s]
+[INFO] ------------------------------------------------------------------------
+[INFO] BUILD SUCCESS
+[INFO] ------------------------------------------------------------------------
+[INFO] Total time: 09:15 min
+[INFO] Finished at: 2018-09-10T17:47:37-07:00
+[INFO] Final Memory: 236M/1848M
+[INFO] ------------------------------------------------------------------------
+```
diff --git a/website/docs/flink-quick-start-guide.md b/website/docs/flink-quick-start-guide.md
new file mode 100644
index 0000000..c02f0a8
--- /dev/null
+++ b/website/docs/flink-quick-start-guide.md
@@ -0,0 +1,167 @@
+---
+title: "Flink Guide"
+toc: true
+last_modified_at: 2020-03-16T11:40:57+08:00
+---
+
+This guide provides a quick peek at Hudi's capabilities using flink SQL client. Using flink SQL, we will walk through 
+code snippets that allows you to insert and update a Hudi table of default table type: 
+[Copy on Write](/docs/concepts#copy-on-write-table) and [Merge On Read](/docs/concepts#merge-on-read-table). 
+After each write operation we will also show how to read the data snapshot (incrementally read is already on the roadmap).
+
+## Setup
+
+We use the [Flink Sql Client](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/sqlClient) because it's a good
+quick start tool for SQL users.
+
+### Step.1 download flink jar
+Hudi works with Flink-1.12.x version. You can follow instructions [here](https://flink.apache.org/downloads) for setting up flink.
+The hudi-flink-bundle jar is archived with scala 2.11, so it’s recommended to use flink 1.12.x bundled with scala 2.11.
+
+### Step.2 start flink cluster
+Start a standalone flink cluster within hadoop environment.
+
+Now starts the cluster:
+
+```bash
+# HADOOP_HOME is your hadoop root directory after unpack the binary package.
+export HADOOP_CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath`
+
+# Start the flink standalone cluster
+./bin/start-cluster.sh
+```
+### Step.3 start flink SQL client
+
+Hudi has a prepared bundle jar for flink, which should be loaded in the flink SQL Client when it starts up.
+You can build the jar manually under path `hudi-source-dir/packaging/hudi-flink-bundle`, or download it from the
+[Apache Official Repository](https://repo.maven.apache.org/maven2/org/apache/hudi/hudi-flink-bundle_2.11/).
+
+Now starts the SQL CLI:
+
+```bash
+# HADOOP_HOME is your hadoop root directory after unpack the binary package.
+export HADOOP_CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath`
+
+./bin/sql-client.sh embedded -j .../hudi-flink-bundle_2.1?-*.*.*.jar shell
+```
+
+<div className="notice--info">
+  <h4>Please note the following: </h4>
+<ul>
+  <li>We suggest hadoop 2.9.x+ version because some of the object storage has filesystem implementation only after that</li>
+  <li>The flink-parquet and flink-avro formats are already packaged into the hudi-flink-bundle jar</li>
+</ul>
+</div>
+
+Setup table name, base path and operate using SQL for this guide.
+The SQL CLI only executes the SQL line by line.
+
+## Insert data
+
+Creates a flink hudi table first and insert data into the Hudi table using SQL `VALUES` as below.
+
+```sql
+-- sets up the result mode to tableau to show the results directly in the CLI
+set execution.result-mode=tableau;
+
+CREATE TABLE t1(
+  uuid VARCHAR(20), -- you can use 'PRIMARY KEY NOT ENFORCED' syntax to mark the field as record key
+  name VARCHAR(10),
+  age INT,
+  ts TIMESTAMP(3),
+  `partition` VARCHAR(20)
+)
+PARTITIONED BY (`partition`)
+WITH (
+  'connector' = 'hudi',
+  'path' = 'table_base_path',
+  'write.tasks' = '1', -- default is 4 ,required more resource
+  'compaction.tasks' = '1', -- default is 10 ,required more resource
+  'table.type' = 'MERGE_ON_READ' -- this creates a MERGE_ON_READ table, by default is COPY_ON_WRITE
+);
+
+-- insert data using values
+INSERT INTO t1 VALUES
+  ('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1'),
+  ('id2','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par1'),
+  ('id3','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par2'),
+  ('id4','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par2'),
+  ('id5','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par3'),
+  ('id6','Emma',20,TIMESTAMP '1970-01-01 00:00:06','par3'),
+  ('id7','Bob',44,TIMESTAMP '1970-01-01 00:00:07','par4'),
+  ('id8','Han',56,TIMESTAMP '1970-01-01 00:00:08','par4');
+```
+
+## Query data
+
+```sql
+-- query from the hudi table
+select * from t1;
+```
+
+This query provides snapshot querying of the ingested data. 
+Refer to [Table types and queries](/docs/concepts#table-types--queries) for more info on all table types and query types supported.
+{: .notice--info}
+
+## Update data
+
+This is similar to inserting new data.
+
+```sql
+-- this would update the record with key 'id1'
+insert into t1 values
+  ('id1','Danny',27,TIMESTAMP '1970-01-01 00:00:01','par1');
+```
+
+Notice that the save mode is now `Append`. In general, always use append mode unless you are trying to create the table for the first time.
+[Querying](#query-data) the data again will now show updated records. Each write operation generates a new [commit](/docs/concepts) 
+denoted by the timestamp. Look for changes in `_hoodie_commit_time`, `age` fields for the same `_hoodie_record_key`s in previous commit. 
+{: .notice--info}
+
+## Streaming query
+
+Hudi flink also provides capability to obtain a stream of records that changed since given commit timestamp. 
+This can be achieved using Hudi's streaming querying and providing a start time from which changes need to be streamed. 
+We do not need to specify endTime, if we want all changes after the given commit (as is the common case). 
+
+```sql
+CREATE TABLE t1(
+  uuid VARCHAR(20), -- you can use 'PRIMARY KEY NOT ENFORCED' syntax to mark the field as record key
+  name VARCHAR(10),
+  age INT,
+  ts TIMESTAMP(3),
+  `partition` VARCHAR(20)
+)
+PARTITIONED BY (`partition`)
+WITH (
+  'connector' = 'hudi',
+  'path' = 'table_base_path',
+  'table.type' = 'MERGE_ON_READ',
+  'read.tasks' = '1', -- default is 4 ,required more resource
+  'read.streaming.enabled' = 'true',  -- this option enable the streaming read
+  'read.streaming.start-commit' = '20210316134557', -- specifies the start commit instant time
+  'read.streaming.check-interval' = '4' -- specifies the check interval for finding new source commits, default 60s.
+);
+
+-- Then query the table in stream mode
+select * from t1;
+``` 
+
+This will give all changes that happened after the `read.streaming.start-commit` commit. The unique thing about this
+feature is that it now lets you author streaming pipelines on streaming or batch data source.
+{: .notice--info}
+
+## Delete data {#deletes}
+
+When consuming data in streaming query, hudi flink source can also accepts the change logs from the underneath data source,
+it can then applies the UPDATE and DELETE by per-row level. You can then sync a NEAR-REAL-TIME snapshot on hudi for all kinds
+of RDBMS.
+
+## Where to go from here?
+
+We used flink here to show case the capabilities of Hudi. However, Hudi can support multiple table types/query types and 
+Hudi tables can be queried from query engines like Hive, Spark, Flink, Presto and much more. We have put together a 
+[demo video](https://www.youtube.com/watch?v=VhNgUsxdrD0) that show cases all of this on a docker based setup with all 
+dependent systems running locally. We recommend you replicate the same setup and run the demo yourself, by following 
+steps [here](/docs/docker_demo) to get a taste for it. Also, if you are looking for ways to migrate your existing data 
+to Hudi, refer to [migration guide](/docs/migration_guide). 
diff --git a/website/docs/gcs_hoodie.md b/website/docs/gcs_hoodie.md
new file mode 100644
index 0000000..f0171af
--- /dev/null
+++ b/website/docs/gcs_hoodie.md
@@ -0,0 +1,60 @@
+---
+title: Google Cloud
+keywords: [ hudi, hive, google cloud, storage, spark, presto]
+summary: In this page, we go over how to configure hudi with Google Cloud Storage.
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+For Hudi storage on GCS, **regional** buckets provide an DFS API with strong consistency.
+
+## GCS Configs
+
+There are two configurations required for Hudi GCS compatibility:
+
+- Adding GCS Credentials for Hudi
+- Adding required jars to classpath
+
+### GCS Credentials
+
+Add the required configs in your core-site.xml from where Hudi can fetch them. Replace the `fs.defaultFS` with your GCS bucket name and Hudi should be able to read/write from the bucket.
+
+```xml
+  <property>
+    <name>fs.defaultFS</name>
+    <value>gs://hudi-bucket</value>
+  </property>
+
+  <property>
+    <name>fs.gs.impl</name>
+    <value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem</value>
+    <description>The FileSystem for gs: (GCS) uris.</description>
+  </property>
+
+  <property>
+    <name>fs.AbstractFileSystem.gs.impl</name>
+    <value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS</value>
+    <description>The AbstractFileSystem for gs: (GCS) uris.</description>
+  </property>
+
+  <property>
+    <name>fs.gs.project.id</name>
+    <value>GCS_PROJECT_ID</value>
+  </property>
+  <property>
+    <name>google.cloud.auth.service.account.enable</name>
+    <value>true</value>
+  </property>
+  <property>
+    <name>google.cloud.auth.service.account.email</name>
+    <value>GCS_SERVICE_ACCOUNT_EMAIL</value>
+  </property>
+  <property>
+    <name>google.cloud.auth.service.account.keyfile</name>
+    <value>GCS_SERVICE_ACCOUNT_KEYFILE</value>
+  </property>
+```
+
+### GCS Libs
+
+GCS hadoop libraries to add to our classpath
+
+- com.google.cloud.bigdataoss:gcs-connector:1.6.0-hadoop2
diff --git a/website/docs/ibm_cos_hoodie.md b/website/docs/ibm_cos_hoodie.md
new file mode 100644
index 0000000..5ac7433
--- /dev/null
+++ b/website/docs/ibm_cos_hoodie.md
@@ -0,0 +1,77 @@
+---
+title: IBM Cloud
+keywords: [ hudi, hive, ibm, cos, spark, presto]
+summary: In this page, we go over how to configure Hudi with IBM Cloud Object Storage filesystem.
+last_modified_at: 2020-10-01T11:38:24-10:00
+---
+In this page, we explain how to get your Hudi spark job to store into IBM Cloud Object Storage.
+
+## IBM COS configs
+
+There are two configurations required for Hudi-IBM Cloud Object Storage compatibility:
+
+- Adding IBM COS Credentials for Hudi
+- Adding required Jars to classpath
+
+### IBM Cloud Object Storage Credentials
+
+Simplest way to use Hudi with IBM Cloud Object Storage, is to configure your `SparkSession` or `SparkContext` with IBM Cloud Object Storage credentials using [Stocator](https://github.com/CODAIT/stocator) storage connector for Spark. Hudi will automatically pick this up and talk to IBM Cloud Object Storage.
+
+Alternatively, add the required configs in your `core-site.xml` from where Hudi can fetch them. Replace the `fs.defaultFS` with your IBM Cloud Object Storage bucket name and Hudi should be able to read/write from the bucket.
+
+For example, using HMAC keys and service name `myCOS`:
+```xml
+  <property>
+      <name>fs.defaultFS</name>
+      <value>cos://myBucket.myCOS</value>
+  </property>
+
+  <property>
+      <name>fs.cos.flat.list</name>
+      <value>true</value>
+  </property>
+
+  <property>
+	  <name>fs.stocator.scheme.list</name>
+	  <value>cos</value>
+  </property>
+
+  <property>
+	  <name>fs.cos.impl</name>
+	  <value>com.ibm.stocator.fs.ObjectStoreFileSystem</value>
+  </property>
+
+  <property>
+	  <name>fs.stocator.cos.impl</name>
+	  <value>com.ibm.stocator.fs.cos.COSAPIClient</value>
+  </property>
+
+  <property>
+	  <name>fs.stocator.cos.scheme</name>
+	  <value>cos</value>
+  </property>
+
+  <property>
+	  <name>fs.cos.myCos.access.key</name>
+	  <value>ACCESS KEY</value>
+  </property>
+
+  <property>
+	  <name>fs.cos.myCos.endpoint</name>
+	  <value>http://s3-api.us-geo.objectstorage.softlayer.net</value>
+  </property>
+
+  <property>
+	  <name>fs.cos.myCos.secret.key</name>
+	  <value>SECRET KEY</value>
+  </property>
+
+```
+
+For more options see Stocator [documentation](https://github.com/CODAIT/stocator/blob/master/README.md).
+
+### IBM Cloud Object Storage Libs
+
+IBM Cloud Object Storage hadoop libraries to add to our classpath
+
+ - com.ibm.stocator:stocator:1.1.3
diff --git a/website/docs/metrics.md b/website/docs/metrics.md
new file mode 100644
index 0000000..8a4766a
--- /dev/null
+++ b/website/docs/metrics.md
@@ -0,0 +1,160 @@
+---
+title: Metrics
+keywords: [ hudi, administration, operation, devops, metrics]
+summary: This section offers an overview of metrics in Hudi
+toc: true
+last_modified_at: 2020-06-20T15:59:57-04:00
+---
+
+In this section, we will introduce the `MetricsReporter` and `HoodieMetrics` in Hudi. You can view the metrics-related configurations [here](configurations#metrics-configs).
+
+## MetricsReporter
+
+MetricsReporter provides APIs for reporting `HoodieMetrics` to user-specified backends. Currently, the implementations include InMemoryMetricsReporter, JmxMetricsReporter, MetricsGraphiteReporter and DatadogMetricsReporter. Since InMemoryMetricsReporter is only used for testing, we will introduce the other three implementations.
+
+### JmxMetricsReporter
+
+JmxMetricsReporter is an implementation of JMX reporter, which used to report JMX metrics.
+
+#### Configurations
+The following is an example of `JmxMetricsReporter`. More detaile configurations can be referenced [here](configurations#jmx).
+
+  ```properties
+  hoodie.metrics.on=true
+  hoodie.metrics.reporter.type=JMX
+  hoodie.metrics.jmx.host=192.168.0.106
+  hoodie.metrics.jmx.port=4001
+  ```
+
+#### Demo
+As configured above, JmxMetricsReporter will started JMX server on port 4001. We can start a jconsole to connect to 192.168.0.106:4001. Below is an illustration of monitoring Hudi JMX metrics through jconsole.
+<figure>
+    <img className="docimage" src={require("/assets/images/hudi_jxm_metrics.png").default} alt="hudi_jxm_metrics.png"  />
+</figure>
+
+### MetricsGraphiteReporter
+
+MetricsGraphiteReporter is an implementation of Graphite reporter, which connects to a Graphite server, and send `HoodieMetrics` to it.
+
+#### Configurations
+The following is an example of `MetricsGraphiteReporter`. More detaile configurations can be referenced [here](configurations#graphite).
+
+  ```properties
+  hoodie.metrics.on=true
+  hoodie.metrics.reporter.type=GRAPHITE
+  hoodie.metrics.graphite.host=192.168.0.106
+  hoodie.metrics.graphite.port=2003
+  hoodie.metrics.graphite.metric.prefix=<your metrics prefix>
+  ```
+#### Demo
+As configured above, assuming a Graphite server is running on host 192.168.0.106 and port 2003, a running Hudi job will connect and report metrics data to it. Below is an illustration of monitoring hudi metrics through Graphite.
+  <figure>
+      <img className="docimage" src={require("/assets/images/hudi_graphite_metrics.png").default} alt="hudi_graphite_metrics.png"  />
+  </figure>
+
+### DatadogMetricsReporter
+
+DatadogMetricsReporter is an implementation of Datadog reporter.
+A reporter which publishes metric values to Datadog monitoring service via Datadog HTTP API.
+
+#### Configurations
+The following is an example of `DatadogMetricsReporter`. More detailed configurations can be referenced [here](configurations#datadog).
+
+```properties
+hoodie.metrics.on=true
+hoodie.metrics.reporter.type=DATADOG
+hoodie.metrics.datadog.api.site=EU # or US
+hoodie.metrics.datadog.api.key=<your api key>
+hoodie.metrics.datadog.metric.prefix=<your metrics prefix>
+```
+
+ * `hoodie.metrics.datadog.api.site` will set the Datadog API site, which determines whether the requests will be sent to api.datadoghq.eu (EU) or api.datadoghq.com (US). Set this according to your Datadog account settings.
+ * `hoodie.metrics.datadog.api.key` will set the api key.
+ * `hoodie.metrics.datadog.metric.prefix` will help segregate metrics by setting different prefixes for different jobs. Note that it will use `.` to delimit the prefix and the metric name. For example, if the prefix is set to `foo`, then `foo.` will be prepended to the metric name.
+
+#### Demo
+In this demo, we ran a `HoodieDeltaStreamer` job with `HoodieMetrics` turned on and other configurations set properly.
+
+<figure>
+    <img className="docimage" src={require("/assets/images/blog/2020-05-28-datadog-metrics-demo.png").default} alt="hudi_datadog_metrics.png"  />
+</figure>
+
+ As shown above, we were able to collect Hudi's action-related metrics like
+
+ * `<prefix>.<table name>.commit.totalScanTime`
+ * `<prefix>.<table name>.clean.duration`
+ * `<prefix>.<table name>.index.lookup.duration`
+
+ as well as `HoodieDeltaStreamer`-specific metrics
+
+ * `<prefix>.<table name>.deltastreamer.duration`
+ * `<prefix>.<table name>.deltastreamer.hiveSyncDuration`
+ 
+### UserDefinedMetricsReporter
+
+Allows users to define a custom metrics reporter.
+
+#### Configurations
+The following is an example of `UserDefinedMetricsReporter`. More detailed configurations can be referenced [here](configurations#user-defined-reporter).
+
+```properties
+hoodie.metrics.on=true
+hoodie.metrics.reporter.class=test.TestUserDefinedMetricsReporter
+```
+
+#### Demo
+In this simple demo, TestMetricsReporter will print all gauges every 10 seconds
+
+```java
+public static class TestUserDefinedMetricsReporter 
+    extends AbstractUserDefinedMetricsReporter {
+  private static final Logger log = LogManager.getLogger(DummyMetricsReporter.class);
+
+  private ScheduledExecutorService exec = Executors.newScheduledThreadPool(1, r -> {
+      Thread t = Executors.defaultThreadFactory().newThread(r);
+      t.setDaemon(true);
+      return t;
+  });
+
+  public TestUserDefinedMetricsReporter(Properties props, MetricRegistry registry) {
+    super(props, registry);
+  }
+
+  @Override
+  public void start() {
+    exec.schedule(this::report, 10, TimeUnit.SECONDS);
+  }
+
+  @Override
+  public void report() {
+    this.getRegistry().getGauges().forEach((key, value) -> 
+      log.info("key: " + key + " value: " + value.getValue().toString()));
+  }
+
+  @Override
+  public Closeable getReporter() {
+    return null;
+  }
+
+  @Override
+  public void stop() {
+    exec.shutdown();
+  }
+}
+```
+
+## HoodieMetrics
+
+Once the Hudi writer is configured with the right table and environment for `HoodieMetrics`, it produces the following `HoodieMetrics`, that aid in debugging hudi tables
+
+ - **Commit Duration** - The amount of time it took to successfully commit a batch of records
+ - **Rollback Duration** - Similarly, the amount of time taken to undo partial data left over by a failed commit (rollback happens automatically after a failing write)
+ - **File Level metrics** - Shows the amount of new files added, versions, deleted (cleaned) in each commit
+ - **Record Level Metrics** - Total records inserted/updated etc per commit
+ - **Partition Level metrics** - number of partitions upserted (super useful to understand sudden spikes in commit duration)
+
+These `HoodieMetrics` can then be plotted on a standard tool like grafana. Below is a sample commit duration chart.
+
+<figure>
+    <img className="docimage" src={require("/assets/images/hudi_commit_duration.png").default} alt="hudi_commit_duration.png"  />
+</figure>
diff --git a/website/docs/migration_guide.md b/website/docs/migration_guide.md
new file mode 100644
index 0000000..a18a7f8
--- /dev/null
+++ b/website/docs/migration_guide.md
@@ -0,0 +1,70 @@
+---
+title: Migration
+keywords: [ hudi, migration, use case]
+summary: In this page, we will discuss some available tools for migrating your existing table into a Hudi table
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+
+Hudi maintains metadata such as commit timeline and indexes to manage a table. The commit timelines helps to understand the actions happening on a table as well as the current state of a table. Indexes are used by Hudi to maintain a record key to file id mapping to efficiently locate a record. At the moment, Hudi supports writing only parquet columnar formats.
+To be able to start using Hudi for your existing table, you will need to migrate your existing table into a Hudi managed table. There are a couple of ways to achieve this.
+
+
+## Approaches
+
+
+### Use Hudi for new partitions alone
+
+Hudi can be used to manage an existing table without affecting/altering the historical data already present in the
+table. Hudi has been implemented to be compatible with such a mixed table with a caveat that either the complete
+Hive partition is Hudi managed or not. Thus the lowest granularity at which Hudi manages a table is a Hive
+partition. Start using the datasource API or the WriteClient to write to the table and make sure you start writing
+to a new partition or convert your last N partitions into Hudi instead of the entire table. Note, since the historical
+ partitions are not managed by HUDI, none of the primitives provided by HUDI work on the data in those partitions. More concretely, one cannot perform upserts or incremental pull on such older partitions not managed by the HUDI table.
+Take this approach if your table is an append only type of table and you do not expect to perform any updates to existing (or non Hudi managed) partitions.
+
+
+### Convert existing table to Hudi
+
+Import your existing table into a Hudi managed table. Since all the data is Hudi managed, none of the limitations
+ of Approach 1 apply here. Updates spanning any partitions can be applied to this table and Hudi will efficiently
+ make the update available to queries. Note that not only do you get to use all Hudi primitives on this table,
+ there are other additional advantages of doing this. Hudi automatically manages file sizes of a Hudi managed table
+ . You can define the desired file size when converting this table and Hudi will ensure it writes out files
+ adhering to the config. It will also ensure that smaller files later get corrected by routing some new inserts into
+ small files rather than writing new small ones thus maintaining the health of your cluster.
+
+There are a few options when choosing this approach.
+
+**Option 1**
+Use the HDFSParquetImporter tool. As the name suggests, this only works if your existing table is in parquet file format.
+This tool essentially starts a Spark Job to read the existing parquet table and converts it into a HUDI managed table by re-writing all the data.
+
+**Option 2**
+For huge tables, this could be as simple as : 
+```java
+for partition in [list of partitions in source table] {
+        val inputDF = spark.read.format("any_input_format").load("partition_path")
+        inputDF.write.format("org.apache.hudi").option()....save("basePath")
+}
+```  
+
+**Option 3**
+Write your own custom logic of how to load an existing table into a Hudi managed one. Please read about the RDD API
+ [here](/docs/quick-start-guide). Using the HDFSParquetImporter Tool. Once hudi has been built via `mvn clean install -DskipTests`, the shell can be
+fired by via `cd hudi-cli && ./hudi-cli.sh`.
+
+```java
+hudi->hdfsparquetimport
+        --upsert false
+        --srcPath /user/parquet/table/basepath
+        --targetPath /user/hoodie/table/basepath
+        --tableName hoodie_table
+        --tableType COPY_ON_WRITE
+        --rowKeyField _row_key
+        --partitionPathField partitionStr
+        --parallelism 1500
+        --schemaFilePath /user/table/schema
+        --format parquet
+        --sparkMemory 6g
+        --retry 2
+```
diff --git a/website/docs/oss_hoodie.md b/website/docs/oss_hoodie.md
new file mode 100644
index 0000000..894bbcf
--- /dev/null
+++ b/website/docs/oss_hoodie.md
@@ -0,0 +1,70 @@
+---
+title: Alibaba Cloud
+keywords: [ hudi, hive, aliyun, oss, spark, presto]
+summary: In this page, we go over how to configure Hudi with OSS filesystem.
+last_modified_at: 2020-04-21T11:38:24-10:00
+---
+In this page, we explain how to get your Hudi spark job to store into Aliyun OSS.
+
+## Aliyun OSS configs
+
+There are two configurations required for Hudi-OSS compatibility:
+
+- Adding Aliyun OSS Credentials for Hudi
+- Adding required Jars to classpath
+
+### Aliyun OSS Credentials
+
+Add the required configs in your core-site.xml from where Hudi can fetch them. Replace the `fs.defaultFS` with your OSS bucket name, replace `fs.oss.endpoint` with your OSS endpoint, replace `fs.oss.accessKeyId` with your OSS key, replace `fs.oss.accessKeySecret` with your OSS secret. Hudi should be able to read/write from the bucket.
+
+```xml
+<property>
+  <name>fs.defaultFS</name>
+  <value>oss://bucketname/</value>
+</property>
+
+<property>
+  <name>fs.oss.endpoint</name>
+  <value>oss-endpoint-address</value>
+  <description>Aliyun OSS endpoint to connect to.</description>
+</property>
+
+<property>
+  <name>fs.oss.accessKeyId</name>
+  <value>oss_key</value>
+  <description>Aliyun access key ID</description>
+</property>
+
+<property>
+  <name>fs.oss.accessKeySecret</name>
+  <value>oss-secret</value>
+  <description>Aliyun access key secret</description>
+</property>
+
+<property>
+  <name>fs.oss.impl</name>
+  <value>org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem</value>
+</property>
+```
+
+### Aliyun OSS Libs
+
+Aliyun hadoop libraries jars to add to our pom.xml. Since hadoop-aliyun depends on the version of hadoop 2.9.1+, you need to use the version of hadoop 2.9.1 or later.
+
+```xml
+<dependency>
+  <groupId>org.apache.hadoop</groupId>
+  <artifactId>hadoop-aliyun</artifactId>
+  <version>3.2.1</version>
+</dependency>
+<dependency>
+  <groupId>com.aliyun.oss</groupId>
+  <artifactId>aliyun-sdk-oss</artifactId>
+  <version>3.8.1</version>
+</dependency>
+<dependency>
+  <groupId>org.jdom</groupId>
+  <artifactId>jdom</artifactId>
+  <version>1.1</version>
+</dependency>
+```
diff --git a/website/docs/overview.md b/website/docs/overview.md
new file mode 100644
index 0000000..665f7dc
--- /dev/null
+++ b/website/docs/overview.md
@@ -0,0 +1,172 @@
+---
+title: "Overview"
+keywords: [ hudi, design, table, queries, timeline]
+summary: "Here we introduce some basic concepts & give a broad technical overview of Hudi"
+toc: true
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+
+Apache Hudi (pronounced “hoodie”) provides streaming primitives over hadoop compatible storages
+
+ * Update/Delete Records      (how do I change records in a table?)
+ * Change Streams             (how do I fetch records that changed?)
+
+In this section, we will discuss key concepts & terminologies that are important to understand, to be able to effectively use these primitives.
+
+## Timeline
+At its core, Hudi maintains a `timeline` of all actions performed on the table at different `instants` of time that helps provide instantaneous views of the table,
+while also efficiently supporting retrieval of data in the order of arrival. A Hudi instant consists of the following components 
+
+ * `Instant action` : Type of action performed on the table
+ * `Instant time` : Instant time is typically a timestamp (e.g: 20190117010349), which monotonically increases in the order of action's begin time.
+ * `state` : current state of the instant
+ 
+Hudi guarantees that the actions performed on the timeline are atomic & timeline consistent based on the instant time.
+
+Key actions performed include
+
+ * `COMMITS` - A commit denotes an **atomic write** of a batch of records into a table.
+ * `CLEANS` - Background activity that gets rid of older versions of files in the table, that are no longer needed.
+ * `DELTA_COMMIT` - A delta commit refers to an **atomic write** of a batch of records into a  MergeOnRead type table, where some/all of the data could be just written to delta logs.
+ * `COMPACTION` - Background activity to reconcile differential data structures within Hudi e.g: moving updates from row based log files to columnar formats. Internally, compaction manifests as a special commit on the timeline
+ * `ROLLBACK` - Indicates that a commit/delta commit was unsuccessful & rolled back, removing any partial files produced during such a write
+ * `SAVEPOINT` - Marks certain file groups as "saved", such that cleaner will not delete them. It helps restore the table to a point on the timeline, in case of disaster/data recovery scenarios.
+
+Any given instant can be 
+in one of the following states
+
+ * `REQUESTED` - Denotes an action has been scheduled, but has not initiated
+ * `INFLIGHT` - Denotes that the action is currently being performed
+ * `COMPLETED` - Denotes completion of an action on the timeline
+
+<figure>
+    <img className="docimage" src={require("/assets/images/hudi_timeline.png").default} alt="hudi_timeline.png" />
+</figure>
+
+Example above shows upserts happenings between 10:00 and 10:20 on a Hudi table, roughly every 5 mins, leaving commit metadata on the Hudi timeline, along
+with other background cleaning/compactions. One key observation to make is that the commit time indicates the `arrival time` of the data (10:20AM), while the actual data
+organization reflects the actual time or `event time`, the data was intended for (hourly buckets from 07:00). These are two key concepts when reasoning about tradeoffs between latency and completeness of data.
+
+When there is late arriving data (data intended for 9:00 arriving >1 hr late at 10:20), we can see the upsert producing new data into even older time buckets/folders.
+With the help of the timeline, an incremental query attempting to get all new data that was committed successfully since 10:00 hours, is able to very efficiently consume
+only the changed files without say scanning all the time buckets > 07:00.
+
+## File Layout
+Hudi organizes a table into a directory structure under a `basepath` on DFS. Table is broken up into partitions, which are folders containing data files for that partition,
+very similar to Hive tables. Each partition is uniquely identified by its `partitionpath`, which is relative to the basepath.
+
+Within each partition, files are organized into `file groups`, uniquely identified by a `file id`. Each file group contains several
+`file slices`, where each slice contains a base file (`*.parquet`) produced at a certain commit/compaction instant time,
+ along with set of log files (`*.log.*`) that contain inserts/updates to the base file since the base file was produced. 
+Hudi adopts a MVCC design, where compaction action merges logs and base files to produce new file slices and cleaning action gets rid of 
+unused/older file slices to reclaim space on DFS. 
+
+## Index
+Hudi provides efficient upserts, by mapping a given hoodie key (record key + partition path) consistently to a file id, via an indexing mechanism. 
+This mapping between record key and file group/file id, never changes once the first version of a record has been written to a file. In short, the 
+mapped file group contains all versions of a group of records.
+
+## Table Types & Queries
+Hudi table types define how data is indexed & laid out on the DFS and how the above primitives and timeline activities are implemented on top of such organization (i.e how data is written). 
+In turn, `query types` define how the underlying data is exposed to the queries (i.e how data is read). 
+
+| Table Type    | Supported Query types |
+|-------------- |------------------|
+| Copy On Write | Snapshot Queries + Incremental Queries  |
+| Merge On Read | Snapshot Queries + Incremental Queries + Read Optimized Queries |
+
+### Table Types
+Hudi supports the following table types.
+
+  - [Copy On Write](#copy-on-write-table) : Stores data using exclusively columnar file formats (e.g parquet). Updates simply version & rewrite the files by performing a synchronous merge during write.
+  - [Merge On Read](#merge-on-read-table) : Stores data using a combination of columnar (e.g parquet) + row based (e.g avro) file formats. Updates are logged to delta files & later compacted to produce new versions of columnar files synchronously or asynchronously.
+    
+Following table summarizes the trade-offs between these two table types
+
+| Trade-off     | CopyOnWrite      | MergeOnRead |
+|-------------- |------------------| ------------------|
+| Data Latency | Higher   | Lower |
+| Query Latency | Lower   | Higher |
+| Update cost (I/O) | Higher (rewrite entire parquet) | Lower (append to delta log) |
+| Parquet File Size | Smaller (high update(I/0) cost) | Larger (low update cost) |
+| Write Amplification | Higher | Lower (depending on compaction strategy) |
+
+
+### Query types
+Hudi supports the following query types
+
+ - **Snapshot Queries** : Queries see the latest snapshot of the table as of a given commit or compaction action. In case of merge on read table, it exposes near-real time data(few mins) by merging 
+    the base and delta files of the latest file slice on-the-fly. For copy on write table,  it provides a drop-in replacement for existing parquet tables, while providing upsert/delete and other write side features. 
+ - **Incremental Queries** : Queries only see new data written to the table, since a given commit/compaction. This effectively provides change streams to enable incremental data pipelines. 
+ - **Read Optimized Queries** : Queries see the latest snapshot of table as of a given commit/compaction action. Exposes only the base/columnar files in latest file slices and guarantees the 
+    same columnar query performance compared to a non-hudi columnar table.
+
+Following table summarizes the trade-offs between the different query types.
+
+| Trade-off     | Snapshot    | Read Optimized |
+|-------------- |-------------| ------------------|
+| Data Latency  | Lower | Higher
+| Query Latency | Higher (merge base / columnar file + row based delta / log files) | Lower (raw base / columnar file performance)
+
+
+## Copy On Write Table
+
+File slices in Copy-On-Write table only contain the base/columnar file and each commit produces new versions of base files. 
+In other words, we implicitly compact on every commit, such that only columnar data exists. As a result, the write amplification 
+(number of bytes written for 1 byte of incoming data) is much higher, where read amplification is zero. 
+This is a much desired property for analytical workloads, which is predominantly read-heavy.
+
+Following illustrates how this works conceptually, when data written into copy-on-write table  and two queries running on top of it.
+
+
+<figure>
+    <img className="docimage" src={require("/assets/images/hudi_cow.png").default} alt="hudi_cow.png" />
+</figure>
+
+
+As data gets written, updates to existing file groups produce a new slice for that file group stamped with the commit instant time, 
+while inserts allocate a new file group and write its first slice for that file group. These file slices and their commit instant times are color coded above.
+SQL queries running against such a table (eg: `select count(*)` counting the total records in that partition), first checks the timeline for the latest commit
+and filters all but latest file slices of each file group. As you can see, an old query does not see the current inflight commit's files color coded in pink,
+but a new query starting after the commit picks up the new data. Thus queries are immune to any write failures/partial writes and only run on committed data.
+
+The intention of copy on write table, is to fundamentally improve how tables are managed today through
+
+  - First class support for atomically updating data at file-level, instead of rewriting whole tables/partitions
+  - Ability to incremental consume changes, as opposed to wasteful scans or fumbling with heuristics
+  - Tight control of file sizes to keep query performance excellent (small files hurt query performance considerably).
+
+
+## Merge On Read Table
+
+Merge on read table is a superset of copy on write, in the sense it still supports read optimized queries of the table by exposing only the base/columnar files in latest file slices.
+Additionally, it stores incoming upserts for each file group, onto a row based delta log, to support snapshot queries by applying the delta log, 
+onto the latest version of each file id on-the-fly during query time. Thus, this table type attempts to balance read and write amplification intelligently, to provide near real-time data.
+The most significant change here, would be to the compactor, which now carefully chooses which delta log files need to be compacted onto
+their columnar base file, to keep the query performance in check (larger delta log files would incur longer merge times with merge data on query side)
+
+Following illustrates how the table works, and shows two types of queries - snapshot query and read optimized query.
+
+<figure>
+    <img className="docimage" src={require("/assets/images/hudi_mor.png").default} alt="hudi_mor.png"  />
+</figure>
+
+There are lot of interesting things happening in this example, which bring out the subtleties in the approach.
+
+ - We now have commits every 1 minute or so, something we could not do in the other table type.
+ - Within each file id group, now there is an delta log file, which holds incoming updates to records in the base columnar files. In the example, the delta log files hold
+ all the data from 10:05 to 10:10. The base columnar files are still versioned with the commit, as before.
+ Thus, if one were to simply look at base files alone, then the table layout looks exactly like a copy on write table.
+ - A periodic compaction process reconciles these changes from the delta log and produces a new version of base file, just like what happened at 10:05 in the example.
+ - There are two ways of querying the same underlying table: Read Optimized query and Snapshot query, depending on whether we chose query performance or freshness of data.
+ - The semantics around when data from a commit is available to a query changes in a subtle way for a read optimized query. Note, that such a query
+ running at 10:10, wont see data after 10:05 above, while a snapshot query always sees the freshest data.
+ - When we trigger compaction & what it decides to compact hold all the key to solving these hard problems. By implementing a compacting
+ strategy, where we aggressively compact the latest partitions compared to older partitions, we could ensure the read optimized queries see data
+ published within X minutes in a consistent fashion.
+
+The intention of merge on read table is to enable near real-time processing directly on top of DFS, as opposed to copying
+data out to specialized systems, which may not be able to handle the data volume. There are also a few secondary side benefits to 
+this table such as reduced write amplification by avoiding synchronous merge of data, i.e, the amount of data written per 1 bytes of data in a batch
+
+
diff --git a/website/docs/performance.md b/website/docs/performance.md
new file mode 100644
index 0000000..2977228
--- /dev/null
+++ b/website/docs/performance.md
@@ -0,0 +1,64 @@
+---
+title: Performance
+keywords: [ hudi, index, storage, compaction, cleaning, implementation]
+toc: false
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+
+In this section, we go over some real world performance numbers for Hudi upserts, incremental pull and compare them against
+the conventional alternatives for achieving these tasks. 
+
+## Upserts
+
+Following shows the speed up obtained for NoSQL database ingestion, from incrementally upserting on a Hudi table on the copy-on-write storage,
+on 5 tables ranging from small to huge (as opposed to bulk loading the tables)
+
+<figure>
+    <img className="docimage" src={require("/assets/images/hudi_upsert_perf1.png").default} alt="hudi_upsert_perf1.png"  />
+</figure>
+
+Given Hudi can build the table incrementally, it opens doors for also scheduling ingesting more frequently thus reducing latency, with
+significant savings on the overall compute cost.
+
+<figure>
+    <img className="docimage" src={require("/assets/images/hudi_upsert_perf2.png").default} alt="hudi_upsert_perf2.png"  />
+</figure>
+
+Hudi upserts have been stress tested upto 4TB in a single commit across the t1 table. 
+See [here](https://cwiki.apache.org/confluence/display/HUDI/Tuning+Guide) for some tuning tips.
+
+## Indexing
+
+In order to efficiently upsert data, Hudi needs to classify records in a write batch into inserts & updates (tagged with the file group 
+it belongs to). In order to speed this operation, Hudi employs a pluggable index mechanism that stores a mapping between recordKey and 
+the file group id it belongs to. By default, Hudi uses a built in index that uses file ranges and bloom filters to accomplish this, with
+upto 10x speed up over a spark join to do the same. 
+
+Hudi provides best indexing performance when you model the recordKey to be monotonically increasing (e.g timestamp prefix), leading to range pruning filtering
+out a lot of files for comparison. Even for UUID based keys, there are [known techniques](https://www.percona.com/blog/2014/12/19/store-uuid-optimized-way/) to achieve this.
+For e.g , with 100M timestamp prefixed keys (5% updates, 95% inserts) on a event table with 80B keys/3 partitions/11416 files/10TB data, Hudi index achieves a 
+**~7X (2880 secs vs 440 secs) speed up** over vanilla spark join. Even for a challenging workload like an '100% update' database ingestion workload spanning 
+3.25B UUID keys/30 partitions/6180 files using 300 cores, Hudi indexing offers a **80-100% speedup**.
+
+## Snapshot Queries
+
+The major design goal for snapshot queries is to achieve the latency reduction & efficiency gains in previous section,
+with no impact on queries. Following charts compare the Hudi vs non-Hudi tables across Hive/Presto/Spark queries and demonstrate this.
+
+**Hive**
+
+<figure>
+    <img className="docimage" src={require("/assets/images/hudi_query_perf_hive.png").default} alt="hudi_query_perf_hive.png"  />
+</figure>
+
+**Spark**
+
+<figure>
+    <img className="docimage" src={require("/assets/images/hudi_query_perf_spark.png").default} alt="hudi_query_perf_spark.png"  />
+</figure>
+
+**Presto**
+
+<figure>
+    <img className="docimage" src={require("/assets/images/hudi_query_perf_presto.png").default} alt="hudi_query_perf_presto.png"  />
+</figure>
diff --git a/website/docs/privacy.md b/website/docs/privacy.md
new file mode 100644
index 0000000..dd8f78c
--- /dev/null
+++ b/website/docs/privacy.md
@@ -0,0 +1,22 @@
+---
+title: Privacy Policy
+keywords: [ hudi, privacy]
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+
+Information about your use of this website is collected using server access logs and a tracking cookie.
+The collected information consists of the following:
+
+* The IP address from which you access the website;
+* The type of browser and operating system you use to access our site;
+* The date and time you access our site;
+* The pages you visit;
+* The addresses of pages from where you followed a link to our site.
+
+Part of this information is gathered using a tracking cookie set by the [Google Analytics](http://www.google.com/analytics) service and handled by Google as described in their [privacy policy](http://www.google.com/privacy). See your browser documentation for instructions on how to disable the cookie if you prefer not to share this data with Google.
+
+We use the gathered information to help us make our site more useful to visitors and to better understand how and when our site is used. We do not track or collect personally identifiable information or associate gathered data with any personally identifying information from other sources.
+
+By using this website, you consent to the collection of this data in the manner and for the purpose described above.
+
+The Hudi development community welcomes your questions or comments regarding this Privacy Policy. Send them to dev@hudi.apache.org
diff --git a/website/docs/querying_data.md b/website/docs/querying_data.md
new file mode 100644
index 0000000..a07d4a6
--- /dev/null
+++ b/website/docs/querying_data.md
@@ -0,0 +1,273 @@
+---
+title: Querying Data
+keywords: [ hudi, hive, spark, sql, presto]
+summary: In this page, we go over how to enable SQL queries on Hudi built tables.
+toc: true
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+
+Conceptually, Hudi stores data physically once on DFS, while providing 3 different ways of querying, as explained [before](/docs/concepts#query-types). 
+Once the table is synced to the Hive metastore, it provides external Hive tables backed by Hudi's custom inputformats. Once the proper hudi
+bundle has been installed, the table can be queried by popular query engines like Hive, Spark SQL, Spark Datasource API and PrestoDB.
+
+Specifically, following Hive tables are registered based off [table name](/docs/configurations#TABLE_NAME_OPT_KEY) 
+and [table type](/docs/configurations#TABLE_TYPE_OPT_KEY) configs passed during write.   
+
+If `table name = hudi_trips` and `table type = COPY_ON_WRITE`, then we get: 
+ - `hudi_trips` supports snapshot query and incremental query on the table backed by `HoodieParquetInputFormat`, exposing purely columnar data.
+
+
+If `table name = hudi_trips` and `table type = MERGE_ON_READ`, then we get:
+ - `hudi_trips_rt` supports snapshot query and incremental query (providing near-real time data) on the table  backed by `HoodieParquetRealtimeInputFormat`, exposing merged view of base and log data.
+ - `hudi_trips_ro` supports read optimized query on the table backed by `HoodieParquetInputFormat`, exposing purely columnar data stored in base files.
+
+As discussed in the concepts section, the one key capability needed for [incrementally processing](https://www.oreilly.com/ideas/ubers-case-for-incremental-processing-on-hadoop),
+is obtaining a change stream/log from a table. Hudi tables can be queried incrementally, which means you can get ALL and ONLY the updated & new rows 
+since a specified instant time. This, together with upserts, is particularly useful for building data pipelines where 1 or more source Hudi tables are incrementally queried (streams/facts),
+joined with other tables (tables/dimensions), to [write out deltas](/docs/writing_data) to a target Hudi table. Incremental queries are realized by querying one of the tables above, 
+with special configurations that indicates to query planning that only incremental data needs to be fetched out of the table. 
+
+
+## Support Matrix
+
+Following tables show whether a given query is supported on specific query engine.
+
+### Copy-On-Write tables
+  
+|Query Engine|Snapshot Queries|Incremental Queries|
+|------------|--------|-----------|
+|**Hive**|Y|Y|
+|**Spark SQL**|Y|Y|
+|**Spark Datasource**|Y|Y|
+|**Flink SQL**|Y|N|
+|**PrestoDB**|Y|N|
+|**Trino**|Y|N|
+|**Impala**|Y|N|
+
+
+Note that `Read Optimized` queries are not applicable for COPY_ON_WRITE tables.
+
+### Merge-On-Read tables
+
+|Query Engine|Snapshot Queries|Incremental Queries|Read Optimized Queries|
+|------------|--------|-----------|--------------|
+|**Hive**|Y|Y|Y|
+|**Spark SQL**|Y|Y|Y|
+|**Spark Datasource**|Y|Y|Y|
+|**Flink SQL**|Y|Y|Y|
+|**PrestoDB**|Y|N|Y|
+|**Trino**|N|N|Y|
+|**Impala**|N|N|Y|
+
+
+In sections, below we will discuss specific setup to access different query types from different query engines. 
+
+## Hive
+
+In order for Hive to recognize Hudi tables and query correctly, 
+ - the HiveServer2 needs to be provided with the `hudi-hadoop-mr-bundle-x.y.z-SNAPSHOT.jar` in its [aux jars path](https://www.cloudera.com/documentation/enterprise/5-6-x/topics/cm_mc_hive_udf#concept_nc3_mms_lr). This will ensure the input format 
+classes with its dependencies are available for query planning & execution. 
+ - For MERGE_ON_READ tables, additionally the bundle needs to be put on the hadoop/hive installation across the cluster, so that queries can pick up the custom RecordReader as well.
+
+In addition to setup above, for beeline cli access, the `hive.input.format` variable needs to be set to the fully qualified path name of the 
+inputformat `org.apache.hudi.hadoop.HoodieParquetInputFormat`. For Tez, additionally the `hive.tez.input.format` needs to be set 
+to `org.apache.hadoop.hive.ql.io.HiveInputFormat`. Then proceed to query the table like any other Hive table.
+
+### Incremental query
+`HiveIncrementalPuller` allows incrementally extracting changes from large fact/dimension tables via HiveQL, combining the benefits of Hive (reliably process complex SQL queries) and 
+incremental primitives (speed up querying tables incrementally instead of scanning fully). The tool uses Hive JDBC to run the hive query and saves its results in a temp table.
+that can later be upserted. Upsert utility (`HoodieDeltaStreamer`) has all the state it needs from the directory structure to know what should be the commit time on the target table.
+e.g: `/app/incremental-hql/intermediate/{source_table_name}_temp/{last_commit_included}`.The Delta Hive table registered will be of the form `{tmpdb}.{source_table}_{last_commit_included}`.
+
+The following are the configuration options for HiveIncrementalPuller
+
+| **Config** | **Description** | **Default** |
+|-------|--------|--------|
+|hiveUrl| Hive Server 2 URL to connect to |  |
+|hiveUser| Hive Server 2 Username |  |
+|hivePass| Hive Server 2 Password |  |
+|queue| YARN Queue name |  |
+|tmp| Directory where the temporary delta data is stored in DFS. The directory structure will follow conventions. Please see the below section.  |  |
+|extractSQLFile| The SQL to execute on the source table to extract the data. The data extracted will be all the rows that changed since a particular point in time. |  |
+|sourceTable| Source Table Name. Needed to set hive environment properties. |  |
+|sourceDb| Source DB name. Needed to set hive environment properties.| |
+|targetTable| Target Table Name. Needed for the intermediate storage directory structure.  |  |
+|targetDb| Target table's DB name.| |
+|tmpdb| The database to which the intermediate temp delta table will be created | hoodie_temp |
+|fromCommitTime| This is the most important parameter. This is the point in time from which the changed records are queried from.  |  |
+|maxCommits| Number of commits to include in the query. Setting this to -1 will include all the commits from fromCommitTime. Setting this to a value > 0, will include records that ONLY changed in the specified number of commits after fromCommitTime. This may be needed if you need to catch up say 2 commits at a time. | 3 |
+|help| Utility Help |  |
+
+
+Setting fromCommitTime=0 and maxCommits=-1 will fetch the entire source table and can be used to initiate backfills. If the target table is a Hudi table,
+then the utility can determine if the target table has no commits or is behind more than 24 hour (this is configurable),
+it will automatically use the backfill configuration, since applying the last 24 hours incrementally could take more time than doing a backfill. The current limitation of the tool
+is the lack of support for self-joining the same table in mixed mode (snapshot and incremental modes).
+
+**NOTE on Hive incremental queries that are executed using Fetch task:**
+Since Fetch tasks invoke InputFormat.listStatus() per partition, Hoodie metadata can be listed in
+every such listStatus() call. In order to avoid this, it might be useful to disable fetch tasks
+using the hive session property for incremental queries: `set hive.fetch.task.conversion=none;` This
+would ensure Map Reduce execution is chosen for a Hive query, which combines partitions (comma
+separated) and calls InputFormat.listStatus() only once with all those partitions.
+
+## Spark SQL
+Once the Hudi tables have been registered to the Hive metastore, it can be queried using the Spark-Hive integration. It supports all query types across both Hudi table types, 
+relying on the custom Hudi input formats again like Hive. Typically notebook users and spark-shell users leverage spark sql for querying Hudi tables. Please add hudi-spark-bundle as described above via --jars or --packages.
+ 
+By default, Spark SQL will try to use its own parquet reader instead of Hive SerDe when reading from Hive metastore parquet tables. However, for MERGE_ON_READ tables which has 
+both parquet and avro data, this default setting needs to be turned off using set `spark.sql.hive.convertMetastoreParquet=false`. 
+This will force Spark to fallback to using the Hive Serde to read the data (planning/executions is still Spark). 
+
+```java
+$ spark-shell --driver-class-path /etc/hive/conf  --packages org.apache.hudi:hudi-spark-bundle_2.11:0.5.3,org.apache.spark:spark-avro_2.11:2.4.4 --conf spark.sql.hive.convertMetastoreParquet=false --num-executors 10 --driver-memory 7g --executor-memory 2g  --master yarn-client
+
+scala> sqlContext.sql("select count(*) from hudi_trips_mor_rt where datestr = '2016-10-02'").show()
+scala> sqlContext.sql("select count(*) from hudi_trips_mor_rt where datestr = '2016-10-02'").show()
+```
+
+For COPY_ON_WRITE tables, either Hive SerDe can be used by turning off `spark.sql.hive.convertMetastoreParquet=false` as described above or Spark's built in support can be leveraged. 
+If using spark's built in support, additionally a path filter needs to be pushed into sparkContext as follows. This method retains Spark built-in optimizations for reading parquet files like vectorized reading on Hudi Hive tables.
+
+```scala
+spark.sparkContext.hadoopConfiguration.setClass("mapreduce.input.pathFilter.class", classOf[org.apache.hudi.hadoop.HoodieROTablePathFilter], classOf[org.apache.hadoop.fs.PathFilter]);
+```
+
+## Spark Datasource
+
+The Spark Datasource API is a popular way of authoring Spark ETL pipelines. Hudi COPY_ON_WRITE and MERGE_ON_READ tables can be queried via Spark datasource similar to how standard 
+datasources work (e.g: `spark.read.parquet`). MERGE_ON_READ table supports snapshot querying and COPY_ON_WRITE table supports both snapshot and incremental querying via Spark datasource. Typically spark jobs require adding `--jars <path to jar>/hudi-spark-bundle_2.11-<hudi version>.jar` to classpath of drivers 
+and executors. Alternatively, hudi-spark-bundle can also fetched via the `--packages` options (e.g: `--packages org.apache.hudi:hudi-spark-bundle_2.11:0.5.3`).
+
+### Snapshot query {#spark-snap-query}
+This method can be used to retrieve the data table at the present point in time.
+Note: The file path must be suffixed with a number of wildcard asterisk (`/*`) one greater than the number of partition levels. Eg: with table file path "tablePath" partitioned by columns "a", "b", and "c", the load path must be `tablePath + "/*/*/*/*"`
+
+```scala
+val hudiIncQueryDF = spark
+     .read()
+     .format("org.apache.hudi")
+     .option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL())
+     .load(tablePath + "/*") //The number of wildcard asterisks here must be one greater than the number of partition
+```
+
+### Incremental query {#spark-incr-query}
+Of special interest to spark pipelines, is Hudi's ability to support incremental queries, like below. A sample incremental query, that will obtain all records written since `beginInstantTime`, looks like below.
+Thanks to Hudi's support for record level change streams, these incremental pipelines often offer 10x efficiency over batch counterparts, by only processing the changed records.
+The following snippet shows how to obtain all records changed after `beginInstantTime` and run some SQL on them.
+
+```java
+ Dataset<Row> hudiIncQueryDF = spark.read()
+     .format("org.apache.hudi")
+     .option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
+     .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), <beginInstantTime>)
+     .option(DataSourceReadOptions.INCR_PATH_GLOB_OPT_KEY(), "/year=2020/month=*/day=*") // Optional, use glob pattern if querying certain partitions
+     .load(tablePath); // For incremental query, pass in the root/base path of table
+     
+hudiIncQueryDF.createOrReplaceTempView("hudi_trips_incremental")
+spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from  hudi_trips_incremental where fare > 20.0").show()
+```
+
+For examples, refer to [Setup spark-shell in quickstart](/docs/quick-start-guide#setup-spark-shell). 
+Please refer to [configurations](/docs/configurations#spark-datasource) section, to view all datasource options.
+
+Additionally, `HoodieReadClient` offers the following functionality using Hudi's implicit indexing.
+
+| **API** | **Description** |
+|-------|--------|
+| read(keys) | Read out the data corresponding to the keys as a DataFrame, using Hudi's own index for faster lookup |
+| filterExists() | Filter out already existing records from the provided `RDD[HoodieRecord]`. Useful for de-duplication |
+| checkExists(keys) | Check if the provided keys exist in a Hudi table |
+
+## Flink SQL
+Once the flink Hudi tables have been registered to the Flink catalog, it can be queried using the Flink SQL. It supports all query types across both Hudi table types,
+relying on the custom Hudi input formats again like Hive. Typically notebook users and Flink SQL CLI users leverage flink sql for querying Hudi tables. Please add hudi-flink-bundle as described above via --jars.
+
+By default, Flink SQL will try to use its own parquet reader instead of Hive SerDe when reading from Hive metastore parquet tables.
+
+```bash
+# HADOOP_HOME is your hadoop root directory after unpack the binary package.
+export HADOOP_CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath`
+
+./bin/sql-client.sh embedded -j .../hudi-flink-bundle_2.1?-*.*.*.jar shell
+```
+
+```sql
+-- this defines a COPY_ON_WRITE table named 't1'
+CREATE TABLE t1(
+  uuid VARCHAR(20), -- you can use 'PRIMARY KEY NOT ENFORCED' syntax to specify the field as record key
+  name VARCHAR(10),
+  age INT,
+  ts TIMESTAMP(3),
+  `partition` VARCHAR(20)
+)
+PARTITIONED BY (`partition`)
+WITH (
+  'connector' = 'hudi',
+  'path' = 'table_base+path'
+);
+
+-- query the data
+select * from t1 where `partition` = 'par1';
+```
+
+Flink's built-in support parquet is used for both COPY_ON_WRITE and MERGE_ON_READ tables,
+additionally partition prune is applied by Flink engine internally if a partition path is specified
+in the filter. Filters push down is not supported yet (already on the roadmap).
+
+For MERGE_ON_READ table, in order to query hudi table as a streaming, you need to add option `'read.streaming.enabled' = 'true'`,
+when querying the table, a Flink streaming pipeline starts and never ends until the user cancel the job manually.
+You can specify the start commit with option `read.streaming.start-commit` and source monitoring interval with option
+`read.streaming.check-interval`.
+
+## PrestoDB
+
+PrestoDB is a popular query engine, providing interactive query performance. PrestoDB currently supports snapshot querying on COPY_ON_WRITE tables. 
+Both snapshot and read optimized queries are supported on MERGE_ON_READ Hudi tables. Since PrestoDB-Hudi integration has evolved over time, the installation
+instructions for PrestoDB would vary based on versions. Please check the below table for query types supported and installation instructions 
+for different versions of PrestoDB.
+
+
+| **PrestoDB Version** | **Installation description** | **Query types supported** |
+|----------------------|------------------------------|---------------------------|
+| < 0.233              | Requires the `hudi-presto-bundle` jar to be placed into `<presto_install>/plugin/hive-hadoop2/`, across the installation. | Snapshot querying on COW tables. Read optimized querying on MOR tables. |
+| >= 0.233             | No action needed. Hudi (0.5.1-incubating) is a compile time dependency. | Snapshot querying on COW tables. Read optimized querying on MOR tables. |
+| >= 0.240             | No action needed. Hudi 0.5.3 version is a compile time dependency. | Snapshot querying on both COW and MOR tables |
+
+## Trino
+
+[Trino](https://trino.io/) (formerly PrestoSQL) was forked off PrestoDB few years ago. Hudi supports snapshot queries on Copy-On-Write tables & Read Optimized queries
+on Merge-On-Read tables at the moment, through the initial input format based integration in PrestoDB (pre forking). This approach has
+known performance limitations with very large tables, which has been since fixed on PrestoDB. We are working on replicating the same fixes on Trino as well.
+
+To query Hudi tables on Trino, please place the `hudi-presto-bundle` jar into the Hive connector installation `<trino_install>/plugin/hive-hadoop2`.
+
+## Impala (3.4 or later)
+
+### Snapshot Query
+
+Impala is able to query Hudi Copy-on-write table as an [EXTERNAL TABLE](https://docs.cloudera.com/documentation/enterprise/6/6.3/topics/impala_tables#external_tables) on HDFS.  
+
+To create a Hudi read optimized table on Impala:
+```
+CREATE EXTERNAL TABLE database.table_name
+LIKE PARQUET '/path/to/load/xxx.parquet'
+STORED AS HUDIPARQUET
+LOCATION '/path/to/load';
+```
+Impala is able to take advantage of the physical partition structure to improve the query performance.
+To create a partitioned table, the folder should follow the naming convention like `year=2020/month=1`.
+Impala use `=` to separate partition name and partition value.  
+To create a partitioned Hudi read optimized table on Impala:
+```
+CREATE EXTERNAL TABLE database.table_name
+LIKE PARQUET '/path/to/load/xxx.parquet'
+PARTITION BY (year int, month int, day int)
+STORED AS HUDIPARQUET
+LOCATION '/path/to/load';
+ALTER TABLE database.table_name RECOVER PARTITIONS;
+```
+After Hudi made a new commit, refresh the Impala table to get the latest results.
+```
+REFRESH database.table_name
+```
diff --git a/website/docs/quick-start-guide.md b/website/docs/quick-start-guide.md
new file mode 100644
index 0000000..5549be4
--- /dev/null
+++ b/website/docs/quick-start-guide.md
@@ -0,0 +1,594 @@
+---
+title: "Quick Start Guide - Spark"
+sidebar_label: "Spark Guide"
+sidebar_position: 2
+toc: true
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+This guide provides a quick peek at Hudi's capabilities using spark-shell. Using Spark datasources, we will walk through 
+code snippets that allows you to insert and update a Hudi table of default table type: 
+[Copy on Write](/docs/concepts#copy-on-write-table). 
+After each write operation we will also show how to read the data both snapshot and incrementally.
+
+## Setup
+
+Hudi works with Spark-2.4.3+ & Spark 3.x versions. You can follow instructions [here](https://spark.apache.org/downloads) for setting up spark. 
+From the extracted directory run spark-shell with Hudi as:
+
+<Tabs
+defaultValue="scala"
+values={[
+{ label: 'Scala', value: 'scala', },
+{ label: 'Python', value: 'python', },
+]}>
+<TabItem value="scala">
+
+```scala
+// spark-shell for spark 3
+spark-shell \
+  --packages org.apache.hudi:hudi-spark3-bundle_2.12:0.8.0,org.apache.spark:spark-avro_2.12:3.0.1 \
+  --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
+  
+// spark-shell for spark 2 with scala 2.12
+spark-shell \
+  --packages org.apache.hudi:hudi-spark-bundle_2.12:0.8.0,org.apache.spark:spark-avro_2.12:2.4.4 \
+  --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
+  
+// spark-shell for spark 2 with scala 2.11
+spark-shell \
+  --packages org.apache.hudi:hudi-spark-bundle_2.11:0.8.0,org.apache.spark:spark-avro_2.11:2.4.4 \
+  --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
+```
+
+</TabItem>
+<TabItem value="python">
+
+```python
+# pyspark
+export PYSPARK_PYTHON=$(which python3)
+
+# for spark3
+pyspark
+--packages org.apache.hudi:hudi-spark3-bundle_2.12:0.8.0,org.apache.spark:spark-avro_2.12:3.0.1
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
+
+# for spark2 with scala 2.12
+pyspark
+--packages org.apache.hudi:hudi-spark-bundle_2.12:0.8.0,org.apache.spark:spark-avro_2.12:2.4.4
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
+
+# for spark2 with scala 2.11
+pyspark
+--packages org.apache.hudi:hudi-spark-bundle_2.11:0.8.0,org.apache.spark:spark-avro_2.11:2.4.4
+--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
+```
+
+</TabItem>
+</Tabs>
+
+:::note Please note the following
+<ul>
+  <li>spark-avro module needs to be specified in --packages as it is not included with spark-shell by default</li>
+  <li>spark-avro and spark versions must match (we have used 3.0.1 for both above)</li>
+  <li>we have used hudi-spark-bundle built for scala 2.12 since the spark-avro module used also depends on 2.12. 
+         If spark-avro_2.11 is used, correspondingly hudi-spark-bundle_2.11 needs to be used. </li>
+</ul>
+:::
+
+Setup table name, base path and a data generator to generate records for this guide.
+
+<Tabs
+defaultValue="scala"
+values={[
+{ label: 'Scala', value: 'scala', },
+{ label: 'Python', value: 'python', },
+]}>
+<TabItem value="scala">
+
+```scala
+// spark-shell
+import org.apache.hudi.QuickstartUtils._
+import scala.collection.JavaConversions._
+import org.apache.spark.sql.SaveMode._
+import org.apache.hudi.DataSourceReadOptions._
+import org.apache.hudi.DataSourceWriteOptions._
+import org.apache.hudi.config.HoodieWriteConfig._
+
+val tableName = "hudi_trips_cow"
+val basePath = "file:///tmp/hudi_trips_cow"
+val dataGen = new DataGenerator
+```
+
+</TabItem>
+<TabItem value="python">
+
+```python
+# pyspark
+tableName = "hudi_trips_cow"
+basePath = "file:///tmp/hudi_trips_cow"
+dataGen = sc._jvm.org.apache.hudi.QuickstartUtils.DataGenerator()
+```
+
+</TabItem>
+</Tabs>
+
+:::tip
+The [DataGenerator](https://github.com/apache/hudi/blob/master/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java#L50) 
+can generate sample inserts and updates based on the the sample trip schema [here](https://github.com/apache/hudi/blob/master/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java#L57)
+:::
+
+
+## Insert data
+
+Generate some new trips, load them into a DataFrame and write the DataFrame into the Hudi table as below.
+
+<Tabs
+defaultValue="scala"
+values={[
+{ label: 'Scala', value: 'scala', },
+{ label: 'Python', value: 'python', },
+]}>
+<TabItem value="scala">
+
+```scala
+// spark-shell
+val inserts = convertToStringList(dataGen.generateInserts(10))
+val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2))
+df.write.format("hudi").
+  options(getQuickstartWriteConfigs).
+  option(PRECOMBINE_FIELD_OPT_KEY, "ts").
+  option(RECORDKEY_FIELD_OPT_KEY, "uuid").
+  option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
+  option(TABLE_NAME, tableName).
+  mode(Overwrite).
+  save(basePath)
+``` 
+
+</TabItem>
+<TabItem value="python">
+
+```python
+# pyspark
+inserts = sc._jvm.org.apache.hudi.QuickstartUtils.convertToStringList(dataGen.generateInserts(10))
+df = spark.read.json(spark.sparkContext.parallelize(inserts, 2))
+
+hudi_options = {
+    'hoodie.table.name': tableName,
+    'hoodie.datasource.write.recordkey.field': 'uuid',
+    'hoodie.datasource.write.partitionpath.field': 'partitionpath',
+    'hoodie.datasource.write.table.name': tableName,
+    'hoodie.datasource.write.operation': 'upsert',
+    'hoodie.datasource.write.precombine.field': 'ts',
+    'hoodie.upsert.shuffle.parallelism': 2,
+    'hoodie.insert.shuffle.parallelism': 2
+}
+
+df.write.format("hudi").
+    options(**hudi_options).
+    mode("overwrite").
+    save(basePath)
+```
+
+</TabItem>
+</Tabs>
+
+:::info
+`mode(Overwrite)` overwrites and recreates the table if it already exists.
+You can check the data generated under `/tmp/hudi_trips_cow/<region>/<country>/<city>/`. We provided a record key 
+(`uuid` in [schema](https://github.com/apache/hudi/blob/master/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java#L58)), partition field (`region/country/city`) and combine logic (`ts` in 
+[schema](https://github.com/apache/hudi/blob/master/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java#L58)) to ensure trip records are unique within each partition. For more info, refer to 
+[Modeling data stored in Hudi](https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=113709185#FAQ-HowdoImodelthedatastoredinHudi)
+and for info on ways to ingest data into Hudi, refer to [Writing Hudi Tables](/docs/writing_data).
+Here we are using the default write operation : `upsert`. If you have a workload without updates, you can also issue 
+`insert` or `bulk_insert` operations which could be faster. To know more, refer to [Write operations](/docs/writing_data#write-operations)
+:::
+
+## Query data 
+
+Load the data files into a DataFrame.
+
+<Tabs
+defaultValue="scala"
+values={[
+{ label: 'Scala', value: 'scala', },
+{ label: 'Python', value: 'python', },
+]}>
+<TabItem value="scala">
+
+```scala
+// spark-shell
+val tripsSnapshotDF = spark.
+  read.
+  format("hudi").
+  load(basePath + "/*/*/*/*")
+//load(basePath) use "/partitionKey=partitionValue" folder structure for Spark auto partition discovery
+tripsSnapshotDF.createOrReplaceTempView("hudi_trips_snapshot")
+
+spark.sql("select fare, begin_lon, begin_lat, ts from  hudi_trips_snapshot where fare > 20.0").show()
+spark.sql("select _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare from  hudi_trips_snapshot").show()
+```
+
+</TabItem>
+<TabItem value="python">
+
+```python
+# pyspark
+tripsSnapshotDF = spark. \
+  read. \
+  format("hudi"). \
+  load(basePath + "/*/*/*/*")
+# load(basePath) use "/partitionKey=partitionValue" folder structure for Spark auto partition discovery
+
+tripsSnapshotDF.createOrReplaceTempView("hudi_trips_snapshot")
+
+spark.sql("select fare, begin_lon, begin_lat, ts from  hudi_trips_snapshot where fare > 20.0").show()
+spark.sql("select _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare from  hudi_trips_snapshot").show()
+```
+
+</TabItem>
+</Tabs>
+
+:::info
+This query provides snapshot querying of the ingested data. Since our partition path (`region/country/city`) is 3 levels nested 
+from base path we ve used `load(basePath + "/*/*/*/*")`. 
+Refer to [Table types and queries](/docs/concepts#table-types--queries) for more info on all table types and query types supported.
+:::
+
+## Update data
+
+This is similar to inserting new data. Generate updates to existing trips using the data generator, load into a DataFrame 
+and write DataFrame into the hudi table.
+
+<Tabs
+defaultValue="scala"
+values={[
+{ label: 'Scala', value: 'scala', },
+{ label: 'Python', value: 'python', },
+]}>
+<TabItem value="scala">
+
+```scala
+// spark-shell
+val updates = convertToStringList(dataGen.generateUpdates(10))
+val df = spark.read.json(spark.sparkContext.parallelize(updates, 2))
+df.write.format("hudi").
+  options(getQuickstartWriteConfigs).
+  option(PRECOMBINE_FIELD_OPT_KEY, "ts").
+  option(RECORDKEY_FIELD_OPT_KEY, "uuid").
+  option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
+  option(TABLE_NAME, tableName).
+  mode(Append).
+  save(basePath)
+```
+
+</TabItem>
+<TabItem value="python">
+
+```python
+# pyspark
+updates = sc._jvm.org.apache.hudi.QuickstartUtils.convertToStringList(dataGen.generateUpdates(10))
+df = spark.read.json(spark.sparkContext.parallelize(updates, 2))
+df.write.format("hudi"). \
+  options(**hudi_options). \
+  mode("append"). \
+  save(basePath)
+```
+
+</TabItem>
+</Tabs>
+
+:::note
+Notice that the save mode is now `Append`. In general, always use append mode unless you are trying to create the table for the first time.
+[Querying](#query-data) the data again will now show updated trips. Each write operation generates a new [commit](/docs/concepts) 
+denoted by the timestamp. Look for changes in `_hoodie_commit_time`, `rider`, `driver` fields for the same `_hoodie_record_key`s in previous commit. 
+:::
+
+## Incremental query
+
+Hudi also provides capability to obtain a stream of records that changed since given commit timestamp. 
+This can be achieved using Hudi's incremental querying and providing a begin time from which changes need to be streamed. 
+We do not need to specify endTime, if we want all changes after the given commit (as is the common case). 
+
+<Tabs
+defaultValue="scala"
+values={[
+{ label: 'Scala', value: 'scala', },
+{ label: 'Python', value: 'python', },
+]}>
+<TabItem value="scala">
+
+```scala
+// spark-shell
+// reload data
+spark.
+  read.
+  format("hudi").
+  load(basePath + "/*/*/*/*").
+  createOrReplaceTempView("hudi_trips_snapshot")
+
+val commits = spark.sql("select distinct(_hoodie_commit_time) as commitTime from  hudi_trips_snapshot order by commitTime").map(k => k.getString(0)).take(50)
+val beginTime = commits(commits.length - 2) // commit time we are interested in
+
+// incrementally query data
+val tripsIncrementalDF = spark.read.format("hudi").
+  option(QUERY_TYPE_OPT_KEY, QUERY_TYPE_INCREMENTAL_OPT_VAL).
+  option(BEGIN_INSTANTTIME_OPT_KEY, beginTime).
+  load(basePath)
+tripsIncrementalDF.createOrReplaceTempView("hudi_trips_incremental")
+
+spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from  hudi_trips_incremental where fare > 20.0").show()
+``` 
+
+</TabItem>
+<TabItem value="python">
+
+```python
+# pyspark
+# reload data
+spark. \
+  read. \
+  format("hudi"). \
+  load(basePath + "/*/*/*/*"). \
+  createOrReplaceTempView("hudi_trips_snapshot")
+
+commits = list(map(lambda row: row[0], spark.sql("select distinct(_hoodie_commit_time) as commitTime from  hudi_trips_snapshot order by commitTime").limit(50).collect()))
+beginTime = commits[len(commits) - 2] # commit time we are interested in
+
+# incrementally query data
+incremental_read_options = {
+  'hoodie.datasource.query.type': 'incremental',
+  'hoodie.datasource.read.begin.instanttime': beginTime,
+}
+
+tripsIncrementalDF = spark.read.format("hudi"). \
+  options(**incremental_read_options). \
+  load(basePath)
+tripsIncrementalDF.createOrReplaceTempView("hudi_trips_incremental")
+
+spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from  hudi_trips_incremental where fare > 20.0").show()
+```
+
+</TabItem>
+</Tabs>
+
+:::info
+This will give all changes that happened after the beginTime commit with the filter of fare > 20.0. The unique thing about this
+feature is that it now lets you author streaming pipelines on batch data.
+:::
+
+## Point in time query
+
+Lets look at how to query data as of a specific time. The specific time can be represented by pointing endTime to a 
+specific commit time and beginTime to "000" (denoting earliest possible commit time). 
+
+<Tabs
+defaultValue="scala"
+values={[
+{ label: 'Scala', value: 'scala', },
+{ label: 'Python', value: 'python', },
+]}>
+<TabItem value="scala">
+
+```scala
+// spark-shell
+val beginTime = "000" // Represents all commits > this time.
+val endTime = commits(commits.length - 2) // commit time we are interested in
+
+//incrementally query data
+val tripsPointInTimeDF = spark.read.format("hudi").
+  option(QUERY_TYPE_OPT_KEY, QUERY_TYPE_INCREMENTAL_OPT_VAL).
+  option(BEGIN_INSTANTTIME_OPT_KEY, beginTime).
+  option(END_INSTANTTIME_OPT_KEY, endTime).
+  load(basePath)
+tripsPointInTimeDF.createOrReplaceTempView("hudi_trips_point_in_time")
+spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_trips_point_in_time where fare > 20.0").show()
+```
+
+</TabItem>
+<TabItem value="python">
+
+```python
+# pyspark
+beginTime = "000" # Represents all commits > this time.
+endTime = commits[len(commits) - 2]
+
+# query point in time data
+point_in_time_read_options = {
+  'hoodie.datasource.query.type': 'incremental',
+  'hoodie.datasource.read.end.instanttime': endTime,
+  'hoodie.datasource.read.begin.instanttime': beginTime
+}
+
+tripsPointInTimeDF = spark.read.format("hudi"). \
+  options(**point_in_time_read_options). \
+  load(basePath)
+
+tripsPointInTimeDF.createOrReplaceTempView("hudi_trips_point_in_time")
+spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_trips_point_in_time where fare > 20.0").show()
+```
+
+</TabItem>
+</Tabs>
+
+## Delete data {#deletes}
+Delete records for the HoodieKeys passed in.
+
+<Tabs
+defaultValue="scala"
+values={[
+{ label: 'Scala', value: 'scala', },
+{ label: 'Python', value: 'python', },
+]}>
+<TabItem value="scala">
+
+```scala
+// spark-shell
+// fetch total records count
+spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
+// fetch two records to be deleted
+val ds = spark.sql("select uuid, partitionpath from hudi_trips_snapshot").limit(2)
+
+// issue deletes
+val deletes = dataGen.generateDeletes(ds.collectAsList())
+val df = spark.read.json(spark.sparkContext.parallelize(deletes, 2))
+
+df.write.format("hudi").
+  options(getQuickstartWriteConfigs).
+  option(OPERATION_OPT_KEY,"delete").
+  option(PRECOMBINE_FIELD_OPT_KEY, "ts").
+  option(RECORDKEY_FIELD_OPT_KEY, "uuid").
+  option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
+  option(TABLE_NAME, tableName).
+  mode(Append).
+  save(basePath)
+
+// run the same read query as above.
+val roAfterDeleteViewDF = spark.
+  read.
+  format("hudi").
+  load(basePath + "/*/*/*/*")
+
+roAfterDeleteViewDF.registerTempTable("hudi_trips_snapshot")
+// fetch should return (total - 2) records
+spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
+```
+
+</TabItem>
+<TabItem value="python">
+
+```python
+# pyspark
+# fetch total records count
+spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
+# fetch two records to be deleted
+ds = spark.sql("select uuid, partitionpath from hudi_trips_snapshot").limit(2)
+
+# issue deletes
+hudi_delete_options = {
+  'hoodie.table.name': tableName,
+  'hoodie.datasource.write.recordkey.field': 'uuid',
+  'hoodie.datasource.write.partitionpath.field': 'partitionpath',
+  'hoodie.datasource.write.table.name': tableName,
+  'hoodie.datasource.write.operation': 'delete',
+  'hoodie.datasource.write.precombine.field': 'ts',
+  'hoodie.upsert.shuffle.parallelism': 2, 
+  'hoodie.insert.shuffle.parallelism': 2
+}
+
+from pyspark.sql.functions import lit
+deletes = list(map(lambda row: (row[0], row[1]), ds.collect()))
+df = spark.sparkContext.parallelize(deletes).toDF(['uuid', 'partitionpath']).withColumn('ts', lit(0.0))
+df.write.format("hudi"). \
+  options(**hudi_delete_options). \
+  mode("append"). \
+  save(basePath)
+
+# run the same read query as above.
+roAfterDeleteViewDF = spark. \
+  read. \
+  format("hudi"). \
+  load(basePath + "/*/*/*/*") 
+roAfterDeleteViewDF.registerTempTable("hudi_trips_snapshot")
+# fetch should return (total - 2) records
+spark.sql("select uuid, partitionpath from hudi_trips_snapshot").count()
+```
+
+</TabItem>
+</Tabs>
+
+:::note
+Only `Append` mode is supported for delete operation.
+:::
+
+See the [deletion section](/docs/writing_data#deletes) of the writing data page for more details.
+
+## Insert Overwrite Table
+
+Generate some new trips, overwrite the table logically at the Hudi metadata level. The Hudi cleaner will eventually
+clean up the previous table snapshot's file groups. This can be faster than deleting the older table and recreating 
+in `Overwrite` mode.
+
+```scala
+// spark-shell
+spark.
+  read.format("hudi").
+  load(basePath + "/*/*/*/*").
+  select("uuid","partitionpath").
+  show(10, false)
+
+val inserts = convertToStringList(dataGen.generateInserts(10))
+val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2))
+df.write.format("hudi").
+  options(getQuickstartWriteConfigs).
+  option(OPERATION_OPT_KEY,"insert_overwrite_table").
+  option(PRECOMBINE_FIELD_OPT_KEY, "ts").
+  option(RECORDKEY_FIELD_OPT_KEY, "uuid").
+  option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
+  option(TABLE_NAME, tableName).
+  mode(Append).
+  save(basePath)
+
+// Should have different keys now, from query before.
+spark.
+  read.format("hudi").
+  load(basePath + "/*/*/*/*").
+  select("uuid","partitionpath").
+  show(10, false)
+
+``` 
+
+## Insert Overwrite 
+
+Generate some new trips, overwrite the all the partitions that are present in the input. This operation can be faster
+than `upsert` for batch ETL jobs, that are recomputing entire target partitions at once (as opposed to incrementally
+updating the target tables). This is because, we are able to bypass indexing, precombining and other repartitioning 
+steps in the upsert write path completely.
+
+```scala
+// spark-shell
+spark.
+  read.format("hudi").
+  load(basePath + "/*/*/*/*").
+  select("uuid","partitionpath").
+  sort("partitionpath","uuid").
+  show(100, false)
+
+val inserts = convertToStringList(dataGen.generateInserts(10))
+val df = spark.
+  read.json(spark.sparkContext.parallelize(inserts, 2)).
+  filter("partitionpath = 'americas/united_states/san_francisco'")
+df.write.format("hudi").
+  options(getQuickstartWriteConfigs).
+  option(OPERATION_OPT_KEY,"insert_overwrite").
+  option(PRECOMBINE_FIELD_OPT_KEY, "ts").
+  option(RECORDKEY_FIELD_OPT_KEY, "uuid").
+  option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
+  option(TABLE_NAME, tableName).
+  mode(Append).
+  save(basePath)
+
+// Should have different keys now for San Francisco alone, from query before.
+spark.
+  read.format("hudi").
+  load(basePath + "/*/*/*/*").
+  select("uuid","partitionpath").
+  sort("partitionpath","uuid").
+  show(100, false)
+```
+
+## Where to go from here?
+
+You can also do the quickstart by [building hudi yourself](https://github.com/apache/hudi#building-apache-hudi-from-source), 
+and using `--jars <path to hudi_code>/packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.1?-*.*.*-SNAPSHOT.jar` in the spark-shell command above
+instead of `--packages org.apache.hudi:hudi-spark3-bundle_2.12:0.8.0`. Hudi also supports scala 2.12. Refer [build with scala 2.12](https://github.com/apache/hudi#build-with-scala-212)
+for more info.
+
+Also, we used Spark here to show case the capabilities of Hudi. However, Hudi can support multiple table types/query types and 
+Hudi tables can be queried from query engines like Hive, Spark, Presto and much more. We have put together a 
+[demo video](https://www.youtube.com/watch?v=VhNgUsxdrD0) that show cases all of this on a docker based setup with all 
+dependent systems running locally. We recommend you replicate the same setup and run the demo yourself, by following 
+steps [here](/docs/docker_demo) to get a taste for it. Also, if you are looking for ways to migrate your existing data 
+to Hudi, refer to [migration guide](/docs/migration_guide). 
diff --git a/website/docs/s3_hoodie.md b/website/docs/s3_hoodie.md
new file mode 100644
index 0000000..c39c73c
--- /dev/null
+++ b/website/docs/s3_hoodie.md
@@ -0,0 +1,80 @@
+---
+title: AWS S3 
+keywords: [ hudi, hive, aws, s3, spark, presto]
+summary: In this page, we go over how to configure Hudi with S3 filesystem.
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+In this page, we explain how to get your Hudi spark job to store into AWS S3.
+
+## AWS configs
+
+There are two configurations required for Hudi-S3 compatibility:
+
+- Adding AWS Credentials for Hudi
+- Adding required Jars to classpath
+
+### AWS Credentials
+
+Simplest way to use Hudi with S3, is to configure your `SparkSession` or `SparkContext` with S3 credentials. Hudi will automatically pick this up and talk to S3.
+
+Alternatively, add the required configs in your core-site.xml from where Hudi can fetch them. Replace the `fs.defaultFS` with your S3 bucket name and Hudi should be able to read/write from the bucket.
+
+```xml
+  <property>
+      <name>fs.defaultFS</name>
+      <value>s3://ysharma</value>
+  </property>
+
+  <property>
+      <name>fs.s3.impl</name>
+      <value>org.apache.hadoop.fs.s3native.NativeS3FileSystem</value>
+  </property>
+
+  <property>
+      <name>fs.s3.awsAccessKeyId</name>
+      <value>AWS_KEY</value>
+  </property>
+
+  <property>
+       <name>fs.s3.awsSecretAccessKey</name>
+       <value>AWS_SECRET</value>
+  </property>
+
+  <property>
+       <name>fs.s3n.awsAccessKeyId</name>
+       <value>AWS_KEY</value>
+  </property>
+
+  <property>
+       <name>fs.s3n.awsSecretAccessKey</name>
+       <value>AWS_SECRET</value>
+  </property>
+```
+
+
+Utilities such as hudi-cli or deltastreamer tool, can pick up s3 creds via environmental variable prefixed with `HOODIE_ENV_`. For e.g below is a bash snippet to setup
+such variables and then have cli be able to work on datasets stored in s3
+
+```java
+export HOODIE_ENV_fs_DOT_s3a_DOT_access_DOT_key=$accessKey
+export HOODIE_ENV_fs_DOT_s3a_DOT_secret_DOT_key=$secretKey
+export HOODIE_ENV_fs_DOT_s3_DOT_awsAccessKeyId=$accessKey
+export HOODIE_ENV_fs_DOT_s3_DOT_awsSecretAccessKey=$secretKey
+export HOODIE_ENV_fs_DOT_s3n_DOT_awsAccessKeyId=$accessKey
+export HOODIE_ENV_fs_DOT_s3n_DOT_awsSecretAccessKey=$secretKey
+export HOODIE_ENV_fs_DOT_s3n_DOT_impl=org.apache.hadoop.fs.s3a.S3AFileSystem
+```
+
+
+
+### AWS Libs
+
+AWS hadoop libraries to add to our classpath
+
+ - com.amazonaws:aws-java-sdk:1.10.34
+ - org.apache.hadoop:hadoop-aws:2.7.3
+
+AWS glue data libraries are needed if AWS glue data is used
+
+ - com.amazonaws.glue:aws-glue-datacatalog-hive2-client:1.11.0
+ - com.amazonaws:aws-java-sdk-glue:1.11.475
\ No newline at end of file
diff --git a/website/docs/structure.md b/website/docs/structure.md
new file mode 100644
index 0000000..137520d
--- /dev/null
+++ b/website/docs/structure.md
@@ -0,0 +1,20 @@
+---
+title: Structure
+keywords: [ big data, stream processing, cloud, hdfs, storage, upserts, change capture]
+summary: "Hudi brings stream processing to big data, providing fresh data while being an order of magnitude efficient over traditional batch processing."
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+
+Hudi (pronounced “Hoodie”) ingests & manages storage of large analytical tables over DFS ([HDFS](http://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign) or cloud stores) and provides three types of queries.
+
+ * **Read Optimized query** - Provides excellent query performance on pure columnar storage, much like plain [Parquet](https://parquet.apache.org/) tables.
+ * **Incremental query** - Provides a change stream out of the dataset to feed downstream jobs/ETLs.
+ * **Snapshot query** - Provides queries on real-time data, using a combination of columnar & row based storage (e.g Parquet + [Avro](http://avro.apache.org/docs/current/mr))
+
+<figure>
+    <img className="docimage" src={require("/assets/images/hudi_intro_1.png").default} alt="hudi_intro_1.png" />
+</figure>
+
+By carefully managing how data is laid out in storage & how it’s exposed to queries, Hudi is able to power a rich data ecosystem where external sources can be ingested in near real-time and made available for interactive SQL Engines like [PrestoDB](https://prestodb.io) & [Spark](https://spark.apache.org/sql/), while at the same time capable of being consumed incrementally from processing/ETL frameworks like [Hive](https://hive.apache.org/) & [Spark](https://spark.apache.org/docs/latest/) [...]
+
+Hudi broadly consists of a self contained Spark library to build tables and integrations with existing query engines for data access. See [quickstart](/docs/quick-start-guide) for a demo.
diff --git a/website/docs/use_cases.md b/website/docs/use_cases.md
new file mode 100644
index 0000000..124cabe
--- /dev/null
+++ b/website/docs/use_cases.md
@@ -0,0 +1,81 @@
+---
+title: "Use Cases"
+keywords: [ hudi, data ingestion, etl, real time, use cases]
+summary: "Following are some sample use-cases for Hudi, which illustrate the benefits in terms of faster processing & increased efficiency"
+toc: true
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+
+## Near Real-Time Ingestion
+
+Hudi offers some great benefits across ingestion of all kinds. Hudi helps __enforces a minimum file size on DFS__. This helps
+solve the ["small files problem"](https://blog.cloudera.com/blog/2009/02/the-small-files-problem/) for HDFS and Cloud Stores alike,
+significantly improving query performance. Hudi adds the much needed ability to atomically commit new data, shielding queries from
+ever seeing partial writes and helping ingestion recover gracefully from failures.
+
+Ingesting data from OLTP sources like (event logs, databases, external sources) into a [Data Lake](http://martinfowler.com/bliki/DataLake) is a common problem,
+that is unfortunately solved in a piecemeal fashion, using a medley of ingestion tools. This "raw data" layer of the data lake often forms the bedrock on which
+more value is created.
+
+For RDBMS ingestion, Hudi provides __faster loads via Upserts__, as opposed costly & inefficient bulk loads. It's very common to use a change capture solution like
+[Debezium](http://debezium.io/) or [Kafka Connect](https://docs.confluent.io/platform/current/connect/index) or 
+[Sqoop Incremental Import](https://sqoop.apache.org/docs/1.4.2/SqoopUserGuide#_incremental_imports) and apply them to an
+equivalent Hudi table on DFS. For NoSQL datastores like [Cassandra](http://cassandra.apache.org/) / [Voldemort](http://www.project-voldemort.com/voldemort/) / [HBase](https://hbase.apache.org/), 
+even moderately big installations store billions of rows. It goes without saying that __full bulk loads are simply infeasible__ and more efficient approaches 
+are needed if ingestion is to keep up with the typically high update volumes.
+
+Even for immutable data sources like [Kafka](https://kafka.apache.org), there is often a need to de-duplicate the incoming events against what's stored on DFS.
+Hudi achieves this by [employing indexes](http://hudi.apache.org/blog/hudi-indexing-mechanisms/) of different kinds, quickly and efficiently.
+
+All of this is seamlessly achieved by the Hudi DeltaStreamer tool, which is maintained in tight integration with rest of the code 
+and we are always trying to add more capture sources, to make this easier for the users. The tool also has a continuous mode, where it
+can self-manage clustering/compaction asynchronously, without blocking ingestion, significantly improving data freshness.
+
+## Data Deletion
+
+Hudi also offers ability to delete the data stored in the data lake, and more so provides efficient ways of dealing with 
+large write amplification, resulting from random deletes based on user_id (or any secondary key), by way of the `Merge On Read` table types.
+Hudi's elegant log based concurrency control, ensures that the ingestion/writing can continue happening,as a background compaction job
+amortizes the cost of rewriting data/enforcing deletes.
+
+Hudi also unlocks special capabilities like data clustering, which allow users to optimize the data layout for deletions. Specifically,
+users can cluster older event log data based on user_id, such that, queries that evaluate candidates for data deletion can do so, while
+more recent partitions are optimized for query performance and clustered on say timestamp.
+
+## Unified Storage For Analytics
+
+The world we live in is polarized - even on data analytics storage - into real-time and offline/batch storage. Typically, real-time [datamarts](https://en.wikipedia.org/wiki/Data_mart) 
+are powered by specialized analytical stores such as [Druid](http://druid.io/) or [Memsql](http://www.memsql.com/) or [Clickhouse](https://clickhouse.tech/), fed by event buses like
+[Kafka](https://kafka.apache.org) or [Pulsar](https://pulsar.apache.org). This model is prohibitively expensive, unless a small fraction of your data lake data 
+needs sub-second query responses such as system monitoring or interactive real-time analysis.
+
+The same data gets ingested into data lake storage much later (say every few hours or so) and then runs through batch ETL pipelines, with intolerable data freshness
+to do any kind of near-realtime analytics. On the other hand, the data lakes provide access to interactive SQL engines like Presto/SparkSQL, which can horizontally scale 
+easily and provide return even more complex queries, within few seconds. 
+
+By bringing streaming primitives to data lake storage, Hudi opens up new possibilities by being able to ingest data within few minutes and also author incremental data
+pipelines that are orders of magnitude faster than traditional batch processing. By bringing __data freshness to a few minutes__, Hudi can provide a much efficient alternative, 
+for a large class of data applications, compared to real-time datamarts. Also, Hudi has no upfront server infrastructure investments
+and thus enables faster analytics on much fresher analytics, without increasing the operational overhead. This external [article](https://www.analyticsinsight.net/can-big-data-solutions-be-affordable/) 
+further validates this newer model.
+
+## Incremental Processing Pipelines
+
+Data Lake ETL typically involves building a chain of tables derived from each other via DAGs expressed as workflows. Workflows often depend on new data being output by 
+multiple upstream workflows and traditionally, availability of new data is indicated by a new DFS Folder/Hive Partition.
+Let's take a concrete example to illustrate this. An upstream workflow `U` can create a Hive partition for every hour, with data for that hour (event_time) at the end of each hour (processing_time), providing effective freshness of 1 hour.
+Then, a downstream workflow `D`, kicks off immediately after `U` finishes, and does its own processing for the next hour, increasing the effective latency to 2 hours.
+
+The above paradigm simply ignores late arriving data i.e when `processing_time` and `event_time` drift apart.
+Unfortunately, in today's post-mobile & pre-IoT world, __late data from intermittently connected mobile devices & sensors are the norm, not an anomaly__.
+In such cases, the only remedy to guarantee correctness is to reprocess the last few hours worth of data, over and over again each hour, 
+which can significantly hurt the efficiency across the entire ecosystem. For e.g; imagine reprocessing TBs worth of data every hour across hundreds of workflows.
+
+Hudi comes to the rescue again, by providing a way to consume new data (including late data) from an upstream Hudi table `HU` at a record granularity (not folders/partitions),
+apply the processing logic, and efficiently update/reconcile late data with a downstream Hudi table `HD`. Here, `HU` and `HD` can be continuously scheduled at a much more frequent schedule
+like 15 mins, and providing an end-end latency of 30 mins at `HD`.
+
+To achieve this, Hudi has embraced similar concepts from stream processing frameworks like [Spark Streaming](https://spark.apache.org/docs/latest/streaming-programming-guide#join-operations) , Pub/Sub systems like [Kafka](http://kafka.apache.org/documentation/#theconsumer)
+[Flink](https://flink.apache.org) or database replication technologies like [Oracle XStream](https://docs.oracle.com/cd/E11882_01/server.112/e16545/xstrm_cncpt.htm#XSTRM187).
+For the more curious, a more detailed explanation of the benefits of Incremental Processing can be found [here](https://www.oreilly.com/ideas/ubers-case-for-incremental-processing-on-hadoop)
+
diff --git a/website/docs/writing_data.md b/website/docs/writing_data.md
new file mode 100644
index 0000000..93b5233
--- /dev/null
+++ b/website/docs/writing_data.md
@@ -0,0 +1,614 @@
+---
+title: Writing Data
+keywords: [hudi, incremental, batch, stream, processing, Hive, ETL, Spark SQL]
+summary: In this page, we will discuss some available tools for incrementally ingesting & storing data.
+toc: true
+last_modified_at: 2019-12-30T15:59:57-04:00
+---
+
+In this section, we will cover ways to ingest new changes from external sources or even other Hudi tables using the [DeltaStreamer](#deltastreamer) tool, as well as 
+speeding up large Spark jobs via upserts using the [Hudi datasource](#datasource-writer). Such tables can then be [queried](/docs/querying_data) using various query engines.
+
+
+## Write Operations
+
+Before that, it may be helpful to understand the 3 different write operations provided by Hudi datasource or the delta streamer tool and how best to leverage them. These operations
+can be chosen/changed across each commit/deltacommit issued against the table.
+
+
+ - **UPSERT** : This is the default operation where the input records are first tagged as inserts or updates by looking up the index. 
+ The records are ultimately written after heuristics are run to determine how best to pack them on storage to optimize for things like file sizing. 
+ This operation is recommended for use-cases like database change capture where the input almost certainly contains updates. The target table will never show duplicates.
+ - **INSERT** : This operation is very similar to upsert in terms of heuristics/file sizing but completely skips the index lookup step. Thus, it can be a lot faster than upserts 
+ for use-cases like log de-duplication (in conjunction with options to filter duplicates mentioned below). This is also suitable for use-cases where the table can tolerate duplicates, but just 
+ need the transactional writes/incremental pull/storage management capabilities of Hudi.
+ - **BULK_INSERT** : Both upsert and insert operations keep input records in memory to speed up storage heuristics computations faster (among other things) and thus can be cumbersome for 
+ initial loading/bootstrapping a Hudi table at first. Bulk insert provides the same semantics as insert, while implementing a sort-based data writing algorithm, which can scale very well for several hundred TBs 
+ of initial load. However, this just does a best-effort job at sizing files vs guaranteeing file sizes like inserts/upserts do. 
+
+
+## DeltaStreamer
+
+The `HoodieDeltaStreamer` utility (part of hudi-utilities-bundle) provides the way to ingest from different sources such as DFS or Kafka, with the following capabilities.
+
+ - Exactly once ingestion of new events from Kafka, [incremental imports](https://sqoop.apache.org/docs/1.4.2/SqoopUserGuide#_incremental_imports) from Sqoop or output of `HiveIncrementalPuller` or files under a DFS folder
+ - Support json, avro or a custom record types for the incoming data
+ - Manage checkpoints, rollback & recovery 
+ - Leverage Avro schemas from DFS or Confluent [schema registry](https://github.com/confluentinc/schema-registry).
+ - Support for plugging in transformations
+
+Command line options describe capabilities in more detail
+
+```java
+[hoodie]$ spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls packaging/hudi-utilities-bundle/target/hudi-utilities-bundle-*.jar` --help
+Usage: <main class> [options]
+Options:
+    --checkpoint
+      Resume Delta Streamer from this checkpoint.
+    --commit-on-errors
+      Commit even when some records failed to be written
+      Default: false
+    --compact-scheduling-minshare
+      Minshare for compaction as defined in
+      https://spark.apache.org/docs/latest/job-scheduling
+      Default: 0
+    --compact-scheduling-weight
+      Scheduling weight for compaction as defined in
+      https://spark.apache.org/docs/latest/job-scheduling
+      Default: 1
+    --continuous
+      Delta Streamer runs in continuous mode running source-fetch -> Transform
+      -> Hudi Write in loop
+      Default: false
+    --delta-sync-scheduling-minshare
+      Minshare for delta sync as defined in
+      https://spark.apache.org/docs/latest/job-scheduling
+      Default: 0
+    --delta-sync-scheduling-weight
+      Scheduling weight for delta sync as defined in
+      https://spark.apache.org/docs/latest/job-scheduling
+      Default: 1
+    --disable-compaction
+      Compaction is enabled for MoR table by default. This flag disables it
+      Default: false
+    --enable-hive-sync
+      Enable syncing to hive
+      Default: false
+    --filter-dupes
+      Should duplicate records from source be dropped/filtered out before
+      insert/bulk-insert
+      Default: false
+    --help, -h
+
+    --hoodie-conf
+      Any configuration that can be set in the properties file (using the CLI
+      parameter "--propsFilePath") can also be passed command line using this
+      parameter
+      Default: []
+    --max-pending-compactions
+      Maximum number of outstanding inflight/requested compactions. Delta Sync
+      will not happen unlessoutstanding compactions is less than this number
+      Default: 5
+    --min-sync-interval-seconds
+      the min sync interval of each sync in continuous mode
+      Default: 0
+    --op
+      Takes one of these values : UPSERT (default), INSERT (use when input is
+      purely new data/inserts to gain speed)
+      Default: UPSERT
+      Possible Values: [UPSERT, INSERT, BULK_INSERT]
+    --payload-class
+      subclass of HoodieRecordPayload, that works off a GenericRecord.
+      Implement your own, if you want to do something other than overwriting
+      existing value
+      Default: org.apache.hudi.common.model.OverwriteWithLatestAvroPayload
+    --props
+      path to properties file on localfs or dfs, with configurations for
+      hoodie client, schema provider, key generator and data source. For
+      hoodie client props, sane defaults are used, but recommend use to
+      provide basic things like metrics endpoints, hive configs etc. For
+      sources, referto individual classes, for supported properties.
+      Default: file:///Users/vinoth/bin/hoodie/src/test/resources/delta-streamer-config/dfs-source.properties
+    --schemaprovider-class
+      subclass of org.apache.hudi.utilities.schema.SchemaProvider to attach
+      schemas to input & target table data, built in options:
+      org.apache.hudi.utilities.schema.FilebasedSchemaProvider.Source (See
+      org.apache.hudi.utilities.sources.Source) implementation can implement
+      their own SchemaProvider. For Sources that return Dataset<Row>, the
+      schema is obtained implicitly. However, this CLI option allows
+      overriding the schemaprovider returned by Source.
+    --source-class
+      Subclass of org.apache.hudi.utilities.sources to read data. Built-in
+      options: org.apache.hudi.utilities.sources.{JsonDFSSource (default),
+      AvroDFSSource, JsonKafkaSource, AvroKafkaSource, HiveIncrPullSource}
+      Default: org.apache.hudi.utilities.sources.JsonDFSSource
+    --source-limit
+      Maximum amount of data to read from source. Default: No limit For e.g:
+      DFS-Source => max bytes to read, Kafka-Source => max events to read
+      Default: 9223372036854775807
+    --source-ordering-field
+      Field within source record to decide how to break ties between records
+      with same key in input data. Default: 'ts' holding unix timestamp of
+      record
+      Default: ts
+    --spark-master
+      spark master to use.
+      Default: local[2]
+  * --table-type
+      Type of table. COPY_ON_WRITE (or) MERGE_ON_READ
+  * --target-base-path
+      base path for the target hoodie table. (Will be created if did not exist
+      first time around. If exists, expected to be a hoodie table)
+  * --target-table
+      name of the target table in Hive
+    --transformer-class
+      subclass of org.apache.hudi.utilities.transform.Transformer. Allows
+      transforming raw source Dataset to a target Dataset (conforming to
+      target schema) before writing. Default : Not set. E:g -
+      org.apache.hudi.utilities.transform.SqlQueryBasedTransformer (which
+      allows a SQL query templated to be passed as a transformation function)
+```
+
+The tool takes a hierarchically composed property file and has pluggable interfaces for extracting data, key generation and providing schema. Sample configs for ingesting from kafka and dfs are
+provided under `hudi-utilities/src/test/resources/delta-streamer-config`.
+
+For e.g: once you have Confluent Kafka, Schema registry up & running, produce some test data using ([impressions.avro](https://docs.confluent.io/current/ksql/docs/tutorials/generate-custom-test-data) provided by schema-registry repo)
+
+```java
+[confluent-5.0.0]$ bin/ksql-datagen schema=../impressions.avro format=avro topic=impressions key=impressionid
+```
+
+and then ingest it as follows.
+
+```java
+[hoodie]$ spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls packaging/hudi-utilities-bundle/target/hudi-utilities-bundle-*.jar` \
+  --props file://${PWD}/hudi-utilities/src/test/resources/delta-streamer-config/kafka-source.properties \
+  --schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider \
+  --source-class org.apache.hudi.utilities.sources.AvroKafkaSource \
+  --source-ordering-field impresssiontime \
+  --target-base-path file:\/\/\/tmp/hudi-deltastreamer-op \ 
+  --target-table uber.impressions \
+  --op BULK_INSERT
+```
+
+In some cases, you may want to migrate your existing table into Hudi beforehand. Please refer to [migration guide](/docs/migration_guide). 
+
+## MultiTableDeltaStreamer
+
+`HoodieMultiTableDeltaStreamer`, a wrapper on top of `HoodieDeltaStreamer`, enables one to ingest multiple tables at a single go into hudi datasets. Currently it only supports sequential processing of tables to be ingested and COPY_ON_WRITE storage type. The command line options for `HoodieMultiTableDeltaStreamer` are pretty much similar to `HoodieDeltaStreamer` with the only exception that you are required to provide table wise configs in separate files in a dedicated config folder. The [...]
+
+```java
+  * --config-folder
+    the path to the folder which contains all the table wise config files
+    --base-path-prefix
+    this is added to enable users to create all the hudi datasets for related tables under one path in FS. The datasets are then created under the path - <base_path_prefix>/<database>/<table_to_be_ingested>. However you can override the paths for every table by setting the property hoodie.deltastreamer.ingestion.targetBasePath
+```
+
+The following properties are needed to be set properly to ingest data using `HoodieMultiTableDeltaStreamer`. 
+
+```java
+hoodie.deltastreamer.ingestion.tablesToBeIngested
+  comma separated names of tables to be ingested in the format <database>.<table>, for example db1.table1,db1.table2
+hoodie.deltastreamer.ingestion.targetBasePath
+  if you wish to ingest a particular table in a separate path, you can mention that path here
+hoodie.deltastreamer.ingestion.<database>.<table>.configFile
+  path to the config file in dedicated config folder which contains table overridden properties for the particular table to be ingested.
+```
+
+Sample config files for table wise overridden properties can be found under `hudi-utilities/src/test/resources/delta-streamer-config`. The command to run `HoodieMultiTableDeltaStreamer` is also similar to how you run `HoodieDeltaStreamer`.
+
+```java
+[hoodie]$ spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer `ls packaging/hudi-utilities-bundle/target/hudi-utilities-bundle-*.jar` \
+  --props file://${PWD}/hudi-utilities/src/test/resources/delta-streamer-config/kafka-source.properties \
+  --config-folder file://tmp/hudi-ingestion-config \
+  --schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider \
+  --source-class org.apache.hudi.utilities.sources.AvroKafkaSource \
+  --source-ordering-field impresssiontime \
+  --base-path-prefix file:\/\/\/tmp/hudi-deltastreamer-op \ 
+  --target-table uber.impressions \
+  --op BULK_INSERT
+```
+
+For detailed information on how to configure and use `HoodieMultiTableDeltaStreamer`, please refer [blog section](/blog/2020/08/22/ingest-multiple-tables-using-hudi).
+
+## Datasource Writer
+
+The `hudi-spark` module offers the DataSource API to write (and read) a Spark DataFrame into a Hudi table. There are a number of options available:
+
+**`HoodieWriteConfig`**:
+
+**TABLE_NAME** (Required)<br/>
+
+
+**`DataSourceWriteOptions`**:
+
+**RECORDKEY_FIELD_OPT_KEY** (Required): Primary key field(s). Record keys uniquely identify a record/row within each partition. If one wants to have a global uniqueness, there are two options. You could either make the dataset non-partitioned, or, you can leverage Global indexes to ensure record keys are unique irrespective of the partition path. Record keys can either be a single column or refer to multiple columns. `KEYGENERATOR_CLASS_OPT_KEY` property should be set accordingly based o [...]
+Default value: `"uuid"`<br/>
+
+**PARTITIONPATH_FIELD_OPT_KEY** (Required): Columns to be used for partitioning the table. To prevent partitioning, provide empty string as value eg: `""`. Specify partitioning/no partitioning using `KEYGENERATOR_CLASS_OPT_KEY`. If partition path needs to be url encoded, you can set `URL_ENCODE_PARTITIONING_OPT_KEY`. If synchronizing to hive, also specify using `HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY.`<br/>
+Default value: `"partitionpath"`<br/>
+
+**PRECOMBINE_FIELD_OPT_KEY** (Required): When two records within the same batch have the same key value, the record with the largest value from the field specified will be choosen. If you are using default payload of OverwriteWithLatestAvroPayload for HoodieRecordPayload (`WRITE_PAYLOAD_CLASS`), an incoming record will always takes precendence compared to the one in storage ignoring this `PRECOMBINE_FIELD_OPT_KEY`. <br/>
+Default value: `"ts"`<br/>
+
+**OPERATION_OPT_KEY**: The [write operations](#write-operations) to use.<br/>
+Available values:<br/>
+`UPSERT_OPERATION_OPT_VAL` (default), `BULK_INSERT_OPERATION_OPT_VAL`, `INSERT_OPERATION_OPT_VAL`, `DELETE_OPERATION_OPT_VAL`
+
+**TABLE_TYPE_OPT_KEY**: The [type of table](/docs/concepts#table-types) to write to. Note: After the initial creation of a table, this value must stay consistent when writing to (updating) the table using the Spark `SaveMode.Append` mode.<br/>
+Available values:<br/>
+[`COW_TABLE_TYPE_OPT_VAL`](/docs/concepts#copy-on-write-table) (default), [`MOR_TABLE_TYPE_OPT_VAL`](/docs/concepts#merge-on-read-table)
+
+**KEYGENERATOR_CLASS_OPT_KEY**: Refer to [Key Generation](#key-generation) section below.
+
+**HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY**: If using hive, specify if the table should or should not be partitioned.<br/>
+Available values:<br/>
+`classOf[SlashEncodedDayPartitionValueExtractor].getCanonicalName` (default), `classOf[MultiPartKeysValueExtractor].getCanonicalName`, `classOf[TimestampBasedKeyGenerator].getCanonicalName`, `classOf[NonPartitionedExtractor].getCanonicalName`, `classOf[GlobalDeleteKeyGenerator].getCanonicalName` (to be used when `OPERATION_OPT_KEY` is set to `DELETE_OPERATION_OPT_VAL`)
+
+
+Example:
+Upsert a DataFrame, specifying the necessary field names for `recordKey => _row_key`, `partitionPath => partition`, and `precombineKey => timestamp`
+
+```java
+inputDF.write()
+       .format("org.apache.hudi")
+       .options(clientOpts) //Where clientOpts is of type Map[String, String]. clientOpts can include any other options necessary.
+       .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
+       .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
+       .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
+       .option(HoodieWriteConfig.TABLE_NAME, tableName)
+       .mode(SaveMode.Append)
+       .save(basePath);
+```
+
+## Flink SQL Writer
+The hudi-flink module defines the Flink SQL connector for both hudi source and sink.
+There are a number of options available for the sink table:
+
+|  Option Name  | Required | Default | Remarks |
+|  -----------  | -------  | ------- | ------- |
+| path | Y | N/A | Base path for the target hoodie table. The path would be created if it does not exist, otherwise a hudi table expects to be initialized successfully |
+| table.type  | N | COPY_ON_WRITE | Type of table to write. COPY_ON_WRITE (or) MERGE_ON_READ |
+| write.operation | N | upsert | The write operation, that this write should do (insert or upsert is supported) |
+| write.precombine.field | N | ts | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..) |
+| write.payload.class | N | OverwriteWithLatestAvroPayload.class | Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. This will render any value set for the option in-effective |
+| write.insert.drop.duplicates | N | false | Flag to indicate whether to drop duplicates upon insert. By default insert will accept duplicates, to gain extra performance |
+| write.ignore.failed | N | true | Flag to indicate whether to ignore any non exception error (e.g. writestatus error). within a checkpoint batch. By default true (in favor of streaming progressing over data integrity) |
+| hoodie.datasource.write.recordkey.field | N | uuid | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c` |
+| hoodie.datasource.write.keygenerator.class | N | SimpleAvroKeyGenerator.class | Key generator class, that implements will extract the key out of incoming record |
+| write.tasks | N | 4 | Parallelism of tasks that do actual write, default is 4 |
+| write.batch.size.MB | N | 128 | Batch buffer size in MB to flush data into the underneath filesystem |
+
+If the table type is MERGE_ON_READ, you can also specify the asynchronous compaction strategy through options:
+
+|  Option Name  | Required | Default | Remarks |
+|  -----------  | -------  | ------- | ------- |
+| compaction.async.enabled | N | true | Async Compaction, enabled by default for MOR |
+| compaction.trigger.strategy | N | num_commits | Strategy to trigger compaction, options are 'num_commits': trigger compaction when reach N delta commits; 'time_elapsed': trigger compaction when time elapsed > N seconds since last compaction; 'num_and_time': trigger compaction when both NUM_COMMITS and TIME_ELAPSED are satisfied; 'num_or_time': trigger compaction when NUM_COMMITS or TIME_ELAPSED is satisfied. Default is 'num_commits' |
+| compaction.delta_commits | N | 5 | Max delta commits needed to trigger compaction, default 5 commits |
+| compaction.delta_seconds | N | 3600 | Max delta seconds time needed to trigger compaction, default 1 hour |
+
+You can write the data using the SQL `INSERT INTO` statements:
+```sql
+INSERT INTO hudi_table select ... from ...; 
+```
+
+**Note**: INSERT OVERWRITE is not supported yet but already on the roadmap.
+
+## Key Generation
+
+Hudi maintains hoodie keys (record key + partition path) for uniquely identifying a particular record. Key generator class will extract these out of incoming record. Both the tools above have configs to specify the 
+`hoodie.datasource.write.keygenerator.class` property. For DeltaStreamer this would come from the property file specified in `--props` and 
+DataSource writer takes this config directly using `DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY()`.
+The default value for this config is `SimpleKeyGenerator`. Note: A custom key generator class can be written/provided here as well. Primary key columns should be provided via `RECORDKEY_FIELD_OPT_KEY` option.<br/>
+ 
+Hudi currently supports different combinations of record keys and partition paths as below - 
+
+ - Simple record key (consisting of only one field) and simple partition path (with optional hive style partitioning)
+ - Simple record key and custom timestamp based partition path (with optional hive style partitioning)
+ - Composite record keys (combination of multiple fields) and composite partition paths
+ - Composite record keys and timestamp based partition paths (composite also supported)
+ - Non partitioned table
+
+`CustomKeyGenerator.java` (part of hudi-spark module) class provides great support for generating hoodie keys of all the above listed types. All you need to do is supply values for the following properties properly to create your desired keys - 
+
+```java
+hoodie.datasource.write.recordkey.field
+hoodie.datasource.write.partitionpath.field
+hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.CustomKeyGenerator
+```
+
+For having composite record keys, you need to provide comma separated fields like
+```java
+hoodie.datasource.write.recordkey.field=field1,field2
+```
+
+This will create your record key in the format `field1:value1,field2:value2` and so on, otherwise you can specify only one field in case of simple record keys. `CustomKeyGenerator` class defines an enum `PartitionKeyType` for configuring partition paths. It can take two possible values - SIMPLE and TIMESTAMP. 
+The value for `hoodie.datasource.write.partitionpath.field` property in case of partitioned tables needs to be provided in the format `field1:PartitionKeyType1,field2:PartitionKeyType2` and so on. For example, if you want to create partition path using 2 fields `country` and `date` where the latter has timestamp based values and needs to be customised in a given format, you can specify the following 
+
+```java
+hoodie.datasource.write.partitionpath.field=country:SIMPLE,date:TIMESTAMP
+``` 
+This will create the partition path in the format `<country_name>/<date>` or `country=<country_name>/date=<date>` depending on whether you want hive style partitioning or not.
+
+`TimestampBasedKeyGenerator` class defines the following properties which can be used for doing the customizations for timestamp based partition paths
+
+```java
+hoodie.deltastreamer.keygen.timebased.timestamp.type
+  This defines the type of the value that your field contains. It can be in string format or epoch format, for example
+hoodie.deltastreamer.keygen.timebased.timestamp.scalar.time.unit
+  This defines the granularity of your field, whether it contains the values in seconds or milliseconds
+hoodie.deltastreamer.keygen.timebased.input.dateformat
+  This defines the custom format in which the values are present in your field, for example yyyy/MM/dd
+hoodie.deltastreamer.keygen.timebased.output.dateformat
+  This defines the custom format in which you want the partition paths to be created, for example dt=yyyyMMdd
+hoodie.deltastreamer.keygen.timebased.timezone
+  This defines the timezone which the timestamp based values belong to
+```
+
+When keygenerator class is `CustomKeyGenerator`, non partitioned table can be handled by simply leaving the property blank like
+```java
+hoodie.datasource.write.partitionpath.field=
+```
+
+For those on hudi versions < 0.6.0, you can use the following key generator classes for fulfilling your use cases - 
+
+ - Simple record key (consisting of only one field) and simple partition path (with optional hive style partitioning) - `SimpleKeyGenerator.java`
+ - Simple record key and custom timestamp based partition path (with optional hive style partitioning) - `TimestampBasedKeyGenerator.java`
+ - Composite record keys (combination of multiple fields) and composite partition paths - `ComplexKeyGenerator.java`
+ - Composite record keys and timestamp based partition paths (composite also supported) - You might need to move to 0.6.0 and use `CustomKeyGenerator.java` class
+ - Non partitioned table - `NonpartitionedKeyGenerator.java`. Non-partitioned tables can currently only have a single key column, [HUDI-1053](https://issues.apache.org/jira/browse/HUDI-1053)
+ 
+ 
+## Syncing to Hive
+
+Both tools above support syncing of the table's latest schema to Hive metastore, such that queries can pick up new columns and partitions.
+In case, its preferable to run this from commandline or in an independent jvm, Hudi provides a `HiveSyncTool`, which can be invoked as below, 
+once you have built the hudi-hive module. Following is how we sync the above Datasource Writer written table to Hive metastore.
+
+```java
+cd hudi-hive
+./run_sync_tool.sh  --jdbc-url jdbc:hive2:\/\/hiveserver:10000 --user hive --pass hive --partitioned-by partition --base-path <basePath> --database default --table <tableName>
+```
+
+Starting with Hudi 0.5.1 version read optimized version of merge-on-read tables are suffixed '_ro' by default. For backwards compatibility with older Hudi versions, an optional HiveSyncConfig - `--skip-ro-suffix`, has been provided to turn off '_ro' suffixing if desired. Explore other hive sync options using the following command:
+
+```java
+cd hudi-hive
+./run_sync_tool.sh
+ [hudi-hive]$ ./run_sync_tool.sh --help
+```
+
+## Deletes 
+
+Hudi supports implementing two types of deletes on data stored in Hudi tables, by enabling the user to specify a different record payload implementation. 
+For more info refer to [Delete support in Hudi](https://cwiki.apache.org/confluence/x/6IqvC).
+
+ - **Soft Deletes** : Retain the record key and just null out the values for all the other fields. 
+ This can be achieved by ensuring the appropriate fields are nullable in the table schema and simply upserting the table after setting these fields to null.
+ 
+ - **Hard Deletes** : A stronger form of deletion is to physically remove any trace of the record from the table. This can be achieved in 3 different ways.
+
+   1) Using DataSource, set `OPERATION_OPT_KEY` to `DELETE_OPERATION_OPT_VAL`. This will remove all the records in the DataSet being submitted.
+   
+   2) Using DataSource, set `PAYLOAD_CLASS_OPT_KEY` to `"org.apache.hudi.EmptyHoodieRecordPayload"`. This will remove all the records in the DataSet being submitted. 
+   
+   3) Using DataSource or DeltaStreamer, add a column named `_hoodie_is_deleted` to DataSet. The value of this column must be set to `true` for all the records to be deleted and either `false` or left null for any records which are to be upserted.
+    
+Example using hard delete method 2, remove all the records from the table that exist in the DataSet `deleteDF`:
+```java
+ deleteDF // dataframe containing just records to be deleted
+   .write().format("org.apache.hudi")
+   .option(...) // Add HUDI options like record-key, partition-path and others as needed for your setup
+   // specify record_key, partition_key, precombine_fieldkey & usual params
+   .option(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY, "org.apache.hudi.EmptyHoodieRecordPayload")
+ 
+```
+
+
+## Optimized DFS Access
+
+Hudi also performs several key storage management functions on the data stored in a Hudi table. A key aspect of storing data on DFS is managing file sizes and counts
+and reclaiming storage space. For e.g HDFS is infamous for its handling of small files, which exerts memory/RPC pressure on the Name Node and can potentially destabilize
+the entire cluster. In general, query engines provide much better performance on adequately sized columnar files, since they can effectively amortize cost of obtaining 
+column statistics etc. Even on some cloud data stores, there is often cost to listing directories with large number of small files.
+
+Here are some ways to efficiently manage the storage of your Hudi tables.
+
+ - The [small file handling feature](/docs/configurations#compactionSmallFileSize) in Hudi, profiles incoming workload 
+   and distributes inserts to existing file groups instead of creating new file groups, which can lead to small files. 
+ - Cleaner can be [configured](/docs/configurations#retainCommits) to clean up older file slices, more or less aggressively depending on maximum time for queries to run & lookback needed for incremental pull
+ - User can also tune the size of the [base/parquet file](/docs/configurations#limitFileSize), [log files](/docs/configurations#logFileMaxSize) & expected [compression ratio](/docs/configurations#parquetCompressionRatio), 
+   such that sufficient number of inserts are grouped into the same file group, resulting in well sized base files ultimately.
+ - Intelligently tuning the [bulk insert parallelism](/docs/configurations#withBulkInsertParallelism), can again in nicely sized initial file groups. It is in fact critical to get this right, since the file groups
+   once created cannot be deleted, but simply expanded as explained before.
+ - For workloads with heavy updates, the [merge-on-read table](/docs/concepts#merge-on-read-table) provides a nice mechanism for ingesting quickly into smaller files and then later merging them into larger base files via compaction.
+
+
+## Schema Evolution
+
+Schema evolution is a very important aspect of data management. 
+Hudi supports common schema evolution scenarios, such as adding a nullable field or promoting a datatype of a field, out-of-the-box.
+Furthermore, the evolved schema is queryable across engines, such as Presto, Hive and Spark SQL.
+The following table presents a summary of the types of schema changes compatible with different Hudi table types.
+
+|  Schema Change  | COW | MOR | Remarks |
+|  -----------  | -------  | ------- | ------- |
+| Add a new nullable column at root level at the end | Yes | Yes | `Yes` means that a write with evolved schema succeeds and a read following the write succeeds to read entire dataset. |
+| Add a new nullable column to inner struct (at the end) | Yes | Yes |
+| Add a new complex type field with default (map and array) | Yes | Yes |  |
+| Add a new nullable column and change the ordering of fields | No | No | Write succeeds but read fails if the write with evolved schema updated only some of the base files but not all. Currently, Hudi does not maintain a schema registry with history of changes across base files. Nevertheless, if the upsert touched all base files then the read will succeed. |
+| Add a custom nullable Hudi meta column, e.g. `_hoodie_meta_col` | Yes | Yes |  |
+| Promote datatype from `int` to `long` for a field at root level | Yes | Yes | For other types, Hudi supports promotion as specified in [Avro schema resolution](http://avro.apache.org/docs/current/spec#Schema+Resolution). |
+| Promote datatype from `int` to `long` for a nested field | Yes | Yes |
+| Promote datatype from `int` to `long` for a complex type (value of map or array) | Yes | Yes |  |
+| Add a new non-nullable column at root level at the end | No | No | In case of MOR table with Spark data source, write succeeds but read fails. As a **workaround**, you can make the field nullable. |
+| Add a new non-nullable column to inner struct (at the end) | No | No |  |
+| Change datatype from `long` to `int` for a nested field | No | No |  |
+| Change datatype from `long` to `int` for a complex type (value of map or array) | No | No |  |
+
+Let us walk through an example to demonstrate the schema evolution support in Hudi. 
+In the below example, we are going to add a new string field and change the datatype of a field from int to long.
+
+```java
+Welcome to
+    ____              __
+    / __/__  ___ _____/ /__
+    _\ \/ _ \/ _ `/ __/  '_/
+    /___/ .__/\_,_/_/ /_/\_\   version 3.1.2
+    /_/
+
+    Using Scala version 2.12.10 (OpenJDK 64-Bit Server VM, Java 1.8.0_292)
+    Type in expressions to have them evaluated.
+    Type :help for more information.
+
+scala> import org.apache.hudi.QuickstartUtils._
+import org.apache.hudi.QuickstartUtils._
+
+scala> import scala.collection.JavaConversions._
+import scala.collection.JavaConversions._
+
+scala> import org.apache.spark.sql.SaveMode._
+import org.apache.spark.sql.SaveMode._
+
+scala> import org.apache.hudi.DataSourceReadOptions._
+import org.apache.hudi.DataSourceReadOptions._
+
+scala> import org.apache.hudi.DataSourceWriteOptions._
+import org.apache.hudi.DataSourceWriteOptions._
+
+scala> import org.apache.hudi.config.HoodieWriteConfig._
+import org.apache.hudi.config.HoodieWriteConfig._
+
+scala> import org.apache.spark.sql.types._
+import org.apache.spark.sql.types._
+
+scala> import org.apache.spark.sql.Row
+import org.apache.spark.sql.Row
+
+scala> val tableName = "hudi_trips_cow"
+    tableName: String = hudi_trips_cow
+scala> val basePath = "file:///tmp/hudi_trips_cow"
+    basePath: String = file:///tmp/hudi_trips_cow
+scala> val schema = StructType( Array(
+    | StructField("rowId", StringType,true),
+    | StructField("partitionId", StringType,true),
+    | StructField("preComb", LongType,true),
+    | StructField("name", StringType,true),
+    | StructField("versionId", StringType,true),
+    | StructField("intToLong", IntegerType,true)
+    | ))
+    schema: org.apache.spark.sql.types.StructType = StructType(StructField(rowId,StringType,true), StructField(partitionId,StringType,true), StructField(preComb,LongType,true), StructField(name,StringType,true), StructField(versionId,StringType,true), StructField(intToLong,IntegerType,true))
+    
+scala> val data1 = Seq(Row("row_1", "part_0", 0L, "bob", "v_0", 0),
+    |                Row("row_2", "part_0", 0L, "john", "v_0", 0),
+    |                Row("row_3", "part_0", 0L, "tom", "v_0", 0))
+    data1: Seq[org.apache.spark.sql.Row] = List([row_1,part_0,0,bob,v_0,0], [row_2,part_0,0,john,v_0,0], [row_3,part_0,0,tom,v_0,0])
+
+scala> var dfFromData1 = spark.createDataFrame(data1, schema)
+scala> dfFromData1.write.format("hudi").
+    |   options(getQuickstartWriteConfigs).
+    |   option(PRECOMBINE_FIELD_OPT_KEY.key, "preComb").
+    |   option(RECORDKEY_FIELD_OPT_KEY.key, "rowId").
+    |   option(PARTITIONPATH_FIELD_OPT_KEY.key, "partitionId").
+    |   option("hoodie.index.type","SIMPLE").
+    |   option(TABLE_NAME.key, tableName).
+    |   mode(Overwrite).
+    |   save(basePath)
+
+scala> var tripsSnapshotDF1 = spark.read.format("hudi").load(basePath + "/*/*")
+    tripsSnapshotDF1: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string, _hoodie_commit_seqno: string ... 9 more fields]
+
+scala> tripsSnapshotDF1.createOrReplaceTempView("hudi_trips_snapshot")
+
+scala> spark.sql("desc hudi_trips_snapshot").show()
+    +--------------------+---------+-------+
+    |            col_name|data_type|comment|
+    +--------------------+---------+-------+
+    | _hoodie_commit_time|   string|   null|
+    |_hoodie_commit_seqno|   string|   null|
+    |  _hoodie_record_key|   string|   null|
+    |_hoodie_partition...|   string|   null|
+    |   _hoodie_file_name|   string|   null|
+    |               rowId|   string|   null|
+    |         partitionId|   string|   null|
+    |             preComb|   bigint|   null|
+    |                name|   string|   null|
+    |           versionId|   string|   null|
+    |           intToLong|      int|   null|
+    +--------------------+---------+-------+
+    
+scala> spark.sql("select rowId, partitionId, preComb, name, versionId, intToLong from hudi_trips_snapshot").show()
+    +-----+-----------+-------+----+---------+---------+
+    |rowId|partitionId|preComb|name|versionId|intToLong|
+    +-----+-----------+-------+----+---------+---------+
+    |row_3|     part_0|      0| tom|      v_0|        0|
+    |row_2|     part_0|      0|john|      v_0|        0|
+    |row_1|     part_0|      0| bob|      v_0|        0|
+    +-----+-----------+-------+----+---------+---------+
+
+// In the new schema, we are going to add a String field and 
+// change the datatype `intToLong` field from  int to long.
+scala> val newSchema = StructType( Array(
+    | StructField("rowId", StringType,true),
+    | StructField("partitionId", StringType,true),
+    | StructField("preComb", LongType,true),
+    | StructField("name", StringType,true),
+    | StructField("versionId", StringType,true),
+    | StructField("intToLong", LongType,true),
+    | StructField("newField", StringType,true)
+    | ))
+    newSchema: org.apache.spark.sql.types.StructType = StructType(StructField(rowId,StringType,true), StructField(partitionId,StringType,true), StructField(preComb,LongType,true), StructField(name,StringType,true), StructField(versionId,StringType,true), StructField(intToLong,LongType,true), StructField(newField,StringType,true))
+
+scala> val data2 = Seq(Row("row_2", "part_0", 5L, "john", "v_3", 3L, "newField_1"),
+    |                Row("row_5", "part_0", 5L, "maroon", "v_2", 2L, "newField_1"),
+    |                Row("row_9", "part_0", 5L, "michael", "v_2", 2L, "newField_1"))
+    data2: Seq[org.apache.spark.sql.Row] = List([row_2,part_0,5,john,v_3,3,newField_1], [row_5,part_0,5,maroon,v_2,2,newField_1], [row_9,part_0,5,michael,v_2,2,newField_1])
+
+scala> var dfFromData2 = spark.createDataFrame(data2, newSchema)
+scala> dfFromData2.write.format("hudi").
+    |   options(getQuickstartWriteConfigs).
+    |   option(PRECOMBINE_FIELD_OPT_KEY.key, "preComb").
+    |   option(RECORDKEY_FIELD_OPT_KEY.key, "rowId").
+    |   option(PARTITIONPATH_FIELD_OPT_KEY.key, "partitionId").
+    |   option("hoodie.index.type","SIMPLE").
+    |   option(TABLE_NAME.key, tableName).
+    |   mode(Append).
+    |   save(basePath)
+
+scala> var tripsSnapshotDF2 = spark.read.format("hudi").load(basePath + "/*/*")
+    tripsSnapshotDF2: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string, _hoodie_commit_seqno: string ... 10 more fields]
+
+scala> tripsSnapshotDF2.createOrReplaceTempView("hudi_trips_snapshot")
+
+scala> spark.sql("desc hudi_trips_snapshot").show()
+    +--------------------+---------+-------+
+    |            col_name|data_type|comment|
+    +--------------------+---------+-------+
+    | _hoodie_commit_time|   string|   null|
+    |_hoodie_commit_seqno|   string|   null|
+    |  _hoodie_record_key|   string|   null|
+    |_hoodie_partition...|   string|   null|
+    |   _hoodie_file_name|   string|   null|
+    |               rowId|   string|   null|
+    |         partitionId|   string|   null|
+    |             preComb|   bigint|   null|
+    |                name|   string|   null|
+    |           versionId|   string|   null|
+    |           intToLong|   bigint|   null|
+    |            newField|   string|   null|
+    +--------------------+---------+-------+
+
+
+scala> spark.sql("select rowId, partitionId, preComb, name, versionId, intToLong, newField from hudi_trips_snapshot").show()
+    +-----+-----------+-------+-------+---------+---------+----------+
+    |rowId|partitionId|preComb|   name|versionId|intToLong|  newField|
+    +-----+-----------+-------+-------+---------+---------+----------+
+    |row_3|     part_0|      0|    tom|      v_0|        0|      null|
+    |row_2|     part_0|      5|   john|      v_3|        3|newField_1|
+    |row_1|     part_0|      0|    bob|      v_0|        0|      null|
+    |row_5|     part_0|      5| maroon|      v_2|        2|newField_1|
+    |row_9|     part_0|      5|michael|      v_2|        2|newField_1|
+    +-----+-----------+-------+-------+---------+---------+----------+
+
+```
diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js
new file mode 100644
index 0000000..8716e24
--- /dev/null
+++ b/website/docusaurus.config.js
@@ -0,0 +1,415 @@
+const darkCodeTheme = require('prism-react-renderer/themes/dracula');
+const versions = require('./versions.json');
+const VersionsArchived = require('./versionsArchived.json');
+const allDocHomesPaths = [
+  '/docs/',
+  '/docs/next/',
+  ...versions.slice(1).map((version) => `/docs/${version}/`),
+];
+
+/** @type {import('@docusaurus/types').DocusaurusConfig} */
+module.exports = {
+  title: 'Welcome to Apache Hudi!',
+  tagline: 'Apache Hudi ingests & manages storage of large analytical datasets over DFS (hdfs or cloud stores).',
+  url: 'https://hudi.apache.org',
+  baseUrl: '/',
+  onBrokenLinks: 'throw',
+  onBrokenMarkdownLinks: 'warn',
+  favicon: 'assets/image/favicon.ico',
+  organizationName: 'apache', // Usually your GitHub org/user name.
+  projectName: 'hudi', // Usually your repo name.
+  i18n: {
+    defaultLocale: 'en',
+    locales: ['en', 'cn'],
+    localeConfigs: {
+      en: {
+        label: 'English',
+        direction: 'ltr',
+      },
+      cn: {
+        label: 'Chinese',
+        direction: 'ltr',
+      },
+    },
+  },
+  plugins: [
+    [
+      '@docusaurus/plugin-content-docs',
+      {
+        id: 'contribute',
+        path: 'contribute',
+        routeBasePath: 'contribute',
+        sidebarPath: require.resolve('./sidebarsContribute.js'),
+        showLastUpdateAuthor: true,
+        showLastUpdateTime: true,
+      },
+    ],
+    [
+      '@docusaurus/plugin-content-docs',
+      {
+        id: 'releases',
+        path: 'releases',
+        routeBasePath: 'releases',
+        sidebarPath: require.resolve('./sidebarsReleases.js'),
+        showLastUpdateAuthor: true,
+        showLastUpdateTime: true,
+      },
+    ],
+    [
+      '@docusaurus/plugin-client-redirects',
+      {
+        fromExtensions: ['html'],
+        createRedirects: function (path) {
+          // redirect to /docs from /docs/introduction,
+          // as introduction has been made the home doc
+          if (allDocHomesPaths.includes(path)) {
+            return [`${path}/quick-start-guide`];
+          }
+        },
+        redirects: [
+          {
+            from: ['/docs/contribute', '/docs/next/contribute'],
+            to: '/contribute/get-involved',
+          },
+          {
+            from: ['/docs/releases', '/docs/next/releases'],
+            to: '/releases/release-0.8.0',
+          },
+          {
+            from: ['/releases'],
+            to: '/releases/release-0.8.0',
+          },
+        ],
+      },
+    ],
+  ],
+  themeConfig: {
+    navbar: {
+      logo: {
+        alt: 'Apache Hudi',
+        src: 'assets/images/hudi.png',
+      },
+      items: [
+        {
+          label: 'Docs',
+          to: '/docs/quick-start-guide',
+        },
+        {
+          label: 'Learn',
+          position: 'left',
+          items: [
+            {
+              label: 'Blog',
+              to: '/blog',
+            },
+            {
+              label: 'Talks & Articles',
+              to: 'talks-articles',
+            },
+            {
+              label: 'FAQ',
+              href: 'https://cwiki.apache.org/confluence/display/HUDI/FAQ',
+            },
+            {
+              label: 'Technical Wiki',
+              href: 'https://cwiki.apache.org/confluence/display/HUDI',
+            }
+          ],
+        },
+        {
+          label: 'Contribute',
+          position: 'left',
+          items: [
+            {
+              label: 'Get Involved',
+              to: '/contribute/get-involved',
+            },
+            {
+              label: 'Team',
+              to: '/contribute/team',
+            },
+            {
+              label: 'How to Contribute',
+              to: '/contribute/how-to-contribute',
+            },
+            {
+              label: 'Developer Setup',
+              to: '/contribute/developer-setup',
+            },
+            {
+              label: 'Report Security Issues',
+              to: '/contribute/report-security-issues',
+            },
+            {
+              label: 'Report Issues',
+              href: 'https://issues.apache.org/jira/projects/HUDI/summary',
+            }
+          ],
+        },
+        {to: '/powered-by', label: "Who's Using", position: 'left'},
+        {to: '/releases/download', label: 'Download', position: 'left'},
+        // right
+        {
+          type: 'docsVersionDropdown',
+          position: 'right',
+          dropdownActiveClassDisabled: true,
+          dropdownItemsAfter: [
+            ...Object.entries(VersionsArchived).map(
+                ([versionName, versionUrl]) => ({
+                  label: versionName,
+                  href: versionUrl,
+                }),
+            )
+          ],
+        },
+        {
+          type: 'localeDropdown',
+          position: 'right',
+        },
+        {
+          href: 'https://github.com/apache/hudi',
+          position: 'right',
+          className: 'header-github-link',
+          'aria-label': 'GitHub repository',
+        },
+        {
+          href: 'https://twitter.com/ApacheHudi',
+          position: 'right',
+          className: 'header-twitter-link',
+          'aria-label': 'Hudi Twitter Handle',
+        },
+        {
+          href: 'https://join.slack.com/t/apache-hudi/shared_invite/enQtODYyNDAxNzc5MTg2LTE5OTBlYmVhYjM0N2ZhOTJjOWM4YzBmMWU2MjZjMGE4NDc5ZDFiOGQ2N2VkYTVkNzU3ZDQ4OTI1NmFmYWQ0NzE',
+          position: 'right',
+          className: 'header-slack-link',
+          'aria-label': 'Hudi Slack Channel',
+        },
+      ],
+    },
+    footer: {
+      style: 'light',
+      links: [
+        {
+          title: 'About',
+          items: [
+            {
+              label: 'Concepts',
+              to: '/docs/concepts',
+            },
+            {
+              label: 'Our Vision',
+              to: '/blog/2021/07/21/streaming-data-lake-platform',
+            },
+            {
+              label: 'Team',
+              to: '/contribute/team',
+            },
+            {
+              label: 'Releases',
+              to: '/releases/release-0.8.0',
+            },
+            {
+              label: 'Download',
+              to: '/releases/download',
+            },
+            {
+              label: 'Who\'s Using',
+              to: 'powered-by',
+            },
+          ],
+        },
+        {
+          title: 'Learn',
+          items: [
+            {
+              label: 'Quick Start',
+              to: '/docs/quick-start-guide',
+            },
+            {
+              label: 'Docker Demo',
+              to: '/docs/docker_demo',
+            },
+            {
+              label: 'Blog',
+              to: '/blog',
+            },
+            {
+              label: 'Talks & Articles',
+              to: 'talks-articles',
+            },
+            {
+              label: 'FAQ',
+              href: 'https://cwiki.apache.org/confluence/display/HUDI/FAQ',
+            },
+            {
+              label: 'Technical Wiki',
+              href: 'https://cwiki.apache.org/confluence/display/HUDI',
+            }
+          ],
+        },
+        {
+          title: 'Hudi On Cloud',
+          items: [
+            {
+              label: 'AWS',
+              to: '/docs/s3_hoodie',
+            },
+            {
+              label: 'Google Cloud',
+              to: '/docs/gcs_hoodie',
+            },
+            {
+              label: 'Alibaba Cloud',
+              to: '/docs/oss_hoodie',
+            },
+            {
+              label: 'Microsoft Azure',
+              to: '/docs/azure_hoodie',
+            },
+            {
+              label: 'Tencent Cloud',
+              to: '/docs/cos_hoodie',
+            },
+            {
+              label: 'IBM Cloud',
+              to: '/docs/ibm_cos_hoodie',
+            },
+          ],
+        },
+        {
+          title: 'Community',
+          items: [
+            {
+              label: 'Get Involved',
+              to: '/contribute/get-involved'
+            },
+            {
+              label: 'Slack',
+              href: 'https://join.slack.com/t/apache-hudi/shared_invite/enQtODYyNDAxNzc5MTg2LTE5OTBlYmVhYjM0N2ZhOTJjOWM4YzBmMWU2MjZjMGE4NDc5ZDFiOGQ2N2VkYTVkNzU3ZDQ4OTI1NmFmYWQ0NzE',
+            },
+            {
+              label: 'GitHub',
+              href: 'https://github.com/apache/hudi',
+            },
+            {
+              label: 'Twitter',
+              href: 'https://twitter.com/ApacheHudi',
+            },
+            {
+              label: 'Mailing List',
+              to: 'mailto:dev-subscribe@hudi.apache.org?Subject=SubscribeToHudi',
+            },
+          ],
+        },
+        {
+          title: 'Apache',
+          items: [
+            {
+              label: 'Events',
+              to: 'https://www.apache.org/events/current-event',
+            },
+            {
+              label: 'Thanks',
+              to: 'https://www.apache.org/foundation/thanks.html',
+            },
+            {
+              label: 'License',
+              to: 'https://www.apache.org/licenses',
+            },
+            {
+              label: 'Security',
+              to: 'https://www.apache.org/security',
+            },
+            {
+              label: 'Sponsorship',
+              to: 'https://www.apache.org/foundation/sponsorship.html',
+            },
+            {
+              label: 'Foundation',
+              to: 'https://www.apache.org',
+            },
+          ],
+        },
+      ],
+      logo: {
+        alt: 'Apache Hudi™',
+        src: '/assets/images/logo-big.png',
+        href: 'https://hudi.apache.org/',
+      },
+      copyright: 'Copyright © 2021 <a href="https://apache.org">The Apache Software Foundation</a>, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0"> Apache License, Version 2.0</a>. <br />Hudi, Apache and the Apache feather logo are trademarks of The Apache Software Foundation.',
+    },
+    prism: {
+      theme: darkCodeTheme,
+      additionalLanguages: ['java', 'scala'],
+      prismPath: require.resolve('./src/theme/prism-include-languages.js'),
+    },
+    announcementBar: {
+      id: 'announcementBar-1', // Increment on change
+      content:
+          '⭐️ If you like Apache Hudi, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/apache/hudi">GitHub</a>! ⭐',
+    },
+    colorMode: {
+      defaultMode: 'light',
+      disableSwitch: true,
+    },
+  },
+  presets: [
+    [
+      '@docusaurus/preset-classic',
+      {
+        docs: {
+          sidebarPath: require.resolve('./sidebars.js'),
+          // Please change this to your repo.
+          editUrl:
+            'https://github.com/apache/hudi/edit/asf-site/website/docs/',
+          includeCurrentVersion: true,
+          versions: {
+            current: {
+              label: 'Current',
+              path: 'next',
+              banner: 'unreleased',
+            },
+            '0.8.0': {
+              label: '0.8.0',
+              path: '',
+            }
+          },
+        },
+        blog: {
+          editUrl:
+            'https://github.com/apache/hudi/edit/asf-site/website/blog/',
+          blogTitle: 'Blog - Apache Hudi: User-Facing Analytics',
+          blogSidebarCount: 10,
+          blogSidebarTitle: 'Recent posts',
+          /**
+           * URL route for the blog section of your site.
+           * *DO NOT* include a trailing slash.
+           */
+          routeBasePath: 'blog',
+          include: ['*.md', '*.mdx'],
+          postsPerPage: 10,
+          /**
+           * Theme components used by the blog pages.
+           */
+          blogListComponent: '@theme/BlogListPage',
+          blogPostComponent: '@theme/BlogPostPage',
+          blogTagsListComponent: '@theme/BlogTagsListPage',
+          blogTagsPostsComponent: '@theme/BlogTagsPostsPage',
+          feedOptions: {
+            type: "all",
+            title: 'Apache Hudi: User-Facing Analytics',
+          },
+          showReadingTime: true,
+        },
+        theme: {
+          customCss: require.resolve('./src/css/custom.css'),
+        },
+      },
+    ],
+  ],
+  scripts: [],
+  stylesheets: [
+    'https://fonts.googleapis.com/css?family=Comfortaa|Ubuntu|Roboto|Source+Code+Pro',
+    'https://at-ui.github.io/feather-font/css/iconfont.css',
+  ],
+};
diff --git a/website/i18n/cn/code.json b/website/i18n/cn/code.json
new file mode 100644
index 0000000..aa6eda1
--- /dev/null
+++ b/website/i18n/cn/code.json
@@ -0,0 +1,166 @@
+{
+  "theme.NotFound.title": {
+    "message": "Page Not Found",
+    "description": "The title of the 404 page"
+  },
+  "theme.NotFound.p1": {
+    "message": "We could not find what you were looking for.",
+    "description": "The first paragraph of the 404 page"
+  },
+  "theme.NotFound.p2": {
+    "message": "Please contact the owner of the site that linked you to the original URL and let them know their link is broken.",
+    "description": "The 2nd paragraph of the 404 page"
+  },
+  "theme.AnnouncementBar.closeButtonAriaLabel": {
+    "message": "Close",
+    "description": "The ARIA label for close button of announcement bar"
+  },
+  "theme.blog.paginator.navAriaLabel": {
+    "message": "Blog list page navigation",
+    "description": "The ARIA label for the blog pagination"
+  },
+  "theme.blog.paginator.newerEntries": {
+    "message": "Newer Entries",
+    "description": "The label used to navigate to the newer blog posts page (previous page)"
+  },
+  "theme.blog.paginator.olderEntries": {
+    "message": "Older Entries",
+    "description": "The label used to navigate to the older blog posts page (next page)"
+  },
+  "theme.blog.post.readingTime.plurals": {
+    "message": "One min read|{readingTime} min read",
+    "description": "Pluralized label for \"{readingTime} min read\". Use as much plural forms (separated by \"|\") as your language support (see https://www.unicode.org/cldr/cldr-aux/charts/34/supplemental/language_plural_rules.html)"
+  },
+  "theme.tags.tagsListLabel": {
+    "message": "Tags:",
+    "description": "The label alongside a tag list"
+  },
+  "theme.blog.post.readMore": {
+    "message": "Read More",
+    "description": "The label used in blog post item excerpts to link to full blog posts"
+  },
+  "theme.blog.post.paginator.navAriaLabel": {
+    "message": "Blog post page navigation",
+    "description": "The ARIA label for the blog posts pagination"
+  },
+  "theme.blog.post.paginator.newerPost": {
+    "message": "Newer Post",
+    "description": "The blog post button label to navigate to the newer/previous post"
+  },
+  "theme.blog.post.paginator.olderPost": {
... 65068 lines suppressed ...