You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@gobblin.apache.org by tr...@apache.org on 2021/05/29 12:06:55 UTC
[gobblin] branch website-collab updated: Initial commit for gobblin website/doc with Docosaurus

This is an automated email from the ASF dual-hosted git repository.

treff7es pushed a commit to branch website-collab
in repository https://gitbox.apache.org/repos/asf/gobblin.git


The following commit(s) were added to refs/heads/website-collab by this push:
     new 3572f9a  Initial commit for gobblin website/doc with Docosaurus
3572f9a is described below

commit 3572f9a9b8036970dcaefa9f4c04d49279d50f7e
Author: treff7es <tr...@gmail.com>
AuthorDate: Sat May 29 14:05:56 2021 +0200

    Initial commit for gobblin website/doc with Docosaurus
---
 gobblin-website/.gitignore                         |   21 +
 gobblin-website/README.md                          |   33 +
 gobblin-website/babel.config.js                    |    3 +
 gobblin-website/docs/Getting-Started.md            |  163 +
 gobblin-website/docs/Gobblin-Architecture.md       |  117 +
 gobblin-website/docs/Powered-By.md                 |   23 +
 gobblin-website/docs/adaptors/Gobblin-Distcp.md    |  282 +
 .../docs/adaptors/Hive-Avro-To-ORC-Converter.md    |  195 +
 gobblin-website/docs/case-studies/Hive-Distcp.md   |  102 +
 .../docs/case-studies/Kafka-HDFS-Ingestion.md      |  304 +
 .../docs/case-studies/Publishing-Data-to-S3.md     |  156 +
 .../docs/case-studies/Writing-ORC-Data.md          |   54 +
 gobblin-website/docs/css/extra.css                 |   40 +
 .../docs/data-management/DistcpNgEvents.md         |   33 +
 .../docs/data-management/Gobblin-Retention.md      |  365 +
 .../docs/developer-guide/CodingStyle.md            |   55 +
 .../docs/developer-guide/Contributing.md           |   23 +
 .../Customization-for-Converter-and-Operator.md    |    6 +
 .../Customization-for-New-Source.md                |    6 +
 .../developer-guide/Documentation-Architecture.md  |   24 +
 .../developer-guide/Gobblin-Compliance-Design.md   |  100 +
 .../docs/developer-guide/GobblinModules.md         |   80 +
 .../docs/developer-guide/HighLevelConsumer.md      |   72 +
 gobblin-website/docs/developer-guide/IDE-setup.md  |   26 +
 .../docs/developer-guide/Monitoring-Design.md      |    9 +
 .../developer-guide/files/codestyle-eclipse.xml    |  308 +
 .../files/codestyle-intellij-gobblin.xml           |  540 ++
 .../docs/developer-guide/files/prefs-eclipse.epf   |   23 +
 gobblin-website/docs/index.md                      |   16 +
 gobblin-website/docs/mdx.md                        |   17 +
 gobblin-website/docs/metrics/Existing-Reporters.md |   20 +
 .../docs/metrics/Gobblin-Metrics-Architecture.md   |   74 +
 .../docs/metrics/Gobblin-Metrics-Performance.md    |   56 +
 ...-generation-instrumentation-for-applications.md |   26 +
 gobblin-website/docs/metrics/Gobblin-Metrics.md    |  109 +
 .../docs/metrics/Implementing-New-Reporters.md     |  104 +
 .../docs/metrics/Metrics-for-Gobblin-ETL.md        |  155 +
 .../miscellaneous/Camus-to-Gobblin-Migration.md    |  106 +
 .../docs/miscellaneous/Exactly-Once-Support.md     |  173 +
 gobblin-website/docs/project/Feature-List.md       |   25 +
 gobblin-website/docs/project/Posts.md              |    5 +
 .../docs/project/Talks-and-Tech-Blogs.md           |   19 +
 gobblin-website/docs/sinks/AvroHdfsDataWriter.md   |   28 +
 gobblin-website/docs/sinks/ConsoleWriter.md        |   15 +
 gobblin-website/docs/sinks/CouchbaseWriter.md      |  150 +
 gobblin-website/docs/sinks/Gobblin-JDBC-Writer.md  |  194 +
 gobblin-website/docs/sinks/Http.md                 |  109 +
 gobblin-website/docs/sinks/Kafka.md                |   79 +
 .../docs/sinks/ParquetHdfsDataWriter.md            |   48 +
 gobblin-website/docs/sinks/SimpleBytesWriter.md    |   23 +
 gobblin-website/docs/sources/AvroFileSource.md     |   17 +
 gobblin-website/docs/sources/CopySource.md         |   17 +
 .../docs/sources/GoogleAnalyticsSource.md          |   18 +
 gobblin-website/docs/sources/GoogleDriveSource.md  |   17 +
 gobblin-website/docs/sources/GoogleWebmaster.md    |   84 +
 .../docs/sources/HadoopTextInputSource.md          |   17 +
 gobblin-website/docs/sources/HelloWorldSource.md   |   16 +
 .../docs/sources/HiveAvroToOrcSource.md            |   16 +
 gobblin-website/docs/sources/HivePurgerSource.md   |   16 +
 gobblin-website/docs/sources/HiveSource.md         |   16 +
 gobblin-website/docs/sources/KafkaSource.md        |   16 +
 gobblin-website/docs/sources/MySQLSource.md        |   16 +
 gobblin-website/docs/sources/OracleSource.md       |   16 +
 gobblin-website/docs/sources/QueryBasedSource.md   |  107 +
 gobblin-website/docs/sources/RestApiSource.md      |   50 +
 gobblin-website/docs/sources/SalesforceSource.md   |   16 +
 gobblin-website/docs/sources/SftpSource.md         |   16 +
 gobblin-website/docs/sources/SimpleJsonSource.md   |   16 +
 gobblin-website/docs/sources/SqlServerSource.md    |   16 +
 gobblin-website/docs/sources/TeradataSource.md     |   16 +
 gobblin-website/docs/sources/WikipediaSource.md    |   16 +
 .../docs/user-guide/Azure-Kubernetes-Deployment.md |   88 +
 .../user-guide/Building-Gobblin-as-a-Service.md    |   54 +
 .../docs/user-guide/Building-Gobblin.md            |   62 +
 gobblin-website/docs/user-guide/Compaction.md      |  299 +
 .../docs/user-guide/Config-Management.md           |  140 +
 .../Configuration-Properties-Glossary.md           |  476 +
 .../docs/user-guide/Docker-Integration.md          |  115 +
 gobblin-website/docs/user-guide/FAQs.md            |   81 +
 gobblin-website/docs/user-guide/Gobblin-CLI.md     |  261 +
 .../docs/user-guide/Gobblin-Compliance.md          |   57 +
 .../docs/user-guide/Gobblin-Deployment.md          |   88 +
 .../docs/user-guide/Gobblin-Schedulers.md          |   81 +
 .../docs/user-guide/Gobblin-as-a-Library.md        |   75 +
 .../docs/user-guide/Gobblin-genericLoad.md         |   19 +
 gobblin-website/docs/user-guide/Gobblin-on-Yarn.md |  309 +
 .../docs/user-guide/Gobblin-template.md            |   73 +
 .../docs/user-guide/Hive-Registration.md           |   79 +
 .../docs/user-guide/Job-Execution-History-Store.md |  172 +
 gobblin-website/docs/user-guide/Monitoring.md      |   83 +
 .../docs/user-guide/Partitioned-Writers.md         |   77 +
 .../user-guide/Source-schema-and-Converters.md     |  372 +
 .../user-guide/State-Management-and-Watermarks.md  |   84 +
 gobblin-website/docs/user-guide/Troubleshooting.md |   92 +
 .../Working-with-Job-Configuration-Files.md        |   97 +
 .../user-guide/Working-with-the-ForkOperator.md    |  200 +
 gobblin-website/docusaurus.config.js               |  114 +
 gobblin-website/download/index.md                  |   23 +
 gobblin-website/package.json                       |   47 +
 gobblin-website/sidebars.js                        |  117 +
 gobblin-website/src/css/custom.css                 |   24 +
 gobblin-website/src/pages/downloads.js             |   68 +
 gobblin-website/src/pages/index.js                 |  129 +
 gobblin-website/src/pages/styles.module.css        |   69 +
 .../src/theme/prism-include-languages.js           |   23 +
 gobblin-website/static/.nojekyll                   |    0
 .../static/img/Avro-to-Orc-timeline.jpg            |  Bin 0 -> 4344882 bytes
 .../static/img/Converters-Explained.png            |  Bin 0 -> 170024 bytes
 .../static/img/Gobblin-Architecture-Overview.png   |  Bin 0 -> 113376 bytes
 gobblin-website/static/img/Gobblin-Constructs.png  |  Bin 0 -> 182413 bytes
 gobblin-website/static/img/Gobblin-Distcp-Flow.png |  Bin 0 -> 14885 bytes
 gobblin-website/static/img/Gobblin-Job-Flow.png    |  Bin 0 -> 158077 bytes
 gobblin-website/static/img/Gobblin-Logo.png        |  Bin 0 -> 59410 bytes
 .../static/img/Gobblin-Metrics-Architecture.png    |  Bin 0 -> 22907 bytes
 .../static/img/Gobblin-Metrics-Example.png         |  Bin 0 -> 20154 bytes
 .../static/img/Gobblin-Partitioned-Writer.png      |  Bin 0 -> 13503 bytes
 ...Gobblin-Query-Based-Extractor-Build-Queries.png |  Bin 0 -> 80769 bytes
 .../static/img/Gobblin-Query-Based-Extractors.png  |  Bin 0 -> 65509 bytes
 .../static/img/Gobblin-Query-Based-Sources.png     |  Bin 0 -> 44037 bytes
 .../static/img/Gobblin-Retention-Architecture.png  |  Bin 0 -> 1317917 bytes
 .../static/img/Gobblin-State-Hierarchy.png         |  Bin 0 -> 38288 bytes
 gobblin-website/static/img/Gobblin-Task-Flow.png   |  Bin 0 -> 220149 bytes
 .../static/img/Gobblin-on-Hadoop-MR.png            |  Bin 0 -> 113123 bytes
 .../static/img/Gobblin-on-Single-Node.png          |  Bin 0 -> 111676 bytes
 .../static/img/Gobblin-on-Yarn-with-Helix.png      |  Bin 0 -> 74429 bytes
 gobblin-website/static/img/Http-Write.png          |  Bin 0 -> 39439 bytes
 .../static/img/Rest-Api-Extractor-Flow.png         |  Bin 0 -> 87665 bytes
 .../static/img/Trie-Conversion-Ratio.png           |  Bin 0 -> 26807 bytes
 .../static/img/configStoreClientApi.png            |  Bin 0 -> 15520 bytes
 .../static/img/configStoreDataModel.png            |  Bin 0 -> 31870 bytes
 gobblin-website/static/img/favicon.ico             |  Bin 0 -> 3626 bytes
 gobblin-website/static/img/hadoop.png              |  Bin 0 -> 62772 bytes
 .../static/img/jdbc/Gobblin_JDBC_Publisher.png     |  Bin 0 -> 31743 bytes
 .../static/img/jdbc/Gobblin_JDBC_Writer.png        |  Bin 0 -> 26642 bytes
 gobblin-website/static/img/jdbc/HDFS_JDBC_Flow.png |  Bin 0 -> 42472 bytes
 .../static/img/jdbc/JdbcWriterInitializer.png      |  Bin 0 -> 55843 bytes
 .../static/img/jdbc/WriterInitializer.png          |  Bin 0 -> 10404 bytes
 gobblin-website/static/img/logo.svg                |    1 +
 gobblin-website/static/img/mesos-cluster.png       |  Bin 0 -> 25304 bytes
 gobblin-website/static/img/poweredby/apple.png     |  Bin 0 -> 41445 bytes
 gobblin-website/static/img/poweredby/applift.png   |  Bin 0 -> 6130 bytes
 gobblin-website/static/img/poweredby/bpu.png       |  Bin 0 -> 4729 bytes
 gobblin-website/static/img/poweredby/cern.png      |  Bin 0 -> 8464 bytes
 .../static/img/poweredby/cleverdata.png            |  Bin 0 -> 6074 bytes
 gobblin-website/static/img/poweredby/ebay.png      |  Bin 0 -> 9572 bytes
 gobblin-website/static/img/poweredby/ibm.png       |  Bin 0 -> 12538 bytes
 gobblin-website/static/img/poweredby/intel.png     |  Bin 0 -> 9587 bytes
 gobblin-website/static/img/poweredby/linkedin.png  |  Bin 0 -> 29921 bytes
 gobblin-website/static/img/poweredby/microsoft.png |  Bin 0 -> 45752 bytes
 .../static/img/poweredby/nerdwallet.png            |  Bin 0 -> 4621 bytes
 gobblin-website/static/img/poweredby/paypal.png    |  Bin 0 -> 59719 bytes
 gobblin-website/static/img/poweredby/prezi.png     |  Bin 0 -> 13830 bytes
 gobblin-website/static/img/poweredby/sandia.png    |  Bin 0 -> 46333 bytes
 .../static/img/poweredby/stunlock-studios.png      |  Bin 0 -> 13704 bytes
 gobblin-website/static/img/poweredby/swisscom.png  |  Bin 0 -> 17898 bytes
 gobblin-website/static/img/threads.gif             |  Bin 0 -> 3762 bytes
 .../static/img/undraw_docusaurus_mountain.svg      |  170 +
 .../static/img/undraw_docusaurus_react.svg         |  169 +
 .../static/img/undraw_docusaurus_tree.svg          |    1 +
 gobblin-website/static/img/yarn_architecture.gif   |  Bin 0 -> 33031 bytes
 gobblin-website/yarn.lock                          | 9505 ++++++++++++++++++++
 161 files changed, 19243 insertions(+)

diff --git a/gobblin-website/.gitignore b/gobblin-website/.gitignore
new file mode 100644
index 0000000..deb68ee
--- /dev/null
+++ b/gobblin-website/.gitignore
@@ -0,0 +1,21 @@
+# Dependencies
+/node_modules
+
+# Production
+/build
+
+# Generated files
+.docusaurus
+.cache-loader
+
+# Misc
+.DS_Store
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+/.idea/
+
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
diff --git a/gobblin-website/README.md b/gobblin-website/README.md
new file mode 100644
index 0000000..8960fa2
--- /dev/null
+++ b/gobblin-website/README.md
@@ -0,0 +1,33 @@
+# Website
+
+This website is built using [Docusaurus 2](https://v2.docusaurus.io/), a modern static website generator.
+
+## Installation
+
+```console
+yarn install
+```
+
+## Local Development
+
+```console
+yarn start
+```
+
+This command starts a local development server and open up a browser window. Most changes are reflected live without having to restart the server.
+
+## Build
+
+```console
+yarn build
+```
+
+This command generates static content into the `build` directory and can be served using any static contents hosting service.
+
+## Deployment
+
+```console
+GIT_USER=<Your GitHub username> USE_SSH=true yarn deploy
+```
+
+If you are using GitHub pages for hosting, this command is a convenient way to build the website and push to the `gh-pages` branch.
diff --git a/gobblin-website/babel.config.js b/gobblin-website/babel.config.js
new file mode 100644
index 0000000..e00595d
--- /dev/null
+++ b/gobblin-website/babel.config.js
@@ -0,0 +1,3 @@
+module.exports = {
+  presets: [require.resolve('@docusaurus/core/lib/babel/preset')],
+};
diff --git a/gobblin-website/docs/Getting-Started.md b/gobblin-website/docs/Getting-Started.md
new file mode 100644
index 0000000..62c4bef
--- /dev/null
+++ b/gobblin-website/docs/Getting-Started.md
@@ -0,0 +1,163 @@
+---
+title: Getting Started
+sidebar_label: Getting Started
+---
+
+
+# Introduction
+
+This guide will help you setup Gobblin, and run your first job. Currently, Gobblin requires JDK 7 or later to run.
+
+# Getting a Gobblin Release
+
+All steps in this page assume you are using a Apache Gobblin source distribution. 
+
+Download source distribution from [here](/downloads). 
+
+## Building a Distribution
+
+Build a distribution:
+
+```bash
+cd /path/to/gobblin/source
+./gradlew :gobblin-distribution:buildDistributionTar
+```
+
+Note: A full build takes time because it runs other tasks like test, javadoc, findMainBugs, etc, which impacts the build performance. 
+For a quick usage, building distribution is good enough. However a full build can be easily made by running:
+```bash
+./gradlew build
+```
+
+The generated distribution contains the binary in a specific directory structure, which is different from source.
+
+After the build is done, there should be a tarball (if there are multiple, use the newest one) at 
+
+`build/gobblin-distribution/distributions/`
+
+Distributions built from source are generated as `*.tar.gz` files. After getting the tarball, unpackage it locally:
+
+`tar -xvf gobblin-distribution-[VERSION].tar.gz`. 
+
+# Run Your First Job
+
+Note: the following two sections are only applicable to newer versions of Gobblin. If you are running version 0.8.0 or earlier, skip to [Gobblin daemon](#running-gobblin-as-a-daemon).
+
+Here we illustrate how to run a simple job. This job will pull revisions for the last ten days of each of the two Wikipedia pages: Linkedin, Wikipedia:Sandbox (a page with frequent edits). The records will be written to stdout.
+
+Gobblin can run either in standalone mode or on MapReduce. In this example we will run Gobblin in standalone mode.
+
+This page explains how to run the job from the terminal. You may also run this job from your favorite IDE (IntelliJ is recommended).
+
+## Steps
+
+* cd to the unpacked Gobblin distribution and run `bin/gobblin cli run` to get usage.
+* Running `bin/gobblin cli run listQuickApps` will list the available easy-to-configure apps. Note the line with the wikipedia example:
+```bash
+wikipedia	-	Gobblin example that downloads revisions from Wikipedia.
+```
+* Running `bin/gobblin cli run wikipedia` will show the usage of this application. Notice the usage and one of the options listed for this job:
+```bash
+usage: gobblin cli run wikipedia [OPTIONS] <article-title> [<article-title>...]
+ -lookback <arg>             Sets the period for which articles should be
+                             pulled in ISO time format (e.g. P2D, PT1H)
+```
+* Run `bin/gobblin cli run wikipedia -lookback P10D LinkedIn Wikipedia:Sandbox`. This will print a lot of logs, but somewhere in there you will see a few json entries with the revisions for those articles. For example:
+```bash
+{"revid":746260034,"parentid":745444076,"user":"2605:8D80:580:5824:B108:82BD:693D:CFA1","anon":"","userid":0,"timestamp":"2016-10-26T08:12:09Z","size":69527,"pageid":970755,"title":"LinkedIn"}
+```
+* In the usage, there is also an option to instead write the output to an avro file:
+```bash
+ -avroOutput <arg>           Write output to Avro files. Specify the
+                             output directory as argument.
+```
+Running `bin/gobblin cli run wikipedia -lookback P10D -avroOutput /tmp/wikiSample LinkedIn Wikipedia:Sandbox` will create a directory `/tmp/wikiSample` with two subdirectories `LinkedIn` and `Wikipedia_Sandbox` each one with one avro file.
+
+# Running Gobblin as a Daemon
+
+Here we show how to run a Gobblin daemon. A Gobblin daemon tracks a directory and finds job configuration files in it (jobs with extensions `*.pull`). Job files can be either run once or scheduled jobs. Gobblin will automatically execute this jobs as they are received following the schedule.
+
+For this example, we will once again run the Wikipedia example. The records will be stored as Avro files.
+
+## Preliminary
+
+Each Gobblin job minimally involves several constructs, e.g. [Source](https://github.com/apache/gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/source/Source.java), [Extractor](https://github.com/apache/gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/source/extractor/Extractor.java), [DataWriter](https://github.com/apache/gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/writer/DataWriter.java) and [DataPublisher](https://github.com/apache [...]
+
+Some of the classes relevant to this example include [WikipediaSource](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/java/org/apache/gobblin/example/wikipedia/WikipediaSource.java), [WikipediaExtractor](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/java/org/apache/gobblin/example/wikipedia/WikipediaExtractor.java), [WikipediaConverter](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/java/org/apache/gobblin/example/w [...]
+
+To run Gobblin in standalone daemon mode we need a Gobblin configuration file (such as uses [application.conf](https://github.com/apache/gobblin/blob/master/conf/standalone/application.conf)). And for each job we wish to run, we also need a job configuration file (such as [wikipedia.pull](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/resources/wikipedia.pull)). The Gobblin configuration file, which is passed to Gobblin as a command line argument, should contain a [...]
+
+A list of commonly used configuration properties can be found here: [Configuration Properties Glossary](user-guide/Configuration-Properties-Glossary).
+
+## Steps
+
+* Create a folder to store the job configuration file. Put [wikipedia.pull](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/resources/wikipedia.pull) in this folder, and set environment variable `GOBBLIN_JOB_CONFIG_DIR` to point to this folder. Also, make sure that the environment variable `JAVA_HOME` is set correctly.
+
+* Create a folder as Gobblin's working directory. Gobblin will write job output as well as other information there, such as locks and state-store (for more information, see the [Standalone Deployment](user-guide/Gobblin-Deployment#Standalone-Deployment) page). Set environment variable `GOBBLIN_WORK_DIR` to point to that folder.
+
+* Unpack Gobblin distribution:
+
+* Launch Gobblin in one of the execution mode [for more info refer: [Gobblin-CLI](user-guide/   Gobblin-CLI.md)] :
+
+```bash
+gobblin service standalone start
+```
+
+Stdout and the job log, which contains the progress and status of the job, will be written into `logs/<execution-mode>.out` & `logs/<execution-mode>.err` (to change where the log is written, modify the Log4j configuration file `conf/log4j.xml`).
+
+Among the job logs there should be the following information:
+
+```
+INFO JobScheduler - Loaded 1 job configuration
+INFO  AbstractJobLauncher - Starting job job_PullFromWikipedia_1422040355678
+INFO  TaskExecutor - Starting the task executor
+INFO  LocalTaskStateTracker2 - Starting the local task state tracker
+INFO  AbstractJobLauncher - Submitting task task_PullFromWikipedia_1422040355678_0 to run
+INFO  TaskExecutor - Submitting task task_PullFromWikipedia_1422040355678_0
+INFO  AbstractJobLauncher - Waiting for submitted tasks of job job_PullFromWikipedia_1422040355678 to complete... to complete...
+INFO  AbstractJobLauncher - 1 out of 1 tasks of job job_PullFromWikipedia_1422040355678 are running
+INFO  WikipediaExtractor - 5 record(s) retrieved for title NASA
+INFO  WikipediaExtractor - 5 record(s) retrieved for title LinkedIn
+INFO  WikipediaExtractor - 5 record(s) retrieved for title Parris_Cues
+INFO  WikipediaExtractor - 5 record(s) retrieved for title Barbara_Corcoran
+INFO  Task - Extracted 20 data records
+INFO  Fork-0 - Committing data of branch 0 of task task_PullFromWikipedia_1422040355678_0
+INFO  LocalTaskStateTracker2 - Task task_PullFromWikipedia_1422040355678_0 completed in 2334ms with state SUCCESSFUL
+INFO  AbstractJobLauncher - All tasks of job job_PullFromWikipedia_1422040355678 have completed
+INFO  TaskExecutor - Stopping the task executor
+INFO  LocalTaskStateTracker2 - Stopping the local task state tracker
+INFO  AbstractJobLauncher - Publishing job data of job job_PullFromWikipedia_1422040355678 with commit policy COMMIT_ON_FULL_SUCCESS
+INFO  AbstractJobLauncher - Persisting job/task states of job job_PullFromWikipedia_1422040355678
+```
+
+* After the job is done, stop Gobblin by running
+
+```bash
+gobblin service standalone stop
+```
+
+The job output is written in `GOBBLIN_WORK_DIR/job-output` folder as an Avro file.
+
+To see the content of the job output, use the Avro tools to convert Avro to JSON. Download the latest version of Avro tools (e.g. avro-tools-1.8.1.jar):
+
+```bash
+curl -O http://central.maven.org/maven2/org/apache/avro/avro-tools/1.8.1/avro-tools-1.8.1.jar
+```
+
+and run
+
+```bash
+java -jar avro-tools-1.8.1.jar tojson --pretty [job_output].avro > output.json
+```
+
+`output.json` will contain all retrieved records in JSON format.
+
+Note that since this job configuration file we used ([wikipedia.pull](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/resources/wikipedia.pull)) doesn't specify a job schedule, the job will run immediately and will run only once. To schedule a job to run at a certain time and/or repeatedly, set the `job.schedule` property with a cron-based syntax. For example, `job.schedule=0 0/2 * * * ?` will run the job every two minutes. See [this link](http://www.quartz-schedul [...]
+
+# Other Example Jobs
+
+Besides the Wikipedia example, we have another example job [SimpleJson](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/resources/simplejson.pull), which extracts records from JSON files and store them in Avro files.
+
+To create your own jobs, simply implement the relevant interfaces such as [Source](https://github.com/apache/gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/source/Source.java), [Extractor](https://github.com/apache/gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/source/extractor/Extractor.java), [Converter](https://github.com/apache/gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/converter/Converter.java) and [DataWriter](https://github [...]
+
+On a side note: while users are free to directly implement the Extractor interface (e.g., WikipediaExtractor), Gobblin also provides several extractor implementations based on commonly used protocols, e.g., [KafkaExtractor](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-kafka-common/src/main/java/org/apache/gobblin/source/extractor/extract/kafka/KafkaExtractor.java), [RestApiExtractor](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apa [...]
diff --git a/gobblin-website/docs/Gobblin-Architecture.md b/gobblin-website/docs/Gobblin-Architecture.md
new file mode 100644
index 0000000..da8feef
--- /dev/null
+++ b/gobblin-website/docs/Gobblin-Architecture.md
@@ -0,0 +1,117 @@
+---
+title: Gobblin Architecture
+sidebar_label: Gobblin Architecture
+---
+
+Gobblin Architecture Overview
+--------------------
+Gobblin is built around the idea of extensibility, i.e., it should be easy for users to add new adapters or extend existing adapters to work with new sources and start extracting data from the new sources in any deployment settings. The architecture of Gobblin reflects this idea, as shown in Fig. 1 below:
+
+![Gobblin Architecture Overview](../static/img/Gobblin-Architecture-Overview.png)
+
+A Gobblin job is built on a set of constructs (illustrated by the light green boxes in the diagram above) that work together in a certain way and get the data extraction work done. All the constructs are pluggable through the job configuration and extensible by adding new or extending existing implementations. The constructs will be discussed in [Gobblin Constructs](Gobblin-Architecture#gobblin-constructs).
+
+A Gobblin job consists of a set of tasks, each of which corresponds to a unit of work to be done and is responsible for extracting a portion of the data. The tasks of a Gobblin job are executed by the Gobblin runtime (illustrated by the orange boxes in the diagram above) on the deployment setting of choice (illustrated by the red boxes in the diagram above). 
+
+The Gobblin runtime is responsible for running user-defined Gobblin jobs on the deployment setting of choice. It handles the common tasks including job and task scheduling, error handling and task retries, resource negotiation and management, state management, data quality checking, data publishing, etc.
+
+Gobblin currently supports two deployment modes: the standalone mode on a single node and the Hadoop MapReduce mode on a Hadoop cluster. We are also working on adding support for deploying and running Gobblin as a native application on [YARN](http://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html). Details on deployment of Gobblin can be found in [Gobblin Deployment](user-guide/Gobblin-Deployment).
+
+The running and operation of Gobblin are supported by a few components and utilities (illustrated by the blue boxes in the diagram above) that handle important things such as metadata management, state management, metric collection and reporting, and monitoring. 
+
+Gobblin Job Flow
+----------------
+A Gobblin job is responsible for extracting data in a defined scope/range from a data source and writing data to a sink such as HDFS. It manages the entire lifecycle of data ingestion in a certain flow as illustrated by Fig. 2 below.
+
+![Gobblin Job Flow](../static/img/Gobblin-Job-Flow.png)
+
+1. A Gobblin job starts with an optional phase of acquiring a job lock. The purpose of doing this is to prevent the next scheduled run of the same job from starting until the current run finishes. This phase is optional because some job schedulers such as [Azkaban](http://azkaban.github.io/) is already doing this. 
+
+2. The next thing the job does is to create an instance of the `Source` class specified in the job configuration. A `Source` is responsible for partitioning the data ingestion work into a set of `WorkUnit`s, each of which represents a logic unit of work for extracting a portion of the data from a data source. A `Source` is also responsible for creating a `Extractor` for each `WorkUnit`. A `Extractor`, as the name suggests, actually talks to the data source and extracts data from it. The  [...]
+
+3. From the set of `WorkUnit`s given by the `Source`, the job creates a set of tasks. A task is a runtime counterpart of a `WorkUnit`, which represents a logic unit of work. Normally, a task is created per `WorkUnit`. However, there is a special type of `WorkUnit`s called `MultiWorkUnit` that wraps multiple `WorkUnit`s for which multiple tasks may be created, one per wrapped `WorkUnit`. 
+
+4. The next phase is to launch and run the tasks. How tasks are executed and where they run depend on the deployment setting. In the standalone mode on a single node, tasks are running in a thread pool dedicated to that job, the size of which is configurable on a per-job basis. In the Hadoop MapReduce mode on a Hadoop cluster, tasks are running in the mappers (used purely as containers to run tasks). 
+
+5. After all tasks of the job finish (either successfully or unsuccessfully), the job publishes the data if it is OK to do so. Whether extracted data should be published is determined by the task states and the `JobCommitPolicy` used (configurable). More specifically, extracted data should be published if and only if any one of the following two conditions holds:
+
+  * `JobCommitPolicy.COMMIT_ON_PARTIAL_SUCCESS` is specified in the job configuration.
+  * `JobCommitPolicy.COMMIT_ON_FULL_SUCCESS` is specified in the job configuration and all tasks were successful.
+
+6. After the data extracted is published, the job persists the job/task states into the state store. When the next scheduled run of the job starts, it will load the job/task states of the previous run to get things like watermarks so it knows where to start.
+
+7. During its execution, the job may create some temporary working data that is no longer needed after the job is done. So the job will cleanup such temporary work data before exiting.  
+
+8. Finally, an optional phase of the job is to release the job lock if it is acquired at the beginning. This gives green light to the next scheduled run of the same job to proceed.  
+
+If a Gobblin job is cancelled before it finishes, the job will not persist any job/task state nor commit and publish any data (as the dotted line shows in the diagram).
+
+Gobblin Constructs
+--------------------------------
+As described above, a Gobblin job creates and runs tasks, each of which is responsible for extracting a portion of the data to be pulled by the job. A Gobblin task is created from a `WorkUnit` that represents a unit of work and serves as a container of job configuration at runtime. A task composes the Gobblin constructs into a flow to extract, transform, checks data quality on, and finally writes each extracted data record to the specified sink. Fig. 3 below gives an overview on the Gobb [...]
+
+
+![Gobblin Constructs](../static/img/Gobblin-Constructs.png)
+
+#### Source and Extractor
+
+A `Source` represents an adapter between a data source and Gobblin and is used by a Gobblin job at the beginning of the job flow. A `Source` is responsible for partitioning the data ingestion work into a set of `WorkUnit`s, each of which represents a logic unit of work for extracting a portion of the data from a data source. 
+
+A `Source` is also responsible for creating an `Extractor` for each `WorkUnit`. An `Extractor`, as the name suggests, actually talks to the data source and extracts data from it. The reason for this design is that Gobblin's `Source` is modeled after Hadoop's `InputFormat`, which is responsible for partitioning the input into `Split`s as well as creating a `RecordReader` for each `Split`. 
+
+Gobblin out-of-the-box provides some built-in `Source` and `Extractor` implementations that work with various types of of data sources, e.g., web services offering some Rest APIs, databases supporting JDBC, FTP/SFTP servers, etc. Currently, `Extractor`s are record-oriented, i.e., an `Extractor` reads one data record at a time, although internally it may choose to pull and cache a batch of data records. We are planning to add options for `Extractor`s to support byte-oriented and file-orie [...]
+
+#### Converter
+
+A `Converter` is responsible for converting both schema and data records and is the core construct for data transformation. `Converter`s are composible and can be chained together as long as each adjacent pair of `Converter`s are compatible in the input and output schema and data record types. This allows building complex data transformation from simple `Converter`s. Note that a `Converter` converts an input schema to one output schema. It may, however, convert an input data record to ze [...]
+
+![Gobblin Converter](../static/img/Converters-Explained.png)
+
+#### Quality Checker
+
+A `QualityChecker`, as the name suggests, is responsible for data quality checking. There are two types of `QualityChecker`s: one that checks individual data records and decides if each record should proceed to the next phase in the task flow and the other one that checks the entire task output and decides if data can be committed. We call the two types row-level `QualityChecker`s and task-level `QualityChecker`s, respectively. A `QualityChecker` can be `MANDATORY` or `OPTIONAL` and will [...]
+
+#### Fork Operator
+
+A `ForkOperator` is a type of control operators that allow a task flow to branch into multiple streams, each of which goes to a separately configured sink. This is useful for situations, e.g., that data records need to be written into multiple different storages, or that data records need to be written out to the same storage (say, HDFS) but in different forms for different downstream consumers. 
+
+#### Data Writer
+
+A `DataWriter` is responsible for writing data records to the sink it is associated to. Gobblin out-of-the-box provides an `AvroHdfsDataWriter` for writing data in [Avro](http://avro.apache.org/) format onto HDFS. Users can plugin their own `DataWriter`s by specifying a `DataWriterBuilder` class in the job configuration that Gobblin uses to build `DataWriter`s.
+
+#### Data Publisher
+A `DataPublisher` is responsible for publishing extracted data of a Gobblin job. Gobblin ships with a default `DataPublisher` that works with file-based `DataWriter`s such as the `AvroHdfsDataWriter` and moves data from the output directory of each task to a final job output directory. 
+
+Gobblin Task Flow
+--------------------------------
+
+Fig. 5 below zooms in further and shows the details on how different constructs are connected and composed to form a task flow. The same task flow is employed regardless of the deployment setting and where tasks are running.
+
+![Gobblin Task Flow](../static/img/Gobblin-Task-Flow.png)
+
+A Gobblin task flow consists of a main branch and a number of forked branches coming out of a `ForkOperator`. It is optional to specify a `ForkOperator` in the job configuration. When no `ForkOperator` is specified in the job configuration, a Gobblin task flow uses a `IdentityForkOperator` by default with a single forked branch. The `IdentityForkOperator` simply connects the master branch and the _single_ forked branch and passes schema and data records between them. The reason behind th [...]
+     
+The master branch of a Gobblin task starts with schema extraction from the source. The extracted schema will go through a schema transformation phase if at least one `Converter` class is specified in the job configuration. The next phase is to repeatedly extract data records one at a time. Each extracted data record will also go through a transformation phase if at least one `Converter` class is specified. Each extracted (or converted if applicable) data record is fed into an optional li [...]
+
+Data records that pass the row-level `QualityChecker`s will go through the `ForkOperator` and be further processed in the forked branches. The `ForkOperator` allows users to specify if the input schema or data record should go to a specific forked branch. If the input schema is specified _not_ to go into a particular branch, that branch will be ignored. If the input schema or data record is specified to go into _more than one_ forked branch, Gobblin assumes that the schema or data record [...]
+
+Similarly to the master branch, a forked branch also processes the input schema and each input data record (one at a time) through an optional transformation phase and a row-level quality checking phase. Data records that pass the branch's row-level `QualityChecker`s will be written out to a sink by a `DataWriter`. Each forked branch has its own sink configuration and a separate `DataWriter`. 
+
+Upon successful processing of the last record, a forked branch applies an optional list of task-level `QualityChecker`s to the data processed by the branch in its entirety. If this quality checking passes, the branch commits the data and exits. 
+
+A task flow completes its execution once every forked branches commit and exit. During the execution of a task, a `TaskStateTracker` keeps track of the task's state and a core set of task metrics, e.g., total records extracted, records extracted per second, total bytes extracted, bytes extracted per second, etc.    
+
+Job State Management
+--------------------------------
+Typically a Gobblin job runs periodically on some schedule and each run of the job is extracting data incrementally, i.e., extracting new data or changes to existing data within a specific range since the last run of the job. To make incremental extraction possible, Gobblin must persist the state of the job upon the completion of each run and load the state of the previous run so the next run knows where to start extracting. Gobblin maintains a state store that is responsible for job sta [...]
+
+Out-of-the-box, Gobblin uses an implementation of the state store that serializes job and task states into Hadoop `SequenceFile`s, one per job run. Each job has a separate directory where its job and task state `SequenceFile`s are stored. The file system on which the `SequenceFile`-based state store resides is configurable.   
+
+Handling of Failures
+--------------------------------
+As a fault tolerance data ingestion framework, Gobblin employs multiple level of defenses against job and task failures. For job failures, Gobblin keeps track of the number of times a job fails consecutively and optionally sends out an alert email if the number exceeds a defined threshold so the owner of the job can jump in and investigate the failures. For task failures, Gobblin retries failed tasks in a job run up to a configurable maximum number of times. In addition to that, Gobblin  [...]
+
+Job Scheduling
+--------------------------------
+Like mentioned above, a Gobblin job typically runs periodically on some schedule. Gobblin can be integrated with job schedulers such as [Azkaban](http://azkaban.github.io/),[Oozie](http://oozie.apache.org/), or Crontab. Out-of-the-box, Gobblin also ships with a built-in job scheduler backed by a [Quartz](http://quartz-scheduler.org/) scheduler, which is used as the default job scheduler in the standalone deployment and it supports cron-based triggers using the configuration property `job [...]
+  
diff --git a/gobblin-website/docs/Powered-By.md b/gobblin-website/docs/Powered-By.md
new file mode 100644
index 0000000..2240b8e
--- /dev/null
+++ b/gobblin-website/docs/Powered-By.md
@@ -0,0 +1,23 @@
+
+![Gobblin Logo](../static/img/Gobblin-Logo.png)
+
+<p>
+  A few companies known to be powered by Gobblin:
+  <ul>
+    <li><a href="http://www.linkedin.com" target="_blank"><b>LinkedIn</b></a></li>
+    <li><a href="http://www.intel.com" target="_blank"><b>Intel</b></a></li>
+    <li><a href="http://www.paypal.com" target="_blank"><b>Paypal</b></a></li>
+    <li><a href="http://www.microsoft.com" target="_blank"><b>Microsoft</b></a></li>
+    <li><a href="http://www.ibm.com" target="_blank"><b>IBM</b></a></li>
+    <li><a href="http://www.home.cern/" target="_blank"><b>CERN</b></a></li>
+    <li><a href="http://www.apple.com/" target="_blank"><b>Apple</b></a></li>
+    <li><a href="http://www.stunlockstudios.com" target="_blank"><b>Stunlock Studios</b></a></li>
+    <li><a href="http://www.swisscom.ch" target="_blank"><b>Swisscom</b></a></li>
+    <li><a href="http://www.prezi.com" target="_blank"><b>Prezi</b></a></li>
+    <li><a href="http://www.cleverleaf.co.uk" target="_blank"><b>Cleverdata</b></a></li>
+    <li><a href="http://www.applift.com" target="_blank"><b>AppLift</b></a></li>
+    <li><a href="http://www.nerdwallet.com" target="_blank"><b>Nerdwallet</b></a></li>
+    <li><a href="http://www.sandia.gov/" target="_blank"><b>Sandia National Laboratories</b></a></li>
+    <li><a href="http://www.bpuholdings.com/" target="_blank"><b>BPU Holdings</b></a></li>
+  </ul>
+</p>
diff --git a/gobblin-website/docs/adaptors/Gobblin-Distcp.md b/gobblin-website/docs/adaptors/Gobblin-Distcp.md
new file mode 100644
index 0000000..63fccea
--- /dev/null
+++ b/gobblin-website/docs/adaptors/Gobblin-Distcp.md
@@ -0,0 +1,282 @@
+---
+title: Gobblin Distcp
+sidebar_label: Gobblin Distcp
+---
+
+# Introduction
+
+Gobblin Distcp is a rebuilding of [Distcp](https://hadoop.apache.org/docs/current/hadoop-distcp/DistCp.html) on top of Gobblin. It is still currently a work in progress, but an Alpha version of the code is available. The document mainly outlines the design of Gobblin Distcp, including the high level design goals and core APIs.
+
+Gobblin Distcp benefits from many features in Gobblin:
+
+* Dataset awareness
+    * Configurability/customization of replication flows (Planned)
+    * Isolation (Implemented)
+    * Support for flexible copy triggering semantics (data triggers, dataset descriptors, etc.) (Planned)
+    * Future support for self-serve replication (Planned)
+* Operability
+    * Metrics (Implemented)
+* Customizable publish semantics
+    * Data triggers (Implemented)
+    * Hive registration (Implemented)
+    * Auditing (Planned)
+    * Exactly-once publishing (Planned)
+* Future support for continuous execution (near-real-time replication) (Planned)
+* Inline byte stream processing
+    * Archiving/unarchiving (Implemented)
+    * Encryption/decryption (Implemented)
+
+The effort uses a regular Gobblin workflow with specific constructs that handle input streams as records. We use gobblin data management to have dataset awareness, and to optimize copy listings where possible. We use gobblin metrics to emit data availability notifications and operational metrics. 
+
+# Problem Statement
+
+We need an application for copying from a FileSystem compatible source to another FileSystem compatible destination. The application must be able to:
+
+1. Find files in source FileSystem A that need to be copied.
+2. Determine locations in FileSystem B where the new files will be created.
+3. Do byte level copy from file in A to file in B efficiently.
+4. Be simple enough for other users to use it instead of distcp.
+5. Set owner, group, and permissions of newly created files, as well as newly created ancestors.
+6. On user request, override default attributes of new files like block size, replication factor, etc.
+7. Allow for on-the-fly byte level transformations like UnGZippping, PGP decrypting, etc.
+8. Allow for on-the-fly unpacking of byte streams, like expanding tarballs, zips, etc.
+9. Perform quality checks on the destination files if requested.
+10. Emit real-time operational metrics (transfer speed, files completed, etc.) and allow for creating post-job summaries.
+11. Emit data availability notifications.
+12. Copy listings should be pluggable and fully dataset aware. Datasets can annotate data availability notifications, or modify aspects of the copy operation (like preserve attributes).
+13. Publishing should be pluggable and allow for easy extensions. Default publishing will simply place files in correct target locations. Extensions can register new files with Hive, etc.
+14. Reuse previously copied files that didnât get published due to errors in the previous flow.
+15. Use other Gobblin features (e.g. proxying, password management).
+
+# Existing Solutions
+
+* [Distcp](https://hadoop.apache.org/docs/current/hadoop-distcp/DistCp.html): Tool maintained by Hadoop. Allows copying files and syncing directories between FileSystem implementations (including HDFS, S3, and local file systems). Uses MapReduce to perform the copy. Has various features like preserving permissions and setting replication factors.
+    * Uses some heuristics to accelerate file listing generation (e.g. using directory mod time to determine if new files are likely to exist).
+    * Minimally dataset aware: e.g. can treat tracking and databases data differently.
+    * Can recover files that failed to publish in previous runs.
+* Gobblin: Regular Gobblin can be used to read every record and re-write it to the new location. However, this involves actually deserializing records and has significant overhead. 
+
+# Proposed Design
+
+## Design Overview
+
+The core of Gobblin distcp is simply a traditional Gobblin flow with sources, converters, and writers that work directly with input streams. The work units are CopyableFiles, which contain all the metadata necessary to copy a single file, and the records are FileAwareInputStream, which is an input stream + CopyableFile. 
+
+![Gobblin Distcp Flow](../../static/img/Gobblin-Distcp-Flow.png)
+
+### Example
+
+1. `CopySource` runs a `DatasetFinder`.
+2. `DatasetFinder` searches for all `Dataset`s.
+3. It creates a `CopyableDataset` for each `Dataset`.
+4. Each `Dataset` creates a copy listing for itself.
+5. `CopySource` creates a Gobblin `WorkUnit` for each `CopyableFile`.
+6. `InputStreamExtractor` opens an `InputStream` for each `CopyableFile`.
+7. `InputStreamWriter` creates the necessary file in destination and dumps the bytes of the `InputStream`.
+8. `InputStreamWriter` sets the correct owner and permissions, and puts files in writer-output location in the same directory structure as they will be published.
+9. `DataPublisher` groups work units by partition string and, for each partition string, moves the files to the destination. If a partition of a dataset failed to copy, all other successful partitions and datasets are published either way. The failed partition is staged for recovery on next run.
+10. `DataPublisher` emits notifications, performs Hive registration, etc.
+
+## Classes
+
+### CopyableDataset
+
+* An abstraction of a `Dataset`, i.e. a set of related files (for example a database table).
+* Generates copy listings for that dataset. Example: if I want to replicate DB.Table to a new location, which files should I copy.
+* Generates partitioning of copy listing into atomic units called file sets. A file set will be published nearly atomically.
+* All files in the listing will be copied. It is the responsibility of the `CopyableDataset` to do a diff with the target (because it might have optimizations for performing the diff).
+* Implementations:
+    * `RecursiveCopyableDataset`: copies all files under an root directory.
+    * `StreamDataset`: copies date-partitioned directories for Kafka topics.
+
+```java
+/**
+* Interface representing a dataset.
+*/
+public interface Dataset {
+ 
+ /**
+  * Deepest {@link org.apache.hadoop.fs.Path} that contains all files in the dataset.
+  */
+ public Path datasetRoot();
+
+}
+
+/**
+ * {@link Dataset} that supports finding {@link CopyableFile}s.
+ */
+public interface CopyableDataset extends Dataset {
+
+ /**
+  * Find all {@link CopyableFile}s in this dataset.
+  *
+  * <p>
+  *   This method should return a collection of {@link CopyableFile}, each describing one file that should be copied
+  *   to the target. The returned collection should contain exactly one {@link CopyableFile} per file that should
+  *   be copied. Directories are created automatically, the returned collection should not include any directories.
+  *   See {@link CopyableFile} for explanation of the information contained in the {@link CopyableFile}s.
+  * </p>
+  *
+  * @param targetFs target {@link FileSystem} where copied files will be placed.
+  * @param configuration {@link CopyConfiguration} for this job. See {@link CopyConfiguration}.
+  * @return List of {@link CopyableFile}s in this dataset.
+  * @throws IOException
+  */
+ public Collection<CopyableFile> getCopyableFiles(FileSystem targetFs, CopyConfiguration configuration) throws
+     IOException;
+
+}
+```
+
+### DatasetFinder
+
+* Finds `CopyableDataset`s in the file system. 
+* Implementations:
+    * `CopyableGlobDatasetFinder`: Uses a glob and creates a` RecursiveCopyableDataset` for each matching directory.
+    * `StreamDatasetFinder`: Creates a `StreamDataset `for each directory in input directory.
+
+```java
+/**
+ * Finds {@link Dataset}s in the file system.
+ *
+ * <p>
+ *   Concrete subclasses should have a constructor with signature
+ *   ({@link org.apache.hadoop.fs.FileSystem}, {@link java.util.Properties}).
+ * </p>
+ */
+public interface DatasetsFinder<T extends Dataset> {
+
+ /**
+  * Find all {@link Dataset}s in the file system.
+  * @return List of {@link Dataset}s in the file system.
+  * @throws IOException
+  */
+ public List<T> findDatasets() throws IOException;
+
+ /**
+  * @return The deepest common root shared by all {@link Dataset}s root paths returned by this finder.
+  */
+ public Path commonDatasetRoot();
+
+}
+```
+
+### CopyableFile
+
+* Structure containing information about a file that needs to be copied:
+    * Origin `FileStatus`.
+    * Destination path.
+    * Desired owner and permission.
+    * Attributes to be preserved (e.g. replication, block size).
+    * `FileSet` file belongs to (atomic units).
+    * Checksum.
+    * Metadata.
+* Built with a builder with sensible defaults.
+* Has a replicable guid that uniquely identifies origin file. `Guid` is a hash (sha1) of:
+    * Origin path.
+    * Origin length.
+    * Origin timestamp.
+    * Checksum if available.
+
+## Distcp Constructs
+
+* Distcp runs as a Gobblin flow with special distcp constructs.
+
+### CopySource
+
+* Source for Gobblin distcp.
+* Flow:
+    1. Instantiate a `DatasetFinder`.
+    2. Use `DatasetFinder` to find `CopyableDatasets`.
+    3. For each `CopyableDataset` get file listing.
+    4. For each `CopyableFile` create a Gobblin `WorkUnit`.
+    5. Serialize the `CopyableFile` into the `WorkUnit`.
+    6. For each `WorkUnit` create a `FileAwareInputStreamExtractor`.
+
+### FileAwareInputStreamExtractor
+
+* Extractor for Gobblin distcp.
+* Opens origin file and creates `FileAwareInputStream` containing the `InputStream` and the corresponding `CopyableFile`.
+
+### DistcpConverter
+
+* Abstract class for distcp converters. Allows transformation of the `InputStream`(for example decrypting, de-archiving, etc.).
+* Alters extensions to reflect the changes (eg. remove .gz).
+* Implementations:
+    * `DecryptConverter`: Performs GPG decryption of the input.
+    * `UnGzipConverter`: Un-gzips the input.
+    * `EncryptConverter`: Performs GPG encryption of the input.
+
+### FileAwareInputStreamDataWriter
+
+* Gobblin writer for distcp.
+* Takes a `FileAwareInputStream` and performs the copy of the file. Currently using a single `DirectByteBuffer`.
+    * Possible optimizations: Use two `DirectByteBuffer`s, while one is reading, the other one is writing.
+* Sets target file attributes and permissions.
+* Performs recovery of previous unpublished work.
+
+### TarArchiveInputStreamDataWriter
+
+* Extension of `FileAwareInputStreamDataWriter`.
+* Takes a tar input stream and writes the contained sequence of files to the file system.
+* Allows for automatic untaring on write.
+* Example: a tarball containing files root/path/to/file, root/file2 will be expanded on the fly to get output/path/to/file and output/file2.
+
+### CopyPublisher
+
+* Groups work units by file set.
+* For each file set, move the output files from staging location to final location in as few operations as possible (for near-atomicity).
+* Recheck permissions of the output.
+* Emit events indicating availability of published data.
+    * One event per file.
+    * One event per file set.
+
+## Recovery of unpublished files
+
+* Copied files may fail to be published even after the copy has succeeded. Some reasons:
+    * Failed to set permissions.
+    * Other files in the same file set failed, preventing atomic publish.
+    * Wrong permissions for destination.
+    * Transient file system issues.
+* When distcp detects a failure on the write step (e.g. setting owner and permissions), it will persist the uncommitted file to a separate location (Gobblin automatically deletes staging locations on exit). On the next run, distcp can identify files that were previously copied, and re-use them instead of repeating the data copy.
+* The publish step uses "exactly once" feature:
+    * The publisher generates a set of publish steps (e.g. 1. move file to this location, 2. send event notifications, 3. commit watermark).
+    * The publish steps are written to a write-ahead log.
+    * The publish steps are executed.
+    * If the publish steps are successful, the write-ahead log is deleted.
+    * If the publish steps fail, the write-ahead log is preserved, and Gobblin will attempt to run them on the next execution. Relevant directories will not be deleted on exit.
+* Eventually, write step should also use exactly-once feature.
+
+## Splitting files into block level granularity work units
+
+Gobblin Distcp has an option to enable splitting of files into block level granularity work units, which involves the use of a helper class, `DistcpFileSplitter`, which has methods for:
+* Splitting of files into block level work units, which is done at the `CopySource`; the block level granularity is represented by an additional `Split` construct within each work unit that contains offset and ordering information.
+* Merging of block level work units/splits, which is done at the `CopyDataPublisher`; this uses calls to the `FileSystem#concat` API to append the separately copied entities of each file back together.
+
+# Leverage
+
+Gobblin Distcp leverages Gobblin as its running framework, and most features available to Gobblin:
+
+* Gobblin execution implementation
+* Gobblin publishing implementation
+* Gobblin metrics
+* Gobblin on YARN
+* Exactly once semantics
+* Automatic Hive registration
+
+# Performance, Scalability and Provisioning
+
+There are two components in the flow:
+
+* File listing and work unit generation: slow if there are too many files. Dataset aware optimizations are possible, as well as using services other than Hadoop ls call (like lsr or HDFS edit log), so this can be improved and should scale with the correct optimizations. Work unit generation is currently a serial process handled by Gobblin and could be a bottleneck. If we find it is a bottleneck, that process is parallelizable.
+* Actual copy tasks: massively parallel using MR or many containers in YARN. Generally, it is the most expensive part of the flow. Although inputs can be split, HDFS does not support parallel writing to the same file, so large files will be a bottleneck (but this is true with distcp2 as well). This issue will be alleviated with the YARN executing model, where WorkUnits are allocated dynamically to containers (multiple small files can be copied in one container will another container copi [...]
+
+# Monitoring and Alerting
+
+Monitoring and alerting will be done through Gobblin metrics. We will have real-time operational metrics available. Gobblin metrics automatically emits notifications for any failures as well as whenever data is available.
+
+Better SLAs can be achieved in the future through the use of continuous ingestion with priority queues.
+
+# Future Work
+
+There is currently work in progress to implement Gobblin Distcp on top of Hive. Gobblin Distcp will be capable of copying Hive tables and databases within and between Hadoop clusters.
diff --git a/gobblin-website/docs/adaptors/Hive-Avro-To-ORC-Converter.md b/gobblin-website/docs/adaptors/Hive-Avro-To-ORC-Converter.md
new file mode 100644
index 0000000..c6b0178
--- /dev/null
+++ b/gobblin-website/docs/adaptors/Hive-Avro-To-ORC-Converter.md
@@ -0,0 +1,195 @@
+---
+title: Hive Avro-To-Orc Converter
+sidebar_label: Hive Avro-To-Orc Converter
+---
+
+# Getting Started
+
+Gobblin provides ready to use adapters for converting data in [Avro](http://avro.apache.org/) to [ORC](https://orc.apache.org/). This page describes the steps to setup such a job.
+
+<b>Note: The job requires Avro data to be registered in Hive.</b>
+
+* Gobblin Avro to ORC job leverages [Hive](http://hive.apache.org/) for the conversion. Meaning, Gobblin does not read the Avro data record by record and convert each one of them to ORC, instead Gobblin executes hive queries to perform the conversion. This means that Avro data MUST be registred in hive for the converison to be possible. Below is a sample query.
+
+<b>Example Conversion DDL</b>
+
+```sql
+INSERT OVERWRITE TABLE db_name_orc.table_orc
+  PARTITION (year='2016')
+          SELECT
+              header.id,
+              header.time,
+              ... (more columns to select)
+              ...
+              ...
+FROM db_name_avro.table_avro WHERE year='2016';
+```
+
+* Since Hive takes care of scaling the number of mappers/reducers required to perform the conversion, Gobblin does not run this job in MR mode. It runs in standalone mode.
+* Each workunit converts a hive partition or a hive table (non partitioned tables).
+* Each workunit/task executes one or more Hive DDLs.
+* A gobblin task publishes data to a staging table first. The publisher moves data into the final table.
+* The job supports schema evolution. Meaning any schema (compatible) changes on the Avro table are automatically made on the ORC table.
+* By default publishing happens per dataset (dataset = table in this context). If a dataset fails, other datasets will still be published but the job will fail. The commit policy is configurable.
+* Gobblin metrics is used to emit events when ORC data is published or when publish fails.
+
+
+# Job Constructs
+
+## Source and Extractor
+
+Gobblin provides [`HiveSource`](https://github.com/apache/gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive/source/HiveSource.java) which is a generic source that connects to the hive metastore and creates `WorkUnits` for any Hive `Partitions` and `Tables` whitelisted. The [`HiveConvertExtractor`](https://github.com/apache/gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive [...]
+
+The `HiveSource` uses the `HiveDatasetFinder` to find all hive tables and partitions that satisfy a whitelist. For each table/partition it creates a workunit is the `updateTime` is greater than the `lowWatermark`. By default a [`PartitionLevelWatermarker`](https://github.com/apache/gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive/watermarker/PartitionLevelWatermarker.java) is used. This watermarker tracks watermarks for every pa [...]
+
+The `HiveConvertExtractor` builds `QueryBasedHiveConversionEntity`s. The extractor makes necessary calls to the Hive Metastore to get table/partition metadata. The metadata is then wrapped into a `QueryBasedHiveConversionEntity`.
+
+## Converter
+
+The converter builds the Hive DDLs/DMLs required to perform the Avro to ORC conversion. Gobblin supports conversion of Avro to both flattened ORC and nested ORC.
+The abstract converter [`AbstractAvroToOrcConverter`](https://github.com/apache/gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive/converter/AbstractAvroToOrcConverter.java) builds DDLs/DMLs for any destination ORC format. Concrete subclass [`HiveAvroToFlattenedOrcConverter`](https://github.com/apache/gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive/converter/HiveAvroToFl [...]
+
+The converter builds the following different DDLs/DMLs
+
+* Create staging table DDL - ORC data is written to a staging table first. The publisher then publishes them to the final ORC table. These DDLs are to create the staging table. A staging table looks like `<orc_db_name>.<orc_table_name>_staging_<timestamp>`
+* Create staging partition DDL - Similar to staging table but for a partition
+* Conversion staging DML - This is the DML to select rows from Avro source table and insert them into the ORC staging table
+* Create final table DDL (Optional) - This is the final ORC destination table. Creates the destination table is it does not exist
+* Evolve final table DDLs (Optional) - Populate the schema evolution queries if required
+* Drop partitions if exist in final table - DDL to drop a partition on destination if it already exists.
+* Create final partition DDL - Create the ORC partition
+* Drop staging table DDL - Cleanup the staging table after data is published from staging to final tables
+
+## Writer
+
+The writer in this context executes the Hive DDLs/DMLs generated by the converter. [`HiveQueryExecutionWriter`](https://github.com/apache/gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive/writer/HiveQueryExecutionWriter.java) uses Hive JDBC connector to execute the DDLs. The DDLs write ORC data into staging tables. After the writer has completed `HiveQueryExecutionWriter#write()`, ORC data will be available in the staging tables.
+
+
+## Publisher
+
+The publisher [`HiveConvertPublisher`](https://github.com/apache/gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive/publisher/HiveConvertPublisher.java) executes hive DDLs to publish staging ORC tables to final ORC tables. The publisher also cleans up staging tables.
+By default publishing happens per dataset (dataset = table in this context). If a dataset fails, other datasets will still be published but the job will fail. The commit policy is configurable.
+
+
+# Job Config Properties
+
+These are some of the job config properties used by `HiveAvroToOrcSource` and `HiveConvertExtractor`.
+
+| Configuration key | Description | Example value |
+| --- | --- | --- |
+|  hive.dataset.whitelist  | Avro hive databases, tables to be converted |   <ol> <li>db1 -> any table under db1 passes.</li><li>db1.table1 -> only db1.table1 passes.</li><li>db1.table* -> any table under db1 whose name satisfies the pattern table* passes.</li><li>db* -> all tables from all databases whose names satisfy the pattern db* pass.</li><li>db*.table* -> db and table must satisfy the patterns db* and table* respectively </li><li>db1.table1,db2.table2 -> combine expressions for d [...]
+| hive.dataset.blacklist | Avro hive databases, tables not to converted |  Same as hive.dataset.whitelist examples  |
+|  gobblin.runtime.root.dir  |  Root dir for gobblin state store, staging, output etc.  |  /jobs/user/avroToOrc  |
+|  hive.source.maximum.lookbackDays  |  Partitions older than this value will not be processed.<br/> The default value is set to 3. <br/> <br/>So if an Avro partition older than 3 days gets modified, the job will not convert the new changes.  | 3 |
+|  hive.source.watermarker.class  |  The type of watermark to use. Watermark can be per partition or per table. The default is `gobblin.data.management.conversion.hive.watermarker.PartitionLevelWatermarker`  |  gobblin.data.management.conversion.hive.watermarker.PartitionLevelWatermarker <br/> <br/> gobblin.data.management.conversion.hive.watermarker.TableLevelWatermarker  |
+|  taskexecutor.threadpool.size  |  Maximum number of parallel conversion hive queries to run. <br/> <br/>This is the standard gobblin property to control the number of parallel tasks (threads). This is set to a default of 50 because each task queries the hive metastore. So this property also limits the number of parallel metastore connections  | 50 |
+|  hive.conversion.avro.flattenedOrc.destination.dbName  | Name of the ORC database |  $DB is the Avro database name.<br/> E.g. If avro database name is tracking, $DB will be resolved at runtime to tracking.  <ul><li>Setting the value to "$DB_column" will result in a ORC table name of tracking_column</li></ul>   |
+|  hive.conversion.avro.flattenedOrc.destination.tableName  |  Name of the ORC table |  $TABLE is the Avro table name.<br/> E.g. If avro table name is LogEvent, $TABLE will be resolved at runtime to LogEvent.  <ul><li>Setting the value of this property to "$TABLE" will cause the ORC table name to be same as Avro table name.</li> <li>Setting the value to "$TABLE_orc" will result in a ORC table name of LogEvent_orc</li></ul>   |
+|  hive.conversion.avro.flattenedOrc.destination.dataPath  | Location on HDFS where ORC data is published | /events_orc/$DB/$TABLE |
+|  hive.conversion.avro.flattenedOrc.evolution.enabled  | Decides if schema evolution is enabled | true/false |
+|  hive.conversion.avro.flattenedOrc.hiveRuntime.*  |  Additional hive properties to be set while executing the conversion DDL. Prefix any hive standard properties with this key  |  hive.conversion.avro.flattenedOrc.hiveRuntime.mapred.map.tasks=10  |
+|  hive.conversion.avro.destinationFormats  | A comma separated list of destination formats. Currently supports nestedOrc and flattenedOrc | flattenedOrc,nestedOrc |
+
+# Metrics and Events
+
+SLA event is published every time an Avro partition/table is converted to ORC. Each SLA event has the following metadata.
+
+```json
+{
+    ## Publish timestamp
+    "timestamp" : "1470229945441",
+    "namespace" : "gobblin.hive.conversion",
+    "name" : "gobblin.hive.conversion.ConversionSuccessful",
+    "metadata" : {
+
+        ## Azkaban metadata (If running on Azkaban)
+        "azkabanExecId": "880060",
+        "azkabanFlowId": "azkaban_flow_name",
+        "azkabanJobId": "azkaban_job_name",
+        "azkabanProjectName": "azkaban_project_name",
+        "jobId": "job_AvroToOrcConversion_1470227416023",
+        "jobName": "AvroToOrcConversion",
+
+        ## Dataset and Partition metadata
+        "datasetUrn": "events@logevent",
+        "sourceDataLocation": "hdfs://<host>:<port>/events/LogEvent/2016/08/03/04",
+        "partition": "datepartition=2016-08-03-04",
+        "schemaEvolutionDDLNum": "0",
+
+        ## Begin and End time metadata for each phase
+        "beginConversionDDLExecuteTime": "1470227453370",
+        "beginDDLBuildTime": "1470227452382",
+        "beginGetWorkunitsTime": "1470227428136",
+        "beginPublishDDLExecuteTime": "1470229944141",
+        "endConversionDDLExecuteTime": "1470227928486",
+        "endDDLBuildTime": "1470227452382",
+        "endPublishDDLExecuteTime": "1470229945440",
+        "originTimestamp": "1470227446703",
+        "previousPublishTs": "1470223843230",
+        "upstreamTimestamp": "1470226593984",
+        "workunitCreateTime": "1470227446703"
+
+        ## Gobblin metrics metadata
+        "class": "org.apache.gobblin.data.management.conversion.hive.publisher.HiveConvertPublisher",
+        "metricContextID": "20bfb2a2-0592-4f53-9259-c8ee125f90a8",
+        "metricContextName": "org.apache.gobblin.data.management.conversion.hive.publisher.HiveConvertPublisher.781426901",
+    }
+}
+```
+
+The diagram below describes timestamps captured in the SLA event.
+![Event metadata description](../../static/img/Avro-to-Orc-timeline.jpg)
+
+# Sample Job
+
+```properties
+# Avro hive databases and tables to convert
+hive.dataset.whitelist=events.LogEvent|LoginEvent
+
+data.publisher.type=org.apache.gobblin.data.management.conversion.hive.publisher.HiveConvertPublisher
+source.class=org.apache.gobblin.data.management.conversion.hive.source.HiveAvroToOrcSource
+writer.builder.class=org.apache.gobblin.data.management.conversion.hive.writer.HiveQueryWriterBuilder
+converter.classes=org.apache.gobblin.data.management.conversion.hive.converter.HiveAvroToFlattenedOrcConverter,org.apache.gobblin.data.management.conversion.hive.converter.HiveAvroToNestedOrcConverter
+
+hive.dataset.finder.class=org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDatasetFinder
+
+# Only flattened orc is enabled
+hive.conversion.avro.destinationFormats=flattenedOrc
+hive.conversion.avro.flattenedOrc.destination.dataPath=/events_orc/
+
+# Avro table name _orc
+hive.conversion.avro.flattenedOrc.destination.tableName=$TABLE_orc
+
+# Same as Avro table name
+hive.conversion.avro.flattenedOrc.destination.dbName=$DB
+hive.conversion.avro.flattenedOrc.evolution.enabled=true
+hive.conversion.avro.flattenedOrc.source.dataPathIdentifier=daily,hourly
+
+# No host and port required. Hive starts an embedded hiveserver2
+hiveserver.connection.string=jdbc:hive2://
+
+## Maximum lookback
+hive.source.maximum.lookbackDays=3
+
+## Gobblin standard properties ##
+task.maxretries=1
+taskexecutor.threadpool.size=75
+workunit.retry.enabled=true
+
+
+# Gobblin framework locations
+mr.job.root.dir=/jobs/working
+state.store.dir=/jobs/state_store
+writer.staging.dir=/jobs/writer_staging
+writer.output.dir=/jobs/writer_output
+
+# Metrics
+metrics.enabled=true
+metrics.reporting.kafka.enabled=true
+metrics.reporting.kafka.format=avro
+metrics.reporting.kafka.avro.use.schema.registry=true
+metrics.reporting.kafka.topic.metrics=MetricReport
+
+launcher.type=LOCAL
+classpath=lib/*
+```
diff --git a/gobblin-website/docs/case-studies/Hive-Distcp.md b/gobblin-website/docs/case-studies/Hive-Distcp.md
new file mode 100644
index 0000000..1b02f70
--- /dev/null
+++ b/gobblin-website/docs/case-studies/Hive-Distcp.md
@@ -0,0 +1,102 @@
+# Introduction
+
+Gobblin hive distcp is built on top of [Gobblin distcp](http://gobblin.readthedocs.io/en/latest/adaptors/Gobblin-Distcp/). It uses Hive metastore to find datasets to copy, then performs regular file listings to find the actual files to copy. After finishing the copy, the Hive registrations in the source are replicated on the target.
+
+This document will show an sample job config of running Gobblin hive distcp, and explain how it works.
+
+# Configure Hive Distcp Job
+
+Below is the sample job config of running Gobblin hive distcp. Gobblin job constructs and data flow are the same as [Gobblin distcp](http://gobblin.readthedocs.io/en/latest/adaptors/Gobblin-Distcp/). The only difference is the `gobblin.data.profile.class` and hive related properties.
+
+```properties
+job.name=SampleHiveDistcp
+job.group=HiveDistcp
+job.description=Sample job config for hive distcp
+
+extract.namespace=org.apache.gobblin.copy.tracking
+gobblin.dataset.profile.class=org.apache.gobblin.data.management.copy.hive.HiveDatasetFinder
+data.publisher.type=org.apache.gobblin.data.management.copy.publisher.CopyDataPublisher
+source.class=org.apache.gobblin.data.management.copy.CopySource
+writer.builder.class=org.apache.gobblin.data.management.copy.writer.FileAwareInputStreamDataWriterBuilder
+converter.classes=org.apache.gobblin.converter.IdentityConverter
+
+hive.dataset.copy.target.table.prefixToBeReplaced=
+hive.dataset.copy.target.table.prefixReplacement=
+data.publisher.final.dir=${hive.dataset.copy.target.table.prefixReplacement}
+
+hive.dataset.hive.metastore.uri=
+hive.dataset.copy.target.metastore.uri=
+
+hive.dataset.whitelist=
+hive.dataset.copy.target.database=
+
+hive.dataset.existing.entity.conflict.policy=REPLACE_PARTITIONS
+hive.dataset.copy.deregister.fileDeleteMethod=NO_DELETE
+
+hive.dataset.copy.location.listing.method=RECURSIVE
+hive.dataset.copy.locations.listing.skipHiddenPaths=true
+
+gobblin.copy.preserved.attributes=rgbp
+```
+
+## Source and target metastores
+
+`hive.dataset.hive.metastore.uri` and `hive.dataset.copy.target.metastore.uri` specify the source and target metastore uri. Make sure the hive distcp job has access to both hive metastores.
+
+## Database and tables to copy
+
+Use a whitelist and optionally a blacklist using to specify tables to copy using the keys `hive.dataset.whitelist` and `hive.dataset.blacklist`. Both whitelist and blacklist accept various patterns, for example:
+
+* sampleDb.sampleTable -> specific table `sampleTable` in database `sampleDb`;
+* sampleDb -> all tables in database `sampleDb`;
+* sampleDb.samplei* -> specific tables starting with `sample` in database `sampleDb`.
+
+The key `hive.dataset.copy.target.database` specifies the target database to create tables under. If omitted, will use the same as the source.
+
+## Target path computation
+
+This specifies where copied files should be placed. There are a few options on how the target paths will be computed:
+
+* Prefix replacement: simply replace a prefix in each file copied, e.g. /a/b to /a/btest. Use the keys `hive.dataset.copy.target.table.prefixToBeReplaced` and `hive.dataset.copy.target.table.prefixReplacement`. Any paths that are not a descendant of `prefixToBeReplaced` will throw an error and will fail the dataset. Note that setting both keys to "/" effectively replicates all paths exactly. 
+* New table root: Puts files in a new table root. The source table root is its location (which is a Hive registration parameter). This mode will simply do a prefix replacement of the table root for each path in that table. Use the key `hive.dataset.copy.target.table.root` to specify the replacement. Note there is some primitive token replacement in the value of the key if using the tokens $DB and $TABLE, which will be replaced by the database and table name respectively. If the token $TA [...]
+    * /data/$DB/$TABLE -> /data/databaseName/tableName
+    * /data/$TABLE -> /data/tableName
+    * /data -> /data/tableName
+* Relocate files: This mode will move all files in a table to a structure matching Hive's native directory structure. I.e. all files for a partition "abc" of table "myDb.myTable" will be placed at path `<prefix>/abc` where prefix is specified using the key `hive.dataset.copy.target.table.root` and processed with the token replacements explained in "new table root". To enable this mode set `hive.dataset.copy.relocate.data.files` to true and set `hive.dataset.copy.target.table.root` approp [...]
+## Conflicting table and partitions treatment
+
+If distcp-ng finds that a partition or table it needs to create already exists it will determine whether the existing table / partition is identical to what it would register (e.g. compare schema, location, etc.). If not, it will use a policy to determine how to proceed. The policy is specified using the key `hive.dataset.existing.entity.conflict.policy` and can take the following values:
+
+* ABORT: the conflicting table will not be copied (default)
+* REPLACE_PARTITIONS: replace any conflicting partitions, but not tables
+* REPLACE_TABLES: replace any conflicting tables by deregistrating previous tables first.
+* UPDATE_TABLES: Keep the original-registered table but make modification.
+
+## Deregistering tables / partitions
+
+Sometimes distcp-ng must deregister a table / partition, for example if it doesn't exist in the source, or if it must be replaced. In this case, distcp-ng offers options on what to do with the files under the deregistered partition. Set this policy using the key `hive.dataset.copy.deregister.fileDeleteMethod` which can take the following values:
+
+* NO_DELETE: do not delete the files (default)
+* INPUT_FORMAT: use the table / partition input format to infer which files are actually used by that table / partition, and delete only those files.
+* RECURSIVE: delete the entire directory in the table / partition location.
+
+## Finding copy files
+
+To specify the files that distcp will copy for each table / partition, use the key `hive.dataset.copy.location.listing.method` which can take the values:
+
+* INPUT_FORMAT: use the table / partition input format to infer which files are actually used by that table / partition. (default)
+* RECURSIVE: copy all files under the directory in the table / partition location recursively.
+If the recursive method is used, user can additionally specify `hive.dataset.copy.locations.listing.skipHiddenPaths`, which, if true, will not copy any hidden files.
+
+## Partition Filter
+
+A partition filter can be applied when copying partitioned tables. Filters can only be applied to text partition columns. To speficy a partition filter use the key `hive.dataset.copy.partition.filter.generator`.
+
+* `gobblin.data.management.copy.hive.filter.LookbackPartitionFilterGenerator`: Filters date-representing partitions by a lookback (i.e. only copy recent partitions). Use the keys `hive.dataset.partition.filter.datetime.column`, `hive.dataset.partition.filter.datetime.lookback`, and `hive.dataset.partition.filter.datetime.format` to configure the filter.
+
+## Fast partition skip predicate
+
+A predicate that operates on partitions can be provided to distcp-ng to allow it to quickly skip partitions without having to list all of the source and target files and do a diff on those sets (a costly operation). To set this predicate, provide the class name of the predicate with the key `hive.dataset.copy.fast.partition.skip.predicate`. Currently only one such predicate exists:
+
+* `RegistrationTimeSkipPredicate`: This predicate compares the Hive partition attribute `registrationGenerationTimeMillis` in the target with the modification time of the partition directory in the source. The partition is skipped unless the directory was modified more recently than the registrationGenerationTime. The attribute `registrationGenerationTimeMillis` is an attribute set by distcp-ng representing (for all practical purposes) the time at which the distcp-ng job that registered  [...]
+
diff --git a/gobblin-website/docs/case-studies/Kafka-HDFS-Ingestion.md b/gobblin-website/docs/case-studies/Kafka-HDFS-Ingestion.md
new file mode 100644
index 0000000..3806e7d
--- /dev/null
+++ b/gobblin-website/docs/case-studies/Kafka-HDFS-Ingestion.md
@@ -0,0 +1,304 @@
+---
+title: Kafka-HDFS Ingestion
+sidebar_label: Kafka-HDFS Ingestion
+---
+
+# Getting Started
+
+This section helps you set up a quick-start job for ingesting Kafka topics on a single machine. We provide quick start examples in both standalone and MapReduce mode.
+
+## Standalone
+
+* Setup a single node Kafka broker by following the [Kafka quick start guide](http://kafka.apache.org/documentation.html#quickstart). Suppose your broker URI is `localhost:9092`, and you've created a topic "test" with two events "This is a message" and "This is a another message".
+
+* The remaining steps are the same as the [Wikipedia example](../Getting-Started), except using the following job config properties:
+
+```properties
+job.name=GobblinKafkaQuickStart
+job.group=GobblinKafka
+job.description=Gobblin quick start job for Kafka
+job.lock.enabled=false
+
+kafka.brokers=localhost:9092
+
+source.class=org.apache.gobblin.source.extractor.extract.kafka.KafkaSimpleSource
+extract.namespace=org.apache.gobblin.extract.kafka
+
+writer.builder.class=org.apache.gobblin.writer.SimpleDataWriterBuilder
+writer.file.path.type=tablename
+writer.destination.type=HDFS
+writer.output.format=txt
+
+data.publisher.type=org.apache.gobblin.publisher.BaseDataPublisher
+
+mr.job.max.mappers=1
+
+metrics.reporting.file.enabled=true
+metrics.log.dir=${gobblin.cluster.work.dir}/metrics
+metrics.reporting.file.suffix=txt
+
+bootstrap.with.offset=earliest
+```
+
+After the job finishes, the following messages should be in the job log:
+
+```
+INFO Pulling topic test
+INFO Pulling partition test:0 from offset 0 to 2, range=2
+INFO Finished pulling partition test:0
+INFO Finished pulling topic test
+INFO Extracted 2 data records
+INFO Actual high watermark for partition test:0=2, expected=2
+INFO Task <task_id> completed in 31212ms with state SUCCESSFUL
+```
+
+The output file will be in `{gobblin.cluster.work.dir}/job-output/test`, with the two messages you've just created in the Kafka broker. `{gobblin.cluster.work.dir}/metrics` will contain metrics collected from this run.
+
+## MapReduce
+
+* Setup a single node Kafka broker same as in standalone mode.
+* Setup a single node Hadoop cluster by following the steps in [Hadoop: Setting up a Single Node Cluster](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/SingleCluster.html). Suppose your HDFS URI is `hdfs://localhost:9000`.
+* Create a job config file with the following properties:
+
+```properties
+job.name=GobblinKafkaQuickStart
+job.group=GobblinKafka
+job.description=Gobblin quick start job for Kafka
+job.lock.enabled=false
+
+kafka.brokers=localhost:9092
+
+source.class=org.apache.gobblin.source.extractor.extract.kafka.KafkaSimpleSource
+extract.namespace=org.apache.gobblin.extract.kafka
+
+writer.builder.class=org.apache.gobblin.writer.SimpleDataWriterBuilder
+writer.file.path.type=tablename
+writer.destination.type=HDFS
+writer.output.format=txt
+
+data.publisher.type=org.apache.gobblin.publisher.BaseDataPublisher
+
+mr.job.max.mappers=1
+
+metrics.reporting.file.enabled=true
+metrics.log.dir=/gobblin-kafka/metrics
+metrics.reporting.file.suffix=txt
+
+bootstrap.with.offset=earliest
+
+fs.uri=hdfs://localhost:9000
+writer.fs.uri=hdfs://localhost:9000
+state.store.fs.uri=hdfs://localhost:9000
+
+mr.job.root.dir=/gobblin-kafka/working
+state.store.dir=/gobblin-kafka/state-store
+task.data.root.dir=/jobs/kafkaetl/gobblin/gobblin-kafka/task-data
+data.publisher.final.dir=/gobblintest/job-output
+```
+
+* Run `gobblin-mapreduce.sh`:
+
+`gobblin-mapreduce.sh --conf <path-to-job-config-file>`
+
+After the job finishes, the job output file will be in `/gobblintest/job-output/test` in HDFS, and the metrics will be in `/gobblin-kafka/metrics`.
+
+
+# Job Constructs
+
+## Source and Extractor
+
+Gobblin provides two abstract classes, [`KafkaSource`](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-kafka-common/src/main/java/org/apache/gobblin/source/extractor/extract/kafka/KafkaSource.java) and [`KafkaExtractor`](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-kafka-common/src/main/java/org/apache/gobblin/source/extractor/extract/kafka/KafkaExtractor.java). `KafkaSource` creates a workunit for each Kafka topic partition to be pulled, th [...]
+
+To use them in a Kafka-HDFS ingestion job, one should subclass `KafkaExtractor` and implement method `decodeRecord(MessageAndOffset)`, which takes a `MessageAndOffset` object pulled from the Kafka broker and decodes it into a desired object. One should also subclass `KafkaSource` and implement `getExtractor(WorkUnitState)` which should return an instance of the Extractor class.
+
+As examples, take a look at [`KafkaSimpleSource`](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-kafka-common/src/main/java/org/apache/gobblin/source/extractor/extract/kafka/KafkaSimpleSource.java), [`KafkaSimpleExtractor`](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-kafka-common/src/main/java/org/apache/gobblin/source/extractor/extract/kafka/KafkaSimpleExtractor.java), and [`KafkaAvroExtractor`](https://github.com/apache/gobblin/blob/mast [...]
+
+`KafkaSimpleExtractor` simply returns the payload of the `MessageAndOffset` object as a byte array. A job that uses `KafkaSimpleExtractor` may use a `Converter` to convert the byte array to whatever format desired. For example, if the desired output format is JSON, one may implement an `ByteArrayToJsonConverter` to convert the byte array to JSON. Alternatively one may implement a `KafkaJsonExtractor`, which extends `KafkaExtractor` and convert the `MessageAndOffset` object into a JSON ob [...]
+
+## Writer and Publisher
+
+Any desired writer and publisher can be used, e.g., one may use the [`AvroHdfsDataWriter`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/writer/AvroHdfsDataWriter.java) and the [`BaseDataPublisher`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/publisher/BaseDataPublisher.java), similar as the [Wikipedia example job](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/resources/ [...]
+
+# Job Config Properties
+
+These are some of the job config properties used by `KafkaSource` and `KafkaExtractor`.
+
+| Property Name | Semantics     |
+| ------------- |-------------| 
+| `topic.whitelist` (regex)      | Kafka topics to be pulled. Default value = .* | 
+| `topic.blacklist` (regex)     | Kafka topics not to be pulled. Default value = empty | 
+| `kafka.brokers` | Comma separated Kafka brokers to ingest data from.      |  
+| `mr.job.max.mappers` | Number of tasks to launch. In MR mode, this will be the number of mappers launched. If the number of topic partitions to be pulled is larger than the number of tasks, `KafkaSource` will assign partitions to tasks in a balanced manner.      |  
+| `bootstrap.with.offset` | For new topics / partitions, this property controls whether they start at the earliest offset or the latest offset. Possible values: earliest, latest, skip. Default: latest      |
+| `reset.on.offset.out.of.range` | This property controls what to do if a partition's previously persisted offset is out of the range of the currently available offsets. Possible values: earliest (always move to earliest available offset), latest (always move to latest available offset), nearest (move to earliest if the previously persisted offset is smaller than the earliest offset, otherwise move to latest), skip (skip this partition). Default: nearest |
+| `topics.move.to.latest.offset` (no regex) | Topics in this list will always start from the latest offset (i.e., no records will be pulled). To move all topics to the latest offset, use "all". This property should rarely, if ever, be used.
+
+It is also possible to set a time limit for each task. For example, to set the time limit to 15 minutes, set the following properties:
+
+```
+extract.limit.enabled=true
+extract.limit.type=time #(other possible values: rate, count, pool)
+extract.limit.timeLimit=15
+extract.limit.timeLimitTimeunit=minutes
+```
+# Metrics and Events
+
+## Task Level Metrics
+
+Task level metrics can be created in `Extractor`, `Converter` and `Writer` by extending [`InstrumentedExtractor`](https://github.com/apache/gobblin/blob/master/gobblin-core-base/src/main/java/org/apache/gobblin/instrumented/extractor/InstrumentedExtractor.java), [`InstrumentedConverter`](https://github.com/apache/gobblin/blob/master/gobblin-core-base/src/main/java/org/apache/gobblin/instrumented/converter/InstrumentedConverter.java) and [`InstrumentedDataWriter`](https://github.com/apach [...]
+
+For example, `KafkaExtractor` extends `InstrumentedExtractor`. So you can do the following in subclasses of `KafkaExtractor`:
+
+```
+Counter decodingErrorCounter = this.getMetricContext().counter("num.of.decoding.errors");
+decodingErrorCounter.inc();
+```
+
+Besides Counter, Meter and Histogram are also supported.
+
+## Task Level Events
+
+Task level events can be submitted by creating an [`EventSubmitter`](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/java/org/apache/gobblin/metrics/event/EventSubmitter.java) instance and using `EventSubmitter.submit()` or `EventSubmitter.getTimingEvent()`.
+
+## Job Level Metrics
+
+To create job level metrics, one may extend [`AbstractJobLauncher`](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/AbstractJobLauncher.java) and create metrics there. For example:
+
+```
+Optional<JobMetrics> jobMetrics = this.jobContext.getJobMetricsOptional();
+if (!jobMetrics.isPresent()) {
+  LOG.warn("job metrics is absent");
+  return;
+}
+Counter recordsWrittenCounter = jobMetrics.get().getCounter("job.records.written");
+recordsWrittenCounter.inc(value);
+```
+
+Job level metrics are often aggregations of task level metrics, such as the `job.records.written` counter above. Since `AbstractJobLauncher` doesn't have access to task-level metrics, one should set these counters in `TaskState`s, and override `AbstractJobLauncher.postProcessTaskStates()` to aggregate them. For example, in `AvroHdfsTimePartitionedWriter.close()`, property `writer.records.written` is set for the `TaskState`. 
+
+## Job Level Events
+
+Job level events can be created by extending `AbstractJobLauncher` and use `this.eventSubmitter.submit()` or `this.eventSubmitter.getTimingEvent()`.
+
+For more details about metrics, events and reporting them, please see Gobblin Metrics section.
+
+# Grouping Workunits
+
+For each topic partition that should be ingested, `KafkaSource` first retrieves the last offset pulled by the previous run, which should be the first offset of the current run. It also retrieves the earliest and latest offsets currently available from the Kafka cluster and verifies that the first offset is between the earliest and the latest offsets. The latest offset is the last offset to be pulled by the current workunit. Since new records may be constantly published to Kafka and old r [...]
+
+For each partition, after the first and last offsets are determined, a workunit is created. If the number of Kafka partitions exceeds the desired number of workunits specified by property `mr.job.max.mappers`, `KafkaSource` will merge and group them into `n` [`MultiWorkUnit`](https://github.com/apache/gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/source/workunit/MultiWorkUnit.java)s where `n=mr.job.max.mappers`. This is done using [`KafkaWorkUnitPacker`](https://github [...]
+
+## Single-Level Packing
+
+The single-level packer uses a worst-fit-decreasing approach for assigning workunits to mappers: each workunit goes to the mapper that currently has the lightest load. This approach balances the mappers well. However, multiple partitions of the same topic are usually assigned to different mappers. This may cause two issues: (1) many small output files: if multiple partitions of a topic are assigned to different mappers, they cannot share output files. (2) task overhead: when multiple par [...]
+
+## Bi-Level Packing
+
+The bi-level packer packs workunits in two steps.
+
+In the first step, all workunits are grouped into approximately `3n` groups, each of which contains partitions of the same topic. The max group size is set as
+
+`maxGroupSize = totalWorkunitSize/3n`
+
+The best-fit-decreasing algorithm is run on all partitions of each topic. If an individual workunit’s size exceeds `maxGroupSize`, it is put in a separate group. For each group, a new workunit is created which will be responsible for extracting all partitions in the group.
+
+The reason behind `3n` is that if this number is too small (i.e., too close to `n`), it is difficult for the second level to pack these groups into n balanced multiworkunits; if this number is too big, `avgGroupSize` will be small which doesn’t help grouping partitions of the same topic together. `3n` is a number that is empirically selected.
+
+The second step uses the same worst-fit-decreasing method as the first-level packer.
+
+This approach reduces the number of small files and the number of tasks, but it may have more mapper skew for two reasons: (1) in the worst-fit-decreasing approach, the less number of items to be packed, the more skew there will be; (2) when multiple partitions of a topic are assigned to the same mapper, if we underestimate the size of this topic, this mapper may take a much longer time than other mappers and the entire MR job has to wait for this mapper. This, however, can be mitigated  [...]
+
+## Average Record Size-Based Workunit Size Estimator
+
+This size estimator uses the average record size of each partition to estimate the sizes of workunits. When using this size estimator, each job run will record the average record size of each partition it pulled. In the next run, for each partition the average record size pulled in the previous run is considered the average record size
+to be pulled in this run.
+
+If a partition was not pulled in a run, a default value of 1024 will be used in the next run.
+
+## Average Record Time-Based Workunit Size Estimator
+
+This size estimator uses the average time to pull a record in each run to estimate the sizes of the workunits in the next run.
+
+When using this size estimator, each job run will record the average time per record of each partition. In the next run, the estimated average time per record for each topic is the geometric mean of the avg time per record of all partitions. For example if a topic has two partitions whose average time per record in the previous run are 2 and 8, the next run will use 4 as the estimated average time per record.
+
+If a topic is not pulled in a run, its estimated average time per record is the geometric mean of the estimated average time per record of all topics that are pulled in this run. If no topic was pulled in this run, a default value of 1.0 is used.
+
+The time-based estimator is more accurate than the size-based estimator when the time to pull a record is not proportional to the size of the record. However, the time-based estimator may lose accuracy when there are fluctuations in the Hadoop cluster which causes the average time for a partition to vary between different runs.
+
+# Topic-Specific Configuration
+
+`kafka.topic.specific.state` is a configuration key that allows a user to specify config parameters on a topic specific level. The value of this config should be a JSON Array. Each entry should be a json string and should contain a primitive value that identifies the topic name. All configs in each topic entry will be added to the WorkUnit for that topic.
+
+An example value could be:
+
+```json
+[
+  {
+    "dataset": "myTopic1",
+    "writer.partition.columns": "header.memberId"
+  },
+  {
+    "dataset": "myTopic2",
+    "writer.partition.columns": "auditHeader.time"
+  }
+]
+```
+
+The `dataset` field also allows regular expressions. For example, one can specify key, value `"dataset" : "myTopic.\*"`. In this case all topics whose name matches the pattern `myTopic.*` will have all the specified config properties added to their WorkUnit. If more than one topic matches multiple `dataset`s then the properties from all the JSON objects will be added to their WorkUnit.
+
+# Kafka `Deserializer` Integration
+
+Gobblin integrates with Kafka's [Deserializer](https://kafka.apache.org/0100/javadoc/org/apache/kafka/common/serialization/Deserializer.html) API. Kafka's `Deserializer` Interface offers a generic interface for Kafka Clients to deserialize data from Kafka into Java Objects. Since Kafka Messages return byte array, the `Deserializer` class offers a convienient way of transforming those byte array's to Java Objects.
+
+Kafka's Client Library already has a few useful `Deserializer`s such as the the [StringDeserializer](https://kafka.apache.org/0100/javadoc/org/apache/kafka/common/serialization/StringDeserializer.html) and the [ByteBufferDeserializer](https://kafka.apache.org/0100/javadoc/org/apache/kafka/common/serialization/ByteBufferDeserializer.html).
+
+Gobblin can integrate with any of these `Deserializer`s, that is any class that implements the `Deserializer` interface can be used to convert Kafka message to Java Objects. This is done in the `KafkaDeserializerSource` and the `KafkaDeserializerExtractor` classes.
+
+The type of `Deserializer` to be used in `KafkaDeserializerExtractor` can be specified by the property `kafka.deserializer.type`. This property can either be set to any of the pre-defined `Deserializer`s such as `CONFLUENT_AVRO`, `CONFLUENT_JSON`, `GSON`, `BYTE_ARRAY`, and `STRING` (see the section on [Confluent Integration](#confluent-integration) and [KafkaGsonDeserializer](#kafkagsondeserializer) for more details). The value of this property can point to the full-qualified path of a ` [...]
+
+## Gobblin `Deserializer` Implementations
+
+### KafkaGsonDeserializer
+
+The `KafkaGsonDeserializer` is an implementation of the `Deserializer` class that converts `byte[]` to [JSONObject](https://google.github.io/gson/apidocs/com/google/gson/JsonObject.html)s. It uses [GSON](https://github.com/google/gson) to do this.
+
+This class is useful for converting Kafka data to JSON Objects.
+
+Using this class simply requires setting `kafka.deserializer.type` to `GSON`.
+
+## Comparison with `KafkaSimpleSource`
+
+Gobblin's `KafkaSimpleSource` and `KafkaSimpleExtractor` are very useful when data just needs to be read from Kafka and written to a text file. However, it does not provide good support for writing to more complex data file formats such as [Avro](https://avro.apache.org/) or [ORC](https://orc.apache.org/). It also doesn't provide good support for record level manipulations such as Gobblin `Converter`s and it lacks good support for use with Gobblin's `WriterPartitioner`. The reason is tha [...]
+
+# Confluent Integration
+
+[Confluent](http://www.confluent.io/) provides a standardized distribution of [Apache Kafka](http://kafka.apache.org/), along with other useful tools for working with Kafka. One useful tool that Confluent provides is a generic [Schema Registry](http://www.confluent.io/blog/schema-registry-kafka-stream-processing-yes-virginia-you-really-need-one).
+
+Gobblin has integration with [Confluent's Schema Registry Library](https://github.com/confluentinc/schema-registry) which provides a service to register and get [Avro Schemas](https://avro.apache.org/docs/1.8.0/spec.html) and provides a generic [Avro Deserializer](https://github.com/confluentinc/schema-registry/blob/master/avro-serializer/src/main/java/io/confluent/kafka/serializers/KafkaAvroDeserializer.java) and [JSON Deserializer](https://github.com/confluentinc/schema-registry/blob/m [...]
+
+## Confluent Schema Registry
+
+Gobblin integrates with Confluent's [SchemaRegistryClient](https://github.com/confluentinc/schema-registry/blob/master/client/src/main/java/io/confluent/kafka/schemaregistry/client/SchemaRegistryClient.java) class in order to register and get Avro Schema's from the Confluent [SchemaRegistry](https://github.com/confluentinc/schema-registry/blob/master/core/src/main/java/io/confluent/kafka/schemaregistry/storage/SchemaRegistry.java). This is implemented in the `ConfluentKafkaSchemaRegistry [...]
+
+## Confluent Deserializers
+
+Confluent's Schema Registry Library also provides a few useful `Deserializer` implementations:
+
+* [KafkaAvroDeserializer](https://github.com/confluentinc/schema-registry/blob/master/avro-serializer/src/main/java/io/confluent/kafka/serializers/KafkaAvroDeserializer.java) 
+* [KafkaJsonDeserializer](https://github.com/confluentinc/schema-registry/blob/master/json-serializer/src/main/java/io/confluent/kafka/serializers/KafkaJsonDeserializer.java)
+
+With regards to Gobblin, these classes are useful if Confluent's [KafkaAvroSerializer](https://github.com/confluentinc/schema-registry/blob/master/avro-serializer/src/main/java/io/confluent/kafka/serializers/KafkaAvroSerializer.java) or [KafkaJsonSerializer](https://github.com/confluentinc/schema-registry/blob/master/json-serializer/src/main/java/io/confluent/kafka/serializers/KafkaJsonSerializer.java) is used to write data to Kafka.
+
+The [Serializer](https://kafka.apache.org/0100/javadoc/org/apache/kafka/common/serialization/Serializer.html) class is a Kafka interface that is the converse of the `Deserializer` class. The `Serializer` provides a generic way of taking Java Objects and converting them to `byte[]` that are written to Kafka by a `KafkaProducer`.
+
+### KafkaAvroDeserializer
+
+Documentation for the `KafkaAvroDeserializer` can be found [here](http://docs.confluent.io/2.0.1/schema-registry/docs/serializer-formatter.html#serializer).
+
+If data is written to a Kafka cluster using Confluent's `KafkaAvroSerializer`, then the `KafkaAvroDeserializer` should be used in Gobblin. Setting this up simply requires a setting the config key `kafka.deserializer.type` to `CONFLUENT_AVRO` (see the section on [Kafka Deserializer Integration](#kafka-deserializer-integration) for more information).
+
+### KafkaJsonDeserializer
+
+The `KafkaJsonDeserializer` class uses [Jackson's Object Mapper](https://fasterxml.github.io/jackson-databind/javadoc/2.7/com/fasterxml/jackson/databind/ObjectMapper.html) to convert `byte[]` to Java Objects. In order to `KafkaJsonDeserializer` to know which class the `byte[]` array should be converted to, the config property `json.value.type` needs to be set to the fully-qualified class name of the Java Object that the `Deserializer` should return. For more information about how the Jac [...]
+
+Using the `KafkaJsonDeserializer` simply requires setting the config key `kafka.deserializer.type` to `CONFLUENT_JSON` (see the section on [Kafka Deserializer Integration](#kafka-deserializer-integration) for more information).
diff --git a/gobblin-website/docs/case-studies/Publishing-Data-to-S3.md b/gobblin-website/docs/case-studies/Publishing-Data-to-S3.md
new file mode 100644
index 0000000..8111697
--- /dev/null
+++ b/gobblin-website/docs/case-studies/Publishing-Data-to-S3.md
@@ -0,0 +1,156 @@
+---
+title: Publishing Data to S3
+sidebar_label: Publishing Data to S3
+---
+
+# Introduction
+
+While Gobblin is not tied to any specific cloud provider, [Amazon Web Services](https://aws.amazon.com/) is a popular choice. This document will outline how Gobblin can publish data to [S3](https://aws.amazon.com/s3/). Specifically, it will provide a step by step guide to help setup Gobblin on Amazon [EC2](https://aws.amazon.com/ec2/), run Gobblin on EC2, and publish data from EC2 to S3.
+
+It is recommended to configure Gobblin to first write data to [EBS](https://aws.amazon.com/ebs/), and then publish the data to S3. This is the recommended approach because there are a few caveats when working with with S3. See the [Hadoop and S3](#hadoop-and-s3) section for more details.
+
+This document will also provide a step by step guide for launching and configuring an EC2 instance and creating a S3 bucket. However, it is by no means a source of truth guide to working with AWS, it will only provide high level steps. The best place to learn about how to use AWS is through the [Amazon documentation](https://aws.amazon.com/documentation/).
+
+# Hadoop and S3
+
+A majority of Gobblin's code base uses Hadoop's [FileSystem](https://hadoop.apache.org/docs/r2.4.1/api/org/apache/hadoop/fs/FileSystem.html) object to read and write data. The `FileSystem` object is an abstract class, and typical implementations either write to the local file system, or write to HDFS. There has been significant work to create an implementation of the `FileSystem` object that reads and writes to S3. The best guide to read about the different S3 `FileSystem` implementation [...]
+
+There are a few different S3 `FileSystem` implementations, the two of note are the `s3a` and the `s3` file systems. The `s3a` file system is relatively new and is only available in Hadoop 2.6.0 (see the original [JIRA](https://issues.apache.org/jira/browse/HADOOP-10400) for more information). The `s3` filesystem has been around for a while.
+
+## The `s3a` File System
+
+The `s3a` file system uploads files to a specified bucket. The data uploaded to S3 via this file system is interoperable with other S3 tools. However, there are a few caveats when working with this file system:
+
+* Since S3 does not support renaming of files in a bucket, the `S3AFileSystem.rename(Path, Path)` operation will actually copy data from the source `Path` to the destination `Path`, and then delete the source `Path` (see the [source code](http://grepcode.com/file/repo1.maven.org/maven2/org.apache.hadoop/hadoop-aws/2.6.0/org/apache/hadoop/fs/s3a/S3AFileSystem.java) for more information)
+* When creating a file using `S3AFileSystem.create(...)` data will be first written to a staging file on the local file system, and when the file is closed, the staging file will be uploaded to S3 (see the [source code](http://grepcode.com/file/repo1.maven.org/maven2/org.apache.hadoop/hadoop-aws/2.6.0/org/apache/hadoop/fs/s3a/S3AOutputStream.java) for more information)
+
+Thus, when using the `s3a` file system with Gobblin it is recommended that one configures Gobblin to first write its staging data to the local filesystem, and then to publish the data to S3. The reason this is the recommended approach is that each Gobblin `Task` will write data to a staging file, and once the file has been completely written it publishes the file to a output directory (it does this by using a rename function). Finally, the `DataPublisher` moves the files from the staging [...]
+
+Furthermore, writing directly to S3 requires creating a staging file on the local file system, and then creating a `PutObjectRequest` to upload the data to S3. This is logically equivalent to just configuring Gobblin to write to a local file and then publishing it to S3.
+
+## The `s3` File System
+
+The `s3` file system stores file as blocks, similar to how HDFS stores blocks. This makes renaming of files more efficient, but data written using this file system is not interoperable with other S3 tools. This limitation may make using this file system less desirable, so the majority of this document focuses on the `s3a` file system. Although the majority of the walkthrough should apply for the `s3` file system also.
+
+# Getting Gobblin to Publish to S3
+
+This section will provide a step by step guide to setting up an EC2 instance, a S3 bucket, installing Gobblin on EC2, and configuring Gobblin to publish data to S3.
+
+This guide will use the free-tier provided by AWS to setup EC2 and S3.
+
+## Signing Up For AWS
+
+In order to use EC2 and S3, one first needs to sign up for an AWS account. The easiest way to get started with AWS is to use their [free tier](https://aws.amazon.com/free/).
+
+## Setting Up EC2
+
+### Launching an EC2 Instance
+
+Once you have an AWS account, login to the AWS [console](https://console.aws.amazon.com/console/home). Select the EC2 link, which will bring you to the [EC2 dashboard](https://console.aws.amazon.com/ec2/).
+
+Click on `Launch Instance` to create a new EC2 instance. Before the instance actually starts to run, there area a few more configuration steps necessary:
+
+* Choose an Amazon Machine Image ([AMI](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html))
+    * For this walkthrough we will pick Red Hat Enterprise Linux ([RHEL](https://en.wikipedia.org/wiki/Red_Hat_Enterprise_Linux)) AMI
+* Choose an Instance Type
+    * Since this walkthrough uses the Amazon Free Tier, we will pick the General Purpose `t2.micro` instance
+        * This instance provides us with 1 vCPU and 1 GiB of RAM
+    * For more information on other instance types, check out the AWS [docs](https://aws.amazon.com/ec2/instance-types/)
+* Click Review and Launch
+    * We will use the defaults for all other setting options
+    * When reviewing your instance, you will most likely get a warning saying access to your EC2 instance is open to the world
+    * If you want to fix this you have to edit the [Security Groups](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-network-security.html); how to do that is out of the scope of this document
+* Set Up SSH Keys
+    * After reviewing your instance, click `Launch`
+    * You should be prompted to setup [SSH](https://en.wikipedia.org/wiki/Secure_Shell) keys
+    * Use an existing key pair if you have one, otherwise create a new one and download it
+* SSH to Launched Instance
+    * SSH using the following command: `ssh -i my-private-key-file.pem ec2-user@instance-name`
+        * The `instance-name` can be taken from the `Public DNS` field from the instance information
+        * SSH may complain that the private key file has insufficient permissions
+            * Execute `chmod 600 my-private-key-file.pem` to fix this
+        * Alternatively, one can modify the `~/.ssh/config` file instead of specifying the `-i` option
+
+After following the above steps, you should be able to freely SSH into the launched EC2 instance, and monitor / control the instance from the [EC2 dashboard](https://console.aws.amazon.com/ec2/).
+
+### EC2 Package Installations
+
+Before setting up Gobblin, you need to install [Java](https://en.wikipedia.org/wiki/Java_(programming_language)) first. Depending on the AMI instance you are running Java may or may not already be installed (you can check if Java is already installed by executing `java -version`).
+
+#### Installing Java
+
+* Execute `sudo yum install java-1.8.0-openjdk*` to install Open JDK 8
+* Confirm the installation was successful by executing `java -version`
+* Set the `JAVA_HOME` environment variable in the `~/.bashrc/` file
+    * The value for `JAVA_HOME` can be found by executing `` readlink `which java` ``
+
+## Setting Up S3
+
+Go to the [S3 dashboard](https://console.aws.amazon.com/s3)
+
+* Click on `Create Bucket`
+    * Enter a name for the bucket (e.g. `gobblin-demo-bucket`)
+    * Enter a [Region](http://docs.aws.amazon.com/general/latest/gr/rande.html) for the bucket (e.g. `US Standard`)
+
+## Setting Up Gobblin on EC2
+
+* Download and Build Gobblin Locally
+    * On your local machine, clone the [Gobblin repository](https://github.com/apache/gobblin): `git clone git@github.com:apache/gobblin.git` (this assumes you have [Git](https://en.wikipedia.org/wiki/Git_(software)) installed locally)
+    * Build Gobblin using the following commands (it is important to use Hadoop version 2.6.0 as it includes the `s3a` file system implementation):
+```bash
+cd gobblin
+./gradlew clean build -PhadoopVersion=2.6.0 -x test
+```
+* Upload the Gobblin Tar to EC2
+    * Execute the command: 
+```bash
+scp -i my-private-key-file.pem gobblin-dist-[project-version].tar.gz ec2-user@instance-name:
+```
+* Un-tar the Gobblin Distribution
+    * SSH to the EC2 Instance
+    * Un-tar the Gobblin distribution: `tar -xvf gobblin-dist-[project-version].tar.gz`
+* Download AWS Libraries
+    * A few JARs need to be downloaded using some cURL commands:
+```bash
+curl http://central.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar > gobblin-dist/lib/aws-java-sdk-1.7.4.jar
+curl http://central.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.6.0/hadoop-aws-2.6.0.jar > gobblin-dist/lib/hadoop-aws-2.6.0.jar
+```
+
+## Configuring Gobblin on EC2
+
+Assuming we are running Gobblin in [standalone mode](../user-guide/Gobblin-Deployment#Standalone-Deployment), the following configuration options need to be modified in the file `gobblin-dist/conf/gobblin-standalone.properties`.
+
+* Add the key `data.publisher.fs.uri` and set it to `s3a://gobblin-demo-bucket/`
+    * This configures the job to publish data to the S3 bucket named `gobblin-demo-bucket`
+* Add the AWS Access Key Id and Secret Access Key
+    * Set the keys `fs.s3a.access.key` and `fs.s3a.secret.key` to the appropriate values
+    * These keys correspond to [AWS security credentials](http://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html)
+    * For information on how to get these credentials, check out the AWS documentation [here](http://docs.aws.amazon.com/general/latest/gr/aws-security-credentials.html)
+    * The AWS documentation recommends using [IAM roles](http://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html); how to set this up is out of the scope of this document; for this walkthrough we will use root access credentials
+
+## Launching Gobblin on EC2
+
+Assuming we want Gobblin to run in standalone mode, follow the usual steps for [standalone deployment](../user-guide/Gobblin-Deployment#Standalone-Deployment).
+
+For the sake of this walkthrough, we will launch the Gobblin [wikipedia example](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/resources/wikipedia.pull). Directions on how to run this example can be found [here](../Getting-Started). The command to launch Gobblin should look similar to:
+```bash
+sh bin/gobblin standalone start --conf-dir /home/ec2-user/gobblin-dist/config
+```
+
+If you are running on the Amazon free tier, you will probably get an error in the `nohup.out` file saying there is insufficient memory for the JVM. To fix this add `--jvmflags "-Xms256m -Xmx512m"` to the `start` command.
+
+Data should be written to S3 during the publishing phase of Gobblin. One can confirm data was successfully written to S3 by looking at the [S3 dashboard](https://console.aws.amazon.com/s3).
+
+### Writing to S3 Outside EC2
+
+It is possible to write to an S3 bucket outside of an EC2 instance. The setup steps are similar to walkthrough outlined above. For more information on writing to S3 outside of AWS, check out [this article](https://aws.amazon.com/articles/5050).
+
+## Configuration Properties for `s3a`
+
+The `s3a` FileSystem has a number of configuration properties that can be set to tune the behavior and performance of the `s3a` FileSystem. A complete list of the properties can be found here: https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html
+
+# FAQs
+
+### How do I control the directory the `s3a` uses when writing to local disk
+
+The configuration property `fs.s3a.buffer.dir` controls the location where the `s3a` FileSystem will write data locally before uplodaing it to S3. By default, this property is set to `${hadoop.tmp.dir}/s3a`.
diff --git a/gobblin-website/docs/case-studies/Writing-ORC-Data.md b/gobblin-website/docs/case-studies/Writing-ORC-Data.md
new file mode 100644
index 0000000..8571814
--- /dev/null
+++ b/gobblin-website/docs/case-studies/Writing-ORC-Data.md
@@ -0,0 +1,54 @@
+---
+title: Writing ORC Data
+sidebar_label: Writing ORC Data
+---
+
+# Introduction
+
+Gobblin is capable of writing data to ORC files by leveraging Hive's SerDe library. Gobblin has native integration with Hive SerDe's library via the [HiveSerDeWrapper](https://github.com/apache/gobblin/blob/master/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/HiveSerDeManager.java) class.
+
+This document will briefly explain how Gobblin integrates with Hive's SerDe library, and show an example of writing ORC files.
+
+# Hive SerDe Integration
+
+[Hive's SerDe library](https://cwiki.apache.org/confluence/display/Hive/SerDe) defines the interface Hive uses for serialization and deserialization of data. The Hive SerDe library has out of the box SerDe support for Avro, ORC, Parquet, CSV, and JSON SerDes. However, users are free to define custom SerDes.
+
+Gobblin integrates with the Hive SerDe's in a few different places. Here is a list of integration points that are relevant for this document:
+
+* `HiveSerDeWrapper` wrapper around Hive's SerDe library that provides some nice utilities and structure that the rest of Gobblin can interfact with
+* `HiveSerDeConverter` takes a `Writable` object in a specific format, and converts it to the Writable of another format (e.g. from `AvroGenericRecordWritable` to `OrcSerdeRow`)
+* `HiveWritableHdfsDataWriter` writes a `Writable` object to a specific file, typically this writes the output of a `HiveSerDeConverter`
+
+# Writing to an ORC File
+
+An end-to-end example of writing to an ORC file is provided in the configuration found [here](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/resources/wikipedia-orc.pull). This `.pull` file is almost identical to the Wikipedia example discussed in the [Getting Started Guide](../Getting-Started.md). The only difference is that the output is written in ORC instead of Avro. The configuration file mentioned above can be directly used as a template for writing data to  [...]
+
+* `converter.classes` requires two additional converters: `gobblin.converter.avro.AvroRecordToAvroWritableConverter` and `gobblin.converter.serde.HiveSerDeConverter`
+    * The output of the first converter (the `WikipediaConverter`) returns Avro `GenericRecord`s
+    * These records must be converted to `Writable` object in order for the Hive SerDe to process them, which is where the `AvroRecordToAvroWritableConverter` comes in
+    * The `HiveSerDeConverter` does the actual heavy lifting of converting the Avro Records to ORC Records
+* In order to configure the `HiveSerDeConverter` the following properites need to be added:
+    * `serde.deserializer.type=AVRO` says that the records being fed into the converter are Avro records
+        * `avro.schema.literal` or `avro.schema.url` must be set when using this deserializer so that the Hive SerDe knows what Avro Schema to use when converting the record
+    * `serde.serializer.type=ORC` says that the records that should be returned by the converter are ORC records
+* `writer.builder.class` should be set to `gobblin.writer.HiveWritableHdfsDataWriterBuilder`
+    * This writer class will take the output of the `HiveSerDeConverter` and write the actual ORC records to an ORC file
+* `writer.output.format` should be set to `ORC`; this ensures the files produced end with the `.orc` file extension
+* `fork.record.queue.capacity` should be set to `1`
+    * This ensures no caching of records is done before they get passed to the writer; this is necessary because the `OrcSerde` caches the object it uses to serialize records, and it does not allow copying of Orc Records
+
+The example job can be run the same way the regular Wikipedia job is run, except the output will be in the ORC format.
+
+## Data Flow
+
+For the Wikipedia to ORC example, data flows in the following manner:
+
+* It is extracted from Wikipedia via the `WikipediaExtractor`, which also converts each Wikipedia entry into a `JsonElement`
+* The `WikipediaConverter` then converts the Wikipedia JSON entry into an Avro `GenericRecord`
+* The `AvroRecordToAvroWritableConverter` converts the Avro `GenericRecord` to a `AvroGenericRecordWritable`
+* The `HiveSerDeConverter` converts the `AvroGenericRecordWritable` to a `OrcSerdeRow`
+* The `HiveWritableHdfsDataWriter` uses the `OrcOutputFormat` to write the `OrcSerdeRow` to an `OrcFile`
+
+# Extending Gobblin's SerDe Integration
+
+While this tutorial only discusses Avro to ORC conversion, it should be relatively straightfoward to use the approach mentioned in this document to convert CSV, JSON, etc. data into ORC.
diff --git a/gobblin-website/docs/css/extra.css b/gobblin-website/docs/css/extra.css
new file mode 100644
index 0000000..306093c
--- /dev/null
+++ b/gobblin-website/docs/css/extra.css
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Make the code font size bigger so its easier to read */
+
+code {
+  font-size: 90%;
+}
+
+/* Work around until the TOC becomes collapsable */
+
+a.toctree-l4 {
+  display: none;
+}
+
+a.toctree-l5 {
+  display: none;
+}
+
+a.toctree-l6 {
+  display: none;
+}
+
+.wy-side-nav-search {
+  background-color: #333333;
+}
\ No newline at end of file
diff --git a/gobblin-website/docs/data-management/DistcpNgEvents.md b/gobblin-website/docs/data-management/DistcpNgEvents.md
new file mode 100644
index 0000000..2d73b40
--- /dev/null
+++ b/gobblin-website/docs/data-management/DistcpNgEvents.md
@@ -0,0 +1,33 @@
+---
+title: Gobblin Distcp
+sidebar_label: Gobblin Distcp
+---
+
+# Copy publisher monitoring events
+
+The following metadata attributes are shared across all events:
+
+- Standard execution metadata (TODO add link)
+- `namespace=org.apache.gobblin.copy.CopyDataPublisher`
+- `metadata["class"]=org.apache.gobblin.data.management.copy.publisher.CopyDataPublisher`
+
+Events by `name`:
+
+- `DatasetPublished` - a datasets gets successfully copied and published
+    - `datasetUrn` - the URN of the dataset (dataset-specific) that was copied
+    - `partition` - the partition (dataset-specific) that was copied
+    - `originTimestamp` - the timestamp of the dataset partition as generated in the origin (e.g. the database)
+    - `upstreamTimestamp` - the timestamp of the dataset partition as made available in the upstream system; this will be equal to `originTimestamp` if the data is read directly from the origin.
+
+- `DatasetPublishFailed` - a dataset failed to publish 
+    - `datasetUrn` - the URN of the dataset (dataset-specific) whose publish failed
+
+
+- `FilePublished` - sent when an individual file gets published
+    - `datasetUrn` - the URN of the dataset (dataset-specific) of which the part is part
+    - `partition` - the partition (dataset-specific) of which this file is part
+    - `SourcePath` - the full source path for the file 
+    - `TargetPath` - the full destination path for the file 
+    - `originTimestamp` - similar to `originTimestamp` for `DatasetPublished` events
+    - `upstreamTimestamp` - similar to `upstreamTimestamp` for `DatasetPublished` events
+    - `SizeInBytes` - the size in bytes of the file
diff --git a/gobblin-website/docs/data-management/Gobblin-Retention.md b/gobblin-website/docs/data-management/Gobblin-Retention.md
new file mode 100644
index 0000000..3fc5f0f
--- /dev/null
+++ b/gobblin-website/docs/data-management/Gobblin-Retention.md
@@ -0,0 +1,365 @@
+---
+title: Retention
+sidebar_label: Retention
+---
+
+#Introduction
+Gobblin retention management is a framework to manage the retention of Hadoop datasets. The system allows users to configure retention policies for individual datasets using the Gobblin config store. This framework gives the flexibility to associate retention configurations both at a dataset level and a cluster level.
+For HDFS datasets, the framework comes with several standard policies like timebased policy, policy to retain top k files in a dataset and many more. It also has in-built support for standard data layouts like daily/hourly paritioned data and snapshot data. Gobblin retention management supports several retention actions. The most basic action is deleting files that satisfy a policy. Gobblin also supports actions like access control which set permissions on files that satisfy a policy.
+
+# Design
+The design has two parts. First part describes the contructs like dataset finders, version finders and policies. Second part describes the configuration aspects of gobblin retention management.
+
+## Overview of Gobblin Config Management Library
+To support all the retention configuration requirements, we use Gobblin Dataset Config Management library.This is a short overview. In the gobblin code base it can be found in the module ```gobblin-config-management```
+
+The Gobblin Dataset Config Management Library is a library for storing, managing and accessing configuration. The library is an extension to TypeSafe Config with additional features like dataset awareness and tags.
+
+The library provides a mapping from a config key to a config object. Each config key is represented through a URI. The config object is a map from property name to a property value.
+
+A config key K can import one or more config keys I1, I2, ... . The config key K will inherit any properties from I1, I2, … that are not defined in K. The inheritance is resolved in the order of the keys I1, I2, … etc., i.e. the property will be resolved to the value in the last Im that defines the property. Applications can create tags T1, T2 etc and import them explicitly in K.
+
+We also use the path in the config key URI for implicit tagging. For example, /trackingData/someEvent implicitly imports which /trackingData which implicitly imports /.
+
+**ConfigClient** - The client APIs that an application uses to interact with the library
+
+**ConfigLibrary** - Core implementation that stores the topology of configs in the store. Business logic such as substitution resolution and interpolation of configs happen here.
+
+**ConfigStore** - The physical store for all the configs and tags. Currently a HDFS based
+ConfigStore is implemented but other physical stores can be implemented
+
+## Retention Constructs
+![Gobblin Retention Architecture](../../static/img/Gobblin-Retention-Architecture.png)
+
+### DatasetCleaner
+The ```DatasetCleaner``` is the retention runner. The class takes in job properites as key value pairs. A single ```DatasetCleaner``` can manage retention for different kinds of datasets. Each kind of dataset gets its own ```DatasetFinder```. ```DatasetCleaner``` is responsible for instantiating all the ```DatasetFinder```s. For each ```DatasetFinder``` it finds all the ```CleanableDataset```s and calls the ```CleanableDataset.clean()``` method to delete data.
+
+To instantiate all the dataset finders, it uses the ```gobblin.retention.tag``` job property. This is a comma seperated list of tag URIs in the ```ConfigStore```. A ```DatasetFinder``` will be created for every dataset that imports any of these tags.
+
+For instance let's say we have a all the event based datasets at ```/datasets/trackingData``` in the ```ConfigStore``` and it is tagged with a tag ```/tags/retention/TimeBased```. When ```gobblin.retention.tag``` is set to ```/tags/retention/TimeBased```. All datasets that are tagged with ```/tags/retention/TimeBased``` in the ```ConfigStore``` will be processed by this retention job. So in this case a ```DatasetFinder``` will be created for ```/datasets/trackingData```. More details abo [...]
+
+### DatasetFinder
+A ```DatasetFinder ``` is an interface to find all ```CleanableDataset```s
+
+### ManagedCleanableDatasetFinder
+This is the most basic implementation of a ```DatasetFinder``` that extends a ```ConfigurableGlobDatasetFinder``` to find HDFS datasets based on a glob pattern. It uses the ```ConfigClient``` to connect to the ```ConfigStore``` and get the dataset specific configs for each dataset found.
+
+### ConfigurableCleanableDataset
+The ```ManagedCleanableDatasetFinder``` instantiates a ```ConfigurableClenableDataset``` for every match in the glob pattern. This class reads the dataset config to instatiate a list of ```VersionFinder``` and ```VersionSelectionPolicy``` pairs. The Retention Configuration section provides details on config keys used to specify the ```VersionFinder``` and ```VersionSelectionPolicy``` classes.
+
+### VersionFinder
+A version is defined as a deletable entity (or a path) in a dataset. A version can either be retained or deleted. The ```VersionFinder``` finds all the versions of a dataset.
+
+### VersionSelectionPolicy
+A predicate to select subset of versions from the list of all version discovered by the `VersionFinder`. By default all the versions selected by the ```VersionSelectionPolicy``` will be **deleted**. Apart from delete, gobblin also provides other `RetentionAction`s on the selected versions.
+
+### RetentionAction
+An abstraction for the kind of action to be performed on all the versions discoverd by the ```VersionFinder``` or a subset of versions filtered by the ```VersionSelectionPolicy```. Delete is the default action on selected versions. Gobblin also supports ```AccessControlAction``` which sets permissions on selected versions.
+
+## Retention Configuration
+Gobblin Retention is configured through Gobblin config management. All dataset configs are stored in a config store that can be accessed through a ```ConfigClient```. The gobblin config management uses [TypeSafe Config](https://github.com/typesafehub/config). The language used is [HOCON](https://github.com/typesafehub/config/blob/master/HOCON.md#hocon-human-optimized-config-object-notation), a more readable JSON superset.
+
+The gobblin config management library allows any implementation of config store but for the scope of this document we assume a HDFS based ConfigStore that stores dataset configs in files on HDFS.
+
+Let us take an example ConfigStore instance on HDFS as below.
+
+<pre>
+├── _CONFIG_STORE
+   └── 2.0
+       ├── data
+       │   └── events
+       │      └── main.conf
+       │      └── includes.conf
+       │          ├── loginEvent
+       │             └── main.conf
+       │             └── includes.conf
+       ├── tags
+          └── retention
+            └── main.conf
+                 ├── timebased
+                    └── main.conf
+
+</pre>
+
+Every config store has a store root directory named ```_CONFIG_STORE```. Each new deployment of a store creates a new version (2.0 shown above). Each directory in the store may have a main.conf file and an includes.conf file. The main.conf file holds the config key/value pairs. And includes.conf are used to import other directory paths in the same store. For instance, ```_CONFIG_STORE/2.0/data/events``` can import ```/tags/retention``` in its includes.conf file. All the key value pairs i [...]
+
+Note that the directory structure under the configStore correspond to the direcctory structure of data on HDFS. In this case ```hdfs://data/events/loginEvent```'s retention configs are at ```hdfs://_CONFIG_STORE/2.0/data/events/loginEvent/main.conf``` in the config store.
+
+### Tags
+For maintainability and reusablity we define all the configs as tags and import them into the dataset.
+
+- Below is a sample timebased retention tag, ```/tags/retention/timebased/main.conf```
+
+```hocon
+gobblin.retention : {
+
+    ##Alias
+    TimeBasedSelectionPolicy=org.apache.gobblin.data.management.policy.SelectBeforeTimeBasedPolicy
+    DateTimeDatasetVersionFinder=org.apache.gobblin.data.management.version.finder.DateTimeDatasetVersionFinder
+
+    dataset : {
+      finder.class=org.apache.gobblin.data.management.retention.profile.ManagedCleanableDatasetFinder
+      partitions=[${gobblin.retention.daily}
+    }
+
+    daily : {
+      selection {
+        policy.class = ${gobblin.retention.TimeBasedSelectionPolicy}
+        timeBased.lookbackTime=1000d
+      }
+      version : {
+        finder.class=${gobblin.retention.DateTimeDatasetVersionFinder}
+        globPattern = "daily/*/*/*"
+        datetime.pattern = "yyyy/MM/dd"
+      }
+    }
+}
+```
+
+- To apply this retention config to ```hdfs://data/events``` the tag ```/tags/retention/timeBased``` can be imported by ```_CONFIG_STORE/2.0/data/events/includes.conf``` shown below.
+
+```text
+###### Include files for /data/events ######
+tags/retention/timeBased
+```
+
+- ```_CONFIG_STORE/2.0/data/events/includes.conf``` Will have the configs specific to ```data/events``` shown below.
+
+```properties
+##### Common configs for all of /data/events ######
+# Glob pattern to use to find datasets
+gobblin.dataset.pattern = "/data/events/*"
+```
+
+Similarly the same tag ```/tags/retention/timebased``` can be imported by other datasets as well.
+
+### Dataset overrides
+By default all the event datasets under ```hdfs://data/events``` get the configs from ```_CONFIG_STORE/2.0/data/events``` but sometimes it becomes necessary to override the retention for a specific dataset under ```hdfs://data/events```. This can be done by creating a directory under ```_CONFIG_STORE/2.0/data/events``` with the name of dataset and overriding config keys. For instance if we want retention of 1d for ```loginEvent``` we can create ```_CONFIG_STORE/2.0/data/events/loginEvent [...]
+All other event datasets will have the default retention of 1000d.
+
+```hocon
+gobblin.retention : {
+    daily : {
+      selection {
+        timeBased.lookbackTime=1d
+      }
+    }
+}
+```
+
+### Examples
+Browse the [gobblin-data-management/config-example](https://github.com/apache/gobblin/tree/master/gobblin-data-management/config-example) directory to see example configuration.
+
+## Supported Retention Configurations
+Below is a list of ready to use supported retention configurations. But users can always implement their own ```DatasetFinder```,```VersionFinder``` and ```VersionSelectionPolicy``` and plug it in.
+
+### 1. Time based retention
+To delete data older than some time
+
+```hocon
+gobblin.retention : {
+
+    dataset : {
+      pattern="/user/gobblin/*"
+      finder.class=org.apache.gobblin.data.management.retention.profile.ManagedCleanableDatasetFinder
+    }
+
+    selection : {
+      policy.class=org.apache.gobblin.data.management.policy.SelectBeforeTimeBasedPolicy
+      timeBased.lookbackTime=7d
+    }
+
+    version : {
+      finder.class=org.apache.gobblin.data.management.version.finder.GlobModTimeDatasetVersionFinder
+    }
+}
+```
+
+### 2. Newest K retention
+To always keep k new versions and delete the rest
+
+```hocon
+gobblin.retention : {
+
+    dataset : {
+      pattern="/user/gobblin/*"
+      finder.class=org.apache.gobblin.data.management.retention.profile.ManagedCleanableDatasetFinder
+    }
+
+    selection : {
+      policy.class=org.apache.gobblin.data.management.policy.NewestKSelectionPolicy
+      newestK.versionsNotSelected=2
+    }
+
+    version : {
+      finder.class=org.apache.gobblin.data.management.version.finder.GlobModTimeDatasetVersionFinder
+    }
+}
+```
+
+### 3. Combining multiple policies
+The below config deletes versions older than 3 days but making sure we always have at least 2 version. So if we have only 1 version and it is 4 days old it is not deleted.
+
+```hocon
+gobblin.retention : {
+
+    dataset : {
+      pattern="/user/gobblin/snapshots/*/*"
+      finder.class=org.apache.gobblin.data.management.retention.profile.ManagedCleanableDatasetFinder
+    }
+
+    selection : {
+      policy.class=org.apache.gobblin.data.management.policy.CombineSelectionPolicy
+      combine.operation=INTERSECT
+      combine.policy.classes=[
+        org.apache.gobblin.data.management.policy.SelectBeforeTimeBasedPolicy,
+        org.apache.gobblin.data.management.policy.NewestKSelectionPolicy
+      ]
+      timeBased.lookbackTime=3d
+      newestK.versionsNotSelected=2
+
+    }
+
+    version : {
+      finder.class=org.apache.gobblin.data.management.version.finder.GlobModTimeDatasetVersionFinder
+    }
+}
+```
+
+### 4. Datasets with multiple kinds of versions
+This is mostly useful for retention management of datasets that have different kinds of versions with each having their own policies. For example an event dataset may have daily and hourly partitions. For daily we may want a higher retention of 5 days but hourly the retention may be set to 2 days.
+
+```hocon
+gobblin.retention : {
+
+    TimeBasedSelectionPolicy=org.apache.gobblin.data.management.policy.SelectBeforeTimeBasedPolicy
+    DateTimeDatasetVersionFinder=org.apache.gobblin.data.management.version.finder.DateTimeDatasetVersionFinder
+
+    dataset : {
+      pattern="/user/gobblin/data/*"
+      finder.class=org.apache.gobblin.data.management.retention.profile.ManagedCleanableDatasetFinder
+      partitions=[${gobblin.retention.hourly}, ${gobblin.retention.daily}]
+    }
+
+    daily : {
+      selection {
+        policy.class = ${gobblin.retention.TimeBasedSelectionPolicy}
+        timeBased.lookbackTime = 5d
+      }
+      version : {
+        finder.class=${gobblin.retention.DateTimeDatasetVersionFinder}
+        globPattern = "daily/*/*/*"
+        datetime.pattern = "yyyy/MM/dd"
+      }
+    }
+
+    hourly : {
+      selection {
+        policy.class = ${gobblin.retention.TimeBasedSelectionPolicy}
+        timeBased.lookbackTime = 2d
+      }
+      version : {
+        finder.class=${gobblin.retention.DateTimeDatasetVersionFinder}
+        globPattern = "hourly/*/*/*/*"
+        datetime.pattern = "yyyy/MM/dd/hh"
+      }
+  }
+}
+
+```
+
+
+### 5. Time based Hive Retention
+Gobblin supports retention for a hive partitioned table. Partitions older than n days can be dropped using this policy. A job can optionally choose to delete data associated with the partition. By default the job does NOT delete data. It only drops the hive partition.
+
+```hocon
+
+gobblin.retention : {
+
+    is.blacklisted=false
+
+    dataset : {
+      finder.class=org.apache.gobblin.data.management.retention.dataset.finder.CleanableHiveDatasetFinder
+    }
+
+    selection : {
+      policy.class=org.apache.gobblin.data.management.policy.SelectBeforeTimeBasedPolicy
+
+      ## Partitions older than 3 days will be deleted
+      timeBased.lookbackTime=3d
+    }
+
+    version.finder.class=org.apache.gobblin.data.management.version.finder.DatePartitionHiveVersionFinder
+
+    hive {
+      partition {
+        key.name=datepartition
+        value.datetime.pattern=yyyy-MM-dd-HH
+      }
+    }
+}
+
+```
+
+Job level configuration to enable data deletion
+```properties
+gobblin.retention.hive.shouldDeleteData=true
+```
+
+### 6. Setting permissions/owner/group for versions of a dataset
+Gobblin retention can set permissions, change owner/group for certain versions of a dataset. The below configuration is an extention to example #4, where along with deleting daily versions older than 5 days, it also restricts the access for daily versions older than 4 days to owner only.
+All the access control policies to apply are discovered through the key ```accessControl.policies```. The below example shows one such policy called ```ownerOnly```. Users can define any arbitrary policy and add them to ```accessControl.policies```.
+
+```hocon
+gobblin.retention : {
+
+    TimeBasedSelectionPolicy=org.apache.gobblin.data.management.policy.SelectBeforeTimeBasedPolicy
+    DateTimeDatasetVersionFinder=org.apache.gobblin.data.management.version.finder.DateTimeDatasetVersionFinder
+
+    dataset : {
+      pattern="/user/gobblin/data/*"
+      finder.class=org.apache.gobblin.data.management.retention.profile.ManagedCleanableDatasetFinder
+      partitions=[${gobblin.retention.hourly}, ${gobblin.retention.daily}]
+    }
+
+    daily : {
+      selection {
+        policy.class = ${gobblin.retention.TimeBasedSelectionPolicy}
+        timeBased.lookbackTime = 5d
+      }
+      version : {
+        finder.class=${gobblin.retention.DateTimeDatasetVersionFinder}
+        globPattern = "daily/*/*/*"
+        datetime.pattern = "yyyy/MM/dd"
+      }
+      accessControl {
+
+            ## Provide a list of comma separated policies to apply. Each entry in this list should have a corresponding config section.
+            policies = [ownerOnly]
+
+            ownerOnly {
+                 selection {
+                    policy.class = ${gobblin.retention.TimeBasedSelectionPolicy}
+                    timeBased.lookbackTime=4d
+                 }
+                 mode : 700
+                 user : myUser
+                 group : noAccess
+            }
+        }
+    }
+
+    hourly : {
+      selection {
+        policy.class = ${gobblin.retention.TimeBasedSelectionPolicy}
+        timeBased.lookbackTime = 2d
+      }
+      version : {
+        finder.class=${gobblin.retention.DateTimeDatasetVersionFinder}
+        globPattern = "hourly/*/*/*/*"
+        datetime.pattern = "yyyy/MM/dd/hh"
+      }
+  }
+}
+
+```
diff --git a/gobblin-website/docs/developer-guide/CodingStyle.md b/gobblin-website/docs/developer-guide/CodingStyle.md
new file mode 100644
index 0000000..a75c606
--- /dev/null
+++ b/gobblin-website/docs/developer-guide/CodingStyle.md
@@ -0,0 +1,55 @@
+---
+title: Code Style Guide
+sidebar_label: Code Style Guide
+---
+
+Overview
+--------
+
+The code formatting standard in this project is based on the [Oracle/Sun Code Convention](http://www.oracle.com/technetwork/java/codeconventions-150003.pdf) and [Google Java Style](http://google-styleguide.googlecode.com/svn/trunk/javaguide.html).  
+
+Guideline
+-------
+
+The coding style is consistent with most of the open source projects with the following callout:
+
+1. Naming Conventions
+    * Variables are camel case beginning with a lowercase letter, e.g. `fooBar`
+    * Constant variables are declared as static final and should be all uppercase ASCII letters delimited by underscore ("_"), e.g. `FOO_BAR`
+
+1. Import statement
+    * Do not use 'star' imports, e.g. `import java.io.*`;
+    * Import order: `java`, `org`, `com`, `gobblin`.
+
+1. Indentation
+    * Two spaces should be used as the unit of indentation;
+    * Tabs must expand to spaces and the tab width should be set to two;
+    * Line length: lines should not exceed 120 characters;
+
+1. White space
+    * Blank lines should be provided to improve readability:
+        * Between the local variables in a method and its first statement
+        * Between methods
+    * Blank spaces should be used in the following circumstances:
+        * A keyword followed by a parenthesis should be separated by a space (e.g. `while (true) {`)
+        * A binary operators except . should be separated from their operands by spaces (e.g. `a + b`);
+
+1. Comments:
+    * Implementation comments: Block comments (`/* ... */`), end-of-line comments (`//...`) can be used to illustrate a particular implementation;
+    * Documentation comments (`/** ... */`) should be used to describe Java classes, interfaces, methods;
+
+1. Compound statements are lists of statements enclosed in curly braces and should be formatted according to the following conventions:
+    * The enclosed statements should be indented one more level than the enclosing statement
+    * The opening brace should be on the same line as the enclosing statement (e.g. the 'if' clause)
+    * The closing brace should be on a line by itself indented to match the enclosing statement
+    * Braces are used around all statements, even single statements, when they are part of a control structure, such as if-else or for statements. This makes it easier to add statements without accidentally introducing bugs due to forgetting to add braces.
+
+Code Style Template File
+-------------------------
+* Eclipse
+    * Download the [codetyle-eclipse.xml](files/codestyle-eclipse.xml), Import the file through Preferences > Java > Code Style > Formatter
+    * Download the [prefs-eclipse.epf](files/prefs-eclipse.epf), Import the file File > Import > General > Preferences
+* IntelliJ
+    * Download the [codestyle-intellij-gobblin.xml](files/codestyle-intellij-gobblin.xml), Copy the file to the appropriate codestyles directory for your installation. This is typically `~/.INTELLIJ_VERSION/config/codestyles` on Linux (or `$HOME/Library/Preferences/INTELLIJ_VERSION/codestyles` on Mac). The specific INTELLIJ_VERSION identifier will depend on your version; examples are IntelliJIdeal3, IdeaC15 etc.
+    * Restart the IDE
+    * Go to File > Settings > Code Style > General > Scheme to select the new style (LinkedIn Gobblin Style)
diff --git a/gobblin-website/docs/developer-guide/Contributing.md b/gobblin-website/docs/developer-guide/Contributing.md
new file mode 100644
index 0000000..609bd8b
--- /dev/null
+++ b/gobblin-website/docs/developer-guide/Contributing.md
@@ -0,0 +1,23 @@
+---
+title: Contributing
+sidebar_label: Contributing
+---
+
+# Contributing to Gobblin
+
+You can contribute to Gobblin in multiple ways. For resources and guides, please refer [here](http://gobblin.apache.org/contributor/).
+
+## Code Contributions
+
+We observe standard Apache practices for code contributions. For code changes, we recommend forking the repository and making your local changes on a feature branch, then updating the Jira, and opening a pull request (PR). A committer will review the changes and merge it in once it is approved. For first time contributors to Gobblin, we do request that you fill out a [one-time survey](https://docs.google.com/a/linkedin.com/forms/d/e/1FAIpQLSeH-8so0m68et6kPvxEiCNqezL7k6cyOlz9W-6eXnk7LEkwi [...]
+
+## Documentation Contributions
+
+To make changes to the documentation modify the files under `gobblin-docs` as you would any other version controlled file. All documentation is checked into GitHub, so the process for making documentation changes is similar to how code changes are made (creating Pull Requests). If one wants to see what the rendered documentation looks like they simply need to take the following steps:
+
+1. Install MkDocs locally, this page has directions on how to do so: http://www.mkdocs.org/#installation
+2. Make sure you are in the top level directory for the Gobblin repo and execute `mkdocs serve`
+
+These steps will start a local server to server the documentation, simply go to the URL show by the output of `mkdocs serve` and you should be able to see the documentation.
+
+One the changes have been made and tested, create a PR and a committer will review and merge the documentation changes. Updates to the documentation page happen automatically everytime a commit is merged into the master branch; however, there may be a 10 to 15 minute delay before the changes actually show up.
diff --git a/gobblin-website/docs/developer-guide/Customization-for-Converter-and-Operator.md b/gobblin-website/docs/developer-guide/Customization-for-Converter-and-Operator.md
new file mode 100644
index 0000000..2d59e72
--- /dev/null
+++ b/gobblin-website/docs/developer-guide/Customization-for-Converter-and-Operator.md
@@ -0,0 +1,6 @@
+---
+title: Customization for Converter and Operator
+sidebar_label: Customization for Converter and Operator
+---
+
+To be updated.
diff --git a/gobblin-website/docs/developer-guide/Customization-for-New-Source.md b/gobblin-website/docs/developer-guide/Customization-for-New-Source.md
new file mode 100644
index 0000000..857ef9a
--- /dev/null
+++ b/gobblin-website/docs/developer-guide/Customization-for-New-Source.md
@@ -0,0 +1,6 @@
+---
+title: Customization for New Source
+sidebar_label: Customization for New Source
+---
+
+To be updated.
diff --git a/gobblin-website/docs/developer-guide/Documentation-Architecture.md b/gobblin-website/docs/developer-guide/Documentation-Architecture.md
new file mode 100644
index 0000000..942534b
--- /dev/null
+++ b/gobblin-website/docs/developer-guide/Documentation-Architecture.md
@@ -0,0 +1,24 @@
+---
+title: Documentation Architecture
+sidebar_label: Documentation Architecture
+---
+
+# Documentation Overview
+
+The documentation for Gobblin is based on [ReadTheDocs](https://readthedocs.org/) and [MkDocs](http://www.mkdocs.org/). MkDocs is used to convert MarkDown files to HTML, and ReadTheDocs is used to host the documentation.
+
+# GitHub Wiki Limitations
+
+Historically, documentation was hosted using the GitHub wiki. The problem is that only Gobblin committers can modify the wiki; any external contributors who want to update or add documentation cannot do so. Editing the Gobblin Wiki is also not PR based, so any committer can make changes without going through a review process.
+
+# MkDocs
+
+MkDocs is an open source tool that converts Python Flavored Markdown to HTML files. MkDocs has a number of pre-defined themes that can be used to display the MarkDown files. New themes can be added, or custom CSS and JavaScript can be added to modify existing themes. MkDocs is configured using `mkdocs.yml` file. This file also specifies a master Table of Contents for the entire website.
+
+# ReadTheDocs
+
+ReadTheDocs is an open source, free tool that can build documentation in a GitHub repository and host it for public use. ReadTheDocs links to a specified GitHub project, and on every push to the repository the documentation is updated. ReadTheDocs essentially clones the repo and builds the documentation using either Sphinx or MkDocs (for Gobblin we only use MkDocs). It then hosts the documentation on internal servers so any end user can view the documentation. ReadTheDocs has other very  [...]
+
+# Additional Information
+
+For more information on how this architecture was decided and the different tradeoffs between other documentation services, check out the original PR: https://github.com/apache/gobblin/pull/788
diff --git a/gobblin-website/docs/developer-guide/Gobblin-Compliance-Design.md b/gobblin-website/docs/developer-guide/Gobblin-Compliance-Design.md
new file mode 100644
index 0000000..8236d5d
--- /dev/null
+++ b/gobblin-website/docs/developer-guide/Gobblin-Compliance-Design.md
@@ -0,0 +1,100 @@
+---
+title: Gobblin Compliance Design
+sidebar_label: Gobblin Compliance Design
+---
+
+# Introduction
+--------------
+The Gobblin Compliance module allows for data purging to meet regulatory compliance requirements. The module includes purging, retention and restore functionality for datasets.
+
+The purging is performed using Hive meaning that purging of datasets is supported in any format that Hive can read from and write to, including for example ORC and Parquet. Further the purger is built on top of the Gobblin framework which means that the fault-tolerance, scalability and flexibility that Gobblin provides is taken full advantage of.
+
+The [User Guide](../user-guide/Gobblin-Compliance) describes how to onboard a dataset for purging.
+
+# Design
+-------
+
+The elements of the Compliance design are:
+
+* The onboarding process
+* The purge process
+* The retention process
+* The restore process
+
+## Onboarding
+-------------
+A dataset is onboarded to the Purger with these steps:
+
+1. The whitelist includes either of the database or table that will be considered for purging
+2. Every table that is to be purged includes the necessary information for purging (dataset descriptor) as a JSON string in its TBLPROPERTIES
+
+The purger iterates over all the tables that are whitelisted, and of those tables further looks for the presence of the dataset descriptor to specify the information required by the purger to proceed with the purge process.
+
+With this information, the purger iterates over the partitions of the table that needs to be purged and proceeds to purge each partition of the table individually.
+
+## Purger
+---------
+The purger code is mostly in the `gobblin.compliance.purger` package.
+
+The elements of the purger are:
+
+* The Gobblin constructs
+* The Hive operations
+
+### Gobblin constructs
+----------------------
+The Gobblin constructs that make up the Purger are:
+
+* `HivePurgerSource` generates a WorkUnit per partition that needs to be purged
+* `HivePurgerExtractor` instantiates a `PurgeableHivePartitionDataset` object that encapsulates all the information required to purge the partition
+* For each partition, `HivePurgerConverter` populates the purge queries into the `PurgeableHivePartitionDataset` object
+* The purge queries are executed by `HivePurgerWriter` 
+* The `HivePurgerPublisher` moves successful Workunits to the `COMMITTED` state
+
+### Hive operations
+------------------
+The purging process operates as follows:
+
+* The partition information including location and partitioning scheme is determined from the metadata of the partition
+* A new external staging table is created using the Hive `LIKE` construct of the current table that is being purged
+* The location of this staging table on HDFS is a new folder within the table location with the current timestamp
+* The purge query executes a `LEFT OUTER JOIN` of the original table against the table containing the ids whose data is to be purged and `INSERT OVERWRITE`s this data into the staging table, and thereby location. Once this query returns, the location will contain the purged data
+* Since when we `ALTER` the original partition location next to the new staging table location, we preserve the location of the current/original location of the partition by creating a backup table pointing to this location. We do not move this immediately to avoid breaking any in-flight queries.
+* The next step is to `ALTER` the partition location to the location containing the purged data
+* The final step is to `DROP` the staging table, this only drops the metadata and not the data
+
+Taking as an example, a `tracking.event` table, and the `datepartition=2017-02-16-00/is_guest=0` partition, the purge process would be the following:
+
+* Let's assume the `tracking.event` table is located at the location `/user/tracking/event/`
+* The full partition name would be `tracking@event@datepartition=2017-02-16-00/is_guest=0` per Hive, and let's assume the data is located at `/user/tracking/event/original/datepartition=2017-02-16-00/is_guest=0/`
+* A staging table `tracking.event_staging_1234567890123` (`1234567890123` is the example timestamp we will use for clarity, a real timestamp looks more like '1487154972824') is created `LIKE tracking.event` with the location `/user/tracking/event/1234567890123/datepartition=2017-02-16-00/is_guest=0/`. This would be within the original table location
+* The purge query would be similar to (assuming u_purger.guestids has the ids whose data is to be purged):
+```sql
+INSERT OVERWRITE TABLE tracking@event_staging_1234567890123
+PARTITION (datepartition='2017-02-16-00',is_guest='0') 
+SELECT /*+MAPJOIN(b) */ a.metadata.guestid, a.col_a, a.col_b 
+FROM tracking.event a 
+LEFT JOIN u_purger.guestids b
+ON a.metadata.guestid=b.guestid
+WHERE b.guestid IS NULL AND a.datepartition='2017-02-16-00' AND a.is_guest='0'
+```
+* A backup table `tracking.event_backup_1234567890123` is created with PARTITION `datepartition=2017-02-16-00,is_guest=0` pointing to the original location `/user/tracking/event/original/datepartition=2017-02-16-00/is_guest=0`
+* The partition location of `tracking@event@2017-02-16-00` is updated to be `/user/tracking/event/1234567890123/datepartition=2017-02-16-00/is_guest=0`
+* The `tracking.event_staging_1234567890123` table is dropped
+
+## Retention
+------------
+The retention code is mostly in the `gobblin.compliance.retention` package.
+
+The retention process builds on top of [Gobblin Retention](../data-management/Gobblin-Retention) and performs the following operations:
+
+* Cleanup of backup data beyond a specified policy
+* Cleanup of any staging tables not cleaned up in case of failures
+* Reaping of backup locations from the original location
+* Cleanup of trash data from the restore process beyond a specified policy
+
+## Restore
+----------
+The restore code is mostly in the `gobblin.compliance.restore` package.
+
+The restore process allows for restoration to a backup dataset if required.
diff --git a/gobblin-website/docs/developer-guide/GobblinModules.md b/gobblin-website/docs/developer-guide/GobblinModules.md
new file mode 100644
index 0000000..ff5837d
--- /dev/null
+++ b/gobblin-website/docs/developer-guide/GobblinModules.md
@@ -0,0 +1,80 @@
+---
+title: Gobblin Modules
+sidebar_label: Gobblin Modules
+---
+
+# Introduction
+
+*Gobblin-modules* is a way to support customization of the gobblin-distribution build.
+
+One of the core features of Gobblin is ability to integrate for a number of systems for data management (sources, targets, monitoring, etc.) Often this leads to inclusion of libraries specific to those systems. Sometimes, such systems also introduce incompatible changes in their APIs (e.g. Kafka 0.8 vs Kafka 0.9).
+
+As the adoption of Gobblin grows and we see an increased number of such dependencies, it is no longer easy (or possible) to maintain a single monolithic gobblin-distribution build. This is where gobblin-modules.
+
+# How it works
+
+## gobblin-modules/
+
+We are moving non-core functionality which may bring conflicting or large external dependencies to a new location: `gobblin-modules/`. This contains the collection of libraries (modules) which bring external depenencies.
+
+For example, currently we have:
+
+- `gobblin-kafka-08` - source, writer, metrics reporter using Kafka 0.8 API
+- `gobblin-metrics-graphite` - metrics reporter to Graphite
+
+Other libraries can refer to those modules using standard Gradle dependencies.
+
+## Gobblin flavor
+
+We have added a build property `gobblinFlavor` which controls what modules to be build and included in the gobblin-distribution tarball. The property can be used as follows
+```bash
+    ./gradlew -PgobblinFlavor=minimal build
+```
+
+Gobblin libraries that support customization can add build files like `gobblin-flavor-<FLAVOR>.gradle` which declare the dependencies. For example, let's look at the current `gobblin-core/gobblin-flavor-standard.gradle` :
+
+```text
+dependencies {
+  compile project(':gobblin-modules:gobblin-kafka-08')
+}
+```
+
+That specifies that the "standard" flavor of Gobblin will include the Kafka 0.8 source, writer and metric reporter.
+
+When one specifies the `-PgobblinFlavor=<FLAVOR>` during build time, the build script will automatically include the dependencies specified in the corresponding `gobblin-flavor-<FLAVOR>.gradle` files in any library that contains such file.
+
+Currently, Gobblin defines 4 flavors out of the box:
+
+- minimal - no modules
+- standard - standard modules for frequently used components. This is the flavor used if none is explicitly specified
+- cluster - modules for running Gobblin clusters (YARN, AWS, stand-alone)
+- full - all non-conflicting modules
+- custom - by default, like minimal but lets users/developers modify and customize the dependencies to be included.
+
+Users/developers can define their own flavor files.
+
+# Current flavors and modules
+
+| Module           | Flavors         | Description |
+|------------------|----------------|-------------|
+| gobblin-azkaban | standard, full | Classes to run gobblin jobs in Azkaban |
+| gobblin-aws | cluster, full | Classes to run gobblin clusters on AWS |
+| gobblin-cluster | cluster, full | Generic classes for running Gobblin clusters |
+| gobblin-compliance | full | Source,converters, writer for cleaning existing datasets for compliance purposes |
+| gobblin-helix | full | State store implementation using Helix/ZK |
+| gobblin-kafka-08 | standard, full | Source, writer and metrics reporter using Kafka 0.8 APIs |
+| gobblin-kafka-09 |  | Source, writer and metrics reporter using Kafka 0.9 APIs |
+| gobblin-metrics-graphite | standard, full | metrics reporter to Graphite |
+| gobblin-metrics-influxdb | standard, full | metrics reporter to InfluxDB |
+| gobblin-metrics-hadoop | standard, full | metrics reporter to Hadoop counters |
+| gobblin-yarn | cluster, full | Classes to run gobblin clusters on YARN as a native app |
+| google-ingestion | standard, full | Source/extractors for GoogleWebMaster, GoogleAnalytics, GoogleDrive |
+| gobblin-azure-datalake | full | FileSystem for Azure Data lake |
+
+Note: Some grandfathered modules may not be in the gobblin-modules/ directory yet. Typically, those are in the root directory.
+
+# What's next
+
+We are in the process of moving existing external dependencies out of `gobblin-core` into separate modules. To preserve backwards compatibility, we will preserve package and class names and make the "standard" flavor of `gobblin-core` depend on these modules.
+
+In the future, new external source, writer and other dependencies are expected to be added directly to gobblin-modules/. Further, we may decide to switch modules between flavors to conrol the number of external dependencies. This will always be done with advanced notice.
diff --git a/gobblin-website/docs/developer-guide/HighLevelConsumer.md b/gobblin-website/docs/developer-guide/HighLevelConsumer.md
new file mode 100644
index 0000000..3ff240c
--- /dev/null
+++ b/gobblin-website/docs/developer-guide/HighLevelConsumer.md
@@ -0,0 +1,72 @@
+---
+title: High Level Consumer
+sidebar_label: High Level Consumer
+---
+
+Problem Statement
+=================
+
+Current Gobblin Kafka [`High Level Consumer`](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/kafka/HighLevelConsumer.java) uses Kafka Consumer (0.8) APIs and gobblin support for them will be deprecated. The Re-design's primary goal is to replace old kafka consumer APIs like [`ConsumerConnector`](https://archive.apache.org/dist/kafka/0.8.2.2/scaladoc/index.html#kafka.consumer.ConsumerConnector) and [`MessageAndMetadata`](https://arch [...]
+Additionally, the old design uses kafka auto commit feature which can cause potential loss of messages when offsets are committed and the system fails before messages are processed.
+
+Detailed design and implementation details can be found [here](https://cwiki.apache.org/confluence/display/GOBBLIN/GIP+5%3A+High+Level+Consumer+Re-design)
+
+New Design & Details 
+====================
+
+GobblinKafkaConsumerClient
+
+The new design uses [`GobblinKafkaConsumerClient`](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-kafka-common/src/main/java/org/apache/gobblin/kafka/client/GobblinKafkaConsumerClient.java) which is a simplified, generic wrapper client to communicate with Kafka. This class does not depend on classes defined in kafka-clients library. This allows the high level consumer to work with different versions of kafka. Concrete classes implementing this interface use a speci [...]
+
+
+Manual Offset Commit
+
+`GobblinKafkaConsumerClient` API has been enhanced to allow manual committing of offsets.
+
+```java
+  /**
+   * Commit offsets manually to Kafka asynchronously
+   */
+  default void commitOffsetsAsync(Map<KafkaPartition, Long> partitionOffsets) {
+    return;
+  }
+
+  /**
+   * Commit offsets manually to Kafka synchronously
+   */
+  default void commitOffsetsSync(Map<KafkaPartition, Long> partitionOffsets) {
+    return;
+  }
+
+  /**
+   * returns the last committed offset for a KafkaPartition
+   * @param partition
+   * @return last committed offset or -1 for invalid KafkaPartition
+   */
+  default long committed(KafkaPartition partition) {
+    return -1L;
+  }
+```
+
+High level consumer records topic partitions and their offsets AFTER the messages are processed and commits them periodically to kafka. This ensures at-least once delivery in case of a failure.
+
+Additionally, APIs are provided to subscribe to a topic along with a [`GobblinKafkaRebalanceListener`](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-kafka-common/src/main/java/org/apache/gobblin/kafka/client/GobblinConsumerRebalanceListener.java) that provides hooks to when a consumer joins/leaves a consumer group.
+In this case, we commit remaining offsets and clear offset caches.
+
+```java
+  /**
+   * Subscribe to a topic
+   * @param topic
+   */
+  default void subscribe(String topic) {
+    return;
+  }
+
+  /**
+   * Subscribe to a topic along with a GobblinKafkaRebalanceListener
+   * @param topic
+   */
+  default void subscribe(String topic, GobblinConsumerRebalanceListener listener) {
+    return;
+  }
+```
diff --git a/gobblin-website/docs/developer-guide/IDE-setup.md b/gobblin-website/docs/developer-guide/IDE-setup.md
new file mode 100644
index 0000000..22b41aa
--- /dev/null
+++ b/gobblin-website/docs/developer-guide/IDE-setup.md
@@ -0,0 +1,26 @@
+---
+title: IDE Setup
+sidebar_label: IDE Setup
+---
+
+# Introduction
+This document is for users who want to import the Gobblin code base into an [IDE](https://en.wikipedia.org/wiki/Integrated_development_environment) and directly modify that Gobblin code base. This is not for users who want to just setup Gobblin as a Maven dependency.
+
+# IntelliJ Integration
+Gobblin uses standard build tools to import code into an IntelliJ project. Execute the following command to build the necessary `*.iml` files:
+```bash
+./gradlew clean idea
+```
+Once the command finishes, use standard practices (File > Open; navigate to root of gobblin codebase on filesystem) to import the project into IntelliJ.
+If you plan to write code, remember to read the [coding style guide](CodingStyle) and import the coding style template file. 
+
+# Eclipse Integration
+Gobblin uses standard build tools to import code into an Eclipse project. Execute the following command to build the necessary `*.classpath` and `*.project` files:
+```bash
+./gradlew clean eclipse
+```
+Once the command finishes, use standard practices to import the project into Eclipse.
+If you plan to write code, remember to read the [coding style guide](CodingStyle) and import the coding style template file.  
+
+# Lombok
+Gobblin uses [Lombok](https://projectlombok.org/) for reducing boilerplate code. Lombok auto generates boilerplate code at runtime if you are building gobblin from command line.If you are using an IDE, you will see compile errors in some of the classes that use Lombok. Please follow the [IDE setup instructions](https://projectlombok.org/download.html) for your IDE to setup lombok.
diff --git a/gobblin-website/docs/developer-guide/Monitoring-Design.md b/gobblin-website/docs/developer-guide/Monitoring-Design.md
new file mode 100644
index 0000000..7561646
--- /dev/null
+++ b/gobblin-website/docs/developer-guide/Monitoring-Design.md
@@ -0,0 +1,9 @@
+---
+title: Monitoring Design
+sidebar_label: Monitoring Design
+---
+
+Metrics Collection Basics
+-----------------
+
+Please refer to [Gobblin Metrics Architecture](../metrics/Gobblin-Metrics-Architecture) section.
diff --git a/gobblin-website/docs/developer-guide/files/codestyle-eclipse.xml b/gobblin-website/docs/developer-guide/files/codestyle-eclipse.xml
new file mode 100644
index 0000000..f374968
--- /dev/null
+++ b/gobblin-website/docs/developer-guide/files/codestyle-eclipse.xml
@@ -0,0 +1,308 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<profiles version="12">
+				<profile kind="CodeFormatterProfile" name="LinkedIn Style" version="12">
+								<setting id="org.eclipse.jdt.core.formatter.comment.insert_new_line_before_root_tags" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.disabling_tag" value="@formatter:off"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_annotation" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_parameters" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_type_declaration" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_arguments" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.brace_position_for_anonymous_type_declaration" value="end_of_line"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_case" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_brace_in_array_initializer" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.comment.new_lines_at_block_boundaries" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_annotation_declaration" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_annotation" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_field" value="0"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_while" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.use_on_off_tags" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_annotation_type_member_declaration" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_else_in_if_statement" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_prefix_operator" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.keep_else_statement_on_same_line" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_ellipsis" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.comment.insert_new_line_for_parameter" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_annotation_type_declaration" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.indent_breaks_compare_to_cases" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_multiple_fields" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_expressions_in_array_initializer" value="0"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_conditional_expression" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_for" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_binary_operator" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_question_in_wildcard" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.brace_position_for_array_initializer" value="end_of_line"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_enum_constant" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_finally_in_try_statement" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_local_variable" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_catch_in_try_statement" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_while" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.blank_lines_after_package" value="1"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_parameters" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.continuation_indentation" value="2"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_postfix_operator" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_method_invocation" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_arguments" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_superinterfaces" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_new_chunk" value="0"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_binary_operator" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_package" value="0"/>
+								<setting id="org.eclipse.jdt.core.compiler.source" value="1.6"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_constant_arguments" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_constructor_declaration" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.comment.format_line_comments" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_arguments" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_declarations" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.join_wrapped_lines" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_block" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_explicit_constructor_call" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_member_type" value="1"/>
+								<setting id="org.eclipse.jdt.core.formatter.align_type_members_on_columns" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_enum_constant" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_for" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_method_declaration" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_selector_in_method_invocation" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_switch" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_unary_operator" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_case" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.comment.indent_parameter_description" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_declaration" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_switch" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_declaration" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_parameters" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_block_comment" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_type_declaration" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.lineSplit" value="120"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_if" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_between_brackets_in_array_type_reference" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_explicitconstructorcall_arguments" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_constructor_declaration" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_first_class_body_declaration" value="0"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_method" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.indentation.size" value="2"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_declaration" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.enabling_tag" value="@formatter:on"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_enum_constant" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_superclass_in_type_declaration" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_assignment" value="16"/>
+								<setting id="org.eclipse.jdt.core.compiler.problem.assertIdentifier" value="error"/>
+								<setting id="org.eclipse.jdt.core.formatter.tabulation.char" value="space"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_parameters" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_try_resources" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_prefix_operator" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.indent_statements_compare_to_body" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_method" value="1"/>
+								<setting id="org.eclipse.jdt.core.formatter.wrap_outer_expressions_when_nested" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.format_guardian_clause_on_one_line" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_for" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_cast" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_parameters_in_constructor_declaration" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_labeled_statement" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.brace_position_for_annotation_type_declaration" value="end_of_line"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_method_body" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_method_declaration" value="0"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_invocation" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_try" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_allocation_expression" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_constant" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_annotation" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation_type_declaration" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_throws" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_if" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.brace_position_for_switch" value="end_of_line"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_throws" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_return" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_question_in_conditional" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_question_in_wildcard" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_try" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_allocation_expression" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.comment.preserve_white_space_between_code_and_line_comments" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_throw" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_arguments" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.compiler.problem.enumIdentifier" value="error"/>
+								<setting id="org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_switch" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_ellipsis" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.brace_position_for_block" value="end_of_line"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_inits" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.brace_position_for_method_declaration" value="end_of_line"/>
+								<setting id="org.eclipse.jdt.core.formatter.compact_else_if" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.wrap_before_or_operator_multicatch" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_array_initializer" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_increments" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.format_line_comment_starting_on_first_column" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_reference" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_field" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.brace_position_for_enum_constant" value="end_of_line"/>
+								<setting id="org.eclipse.jdt.core.formatter.comment.indent_root_tags" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_declarations" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_union_type_in_multicatch" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_explicitconstructorcall_arguments" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_switch" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_superinterfaces" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_allocation_expression" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.tabulation.size" value="2"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_type_reference" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_brace_in_block" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_reference" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_constant" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_arguments" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_constructor_declaration" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_throws" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_if" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_javadoc_comment" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_constructor_declaration" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_assignment_operator" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_assignment_operator" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.indent_empty_lines" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_synchronized" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_paren_in_cast" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.brace_position_for_block_in_case" value="end_of_line"/>
+								<setting id="org.eclipse.jdt.core.formatter.number_of_empty_lines_to_preserve" value="1"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_declaration" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_catch" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_constructor_declaration" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_invocation" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_reference" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_qualified_allocation_expression" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_and_in_type_parameter" value="insert"/>
+								<setting id="org.eclipse.jdt.core.compiler.compliance" value="1.6"/>
+								<setting id="org.eclipse.jdt.core.formatter.continuation_indentation_for_array_initializer" value="2"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_brackets_in_array_allocation_expression" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_at_in_annotation_type_declaration" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_allocation_expression" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_cast" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_unary_operator" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_parameterized_type_reference" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_anonymous_type_declaration" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.keep_empty_array_initializer_on_one_line" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_declaration" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.keep_imple_if_on_one_line" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_parameters" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_parameters" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_at_end_of_file_if_missing" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_for" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_labeled_statement" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_parameterized_type_reference" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_type_declaration" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_binary_expression" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.brace_position_for_enum_declaration" value="end_of_line"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_type" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_while" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode" value="enabled"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_try" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.put_empty_statement_on_new_line" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_label" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_parameter" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_parameters" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_invocation" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_while_in_do_statement" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_enum_constant" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.comment.format_javadoc_comments" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.comment.line_length" value="120"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_package" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.blank_lines_between_import_groups" value="1"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_constant_arguments" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_semicolon" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.brace_position_for_constructor_declaration" value="end_of_line"/>
+								<setting id="org.eclipse.jdt.core.formatter.number_of_blank_lines_at_beginning_of_method_body" value="0"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_conditional" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_type_header" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation_type_member_declaration" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.wrap_before_binary_operator" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_declaration_header" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.blank_lines_between_type_declarations" value="1"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_synchronized" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.indent_statements_compare_to_block" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_enum_declaration" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.join_lines_in_comments" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_question_in_conditional" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_field_declarations" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_compact_if" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_inits" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_cases" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_array_initializer" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_default" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_and_in_type_parameter" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_constructor_declaration" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_imports" value="1"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_assert" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.comment.format_html" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_method_declaration" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_parameters" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_allocation_expression" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_anonymous_type_declaration" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_conditional" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_parameterized_type_reference" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_for" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_postfix_operator" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.comment.format_source_code" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_synchronized" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_allocation_expression" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_throws" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_parameters_in_method_declaration" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_brace_in_array_initializer" value="insert"/>
+								<setting id="org.eclipse.jdt.core.compiler.codegen.targetPlatform" value="1.6"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_resources_in_try" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.use_tabs_only_for_leading_indentations" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_annotation" value="16"/>
+								<setting id="org.eclipse.jdt.core.formatter.comment.format_header" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.comment.format_block_comments" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_enum_constant" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.alignment_for_enum_constants" value="49"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_block" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_annotation_declaration_header" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_catch" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_local_declarations" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_switch" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_increments" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_invocation" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_assert" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.brace_position_for_type_declaration" value="end_of_line"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_array_initializer" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_braces_in_array_initializer" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_declaration" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_for" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_catch" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_parameterized_type_reference" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_field_declarations" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_annotation" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_parameterized_type_reference" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.comment.new_lines_at_javadoc_boundaries" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.blank_lines_after_imports" value="2"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_local_declarations" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_constant_header" value="true"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_for" value="insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.never_indent_line_comments_on_first_column" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_try_resources" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_arguments" value="do not insert"/>
+								<setting id="org.eclipse.jdt.core.formatter.never_indent_block_comments_on_first_column" value="false"/>
+								<setting id="org.eclipse.jdt.core.formatter.keep_then_statement_on_same_line" value="false"/>
+				</profile>
+</profiles>
diff --git a/gobblin-website/docs/developer-guide/files/codestyle-intellij-gobblin.xml b/gobblin-website/docs/developer-guide/files/codestyle-intellij-gobblin.xml
new file mode 100644
index 0000000..88c814f
--- /dev/null
+++ b/gobblin-website/docs/developer-guide/files/codestyle-intellij-gobblin.xml
@@ -0,0 +1,540 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<code_scheme name="LinkedIn Gobblin Style">
+  <option name="JAVA_INDENT_OPTIONS">
+    <value>
+      <option name="INDENT_SIZE" value="2" />
+      <option name="CONTINUATION_INDENT_SIZE" value="4" />
+      <option name="TAB_SIZE" value="2" />
+      <option name="USE_TAB_CHARACTER" value="false" />
+      <option name="SMART_TABS" value="false" />
+      <option name="LABEL_INDENT_SIZE" value="0" />
+      <option name="LABEL_INDENT_ABSOLUTE" value="false" />
+      <option name="USE_RELATIVE_INDENTS" value="false" />
+    </value>
+  </option>
+  <option name="OTHER_INDENT_OPTIONS">
+    <value>
+      <option name="INDENT_SIZE" value="2" />
+      <option name="CONTINUATION_INDENT_SIZE" value="4" />
+      <option name="TAB_SIZE" value="2" />
+      <option name="USE_TAB_CHARACTER" value="false" />
+      <option name="SMART_TABS" value="false" />
+      <option name="LABEL_INDENT_SIZE" value="0" />
+      <option name="LABEL_INDENT_ABSOLUTE" value="false" />
+      <option name="USE_RELATIVE_INDENTS" value="false" />
+    </value>
+  </option>
+  <option name="FIELD_NAME_PREFIX" value="_" />
+  <option name="CLASS_COUNT_TO_USE_IMPORT_ON_DEMAND" value="1000" />
+  <option name="NAMES_COUNT_TO_USE_IMPORT_ON_DEMAND" value="5" />
+  <option name="IMPORT_LAYOUT_TABLE">
+    <value>
+      <package name="java" withSubpackages="true" static="false" />
+      <emptyLine />
+      <package name="org" withSubpackages="true" static="false" />
+      <emptyLine />
+      <package name="com" withSubpackages="true" static="false" />
+      <emptyLine />
+      <package name="" withSubpackages="true" static="false" />
+      <emptyLine />
+      <package name="org.apache.gobblin" withSubpackages="true" static="false" />
+      <emptyLine />
+      <package name="" withSubpackages="true" static="true" />
+    </value>
+  </option>
+  <option name="ENABLE_JAVADOC_FORMATTING" value="false" />
+  <option name="JD_ADD_BLANK_AFTER_PARM_COMMENTS" value="true" />
+  <option name="JD_ADD_BLANK_AFTER_RETURN" value="true" />
+  <option name="JD_KEEP_INVALID_TAGS" value="false" />
+  <option name="KEEP_LINE_BREAKS" value="false" />
+  <option name="KEEP_BLANK_LINES_IN_DECLARATIONS" value="1" />
+  <option name="KEEP_BLANK_LINES_IN_CODE" value="1" />
+  <option name="KEEP_BLANK_LINES_BEFORE_RBRACE" value="0" />
+  <option name="BLANK_LINES_AFTER_PACKAGE" value="2" />
+  <option name="BLANK_LINES_AFTER_IMPORTS" value="2" />
+  <option name="BRACE_STYLE" value="2" />
+  <option name="CLASS_BRACE_STYLE" value="2" />
+  <option name="METHOD_BRACE_STYLE" value="2" />
+  <option name="ELSE_ON_NEW_LINE" value="true" />
+  <option name="WHILE_ON_NEW_LINE" value="true" />
+  <option name="CATCH_ON_NEW_LINE" value="true" />
+  <option name="FINALLY_ON_NEW_LINE" value="true" />
+  <option name="ALIGN_MULTILINE_PARAMETERS_IN_CALLS" value="true" />
+  <option name="ALIGN_MULTILINE_THROWS_LIST" value="true" />
+  <option name="ALIGN_MULTILINE_EXTENDS_LIST" value="true" />
+  <option name="CALL_PARAMETERS_WRAP" value="5" />
+  <option name="METHOD_PARAMETERS_WRAP" value="5" />
+  <option name="THROWS_LIST_WRAP" value="1" />
+  <option name="THROWS_KEYWORD_WRAP" value="2" />
+  <option name="WRAP_COMMENTS" value="true" />
+  <XML>
+    <option name="XML_LEGACY_SETTINGS_IMPORTED" value="true" />
+  </XML>
+  <ADDITIONAL_INDENT_OPTIONS fileType="scala">
+    <option name="INDENT_SIZE" value="2" />
+    <option name="TAB_SIZE" value="2" />
+  </ADDITIONAL_INDENT_OPTIONS>
+  <ADDITIONAL_INDENT_OPTIONS fileType="txt">
+    <option name="INDENT_SIZE" value="2" />
+  </ADDITIONAL_INDENT_OPTIONS>
+  <codeStyleSettings language="CFML">
+    <option name="KEEP_LINE_BREAKS" value="false" />
+    <option name="KEEP_BLANK_LINES_IN_CODE" value="1" />
+    <option name="BRACE_STYLE" value="2" />
+    <option name="ELSE_ON_NEW_LINE" value="true" />
+    <option name="WHILE_ON_NEW_LINE" value="true" />
+    <option name="CATCH_ON_NEW_LINE" value="true" />
+    <option name="ALIGN_MULTILINE_PARAMETERS_IN_CALLS" value="true" />
+    <option name="CALL_PARAMETERS_WRAP" value="5" />
+    <option name="METHOD_PARAMETERS_WRAP" value="5" />
+    <option name="PARENT_SETTINGS_INSTALLED" value="true" />
+  </codeStyleSettings>
+  <codeStyleSettings language="CSS">
+    <indentOptions>
+      <option name="INDENT_SIZE" value="2" />
+      <option name="CONTINUATION_INDENT_SIZE" value="4" />
+      <option name="TAB_SIZE" value="2" />
+    </indentOptions>
+  </codeStyleSettings>
+  <codeStyleSettings language="CoffeeScript">
+    <option name="KEEP_LINE_BREAKS" value="false" />
+    <option name="KEEP_BLANK_LINES_IN_CODE" value="1" />
+    <option name="ALIGN_MULTILINE_PARAMETERS_IN_CALLS" value="true" />
+    <option name="METHOD_PARAMETERS_WRAP" value="1" />
+    <option name="PARENT_SETTINGS_INSTALLED" value="true" />
+    <indentOptions>
+      <option name="CONTINUATION_INDENT_SIZE" value="4" />
+    </indentOptions>
+  </codeStyleSettings>
+  <codeStyleSettings language="ECMA Script Level 4">
+    <option name="KEEP_LINE_BREAKS" value="false" />
+    <option name="KEEP_BLANK_LINES_IN_CODE" value="1" />
+    <option name="BLANK_LINES_AFTER_PACKAGE" value="2" />
+    <option name="BLANK_LINES_AFTER_IMPORTS" value="2" />
+    <option name="BRACE_STYLE" value="2" />
+    <option name="CLASS_BRACE_STYLE" value="2" />
+    <option name="METHOD_BRACE_STYLE" value="2" />
+    <option name="ELSE_ON_NEW_LINE" value="true" />
+    <option name="WHILE_ON_NEW_LINE" value="true" />
+    <option name="CATCH_ON_NEW_LINE" value="true" />
+    <option name="FINALLY_ON_NEW_LINE" value="true" />
+    <option name="ALIGN_MULTILINE_PARAMETERS_IN_CALLS" value="true" />
+    <option name="ALIGN_MULTILINE_EXTENDS_LIST" value="true" />
+    <option name="CALL_PARAMETERS_WRAP" value="5" />
+    <option name="METHOD_PARAMETERS_WRAP" value="5" />
+    <option name="PARENT_SETTINGS_INSTALLED" value="true" />
+  </codeStyleSettings>
+  <codeStyleSettings language="GSP">
+    <indentOptions>
+      <option name="INDENT_SIZE" value="2" />
+      <option name="CONTINUATION_INDENT_SIZE" value="4" />
+      <option name="TAB_SIZE" value="2" />
+    </indentOptions>
+  </codeStyleSettings>
+  <codeStyleSettings language="Groovy">
+    <option name="KEEP_LINE_BREAKS" value="false" />
+    <option name="KEEP_BLANK_LINES_IN_DECLARATIONS" value="1" />
+    <option name="KEEP_BLANK_LINES_IN_CODE" value="1" />
+    <option name="KEEP_BLANK_LINES_BEFORE_RBRACE" value="0" />
+    <option name="BLANK_LINES_AFTER_PACKAGE" value="2" />
+    <option name="BLANK_LINES_AFTER_IMPORTS" value="2" />
+    <option name="ALIGN_MULTILINE_PARAMETERS" value="false" />
+    <option name="CALL_PARAMETERS_WRAP" value="1" />
+    <option name="METHOD_PARAMETERS_WRAP" value="1" />
+    <option name="EXTENDS_LIST_WRAP" value="1" />
+    <option name="THROWS_LIST_WRAP" value="1" />
+    <option name="THROWS_KEYWORD_WRAP" value="2" />
+    <option name="METHOD_CALL_CHAIN_WRAP" value="1" />
+    <option name="BINARY_OPERATION_WRAP" value="1" />
+    <option name="TERNARY_OPERATION_WRAP" value="1" />
+    <option name="KEEP_SIMPLE_METHODS_IN_ONE_LINE" value="false" />
+    <option name="KEEP_SIMPLE_CLASSES_IN_ONE_LINE" value="false" />
+    <option name="FOR_STATEMENT_WRAP" value="1" />
+    <option name="IF_BRACE_FORCE" value="3" />
+    <option name="WHILE_BRACE_FORCE" value="3" />
+    <option name="FOR_BRACE_FORCE" value="3" />
+    <option name="ENUM_CONSTANTS_WRAP" value="5" />
+    <option name="PARENT_SETTINGS_INSTALLED" value="true" />
+    <indentOptions>
+      <option name="INDENT_SIZE" value="2" />
+      <option name="CONTINUATION_INDENT_SIZE" value="4" />
+      <option name="TAB_SIZE" value="2" />
+    </indentOptions>
+  </codeStyleSettings>
+  <codeStyleSettings language="HTML">
+    <indentOptions>
+      <option name="INDENT_SIZE" value="2" />
+      <option name="CONTINUATION_INDENT_SIZE" value="4" />
+      <option name="TAB_SIZE" value="2" />
+    </indentOptions>
+  </codeStyleSettings>
+  <codeStyleSettings language="JAVA">
+    <option name="KEEP_LINE_BREAKS" value="false" />
+    <option name="KEEP_BLANK_LINES_IN_DECLARATIONS" value="1" />
+    <option name="KEEP_BLANK_LINES_IN_CODE" value="1" />
+    <option name="KEEP_BLANK_LINES_BEFORE_RBRACE" value="0" />
+    <option name="BLANK_LINES_AFTER_IMPORTS" value="2" />
+    <option name="ALIGN_MULTILINE_PARAMETERS" value="false" />
+    <option name="ALIGN_MULTILINE_RESOURCES" value="false" />
+    <option name="ALIGN_MULTILINE_FOR" value="false" />
+    <option name="ALIGN_MULTILINE_THROWS_LIST" value="true" />
+    <option name="ALIGN_MULTILINE_EXTENDS_LIST" value="true" />
+    <option name="CALL_PARAMETERS_WRAP" value="1" />
+    <option name="METHOD_PARAMETERS_WRAP" value="1" />
+    <option name="RESOURCE_LIST_WRAP" value="1" />
+    <option name="THROWS_LIST_WRAP" value="1" />
+    <option name="THROWS_KEYWORD_WRAP" value="2" />
+    <option name="METHOD_CALL_CHAIN_WRAP" value="1" />
+    <option name="BINARY_OPERATION_WRAP" value="1" />
+    <option name="BINARY_OPERATION_SIGN_ON_NEXT_LINE" value="true" />
+    <option name="TERNARY_OPERATION_WRAP" value="1" />
+    <option name="TERNARY_OPERATION_SIGNS_ON_NEXT_LINE" value="true" />
+    <option name="FOR_STATEMENT_WRAP" value="1" />
+    <option name="ASSIGNMENT_WRAP" value="1" />
+    <option name="IF_BRACE_FORCE" value="3" />
+    <option name="DOWHILE_BRACE_FORCE" value="3" />
+    <option name="WHILE_BRACE_FORCE" value="3" />
+    <option name="FOR_BRACE_FORCE" value="3" />
+    <option name="VARIABLE_ANNOTATION_WRAP" value="2" />
+    <option name="ENUM_CONSTANTS_WRAP" value="5" />
+    <option name="PARENT_SETTINGS_INSTALLED" value="true" />
+    <indentOptions>
+      <option name="INDENT_SIZE" value="2" />
+      <option name="CONTINUATION_INDENT_SIZE" value="4" />
+      <option name="TAB_SIZE" value="2" />
+    </indentOptions>
+    <arrangement>
+      <rules>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <FINAL />
+                <PUBLIC />
+                <STATIC />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <FINAL />
+                <PROTECTED />
+                <STATIC />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <FINAL />
+                <PACKAGE_PRIVATE />
+                <STATIC />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <FINAL />
+                <PRIVATE />
+                <STATIC />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <PUBLIC />
+                <STATIC />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <PROTECTED />
+                <STATIC />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <PACKAGE_PRIVATE />
+                <STATIC />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <PRIVATE />
+                <STATIC />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <FINAL />
+                <PUBLIC />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <FINAL />
+                <PROTECTED />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <FINAL />
+                <PACKAGE_PRIVATE />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <FINAL />
+                <PRIVATE />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <PUBLIC />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <PROTECTED />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <PACKAGE_PRIVATE />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <FIELD />
+                <PRIVATE />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <FIELD />
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <CONSTRUCTOR />
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <METHOD />
+                <STATIC />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <METHOD />
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <ENUM />
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <INTERFACE />
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <AND>
+                <CLASS />
+                <STATIC />
+              </AND>
+            </match>
+          </rule>
+        </section>
+        <section>
+          <rule>
+            <match>
+              <CLASS />
+            </match>
+          </rule>
+        </section>
+      </rules>
+    </arrangement>
+  </codeStyleSettings>
+  <codeStyleSettings language="JSP">
+    <indentOptions>
+      <option name="INDENT_SIZE" value="2" />
+      <option name="CONTINUATION_INDENT_SIZE" value="4" />
+      <option name="TAB_SIZE" value="2" />
+    </indentOptions>
+  </codeStyleSettings>
+  <codeStyleSettings language="JavaScript">
+    <option name="KEEP_LINE_BREAKS" value="false" />
+    <option name="KEEP_BLANK_LINES_IN_CODE" value="1" />
+    <option name="ALIGN_MULTILINE_PARAMETERS" value="false" />
+    <option name="ALIGN_MULTILINE_FOR" value="false" />
+    <option name="CALL_PARAMETERS_WRAP" value="1" />
+    <option name="METHOD_PARAMETERS_WRAP" value="1" />
+    <option name="BINARY_OPERATION_WRAP" value="1" />
+    <option name="BINARY_OPERATION_SIGN_ON_NEXT_LINE" value="true" />
+    <option name="TERNARY_OPERATION_WRAP" value="1" />
+    <option name="TERNARY_OPERATION_SIGNS_ON_NEXT_LINE" value="true" />
+    <option name="FOR_STATEMENT_WRAP" value="1" />
+    <option name="ARRAY_INITIALIZER_WRAP" value="1" />
+    <option name="IF_BRACE_FORCE" value="3" />
+    <option name="DOWHILE_BRACE_FORCE" value="3" />
+    <option name="WHILE_BRACE_FORCE" value="3" />
+    <option name="FOR_BRACE_FORCE" value="3" />
+    <option name="PARENT_SETTINGS_INSTALLED" value="true" />
+    <indentOptions>
+      <option name="INDENT_SIZE" value="2" />
+      <option name="TAB_SIZE" value="2" />
+    </indentOptions>
+  </codeStyleSettings>
+  <codeStyleSettings language="LESS">
+    <indentOptions>
+      <option name="CONTINUATION_INDENT_SIZE" value="4" />
+      <option name="TAB_SIZE" value="2" />
+    </indentOptions>
+  </codeStyleSettings>
+  <codeStyleSettings language="SASS">
+    <indentOptions>
+      <option name="CONTINUATION_INDENT_SIZE" value="4" />
+      <option name="TAB_SIZE" value="2" />
+    </indentOptions>
+  </codeStyleSettings>
+  <codeStyleSettings language="SCSS">
+    <indentOptions>
+      <option name="CONTINUATION_INDENT_SIZE" value="4" />
+      <option name="TAB_SIZE" value="2" />
+    </indentOptions>
+  </codeStyleSettings>
+  <codeStyleSettings language="SQL">
+    <option name="KEEP_BLANK_LINES_IN_CODE" value="1" />
+    <option name="PARENT_SETTINGS_INSTALLED" value="true" />
+  </codeStyleSettings>
+  <codeStyleSettings language="TypeScript">
+    <option name="KEEP_LINE_BREAKS" value="false" />
+    <option name="KEEP_BLANK_LINES_IN_CODE" value="1" />
+    <option name="BRACE_STYLE" value="2" />
+    <option name="CLASS_BRACE_STYLE" value="2" />
+    <option name="METHOD_BRACE_STYLE" value="2" />
+    <option name="ELSE_ON_NEW_LINE" value="true" />
+    <option name="WHILE_ON_NEW_LINE" value="true" />
+    <option name="CATCH_ON_NEW_LINE" value="true" />
+    <option name="FINALLY_ON_NEW_LINE" value="true" />
+    <option name="ALIGN_MULTILINE_PARAMETERS_IN_CALLS" value="true" />
+    <option name="ALIGN_MULTILINE_EXTENDS_LIST" value="true" />
+    <option name="CALL_PARAMETERS_WRAP" value="5" />
+    <option name="METHOD_PARAMETERS_WRAP" value="5" />
+    <option name="PARENT_SETTINGS_INSTALLED" value="true" />
+  </codeStyleSettings>
+  <codeStyleSettings language="XML">
+    <indentOptions>
+      <option name="INDENT_SIZE" value="2" />
+      <option name="CONTINUATION_INDENT_SIZE" value="4" />
+      <option name="TAB_SIZE" value="2" />
+    </indentOptions>
+  </codeStyleSettings>
+</code_scheme>
diff --git a/gobblin-website/docs/developer-guide/files/prefs-eclipse.epf b/gobblin-website/docs/developer-guide/files/prefs-eclipse.epf
new file mode 100644
index 0000000..d15b357
--- /dev/null
+++ b/gobblin-website/docs/developer-guide/files/prefs-eclipse.epf
@@ -0,0 +1,23 @@
+/instance/org.eclipse.jdt.core/org.eclipse.jdt.core.codeComplete.argumentPrefixes=
+/instance/org.eclipse.jdt.core/org.eclipse.jdt.core.codeComplete.argumentSuffixes=
+/instance/org.eclipse.jdt.core/org.eclipse.jdt.core.codeComplete.fieldPrefixes=_
+/instance/org.eclipse.jdt.core/org.eclipse.jdt.core.codeComplete.fieldSuffixes=
+/instance/org.eclipse.jdt.core/org.eclipse.jdt.core.codeComplete.localPrefixes=
+/instance/org.eclipse.jdt.core/org.eclipse.jdt.core.codeComplete.localSuffixes=
+/instance/org.eclipse.jdt.core/org.eclipse.jdt.core.codeComplete.staticFieldPrefixes=
+/instance/org.eclipse.jdt.core/org.eclipse.jdt.core.codeComplete.staticFieldSuffixes=
+/instance/org.eclipse.jdt.core/org.eclipse.jdt.core.codeComplete.staticFinalFieldPrefixes=
+/instance/org.eclipse.jdt.core/org.eclipse.jdt.core.codeComplete.staticFinalFieldSuffixes=
+/instance/org.eclipse.jdt.core/org.eclipse.jdt.core.codeComplete.visibilityCheck=enabled
+/instance/org.eclipse.jdt.core/org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
+/instance/org.eclipse.jdt.core/org.eclipse.jdt.core.compiler.compliance=1.6
+/instance/org.eclipse.jdt.core/org.eclipse.jdt.core.compiler.source=1.6
+/instance/org.eclipse.jdt.ui/formatter_profile=_LinkedIn Style
+/instance/org.eclipse.jdt.ui/importsOnPaste=false
+/instance/org.eclipse.jdt.ui/org.eclipse.jdt.ui.exception.name=e
+/instance/org.eclipse.ui.editors/lineNumberRuler=true
+/instance/org.eclipse.ui.editors/printMargin=true
+/instance/org.eclipse.ui.editors/printMarginColumn=120
+/instance/org.eclipse.ui.editors/spacesForTabs=true
+/instance/org.eclipse.ui.editors/textDragAndDropEnabled=false
+file_export_version=3.0
diff --git a/gobblin-website/docs/index.md b/gobblin-website/docs/index.md
new file mode 100644
index 0000000..3cd582b
--- /dev/null
+++ b/gobblin-website/docs/index.md
@@ -0,0 +1,16 @@
+---
+id: index
+title: Overview
+slug: /
+---
+
+![Gobblin Logo](../static/img/Gobblin-Logo.png)
+
+Over the years, LinkedIn's data infrastructure team built custom solutions for ingesting diverse data entities into our Hadoop eco-system. At one point, we were running 15 types of ingestion pipelines which created significant data quality, metadata management, development, and operation challenges.
+ 
+Our experiences and challenges motivated us to build _Gobblin_. Gobblin is a universal data ingestion framework for extracting, transforming, and loading large volume of data from a variety of data sources, e.g., databases, rest APIs, FTP/SFTP servers, filers, etc., onto Hadoop. Gobblin handles the common routine tasks required for all data ingestion ETLs, including job/task scheduling, task partitioning, error handling, state management, data quality checking, data publishing, etc. Gobb [...]
+
+You can find a lot of useful resources in our wiki pages, including [how to get started with Gobblin](Getting-Started), an [architecture overview of Gobblin](Gobblin-Architecture), and
+the [Gobblin user guide](user-guide/Gobblin-Deployment). We also provide a discussion group: [Google Gobblin-Users Group](https://groups.google.com/forum/#!forum/gobblin-users). Please feel free to post any questions or comments.
+
+For a detailed overview, please take a look at the [VLDB 2015 paper](http://www.vldb.org/pvldb/vol8/p1764-qiao.pdf) and the [LinkedIn's Gobblin blog post](https://engineering.linkedin.com/data-ingestion/gobblin-big-data-ease).
diff --git a/gobblin-website/docs/mdx.md b/gobblin-website/docs/mdx.md
new file mode 100644
index 0000000..f0210fb
--- /dev/null
+++ b/gobblin-website/docs/mdx.md
@@ -0,0 +1,17 @@
+---
+id: mdx
+title: Powered by MDX
+---
+
+You can write JSX and use React components within your Markdown thanks to [MDX](https://mdxjs.com/).
+
+export const Highlight = ({children, color}) => ( <span style={{
+      backgroundColor: color,
+      borderRadius: '2px',
+      color: '#fff',
+      padding: '0.2rem',
+    }}>{children}</span> );
+
+<Highlight color="#25c2a0">Docusaurus green</Highlight> and <Highlight color="#1877F2">Facebook blue</Highlight> are my favorite colors.
+
+I can write **Markdown** alongside my _JSX_!
diff --git a/gobblin-website/docs/metrics/Existing-Reporters.md b/gobblin-website/docs/metrics/Existing-Reporters.md
new file mode 100644
index 0000000..5f708e1
--- /dev/null
+++ b/gobblin-website/docs/metrics/Existing-Reporters.md
@@ -0,0 +1,20 @@
+---
+title: Existing Reporters
+sidebar_label: Existing Reporter
+---
+
+Metric Reporters
+================
+
+* [Output Stream Reporter](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/java/org/apache/gobblin/metrics/reporter/OutputStreamReporter.java): allows printing metrics to any OutputStream, including STDOUT and files.
+* [Kafka Reporter](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-kafka-common/src/main/java/org/apache/gobblin/metrics/kafka/KafkaReporter.java): emits metrics to Kafka topic as Json messages.
+* [Kafka Avro Reporter](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-kafka-common/src/main/java/org/apache/gobblin/metrics/kafka/KafkaAvroReporter.java): emits metrics to Kafka topic as Avro messages with schema [MetricReport](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/avro/MetricReport.avsc).
+* [Graphite Reporter](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-metrics-graphite/src/main/java/org/apache/gobblin/metrics/graphite/GraphiteReporter.java): emits metrics to Graphite. This reporter has a different, deprecated construction API included in its javadoc.
+* [Influx DB Reporter](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-metrics-influxdb/src/main/java/org/apache/gobblin/metrics/influxdb/InfluxDBReporter.java): emits metrics to Influx DB. This reporter has a different, deprecated construction API included in its javadoc.
+* [Hadoop Counter Reporter](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-metrics-hadoop/src/main/java/org/apache/gobblin/metrics/hadoop/HadoopCounterReporter.java): emits metrics as Hadoop counters at the end of the execution. Available for old and new Hadoop API. This reporter has a different, deprecated construction API included in its javadoc. Due to limits on the number of Hadoop counters that can be created, this reporter is not recommended except for applic [...]
+
+Event Reporters
+===============
+* [Output Stream Event Reporter](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/java/org/apache/gobblin/metrics/reporter/OutputStreamEventReporter.java): Emits events to any output stream, including STDOUT and files.
+* [Kafka Event Reporter](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-kafka-common/src/main/java/org/apache/gobblin/metrics/kafka/KafkaEventReporter.java): Emits events to Kafka topic as Json messages.
+* [Kafka Avro Event Reporter](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-kafka-common/src/main/java/org/apache/gobblin/metrics/kafka/KafkaAvroEventReporter.java): Emits events to Kafka topic as Avro messages using the schema [GobblinTrackingEvent](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/avro/GobblinTrackingEvent.avsc).
diff --git a/gobblin-website/docs/metrics/Gobblin-Metrics-Architecture.md b/gobblin-website/docs/metrics/Gobblin-Metrics-Architecture.md
new file mode 100644
index 0000000..c97b4de
--- /dev/null
+++ b/gobblin-website/docs/metrics/Gobblin-Metrics-Architecture.md
@@ -0,0 +1,74 @@
+---
+title: Gobblin Metrics Architecture
+sidebar_label: Gobblin Metrics Architecture
+---
+
+![Gobblin Metrics Architecture Diagram](../../static/img/Gobblin-Metrics-Architecture.png)
+
+Metric Context
+==============
+
+Metric contexts are organized hierarchically in a tree. Each metric context has a set of Tags, each of which is just key-value pair. The keys of all tags are strings, while the values are allowed to be of any type. However, most reporters will serialize the tag values using their `toString()` method.
+
+Children contexts automatically inherit the tags of their parent context, and can add more tags, or override tags present in the parent. Tags can only be defined during construction of each metric context, and are immutable afterwards. This simplifies the inheritance and overriding of metrics. 
+
+Metric Contexts are created using `MetricContext.Builder`, which allows adding tags and specifying the parent. This is the only time tags can be added to the context. When building, the tags of the parent and the new tags are merged to obtain the final tags for this context. When building a child context for Metric Context `context`, calling `context.childBuilder(String)` generates a Builder with the correct parent.
+
+Each metric context contains the following instance variables:
+
+* A `String` `name`. The name is not used by the core metrics engine, but can be accessed by users to identify the context.
+* A reference to the parent metric context, or null if it has no parent.
+* A list of children metric context references, stored as soft references.
+* An object of type [Tagged](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/java/org/apache/gobblin/metrics/Tagged.java) containing the tags for this metric context.
+* A `Set` of notification targets. Notification targets are objects of type [Function](https://google.github.io/guava/releases/15.0/api/docs/com/google/common/base/Function.html)<[Notification](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/java/org/apache/gobblin/metrics/notification/Notification.java), Void> which are all called every time there is a new notification. Notifications can be submitted to the Metric Context using the method [...]
+* A lazily instantiated `ExecutorService` used for asynchronously executing the notification targets. The executor service will only be started the first time there is a notification and the number of notification targets is positive.
+* A `ConcurrentMap` from metric names to `Metric` for all metrics registered in this Metric Context. Metrics can be added to this map using the `register(Metric)`, `register(String, Metric)`, or `registerAll(MetricSet)`, although it is recommended to instead use the methods to create and register the metrics. Metric Context implements getter methods for all metrics, as well as for each type of metric individually (`getMetrics`, `getGauges`, `getCounters`, `getHistograms`, `getMeters`, `g [...]
+
+Metrics
+=======
+
+All metrics extend the interface [ContextAwareMetric](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/java/org/apache/gobblin/metrics/ContextAwareMetric.java). Each metric type in Dropwizard Metrics is extended to a Context Aware type: `ContextAwareCounter`, `ContextAwareGauge`, `ContextAwareHistogram`, `ContextAwareMeter`, `ContextAwareTimer`.
+
+Context Aware metrics all always created from the Metric Context where they will be registered. For example, to get a counter under Metric Context `context`, the user would call `context.counter("counter.name")`. This method first checks all registered metrics in the Metric Context to find a counter with that name, if it succeeds, it simply returns that counter. If a counter with that name has not been registered in `context`, then a new `ContextAwareCounter` is created and registered in [...]
+
+On creation, each Context Aware metric (except Gauges) checks if its parent Metric Context has parents itself. If so, then it automatically creates a metric of the same type, with the same name, in that parent. This will be repeated recursively until, at the end, all ancestor Metric Contexts will all contain a context aware metric of the same type and with the same name. Every time the context aware metric is updated, the metric will automatically call the same update method, with the sa [...]
+
+Users can also register objects of type `com.codahale.metrics.Metric` with any Metric Context, but they will not be auto-aggregated.
+
+Events
+======
+
+Events are objects of type [GobblinTrackingEvent](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/avro/GobblinTrackingEvent.avsc), which is a type generated from an Avro schema. Events have:
+
+* A `namespace`.
+* A `name`.
+* A `timestamp`.
+* A `Map<String,String>` of `metadata`.
+
+Events are submitted using the `MetricContext#submitEvent(GobblinTrackingEvent)` method. When called, this method packages the event into an [EventNotification](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/java/org/apache/gobblin/metrics/notification/EventNotification.java) and submits it to the metric context using the method `MetricContext#sendNotification(Notification)`. This notification is passed to all metrics context ancestors. E [...]
+
+Events can be created manually using Avro constructors, and using the method `context.submitEvent(GobblinTrackinEvent)`, but this is unfriendly when trying to build events incrementally, especially when using metadata. To address this, users can instead use [EventSubmitter](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/java/org/apache/gobblin/metrics/event/EventSubmitter.java) which is an abstraction around the Avro constructor for Gobbl [...]
+
+Event Submitter
+---------------
+
+An event submitter is created using an `EventSubmitter.Builder`. It is associated with a Metric Context where it will submit all events, and it contains a `namespace` and default `metadata` that will be applied to all events generated through the event submitter. The user can then call `EventSubmitter#submit` which will package the event with the provided metadata and submit it to the Metric Context.
+
+Reporters
+=========
+
+Reporters export the metrics and/or events of a metric context to a sink. Reporters extend the interface `com.codahale.metrics.Reporter`. Most reporters will attach themselves to a Metric Context. The reporter can then navigate the Metric Context tree where the Metric Context belongs, get tags and metrics, get notified of events, and export them to the sink.
+
+The two best entry points for developing reporters are [RecursiveScheduledMetricReporter](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/java/org/apache/gobblin/metrics/reporter/RecursiveScheduledMetricReporter.java) and [EventReporter](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/java/org/apache/gobblin/metrics/reporter/EventReporter.java). These classes do most of the heavy lifting for [...]
+
+RecursiveScheduleMetricReporter
+-------------------------------
+
+This abstract reporter base is used for emitting metrics on a schedule. The reporter, on creation, is attached to a particular Metric Report. Every time the reporter is required to emit events, the reporter selects the attached Metric Context and all descendant Metric Contexts. For each of these metric contexts, it queries the Metric Context for all metrics, filtered by an optional user supplied filter, and then calls `RecursiveScheduledMetricReporter#report`, providing the method with a [...]
+
+EventReporter
+-------------
+
+This abstract reporter base is used for emitting events. The EventReporter, on creation, takes a Metric Context it should listen to. It registers a callback function as a notification target for that Metric Context. Every time the callback is called, if the notification is of type `EventNotification`, the EventReporter unpacks the event and adds it to a `LinkedBlockingQueue` of events.
+
+On a configurable schedule, the event reporter calls the abstract method `EventReporter#reportEventQueue(Queue<GobblinTrackingEvent>)`, which should be implemented by the concrete subclass. To keep memory limited, the event queue has a maximum size. Whenever the queue reaches a size 2/3 of the maximum size, `EventReporter#reportEventQueue` is called immediately.
diff --git a/gobblin-website/docs/metrics/Gobblin-Metrics-Performance.md b/gobblin-website/docs/metrics/Gobblin-Metrics-Performance.md
new file mode 100644
index 0000000..93e8436
--- /dev/null
+++ b/gobblin-website/docs/metrics/Gobblin-Metrics-Performance.md
@@ -0,0 +1,56 @@
+---
+title: Gobblin Metrics Performance
+sidebar_label: Gobblin Metrics Performance
+---
+
+Generalities
+============
+These are the main resources used by Gobblin Metrics:
+
+* CPU time for updating metrics: scales with number of metrics and frequency of metric update
+* CPU time for metric emission and lifecycle management: scales with number of metrics and frequency of emission
+* Memory for storing metrics: scales with number of metrics and metric contexts
+* I/O for reporting metrics: scales with number of metrics and frequency of emission
+* External resources for metrics emission (e.g. HDFS space, Kafka queue space, etc.): scales with number of metrics and frequency of emission
+
+This page focuses on the CPU time for updating metrics, as these updates are usually in the critical performance path of an application. Each metric requires bounded memory, and having a few metrics should have no major effect on memory usage. Metrics and Metric Contexts are cleaned when no longer needed to further reduce this impact. Resources related to metric emission can always be reduced by reporting fewer metrics or decreasing the reporting frequency when necessary.
+
+How to interpret these numbers
+==============================
+This document provides maximum QPS achievable by Gobblin Metrics. If the application attempts to update metrics at a higher rate than this, the metrics will effectively throttle the application. If, on the other hand, the application only updates metrics at 10% or less of the maximum QPS, the performance impact of Gobblin Metrics should be minimal.
+
+### What if I need larger QPS?
+If your application needs larger QPS, the recommendation is to batch metrics updates. Counters and Meters offer the option to increase their values by multiple units at a time. Histograms and Timers don't offer this option, but for very high throughput applications, randomly registering for example only 10% of the values will not affect statistics significantly (although you will have to adjust timer and histogram counts manually).
+
+Update Metrics Performance
+==========================
+Metric updates are the most common interaction with Gobblin Metrics in an application. Every time a counter is increased, a meter is marked, or entries are added to histograms and timers, an update happens. As such, metric updates are the most likely to impact application performance.
+
+We measured the max number of metric updates that can be executed per second. The performance of different metric types is different. Also, the performance of metrics depends on the depth in the Metric Context tree at which they are created. Metrics in the Root Metric Context are the fastest, while metrics deep in the tree are slower because they have to update all ancestors as well. The following table shows reference max QPS in updates per second as well as the equivalent single update [...]
+
+| Metric | Root level | Depth: 1 | Depth: 2 | Depth: 3 |
+|--------|------------|----------|----------|----------|
+| Counter | 76M (13ns) | 39M (25ns) | 29M (34ns) | 24M (41ns) |
+| Meter | 11M (90ns) | 7M (142ns) | 4.5M (222ns) | 3.5M (285ns) |
+| Histogram | 2.4M (416ns) | 2.4M (416ns) | 1.8M (555ns) | 1.3M (769ns) |
+| Timer | 1.4M (714ns) | 1.4M (714ns) | 1M (1us) | 1M (1us) |
+
+Multiple metric updates per iteration
+-------------------------------------
+If a single thread updates multiple metrics, the average delay for metric updates will be the sum of the delays of each metric independently. For example, if each iteration the application is updating two counters, one timer, and one histogram at the root metric context level, the total delay will be `13ns + 13ns + 416ns + 714ns = 1156ns` for a max QPS of `865k`.
+
+Multi-threading
+---------------
+Updating metrics with different names can be parallelized efficiently, e.g. different threads updating metrics with different names will not interfere with each other. However, multiple threads updating metrics with the same names will interfere with each other, as the updates of common ancestor metrics are synchronized (to provide with auto-aggregation). In experiments we observed that updating metrics with the same name from multiple threads increases the maximum QPS sub-linearly, satu [...]
+
+On the other hand, if each thread is updating multiple metrics, the updates might interleave with each other, potentially increasing the max total QPS. In the example with two counters, one timer, and one histogram, one thread could be updating the timer while another could be updating the histogram, reducing interference, but never exceeding the max QPS of the single most expensive metric. Note that there is no optimization in code to produce this interleaving, it is merely an effect of [...]
+
+Running Performance Tests
+-------------------------
+To run the performance tests
+```bash
+cd gobblin-metrics
+../gradlew performance
+```
+
+After finishing, it should create a TestNG report at `build/gobblin-metrics/reports/tests/packages/gobblin.metrics.performance.html`. Nicely printed performance results are available on the Output tab. 
diff --git a/gobblin-website/docs/metrics/Gobblin-Metrics-next-generation-instrumentation-for-applications.md b/gobblin-website/docs/metrics/Gobblin-Metrics-next-generation-instrumentation-for-applications.md
new file mode 100644
index 0000000..4ff5a89
--- /dev/null
+++ b/gobblin-website/docs/metrics/Gobblin-Metrics-next-generation-instrumentation-for-applications.md
@@ -0,0 +1,26 @@
+---
+title: Next Generation Instrumentation for Application
+sidebar_label: Next Generation Instrumentation for Application
+---
+
+Long running, complex applications are prone to operational issues. Good instrumentation, monitoring, and accessible historical information on its execution helps diagnose them, and many times even prevent them. For Gobblin ingestion, we wanted to add this instrumentation to all parts of the application. Some of the requirements we had were:
+
+* Report progress of the ingestion processing for each job, task, and module. Many reports would be almost identical, just covering different instances of the same module.
+* Report major milestones in the processing: when a Gobblin job starts, when the ingestion of a dataset finishes, when files of a dataset get committed, etc.
+* Provide various levels of granularity: totals aggregations give a quick view of the performance of the application, but detailed, instance level reports are essential for debugging.
+* Easily switch between sinks where reports and events are emitted.
+* Generate queriable reports.
+
+Among existing solutions, we found <a href="http://metrics.dropwizard.io/">Dropwizard Metrics</a> to be the closest to what we needed, but it was not enough, so we developed Gobblin Metrics.
+
+Gobblin Metrics is a metrics library, which is based on Dropwizard Metrics but extends it considerably to provide all the amazing features that make monitoring and execution auditing easy. The library is designed for modular applications: the application is a set of module instances, organized hierarchically. Following this pattern, the metrics library uses Metric Contexts organized hierarchically to instrument instances of classes and modules (see figure below for an example of this hie [...]
+
+![Gobblin Metrics Example](../../static/img/Gobblin-Metrics-Example.png)
+
+Each metric context manages a set of metrics (like counters, timers, meters, and histograms), providing information for instance on the throughput for each reader and writer, serialization/deserialization times, etc. Metrics are automatically aggregated in the metric context tree: for example, while each writer is computing is throughput independently, we are also computing in real-time the throughput across each task (containing many writers) and each job (containing many tasks).
+
+Gobblin Metrics also introduces the concept of events. Events are fire-and-forget reports of milestones of the execution, enriched by metadata relevant to that milestone, plus all of the context information derived from tags. For example, every time we finish processing a file, we emit an event cotaining detailed information like the number of records read, number of records written, and the location where the file was published. The events can be used to get historical information on pr [...]
+
+Finally, the library would not be complete without options to actually export metrics and events to external sinks. Following Dropwizard Metric's model, we use Reporters to write out metrics and events. A few sinks are implemented by default, which we already use heavily: Kafka, OutputStream, Graphite, and InfluxDB. However, any developer can easily implement their own sinks. There is already logic to publish metrics and events as Avro records. Combining this with Hive / Pig, or any othe [...]
+
+To learn more about Gobblin Metrics, check out the <a href="https://gobblin.readthedocs.io/en/latest/metrics/Gobblin-Metrics/">Wiki</a> and the <a href="https://github.com/apache/gobblin">Gobblin project</a> in Github.
diff --git a/gobblin-website/docs/metrics/Gobblin-Metrics.md b/gobblin-website/docs/metrics/Gobblin-Metrics.md
new file mode 100644
index 0000000..38874ff
--- /dev/null
+++ b/gobblin-website/docs/metrics/Gobblin-Metrics.md
@@ -0,0 +1,109 @@
+---
+title: Quick Start
+sidebar_label: Quick Start
+---
+
+Gobblin Metrics is a metrics library for emitting metrics and events instrumenting java applications. 
+Metrics and events are easy to use and enriched with tags. Metrics allow full granularity, auto-aggregation, and configurable 
+reporting schedules. Gobblin Metrics is based on [Dropwizard Metrics](http://metrics.dropwizard.io/), enhanced to better support 
+modular applications (by providing hierarchical, auto-aggregated metrics) and their monitoring / auditing.
+
+Quick Start
+===========
+
+The following code excerpt shows the functionality of Gobblin Metrics.
+
+```java
+// ========================================
+// METRIC CONTEXTS
+// ========================================
+
+// Create a Metric context with a Tag
+MetricContext context = MetricContext.builder("MyMetricContext").addTag(new Tag<Integer>("key", value)).build();
+// Create a child metric context. It will automatically inherit tags from parent.
+// All metrics in the child context will be auto-aggregated in the parent context.
+MetricContext childContext = context.childBuilder("childContext").build();
+
+// ========================================
+// METRICS
+// ========================================
+
+// Create a reporter for metrics. This reporter will write metrics to STDOUT.
+OutputStreamReporter.Factory.newBuilder().build(new Properties());
+// Start all metric reporters.
+RootMetricContext.get().startReporting();
+
+// Create a counter.
+Counter counter = childContext.counter("my.counter.name");
+// Increase the counter. The next time metrics are reported, "my.counter.name" will be reported as 1.
+counter.inc();
+
+// ========================================
+// EVENTS
+// ========================================
+
+// Create an reporter for events. This reporter will write events to STDOUT.
+ScheduledReporter eventReporter = OutputStreamEventReporter.forContext(context).build();
+eventReporter.start();
+
+// Create an event submitter, can include default metadata.
+EventSubmitter eventSubmitter = new EventSubmitter.Builder(context, "events.namespace").addMetadata("metadataKey", "value").build();
+// Submit an event. Its metadata will contain all tags in context, all metadata in eventSubmitter,
+// and any additional metadata specified in the call.
+// This event will be displayed the next time the event reporter flushes.
+eventSubmitter.submit("EventName", "additionalMetadataKey", "value");
+```
+
+Metric Contexts
+===============
+
+A metric context is a context from which users can emit metrics and events. These contexts contain a set of tags, each tag 
+being a key-value pair. Contexts are hierarchical in nature: each context has one parent and children. They automatically 
+inherit the tags of their parent, and can define or override more tags.
+
+Generally, a metric context is associated with a specific instance of an object that should be instrumented. 
+Different instances of the same object will have separate instrumentations. However, each context also aggregates 
+all metrics defined by its descendants, providing with a full range of granularities for reporting. 
+With this functionality if, for example, an application has 10 different data writers,  users can monitor each writer 
+individually, or all at the same time.
+
+Metrics
+=======
+
+Metrics are used to monitor the progress of an application. Metrics are emitted regularly following a schedule and represent 
+the current state of the application. The metrics supported by Gobblin Metrics are the same ones as those supported 
+by [Dropwizard Metrics Core](http://metrics.dropwizard.io/3.1.0/manual/core/), adapted for tagging and auto-aggregation. 
+The types supported are:
+
+* Counter: simple long counter.
+* Meter: counter with added computation of the rate at which the counter is changing.
+* Histogram: stores a histogram of a value, divides all of the values observed into buckets, and reports the count for each bucket.
+* Timer: a histogram for timing information.
+* Gauge: simply stores a value. Gauges are not auto-aggregated because the aggregation operation is context-dependent.
+
+Events
+======
+
+Events are fire-and-forget messages indicating a milestone in the execution of an application, 
+along with metadata that can provide further information about that event (all tags of the metric context used to generate 
+the event are also added as metadata).
+
+Reporters
+=========
+
+Reporters periodically output the metrics and events to particular sinks following a configurable schedule. Events and Metrics reporters are kept separate to allow users more control in case they want to emit metrics and events to separate sinks (for example, different files). Reporters for a few sinks are implemented by default, but additional sinks can be implemented by extending the `RecursiveScheduledMetricReporter` and the `EventReporter`. Each of the included reporters has a simple [...]
+
+The metric reporter implementations included with Gobblin Metrics are:
+
+* OutputStreamReporter: Supports any output stream, including STDOUT and files.
+* KafkaReporter: Emits metrics to a Kafka topic as Json messages.
+* KafkaAvroReporter: Emits metrics to a Kafka topic as Avro messages.
+* InfluxDBReporter: Emits metrics to Influx DB.
+* GraphiteReporter: Emits metrics to Graphite.
+* HadoopCounterReporter: Emits metrics as Hadoop counters.
+
+The event reporter implementations included with Gobblin metrics are:
+
+* OutputStreamEventReporter: Supports any output stream, including STDOUT and files.
+* KafkaEventReporter: Emits events to Kafka as Json messages.
+* KafkaEventAvroReporter: Emits events to Kafka as Avro messages.
diff --git a/gobblin-website/docs/metrics/Implementing-New-Reporters.md b/gobblin-website/docs/metrics/Implementing-New-Reporters.md
new file mode 100644
index 0000000..8bce710
--- /dev/null
+++ b/gobblin-website/docs/metrics/Implementing-New-Reporters.md
@@ -0,0 +1,104 @@
+---
+title: Implementing New Reporters
+sidebar_label: Implementing New Reporters
+---
+
+The two best entry points for implementing custom reporters are [RecursiveScheduledMetricReporter](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/java/org/apache/gobblin/metrics/reporter/RecursiveScheduledMetricReporter.java) and [EventReporter](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/java/org/apache/gobblin/metrics/reporter/EventReporter.java). Each of these classes automatically s [...]
+
+In the interest of giving more control to the users, metric and event reporters are kept separate, allowing users to more easily specify separate sinks for events and metrics. However, it is possible to implement a single report that handles both events and metrics.
+
+> It is recommended that each reporter has a constructor with signature `<init>(Properties)`. In the near future we are planning to implement auto-starting, file-configurable reporting similar to Log4j architecture, and compliant reporters will be required to have such a constructor.
+
+Extending Builders
+==================
+
+The builder patterns implemented in the base reporters are designed to be extendable. The architecture is a bit complicated, but a subclass of the base reporters wanting to use builder patterns should follow this pattern (replacing with RecursiveScheduledMetricReporter in the case of a metrics reporter):
+
+```java
+class MyReporter extends EventReporter {
+
+  private MyReporter(Builder<?> builder) throws IOException {
+    super(builder);
+    // Other initialization logic.
+  }
+
+  // Concrete implementation of extendable Builder.
+  public static class BuilderImpl extends Builder<BuilderImpl> {
+    private BuilderImpl(MetricContext context) {
+      super(context);
+    }
+
+    @Override
+    protected BuilderImpl self() {
+      return this;
+    }
+  }
+
+  public static class Factory {
+    /**
+     * Returns a new {@link MyReporter.Builder} for {@link MyReporter}.
+     * Will automatically add all Context tags to the reporter.
+     *
+     * @param context the {@link org.apache.gobblin.metrics.MetricContext} to report
+     * @return MyReporter builder
+     */
+    public static BuilderImpl forContext(MetricContext context) {
+      return new BuilderImpl(context);
+    }
+  }
+
+  /**
+   * Builder for {@link MyReporter}.
+   */
+  public static abstract class Builder<T extends EventReporter.Builder<T>>
+      extends EventReporter.Builder<T> {
+
+    // Additional instance variables needed to construct MyReporter.
+    private int myBuilderVariable;
+
+    protected Builder(MetricContext context) {
+      super(context);
+      this.myBuilderVariable = 0;
+    }
+
+    /**
+     * Set myBuilderVariable.
+     */
+    public T withMyBuilderVariable(int value) {
+      this.myBuilderVariable = value;
+      return self();
+    }
+
+    // Other setters for Builder variables.
+
+    /**
+     * Builds and returns {@link MyReporter}.
+     */
+    public MyReporter build() throws IOException {
+      return new MyReporter(this);
+    }
+
+  }
+}
+```
+
+This pattern allows users to simply call
+```java
+MyReporter reporter = MyReporter.Factory.forContext(context).build();
+```
+to generate an instance of the reporter. Additionally, if you want to further extend MyReporter, following the exact same pattern except extending MyReporter instead of EventReporter will work correctly (which would not be true with standard Builder pattern).
+
+Metric Reporting
+================
+
+Developers should extend `RecursiveScheduledMetricReporter` and implement the method `RecursiveScheduledMetricReporter#report`. The base class will call report when appropriate with the list of metrics, separated by type, and tags that should be reported.
+
+Event Reporting
+===============
+
+Developers should extend `EventReporter` and implement the method `EventReporter#reportEventQueue(Queue<GobblinTrackingEvent>)`. The base class will call this method with a queue of all events to report as needed.
+
+Other Reporters
+===============
+
+It is also possible to implement a reporter without using the suggested classes. Reporters are recommended, but not required, to extend the interface `Reporter`. Reporters can use the public methods of `MetricContext` to navigate the Metric Context tree, query metrics, and register for notifications.
diff --git a/gobblin-website/docs/metrics/Metrics-for-Gobblin-ETL.md b/gobblin-website/docs/metrics/Metrics-for-Gobblin-ETL.md
new file mode 100644
index 0000000..0d483cf
--- /dev/null
+++ b/gobblin-website/docs/metrics/Metrics-for-Gobblin-ETL.md
@@ -0,0 +1,155 @@
+---
+title: Metrics for Gobblin ETL
+sidebar_label: Metrics for Gobblin ETL
+---
+
+Gobblin ETL comes equipped with instrumentation using [Gobblin Metrics](Gobblin Metrics), as well as end points to easily extend this instrumentation.
+
+Configuring Metrics and Event emission
+======================================
+
+The following configurations are used for metrics and event emission:
+
+|Configuration Key                | Definition           | Default        |
+|---------------------------------|----------------------|----------------|
+|metrics.enabled                  | Whether metrics are enabled. If false, will not report metrics. | true |
+|metrics.report.interval          | Metrics report interval in milliseconds.    | 30000 |
+|metrics.reporting.file.enabled   | Whether metrics will be reported to a file. | false |
+|metrics.log.dir                  | If file enabled, the directory where metrics will be written. If missing, will not report to file. | N/A |
+|metrics.reporting.kafka.enabled  | Whether metrics will be reported to Kafka. | false |
+|metrics.reporting.kafka.brokers  | Kafka brokers for Kafka metrics emission.  | N/A   |
+|metrics.reporting.kafka.topic.metrics | Kafka topic where metrics (but not events) will be reported. | N/A   |
+|metrics.reporting.kafka.topic.events  | Kafka topic where events (but not metrics) will be reported. | N/A   |
+|metrics.reporting.kafka.format   | Format of metrics / events emitted to Kafka. (Options: json, avro) | json |
+|metrics.reporting.kafka.avro.use.schema.registry | Whether to use a schema registry for Kafka emitting. | false |
+|kafka.schema.registry.url        | If using schema registry, the url of the schema registry. | N/A   |
+|metrics.reporting.jmx.enabled    | Whether to report metrics to JMX.      | false  |
+|metrics.reporting.custom.builders | Comma-separated list of classes for custom metrics reporters. (See [Custom Reporters](Metrics-for-Gobblin-ETL#custom-reporters)) |    |
+
+ 
+Operational Metrics
+===================
+
+Each construct in a Gobblin ETL run computes metrics regarding it's performance / progress. Each metric is tagged by default with the following tags:
+
+* jobName: Gobblin generated name for the job.
+* jobId: Gobblin generated id for the job.
+* clusterIdentifier: string identifier the cluster / host where the job was run. Obtained from resource manager, job tracker, or the name of the host.
+* taskId: Gobblin generated id for the task that generated the metric.
+* construct: construct type that generated the metric (e.g. extractor, converter, etc.)
+* class: specific class of the construct that generated the metric.
+* finalMetricReport: metrics are emitted regularly. Sometimes it is useful to select only the last report from each context. To aid with this, some reporters will add this tag with value "true" only to the final report from a metric context.
+
+This is the list of operational metrics implemented by default, grouped by construct.
+
+Extractor Metrics
+-----------------
+* gobblin.extractor.records.read: meter for records read.
+* gobblin.extractor.records.failed: meter for records failed to read.
+* gobblin.extractor.extract.time: timer for reading of records.
+
+Converter Metrics
+-----------------
+* gobblin.converter.records.in: meter for records going into the converter.
+* gobblin.converter.records.out: meter for records outputted by the converter.
+* gobblin.converter.records.failed: meter for records that failed to be converted.
+* gobblin.converter.convert.time: timer for conversion time of each record.
+
+Fork Operator Metrics
+---------------------
+* gobblin.fork.operator.records.in: meter for records going into the fork operator.
+* gobblin.fork.operator.forks.out: meter for records going out of the fork operator (each record is counted once for each fork it is emitted to).
+* gobblin.fork.operator.fork.time: timer for forking of each record.
+
+Row Level Policy Metrics
+------------------------
+* gobblin.qualitychecker.records.in: meter for records going into the row level policy.
+* gobblin.qualitychecker.records.passed: meter for records passing the row level policy check.
+* gobblin.qualitychecker.records.failed: meter for records failing the row level policy check.
+* gobblin.qualitychecker.check.time: timer for row level policy checking of each record.
+
+Data Writer Metrics
+-------------------
+* gobblin.writer.records.in: meter for records requested to be written.
+* gobblin.writer.records.written: meter for records actually written.
+* gobblin.writer.records.failed: meter for records failed to be written.
+* gobblin.writer.write.time: timer for writing each record.
+
+Runtime Events
+==============
+
+The Gobblin ETL runtime emits events marking its progress. All events have the following metadata:
+
+* jobName: Gobblin generated name for the job.
+* jobId: Gobblin generated id for the job.
+* clusterIdentifier: string identifier the cluster / host where the job was run. Obtained from resource manager, job tracker, or the name of the host.
+* taskId: Gobblin generated id for the task that generated the metric (if applicable).
+
+This is the list of events that are emitted by the Gobblin runtime:
+
+Job Progression Events
+----------------------
+
+* LockInUse: emitted if a job fails because it fails to get a lock.
+* WorkUnitsMissing: emitted if a job exits because source failed to get work units.
+* WorkUnitsEmpty: emitted if a job exits because there were no work units to process.
+* WorkUnitsCreated: emitted when workunits are created for a task. Metadata: workUnitsCreated(Number of bin-packed workunits created).
+* TasksSubmitted: emitted when tasks are submitted for execution. Metadata: tasksCount(number of tasks submitted).
+* TaskFailed: emitted when a task fails. Metadata: taskId(id of the failed task).
+* Job_Successful: emitted at the end of a successful job.
+* Job_Failed: emitted at the end of a failed job.
+
+Job Timing Events
+-----------------
+These events give information on timing on certain parts of the execution. Each timing event contains the following metadata:
+
+* startTime: timestamp when the timed processing started.
+* endTime: timestamp when the timed processing finished.
+* durationMillis: duration in milliseconds of the timed processing.
+* eventType: always "timingEvent" for timing events.
+
+The following timing events are emitted:
+
+* FullJobExecutionTimer: times the entire job execution.
+* WorkUnitsCreationTimer: times the creation of work units.
+* WorkUnitsPreparationTime: times the preparation of work units.
+* JobRunTimer: times the actual running of job (i.e. processing of all work units).
+* JobCommitTimer: times the committing of work units.
+* JobCleanupTimer: times the job cleanup.
+* JobLocalSetupTimer: times the setup of a local job.
+* JobMrStagingDataCleanTimer: times the deletion of staging directories from previous work units (MR mode).
+* JobMrDistributedCacheSetupTimer: times the setting up of distributed cache (MR mode).
+* JobMrSetupTimer: times the setup of the MR job (MR mode).
+* JobMrRunTimer: times the execution of the MR job (MR mode).
+
+Customizing Instrumentation
+===========================
+
+Custom constructs
+-----------------
+When using a custom construct (for example a custom extractor for your data source), you will get the above mentioned instrumentation for free. However, you may want to implement additional metrics. To aid with this, instead of extending the usual class Extractor, you can extend the class `gobblin.instrumented.extractor.InstrumentedExtractor`. Similarly, for each construct there is an instrumented version that allows extension of the default metrics ([InstrumentedExtractor](https://githu [...]
+
+All of the instrumented constructs have Javadoc providing with additional information. In general, when extending an instrumented construct, you will have to implement a different method. For example, when extending an InstrumentedExtractor, instead of implementing `readRecord`, you will implement `readRecordImpl`. To make this clearer for the user, implementing `readRecord` will throw a compilation error, and the javadoc of each method specifies the method that should be implemented.
+
+### Instrumentable Interface
+
+Instrumented constructs extend the interface [Instrumentable](https://github.com/apache/gobblin/blob/master/gobblin-core-base/src/main/java/org/apache/gobblin/instrumented/Instrumentable.java). It contains the following methods:
+
+* `getMetricContext()`: get the default metric context generated for that instance of the construct, with all the appropriate tags. Use this metric context to create any additional metrics.
+* `isInstrumentationEnabled()`: returns true if instrumentation is enabled.
+* `switchMetricsContext(List<Tag<?>>)`: switches the default metric context returned by `getMetricContext()` to a metric context containing the supplied tags. All default metrics will be reported to the new metric context. This method is useful when the state of a construct changes during the execution, and the user desires to reflect that in the emitted tags (for example, Kafka extractor can handle multiple topics in the same extractor, and we want to reflect this in the metrics).
+* `switchMetricContext(MetricContext)`: similar to the above method, but uses the supplied metric context instead of generating a new metric context. It is the responsibility of the caller to ensure the new metric context has the correct tags and parent.
+
+The following method can be re-implemented by the user:
+* `generateTags(State)`: this method should return a list of tags to use for metric contexts created for this construct. If overriding this method, it is always a good idea to call `super()` and only append tags to this list.
+
+### Callback Methods
+
+Instrumented constructs have a set of callback methods that are called at different points in the processing of each record, and which can be used to update metrics. For example, the `InstrumentedExtractor` has the callbacks `beforeRead()`, `afterRead(D, long)`, and `onException(Exception)`. The javadoc for the instrumented constructs has further descriptions for each callback. Users should always call `super()` when overriding this callbacks, as default metrics depend on that.
+
+Custom Reporters
+----------------
+
+Besides the reporters implemented by default (file, Kafka, and JMX), users can add custom reporters to the classpath and instruct Gobblin to use these reporters. To do this, users should extend the interface [CustomReporterFactory](https://github.com/apache/gobblin/blob/master/gobblin-metrics-libs/gobblin-metrics-base/src/main/java/org/apache/gobblin/metrics/CustomReporterFactory.java), and specify a comma-separated list of CustomReporterFactory classes in the configuration key `metrics. [...]
+
+Gobblin will automatically search for these CustomReporterFactory implementations, instantiate each one with a parameter-less constructor, and then call the method `newScheduledReporter(MetricContext, Properties)`, where the properties contain all of the input configurations supplied to Gobblin. Gobblin will then manage this `ScheduledReporter`.
diff --git a/gobblin-website/docs/miscellaneous/Camus-to-Gobblin-Migration.md b/gobblin-website/docs/miscellaneous/Camus-to-Gobblin-Migration.md
new file mode 100644
index 0000000..d1de974
--- /dev/null
+++ b/gobblin-website/docs/miscellaneous/Camus-to-Gobblin-Migration.md
@@ -0,0 +1,106 @@
+---
+title: Camus to Gobblin Migration
+sidebar_label: Camus to Gobblin Migration
+---
+
+This page is a guide for [Camus](https://github.com/linkedin/camus) → Gobblin migration, intended for users and organizations currently using Camus. Camus is LinkedIn's previous-generation Kafka-HDFS pipeline.
+
+It is recommended that one read [Kafka-HDFS Ingestion](../case-studies/Kafka-HDFS-Ingestion) before reading this page. This page focuses on the Kafka-related configuration properties in Gobblin vs Camus.
+
+## Advantages of Migrating to Gobblin
+
+**Operability**: Gobblin is a generic data ingestion pipeline that supports not only Kafka but several other data sources, and new data sources can be easily added. If you have multiple data sources, using a single tool to ingest data from these sources is a lot more pleasant operationally than deploying a separate tool for each source.
+
+**Performance**: The performance of Gobblin in MapReduce mode is comparable to Camus', and faster in some cases (e.g., the average record size of a Kafka topic is not proportional to the average time of pulling a topic) due to a better mapper load balancing algorithm. In the new continuous ingestion mode (currently under development), the performance of Gobblin will further improve.
+
+**Metrics and Monitoring**: Gobblin has a powerful end-to-end metrics collection and reporting module for monitoring purpose, making it much easier to spot problems in time and find the root causes. See the "Gobblin Metrics" section in the wiki and [this post](../metrics/Gobblin-Metrics-next-generation-instrumentation-for-applications) for more details.
+
+**Features**: In addition to the above, there are several other useful features for Kafka-HDFS ingestion in Gobblin that are not available in Camus, e.g., [handling late events in data compaction](../user-guide/Compaction#handling-late-records); dataset retention management; converter and quality checker; all-or-nothing job commit policy, etc. Also, Gobblin is under active development and new features are added frequently.
+
+## Kafka Ingestion Related Job Config Properties
+
+This list contains Kafka-specific properties. For general configuration properties please refer to [Configuration Properties Glossary](../user-guide/Configuration-Properties-Glossary).
+
+### Config properties for pulling Kafka topics
+
+| Gobblin Property   |  Corresponding Camus Property | Default value |
+|----------|-------------|:------:|
+| topic.whitelist |  kafka.whitelist.topics | .*|
+| topic.blacklist |  kafka.blacklist.topics  | a^ |
+| mr.job.max.mappers | mapred.map.tasks | 100 |
+| kafka.brokers  | kafka.host.url | (required) |
+| topics.move.to.latest.offset  | kafka.move.to.last.offset.list | empty |
+| bootstrap.with.offset  | none | latest |
+| reset.on.offset.out.of.range | none | nearest |
+
+Remarks:
+
+* topic.whitelist and topic.blacklist supports regex.
+* topics.move.to.latest.offset: Topics in this list will always start from the latest offset (i.e., no records will be pulled). To move all topics to the latest offset, use "all". This property is useful in Camus for moving a new topic to the latest offset, but in Gobblin it should rarely, if ever, be used, since you can use bootstrap.with.offset to achieve the same purpose more conveniently.
+* bootstrap with offset: For new topics / partitions, this property controls whether they start at the earliest offset or the latest offset. Possible values: earliest, latest, skip.
+* reset.on.offset.out.of.range: This property controls what to do if a partition's previously persisted offset is out of the range of the currently available offsets. Possible values: earliest (always move to earliest available offset), latest (always move to latest available offset), nearest (move to earliest if the previously persisted offset is smaller than the earliest offset, otherwise move to latest), skip (skip this partition).
+
+### Config properties for compaction
+
+Gobblin compaction is comparable to Camus sweeper, which can deduplicate records in an input folder. Compaction is useful for Kafka-HDFS ingestion for two reasons:
+
+1. Although Gobblin guarantees no loss of data, in rare circumstances where data is published on HDFS but checkpoints failed to be persisted into the state store, it may pull the same records twice.
+
+2. If you have a hierarchy of Kafka clusters where topics are replicated among the Kafka clusters, duplicate records may be generated during replication.
+
+Below are the configuration properties related to compaction. For more information please visit the MapReduce Compaction section in the [Compaction](../user-guide/Compaction) page.
+
+| Gobblin Property   |  Corresponding Camus Property | Default value |
+|----------|-------------|:------:|
+| compaction.input.dir |  camus.sweeper.source.dir | (required) |
+| compaction.dest.dir |  camus.sweeper.dest.dir | (required) |
+| compaction.input.subdir |  camus.sweeper.source.dir | hourly |
+| compaction.dest.subdir |  camus.sweeper.dest.dir | daily |
+| compaction.tmp.dest.dir | camus.sweeper.tmp.dir | /tmp/gobblin-compaction |
+| compaction.whitelist |  camus.sweeper.whitelist | .* |
+| compaction.blacklist |  camus.sweeper.blacklist | a^ |
+| compaction.high.priority.topics | none |a^|
+| compaction.normal.priority.topics | none |a^|
+| compaction.input.deduplicated | none | false |
+| compaction.output.deduplicated | none | true |
+| compaction.file.system.uri | none ||
+| compaction.timebased.max.time.ago |  none | 3d |
+| compaction.timebased.min.time.ago | none | 1d |
+| compaction.timebased.folder.pattern | none | YYYY/mm/dd |
+| compaction.thread.pool.size | num.threads | 20 |
+| compaction.max.num.reducers | max.files | 900 |
+| compaction.target.output.file.size | camus.sweeper.target.file.size | 268435456 |
+| compaction.mapred.min.split.size | mapred.min.split.size | 268435456 |
+| compaction.mapred.max.split.size | mapred.max.split.size | 268435456 |
+| compaction.mr.job.timeout.minutes | none | |
+
+Remarks:
+
+* The following properties support regex: compaction.whitelist, compaction.blacklist, compaction.high.priority.topics, compaction.normal.priority.topics
+* compaction.input.dir is the parent folder of input topics, e.g., /data/kafka_topics, which contains topic folders such as /data/kafka_topics/Topic1, /data/kafka_topics/Topic2, etc. Note that Camus uses camus.sweeper.source.dir both as the input folder of Camus sweeper (i.e., compaction), and as the output folder for ingesting Kafka topics. In Gobblin, one should use data.publisher.final.dir as the output folder for ingesting Kafka topics.
+* compaction.output.dir is the parent folder of output topics, e.g., /data/compacted_kafka_topics.
+* compaction.input.subdir is the subdir name of output topics, if exists. For example, if the input topics are partitioned by hour, e.g., /data/kafka_topics/Topic1/hourly/2015/10/06/20, then compaction.input.subdir should be 'hourly'.
+* compaction.output.subdir is the subdir name of output topics, if exists. For example, if you want to publish compacted data into day-partitioned folders, e.g., /data/compacted_kafka_topics/Topic1/daily/2015/10/06, then compaction.output.subdir should be 'daily'.
+* There are 3 priority levels: high, normal, low. Topics not included in compaction.high.priority.topics or compaction.normal.priority.topics are considered low priority.
+* compaction.input.deduplicated and compaction.output.deduplicated controls the behavior of the compaction regarding deduplication. Please see the [Compaction](../user-guide/Compaction) page for more details.
+* compaction.timebased.max.time.ago and compaction.timebased.min.time.ago controls the earliest and latest input folders to process, when using `MRCompactorTimeBasedJobPropCreator`. The format is ?m?d?h, e.g., 3m or 2d10h (m = month, not minute). For example, suppose `compaction.timebased.max.time.ago=3d`, `compaction.timebased.min.time.ago=1d` and the current time is 10/07 9am. Folders whose timestamps are before 10/04 9am, or folders whose timestamps are after 10/06 9am will not be processed.
+* compaction.timebased.folder.pattern: time pattern in the folder path, when using `MRCompactorTimeBasedJobPropCreator`. This should come after `compaction.input.subdir`, e.g., if the input folder to a compaction job is `/data/compacted_kafka_topics/Topic1/daily/2015/10/06`, this property should be `YYYY/mm/dd`.
+* compaction.thread.pool.size: how many compaction MR jobs to run concurrently.
+* compaction.max.num.reducers: max number of reducers for each compaction job
+* compaction.target.output.file.size: This also controls the number of reducers. The number of reducers will be the smaller of `compaction.max.num.reducers` and `<input data size> / compaction.target.output.file.size`.
+* compaction.mapred.min.split.size and compaction.mapred.max.split.size are used to control the number of mappers.
+
+## Deployment and Checkpoint Management
+
+For deploying Gobblin in standalone or MapReduce mode, please see the [Deployment](../user-guide/Gobblin-Deployment) page.
+
+Gobblin and Camus checkpoint management are similar in the sense that they both create checkpoint files in each run, and the next run will load the checkpoint files created by the previous run and start from there. Their difference is that Gobblin creates a single checkpoint file per job run or per dataset per job run, and provides two job commit policies: `full` and `partial`. In `full` mode, data are only commited for the job/dataset if all workunits of the job/dataset succeeded. Other [...]
+
+## Migrating from Camus to Gobblin in Production
+
+If you are currently running in production, you can use the following steps to migrate to Gobblin:
+
+1. Deploy Gobblin based on the instructions in [Deployment](../user-guide/Gobblin-Deployment) and [Kafka-HDFS Ingestion](../case-studies/Kafka-HDFS-Ingestion), and set the properties mentioned in this page as well as other relevant properties in [Configuration Glossary](../user-guide/Configuration-Properties-Glossary) to the appropriate values.
+2. Whitelist the topics in Gobblin ingestion, and schedule Gobblin to run at your desired frequency.
+3. Once Gobblin starts running, blacklist these topics in Camus.
+4. If compaction is applicable to you, set up the compaction jobs based on instructions in [Kafka-HDFS Ingestion](../case-studies/Kafka-HDFS-Ingestion) and [Compaction](../user-guide/Compaction). Whitelist the topics you want to migrate in Gobblin and blacklist them in Camus.
diff --git a/gobblin-website/docs/miscellaneous/Exactly-Once-Support.md b/gobblin-website/docs/miscellaneous/Exactly-Once-Support.md
new file mode 100644
index 0000000..d2b60b4
--- /dev/null
+++ b/gobblin-website/docs/miscellaneous/Exactly-Once-Support.md
@@ -0,0 +1,173 @@
+---
+title: Exactly Once Support
+sidebar_label: Exactly Once Support
+---
+
+This page outlines the design for exactly-once support in Gobblin. 
+
+Currently the flow of publishing data in Gobblin is:
+
+1. DataWriter writes to staging folder 
+2. DataWriter moves files from staging folder to task output folder
+3. Publisher moves files from task output folder to job output folder
+4. Persists checkpoints (watermarks) to state store
+5. Delete staging folder and task-output folder.
+
+This flow does not theoretically guarantee exactly-once delivery, rather, it guarantess at least once. Because if something bad happens in step 4, or between steps 3 and 4, it is possible that data is published but checkpoints are not, and the next run will re-extract and re-publish those records.
+
+To guarantee exactly-once, steps 3 & 4 should be atomic.
+
+## Achieving Exactly-Once Delivery with `CommitStepStore`
+
+The idea is similar as write-head logging. Before doing the atomic steps (i.e., steps 3 & 4), first write all these steps (referred to as `CommitStep`s) into a `CommitStepStore`. In this way, if failure happens during the atomic steps, the next run can continue doing the rest of the steps before ingesting more data for this dataset.
+
+**Example**: Suppose we have a Kafka-HDFS ingestion job, where each Kafka topic is a dataset. Suppose a task generates three output files for topic 'MyTopic':
+
+```text
+task-output/MyTopic/2015-12-09/1.avro
+task-output/MyTopic/2015-12-09/2.avro
+task-output/MyTopic/2015-12-10/1.avro
+```
+
+which should be published to
+```text
+job-output/MyTopic/2015-12-09/1.avro
+job-output/MyTopic/2015-12-09/2.avro
+job-output/MyTopic/2015-12-10/1.avro
+```
+
+And suppose this topic has two partitions, and the their checkpoints, i.e., the actual high watermarks are `offset=100` and `offset=200`.
+
+In this case, there will be 5 CommitSteps for this dataset:
+
+1. `FsRenameCommitStep`: rename `task-output/MyTopic/2015-12-09/1.avro` to `job-output/MyTopic/2015-12-09/1.avro`
+2. `FsRenameCommitStep`: rename `task-output/MyTopic/2015-12-09/2.avro` to `job-output/MyTopic/2015-12-09/2.avro`
+3. `FsRenameCommitStep`: rename `task-output/MyTopic/2015-12-10/1.avro` to `job-output/MyTopic/2015-12-10/1.avro`
+4. `HighWatermarkCommitStep`: set the high watermark for partition `MyTopic:0 = 100`
+5. `HighWatermarkCommtiStep`: set the high watermark for partition `MyTopic:1 = 200`
+
+If all these `CommitStep`s are successful, we can proceed with deleting task-output folder and deleting the above `CommitStep`s from the `CommitStepStore`. If any of these steps fails, these steps will not be deleted. When the next run starts, for each dataset, it will check whether there are `CommitStep`s for this dataset in the CommitStepStore. If there are, it means the previous run may not have successfully executed some of these steps, so it will verify whether each step has been do [...]
+
+## Scalability
+
+The above approach potentially affects scalability for two reasons:
+
+1. The driver needs to write all `CommitStep`s to the `CommitStepStore` for each dataset, once it determines that all tasks for the dataset have finished. This may cause scalability issues if there are too many `CommitStep`s, too many datasets, or too many tasks.
+2. Upon the start of the next run, the driver needs to verify all `CommitStep`s and redo the `CommitStep`s that the previous run failed to do. This may also cause scalability issues if there are too many `CommitStep`s.
+
+Both issues can be resolved by moving the majority of the work to containers, rather than doing it in the driver. 
+
+For #1, we can make each container responsible for writing `CommitStep`s for a subset of the datasets. Each container will keep polling the `TaskStateStore` to determine whether all tasks for each dataset that it is responsible for have finished, and if so, it writes `CommitStep`s for this dataset to the `CommitStepStore`.
+
+ #2 can also easily be parallelized where we have each container responsible for a subset of datasets.
+
+## APIs
+
+**CommitStep**:
+```java
+/**
+ * A step during committing in a Gobblin job that should be atomically executed with other steps.
+ */
+public abstract class CommitStep {
+
+  private static final Gson GSON = new Gson();
+
+  public static abstract class Builder<T extends Builder<?>> {
+  }
+
+  protected CommitStep(Builder<?> builder) {
+  }
+
+  /**
+   * Verify whether the CommitStep has been done.
+   */
+  public abstract boolean verify() throws IOException;
+
+  /**
+   * Execute a CommitStep.
+   */
+  public abstract boolean execute() throws IOException;
+
+  public static CommitStep get(String json, Class<? extends CommitStep> clazz) throws IOException {
+    return GSON.fromJson(json, clazz);
+  }
+}
+```
+
+**CommitSequence**:
+```java
+@Slf4j
+public class CommitSequence {
+  private final String storeName;
+  private final String datasetUrn;
+  private final List<CommitStep> steps;
+  private final CommitStepStore commitStepStore;
+
+  public CommitSequence(String storeName, String datasetUrn, List<CommitStep> steps, CommitStepStore commitStepStore) {
+    this.storeName = storeName;
+    this.datasetUrn = datasetUrn;
+    this.steps = steps;
+    this.commitStepStore = commitStepStore;
+  }
+
+  public boolean commit() {
+    try {
+      for (CommitStep step : this.steps) {
+        if (!step.verify()) {
+          step.execute();
+        }
+      }
+      this.commitStepStore.remove(this.storeName, this.datasetUrn);
+      return true;
+    } catch (Throwable t) {
+      log.error("Commit failed for dataset " + this.datasetUrn, t);
+      return false;
+    }
+  }
+}
+```
+
+**CommitStepStore**:
+```java
+/**
+ * A store for {@link CommitStep}s.
+ */
+public interface CommitStepStore {
+
+  /**
+   * Create a store with the given name.
+   */
+  public boolean create(String storeName) throws IOException;
+
+  /**
+   * Create a new dataset URN in a store.
+   */
+  public boolean create(String storeName, String datasetUrn) throws IOException;
+
+  /**
+   * Whether a dataset URN exists in a store.
+   */
+  public boolean exists(String storeName, String datasetUrn) throws IOException;
+
+  /**
+   * Remove a given store.
+   */
+  public boolean remove(String storeName) throws IOException;
+
+  /**
+   * Remove all {@link CommitStep}s for the given dataset URN from the store.
+   */
+  public boolean remove(String storeName, String datasetUrn) throws IOException;
+
+  /**
+   * Put a {@link CommitStep} with the given dataset URN into the store.
+   */
+  public boolean put(String storeName, String datasetUrn, CommitStep step) throws IOException;
+
+  /**
+   * Get the {@link CommitSequence} associated with the given dataset URN in the store.
+   */
+  public CommitSequence getCommitSequence(String storeName, String datasetUrn) throws IOException;
+
+}
+```
diff --git a/gobblin-website/docs/project/Feature-List.md b/gobblin-website/docs/project/Feature-List.md
new file mode 100644
index 0000000..57014a3
--- /dev/null
+++ b/gobblin-website/docs/project/Feature-List.md
@@ -0,0 +1,25 @@
+---
+title: Feature List
+---
+
+Currently, Gobblin supports the following feature list:
+
+# Different Data Sources
+
+|Source Type|Protocol|Vendors|
+|-----------|--------|-------|
+|RDMS|JDBC|MySQL/SQLServer|
+|Files|HDFS/SFTP/LocalFS|N/A|
+|Salesforce|REST|Salesforce|
+
+* Different Pulling Types
+    * SNAPSHOT-ONLY: Pull the snapshot of one dataset.
+    * SNAPSHOT-APPEND: Pull delta changes since last run, optionally merge delta changes into snapshot (Delta changes include updates to the dataset since last run).
+    * APPEND-ONLY: Pull delta changes since last run, and append to dataset.
+
+* Different Deployment Types
+    * standalone deploy on a single machine
+    * cluster deploy on hadoop 2.3.0
+
+* Compaction
+    * Merge delta changes into snapshot.
diff --git a/gobblin-website/docs/project/Posts.md b/gobblin-website/docs/project/Posts.md
new file mode 100644
index 0000000..4ef894e
--- /dev/null
+++ b/gobblin-website/docs/project/Posts.md
@@ -0,0 +1,5 @@
+---
+title: Posts
+---
+
+* [Gobblin Metrics: next generation instrumentation for applications](../metrics/Gobblin-Metrics-next-generation-instrumentation-for-applications)
diff --git a/gobblin-website/docs/project/Talks-and-Tech-Blogs.md b/gobblin-website/docs/project/Talks-and-Tech-Blogs.md
new file mode 100644
index 0000000..b4f28e8
--- /dev/null
+++ b/gobblin-website/docs/project/Talks-and-Tech-Blogs.md
@@ -0,0 +1,19 @@
+---
+title: Talks and Tech Blog Posts
+---
+
+Gobblin Talks and Tech Blogs
+
+* QCon presentation <small><b>| [Presentation](http://www.slideshare.net/LinQiao1/gobblin-big-data-with-ease) | (Nov 5th, 2014)</b></small>
+* Engineering Blog Post <small><b>| [Presentation](http://engineering.linkedin.com/data-ingestion/gobblin-big-data-ease) | (Nov 25th, 2014)</b></small>
+* Bigger, Faster, Easier: Building a Real-Time Self Service Data Analytics Ecosystem at LinkedIn <small><b>| [Presentation](http://www.slideshare.net/Hadoop_Summit/bigger-faster-easier-building-a-realtime-self-service-data-analytics-ecosystem-at-linkedin?qid=9c8f8c33-0083-495b-a6e2-572ac45f7f2c&v=qf1&b=&from_search=9) | (Hadoop Summit 2015)</b></small>
+* Gobblin: Unifying Data Ingestion for Hadoop <small><b>| [Presentation](http://www.vldb.org/pvldb/vol8/p1764-qiao.pdf) | (VLDB 2015)</b></small>
+* Gobblin: Unifying Data Ingestion for Hadoop <small><b>| [Presentation](http://www.slideshare.net/YinanLi/gobblin-unifying-data-ingestion-for-hadoop) | (VLDB 2015 slides)</b></small>
+* Ingestion from Kafka using Gobblin <small><b>| [Presentation](http://www.slideshare.net/ZiyangLiu1/ingestion-from-kafka-using-gobblin?qid=b7dce13f-85f6-49f2-94df-feedd6057cbe&v=qf1&b=&from_search=4) | (Gobblin Meetup November 2015)</b></small>
+* Gobblin on Yarn: A Preview <small><b>| [Presentation](http://www.slideshare.net/YinanLi/gobblinmeetupyarn?qid=bda2e238-f302-402b-8c02-9dca1a3b7f4e&v=qf1&b=&from_search=6) | (Gobblin Meetup November 2015)</b></small>
+* Gobblin@NerdWallet  External Use Case 1 <small><b>| [Presentation](http://www.slideshare.net/NerdWalletHQ/gobblin-nerdwallet-nov-2015?qid=33ba50e5-8122-4668-89d5-bbf3302adb31&v=default&b=&from_search=2) | (Gobblin Meetup November 2015)</b></small>
+* Gobblin@Intel  External Use Case 2 <small><b>| [Presentation](http://www.slideshare.net/IntelITCenter/gobblin-for-data-analytics) | (Gobblin Meetup November 2015)</b></small>
+* Gobblin: Beyond ingest to big data management <small><b>| [Video](https://www.youtube.com/watch?v=MvohU8rSFqU) | [Presentation](http://www.slideshare.net/VasanthRajamani/gobblin-meetupwhats-new-in-07) | (Gobblin Meetup June 2016)</b></small>
+* Gobblin: Inter cluster replication <small><b>| [Video](https://www.youtube.com/watch?v=o1BnaovUObE) | [Presentation](http://www.slideshare.net/VasanthRajamani/distcp-gobblin) | (Gobblin Meetup June 2016)</b></small>
+* Gobblin: Configuration and Orchestration <small><b>| [Video](https://www.youtube.com/watch?v=O9KbPDLsy_c) | [Presentation](http://www.slideshare.net/VasanthRajamani/gobbin-configmeetupjune2016) | (Gobblin Meetup June 2016)</b></small>
+* Gobblin on AWS <small><b>| [Video](https://youtu.be/_c7agtS5bI8?t=107) | [Presentation](http://www.slideshare.net/VasanthRajamani/gobblin-onaws-63970489) | (Gobblin Meetup June 2016)</b></small>
diff --git a/gobblin-website/docs/sinks/AvroHdfsDataWriter.md b/gobblin-website/docs/sinks/AvroHdfsDataWriter.md
new file mode 100644
index 0000000..e5af4b4
--- /dev/null
+++ b/gobblin-website/docs/sinks/AvroHdfsDataWriter.md
@@ -0,0 +1,28 @@
+---
+title: Avro HDFS
+sidebar_label: Avro HDFS
+---
+
+# Description
+
+Writes Avro records to Avro data files on Hadoop file systems.
+
+
+# Usage
+
+```properties
+    writer.builder.class=org.apache.gobblin.writer.AvroDataWriterBuilder
+    writer.destination.type=HDFS
+```
+
+For more info, see [`AvroHdfsDataWriter`](https://github.com/apache/gobblin/search?utf8=%E2%9C%93&q=AvroHdfsDataWriter)
+
+
+# Configuration
+
+
+| Key | Type | Description | Default Value |
+|-----|------|-------------|---------------|
+| writer.codec.type | One of null,deflate,snappy,bzip2,xz | Type of the compression codec | deflate |
+| writer.deflate.level | 1-9 | The compression level for the "deflate" codec | 9 |
+
diff --git a/gobblin-website/docs/sinks/ConsoleWriter.md b/gobblin-website/docs/sinks/ConsoleWriter.md
new file mode 100644
index 0000000..3f3400a
--- /dev/null
+++ b/gobblin-website/docs/sinks/ConsoleWriter.md
@@ -0,0 +1,15 @@
+---
+title: Console
+sidebar_label: Console
+---
+
+# Description
+----------
+
+A simple implementation that writes records to Stdout. 
+
+# Usage
+
+```properties
+    writer.builder.class=org.apache.gobblin.writer.ConsoleWriterBuilder
+```
diff --git a/gobblin-website/docs/sinks/CouchbaseWriter.md b/gobblin-website/docs/sinks/CouchbaseWriter.md
new file mode 100644
index 0000000..4ff36f0
--- /dev/null
+++ b/gobblin-website/docs/sinks/CouchbaseWriter.md
@@ -0,0 +1,150 @@
+---
+title: Couchbase
+sidebar_label: Couchbase 
+---
+
+# Introduction
+The [CouchbaseWriter](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-couchbase/src/main/java/org/apache/gobblin/couchbase/writer/CouchbaseWriter.java) supports writing documents to a Couchbase bucket though the [Couchbase Java SDK](https://docs.couchbase.com/java-sdk/current/start-using-sdk.html). Note that CouchbaseWriter only supports writing to a single bucket as there should be only 1 CouchbaseEnvironment per JVM.
+
+
+# Record format
+Couchbase writer currently support `AVRO` and `JSON` as data inputs. On both of them it requires the following structured schema:
+
+
+| Document field | Description |
+| -------------- | ----------- |
+| `key` | Unique key used to store the document on the bucket. For more info view [Couchbase docs](https://developer.couchbase.com/documentation/server/3.x/developer/dev-guide-3.0/keys-values.html)  |
+| `data.data` | Object or value containing the information associated with the `key` for this document |
+| `data.flags` | [Couchbase flags](https://docs.couchbase.com/server/4.1/developer-guide/transcoders.html) To store JSON on `data.data` use `0x02 << 24` for UTF-8 `0x04 << 24` . |
+
+The following is a sample input record with JSON data
+
+```json
+{
+ "key": "myKey123",
+ "data": {
+    "data": {
+        "field1": "field1Value",
+        "field2": 123
+    },
+    "flags": 33554432
+  }
+}
+```
+
+or to store plain text:
+
+```json
+{
+ "key": "myKey123",
+ "data": {
+    "data": "singleValueData",
+    "flags": 67108864
+  }
+}
+```
+
+If using AVRO, use the following schema:
+
+```json
+{
+  "type" : "record",
+  "name" : "topLevelRecord",
+  "fields" : [ {
+    "name" : "key",
+    "type" : "string"
+  }, {
+    "name" : "data",
+    "type" : {
+      "type" : "record",
+      "name" : "data",
+      "namespace" : "topLevelRecord",
+      "fields" : [ {
+        "name" : "data",
+        "type" : [ "bytes", "null" ]
+      }, {
+        "name" : "flags",
+        "type" : "int"
+      } ]
+    }
+  } ]
+}
+```
+Note that the key can be other than string if needed.
+# Configuration
+## General configuration values
+| Configuration Key | Default Value | Description |
+| ----------------- | ------------- | ----------- |
+| `writer.couchbase.bucket` | Optional | Name of the couchbase bucket. Change if using other than default bucket |
+| `writer.couchbase.default` | `"default"` | Name of the default bucket if `writer.couchbase.bucket` is not provided |
+| `writer.couchbase.dnsSrvEnabled` | `"false"` | Enable DNS SRV bootstrapping [docs](https://docs.couchbase.com/java-sdk/current/managing-connections.html) | 
+| `writer.couchbase.bootstrapServers | `localhost` | URL to bootstrap servers. If using DNS SRV set `writer.couchbase.dnsSrvEnabled` to true |
+| `writer.couchbase.sslEnabled` | `false` | Use SSL to connect to couchbase |
+| `writer.couchbase.password` | Optional | Bucket password. Will be ignored if `writer.couchbase.certAuthEnabled` is true |
+| `writer.couchbase.certAuthEnabled` | `false` | Set to true if using certificate authentication. Must also specify `writer.couchbase.sslKeystoreFile`, `writer.couchbase.sslKeystorePassword`, `writer.couchbase.sslTruststoreFile`, and `writer.couchbase.sslTruststorePassword` |
+| `writer.couchbase.sslKeystoreFile` | Optional | Path to the keystore file location |
+| `writer.couchbase.sslKeystorePassword` | Optional | Keystore password |
+| `writer.couchbase.sslTruststoreFile` | Optional | Path to the trustStore file location |
+| `writer.couchbase.sslTruststorePassword` | Optional | TrustStore password |
+| `writer.couchbase.documentTTL` | `0` | Time To Live of each document. Units are specified in `writer.couchbase.documentTTLOriginField` |
+| `writer.couchbase.documentTTLUnits` | `SECONDS` | Unit for `writer.couchbase.documentTTL`. Must be one of [java.util.concurrent.TimeUnit](https://docs.oracle.com/javase/7/docs/api/java/util/concurrent/TimeUnit.html). Case insensitive  |
+| `writer.couchbase.documentTTLOriginField` | Optional | Time To Live of each document. Units are specified in `writer.couchbase.documentTTLOriginField` |
+| `writer.couchbase.documentTTLOriginUnits` | `MILLISECONDS` | Unit for `writer.couchbase.documentTTL`. Must be one of [java.util.concurrent.TimeUnit](https://docs.oracle.com/javase/7/docs/api/java/util/concurrent/TimeUnit.html). Case insensitive. As an example a `writer.couchbase.documentTTLOriginField` value of `1568240399000` and `writer.couchbase.documentTTLOriginUnits` value of `MILLISECONDS` timeunit would be `Wed Sep 11 15:19:59 PDT 2019` |
+| `writer.couchbase.retriesEnabled` | `false` | Enable write retries on failures |
+| `writer.couchbase.maxRetries` | `5` | Maximum number of retries |
+| `writer.couchbase.failureAllowancePercentage` | `0.0` | The percentage of failures that you are willing to tolerate while writing to Couchbase. Gobblin will mark the workunit successful and move on if there are failures but not enough to trip the failure threshold. Only successfully acknowledged writes are counted as successful, all others are considered as failures. The default for the failureAllowancePercentage is set to 0.0. For example, if the value is set to 0.2 This means that as [...]
+|`operationTimeoutMillis` | `10000` | Global timeout for couchbase communication operations |
+
+## Authentication
+### No credentials
+NOT RECOMMENDED FOR PRODUCTION.
+
+Do not set `writer.couchbase.certAuthEnabled` nor `writer.couchbase.password`
+### Using certificates
+Set `writer.couchbase.certAuthEnabled` to `true` and values for `writer.couchbase.sslKeystoreFile`, `writer.couchbase.sslKeystorePassword`, `writer.couchbase.sslTruststoreFile`, and `writer.couchbase.sslTruststorePassword`.
+
+`writer.couchbase.password` setting will be ignored if `writer.couchbase.certAuthEnabled` is set
+### Using bucket password
+Set `writer.couchbase.password`
+
+## Document level expiration
+Couchbase writer allows to set expiration at the document level using the [expiry](https://docs.couchbase.com/java-sdk/current/document-operations.html) property of the couchbase document. PLease note that current couchbase implementation using timestamps limits it to January 19, 2038 03:14:07 GM given the type of expiry is set to int. CouchbaseWriter only works with global timestamps and does not use relative expiration in seconds (<30 days) for simplicity.
+Currently three modes are supported:
+### 1 - Expiration from write time
+Define only `writer.couchbase.documentTTL` and `writer.couchbase.documentTTLUnits`. For example for a 2 days expiration configs would look like:
+
+| Configuration Key | Value |
+| ----------------- | ------------- |
+| `writer.couchbase.documentTTL` | `2` |
+| `writer.couchbase.documentTTLUnits` | `DAYS` |
+
+### 2 - Expiration from an origin timestamp
+Define only `writer.couchbase.documentTTL` and `writer.couchbase.documentTTLUnits`.
+
+For example for a 2 days expiration configs using the `header.time` field that has timestamp in MILLISECONDS would look like:
+
+| Configuration Key | Value |
+| ----------------- | ------------- |
+| `writer.couchbase.documentTTL` | `2` |
+| `writer.couchbase.documentTTLUnits` | `"DAYS"` |
+| `writer.couchbase.documentTTLOriginField` | `"header.time"` |
+| `writer.couchbase.documentTTLOriginUnits` | `1568240399000` |
+
+So a sample document with origin on 1568240399 (Wed Sep 11 15:19:59 PDT 2019) would expire on 1568413199 (Fri Sep 13 15:19:59 PDT 2019). The following is a sample record format.
+
+```json
+{
+ "key": "sampleKey",
+ "data": {
+    "data": {
+        "field1": "field1Value",
+        "header": {
+            "time": 1568240399000
+        }
+    },
+    "flags": 33554432
+  }
+}
+```
+
+}
diff --git a/gobblin-website/docs/sinks/Gobblin-JDBC-Writer.md b/gobblin-website/docs/sinks/Gobblin-JDBC-Writer.md
new file mode 100644
index 0000000..833de18
--- /dev/null
+++ b/gobblin-website/docs/sinks/Gobblin-JDBC-Writer.md
@@ -0,0 +1,194 @@
+---
+title: JDBC
+sidebar_label: JDBC
+---
+
+Gobblin JDBC writer & publisher
+--------------------
+Gobblin is a general data ingestion framework that can extract, convert, and publish data. Currently publishing into JDBC compatible RDBMS is not in Gobblin and here we are introducing JDBC writer (and publisher) so that Gobblin can easily write into JDBC compatible RDBMS by using new JDBC writer at the same time reusing existing extraction, and conversion.
+
+Proposed design
+--------------------
+
+### Requirements
+1. User can choose to replace the destination table.
+2. User can pass the staging table, for the case user does not have permission to create table. If user chose to use their own staging table, user can also choose to truncate the staging table.
+3. User should be able to skip staging table for performance reason.
+4. User can choose level of parallelism.
+5. For Avro → JDBC use case, user should be able to cherry pick the fields to be copied.
+
+### Design summary
+- New JdbcWriter, JdbcPublisher will be introduced, along with AvroFieldsPickConverter and AvroToJdbcEntry.
+- Figure 1 shows Gobblin general flow and figure 2 shows the specific flow; Avro → RDBMS through JDBC.
+- JdbcWriter will use staging table mainly for failure handling. Since Gobblin breaks down the job into multiple task and process it in parallel, each writer will hold transaction and there will be more than one transaction against the database. If there’s no staging table and writer writes directly to destination table, partial failure among the writers may result partial data push into destination table. Having partial data is mostly bad for consumption and could also result subsequent [...]
+- For performance reason, and from the requirement, user may skip staging. This comes with the cost of giving up failure handling and for this case Gobblin does not guarantee recovery from failure.
+Will introduce WriterInitializer for the case that writer needs an initialization but needs to be done before going on parallel processing. (more on below)
+
+
+![Gobblin-Constructs](../../static/img/Gobblin-Constructs.png)
+figure 1. Gobblin general flow
+
+![Gobblin-Constructs](../../static/img/jdbc/HDFS_JDBC_Flow.png)
+figure 2. Gobblin Avro → RDBMS through JDBC specific flow
+
+- AvroFieldsPickConverter will cherry pick the columns that user wants.
+- AvroToJdbcEntryConverter will convert Avro to JDBC entry.
+- JdbcWriter will write JdbcEntry to staging table. (user can skip staging which is addressed below in JDBC Writer/Publisher section).
+- JdbcPublisher will write into destination table.
+
+### Design detail
+
+#### WriterInitializer
+Note that this is a new interface where its responsibility is to initialize writer, which means it’s tied to writer’s implementation, and clean what it initialized.
+
+Reasons of introducing writer initializer:
+
+- The main reason for this initializer is to perform initialization but not in parallel environment. As writer subjects to run in parallel, certain task that needs to be done only once across all writers is hard to be done. For example, if user chose to skip staging table and also chose to replace destination table, the initialization task would be truncating destination table which needs to be done only once before all writers start writing. This is simply hard to be done in parallel en [...]
+
+- Another reason for writer initializer is to simplify the cleanup process. As writer initializer initializes things, it also knows what to clean up. Instead of having other code to figure out what to clean up on which branch and condition all over again, closing writer initializer at the end of the job will just simply clean it up.(This pattern is widely used in JDK(e.g: Stream), where many classes implements interface Closeable). This clean up can be done in writer, but writer is curre [...]
+
+- Currently, Gobblin has a logic to clean up the the staging data in JobLauncherUtils which is specific to FsDataWriter, where AbstractJobLauncher has the logic to figure out what kind of clean up method of JobLauncherUtils needs to be called. Ideally, we will hide this implementation of clean up behind the interface of WriterInitializer.
+
+- Figure 3 shows interface of WriterInitializer. WriterInitializer will be extensible in Gobblin via factory method pattern where any writer can plugin their initialization code if needed.
+
+![JdbcWriterInitializer](../../static/img/jdbc/WriterInitializer.png)
+figure 3. Class diagram for WriterInitializer interface
+
+#### JdbcWriterInitializer
+- JdbcWriterInitializer will be the first class implements WriterInitializer interface. This will be instantiated by AbstractJobLauncher after Source creates WorkUnit and always get closed by AbstractJobLauncher when job is finished regardless fail or success.
+- By default, JdbcWriterInitializer will create staging tables based on the structure of the target table. Therefore it's necessary to create the target table first. 
+  Staging tables are created per Workunit, in which there can be more than one staging table. Having multiple staging table will make parallelism easier for publisher when moving data from staging table to destination. Any table created by JdbcWriterInitializer will be remembered and later will be dropped when it’s being closed.
+- Staging will be placed in same destination host, same database, some temporary table name, with same structure. The main purpose of staging table for handling failure. Without staging table, it is hard to recover from failure, because writers writes into table in multiple transactions. Additionally, staging table also brings data integrity check before publishing into destination.
+- Before creating the staging table, JdbcWriterInitializer will validate if user has drop table privilege to make sure it can drop it later on.
+- User can choose to use their own staging table. This is to support the use case when user does not have privilege to create table. When user chooses to use their own staging table, JdbcWriterInitializer will truncate the table later when it’s being closed.
+- Staging table initially should be always empty. User who chose to use user’s own staging table can also choose to truncate staging table. If staging table is not empty and user does not choose to truncate table, JdbcWriterInitializer will make the job fail.
+- If user chose to skip staging table and replace the output, JdbcWriterInitializer will truncate destination table. This is because destination table needs to be emptied prior going into parallel processing as more than one writer will start writing simultaneously.
+- Figure 4 shows overall flow of JdbcWriterInitializer.
+
+![JdbcWriterInitializer](../../static/img/jdbc/WriterInitializer.png)
+figure 4. JdbcWriterInitializer flow
+
+#### AvroFieldsPickConverter
+- User can define which fields to be copied. Given input from the user, it will narrow down the number of columns by updating schema and data.
+
+#### AvroToJdbcEntryConverter
+- Will convert avro schema and data into JdbcEntrySchema and JdbcEntryData. JdbcEntrySchema consists of pairs of column name and JDBCTypes, and JdbcEntry consists of pairs of column names and object where object can be directly used against PreparedStatement.setObject().
+- Although Avro schema can be a recursive structure by having record in the record, RDBMS table structure is not a recursive data structure. Thus, AvroToJdbcEntryConverter does not accept Avro schema that has record type inside record type.
+- Both JdbcEntrySchema and JdbcEntry will be case sensitive because Avro field name is case sensitive, and many widely used RDBMS are case sensitive on column name as well.
+- In case there’s a mismatch on column names, AvroToJdbcEntryConverter will take column name mapping between Avro field name and Jdbc column name.
+
+#### JdbcWriter
+- Uses JDBC to persist data into staging table in task level.
+- By default it will persist into staging area (and will be put into final destination by publisher).
+- Staging table is already ready by WriterInitializer.
+- Input column names should exactly match Jdbc column names. User can convert the name using AvroToJdbcEntryConverter.
+- Schema evolution: The number of input columns is expected to be equal or smaller than the number of columns in Jdbc. This is to prevent unintended outcome from schema evolution such as additional column. As underlying Jdbc RDBMS can declare constraints on its schema, writer will allow if number of columns in Jdbc is greater than number of input columns.
+    - number of input columns <= number of columns in Jdbc
+- Each writer will open one transaction. Having one transaction per writer has its tradeoffs:
+    - Pro: Simple on failure handling as you can just simply execute rollback on failure. Basically, it will revert back to previous state so that the job can retry the task.
+    - Con: It can lead up to long lived transaction and it can face scalability issue. (Not enough disk space for transaction log, number of record limit on one transaction (2.1B for Postgre sql), etc)
+    - JdbcWriter will go with one transaction per writer for it’s simplicity on failure handling. Scalability issue with long transaction can be overcome by increasing partition which makes transaction short.
+    - During the design meeting, we’ve discussed that long transaction could be a problem. One suggestion came out during the meeting was commit periodically. This will address long transaction problem, but we also discussed it would be hard on failure handling. Currently, Gobblin does task level retry on failure and there were three options we’ve discussed. (There was no silver bullet solution from the meeting.) Note that these are all with committing periodically.
+        - Revert to previous state: For writer, this will be delete the record it wrote. For JdbcWriter, it could use it’s own staging table or could share staging table with other writer. As staging table can be passed by user where we don’t have control of, not able to add partition information, it is hard to revert back to previous state for all cases.
+        - Ignore duplicate: The idea is to use Upsert to perform insert or update. As it needs to check the current existence in the dataset, it is expected to show performance degradation. Also, possibility of duplicate entry was also discussed.
+        - Water mark: In order to use water mark in task level, writer needs to send same order when retried which is not guaranteed.
+        - Data with over 200M record was tested with single transaction and it had no problem in MySQL 5.6.
+- Operation:
+    - Write operation will write into staging table. (If user chose to skip staging table, write operation will write into destination table directly.)
+    - Commit operation will commit the transaction.
+    - Close operation will close database connection. If there was a failure, it will execute rollback before closing connection.
+
+#### Skipping staging table
+- From the requirement, user can choose to skip staging for performance reason. In this case, writer will directly persist into final destination. Without staging table, it is hard to recover from failure as mentioned above and **for this reason, if user does not want staging, the framework does not guarantee any recovery from the failure.**
+- If user configures "job.commit.policy=partial" and "publish.at.job.level=false", this means it won't be published in job level and it allows partial success commit. This will make Gobblin to skip staging table as it aligns with the behavior of skipping staging. The reason to reuse these two parameters instead of introducing new parameter is to avoid parameters contradicting each other.
+- Figure 4 shows overall flow of JDBC writer.
+
+![Gobblin JDBC Writer](../../static/img/jdbc/Gobblin_JDBC_Writer.png)
+figure 5. JDBC Writer flow
+
+#### JDBC Publisher
+- Uses JDBC to publish final result into output.
+- If user chose to not to use staging table, for performance reason, JDBC publisher won't do anything as the output is already updated by the writer(s). More precisely, the parameter makes Gobblin skip staging table, will make publisher to be skipped. 
+- JDBC publisher will copy data from staging table via SQL command that underlying database provides. (Renaming staging table is not viable mainly because 1. it's hard to copy exact structure as original one (constraints, index, sequence, foreign keys, etc). 2. couple of underlying database system, rename requires no active connection to the table.)
+- A publisher will open one transaction. Being processed in single transaction, any failure can be reverted by rolling back transaction.
+Operation:
+    - PublishData operation opens single transaction and writes into destination table from staging table. If there is any error, transaction will be rolled back. Once completed successfully, transaction will be committed. 
+    - Parallelism: Currently parallelism in publisher level is not supported. For example the MySQL Writer fails on deleting all from table before inserting new data using global transaction with multiple connections.
+    - PublishMeta won’t do anything.
+- Figure 6 shows overall flow of JDBC publisher.
+
+![Gobblin JDBC Publisher](../../static/img/jdbc/Gobblin_JDBC_Publisher.png)
+figure 6. Gobblin_JDBC_Publisher
+
+### Concrete implementations
+
+To configure a concrete writer, please refer the [JDBC Writer Properties](/docs/user-guide/Configuration-Properties-Glossary#JdbcWriter-Properties) section in the [Configuration Glossary](/docs/user-guide/Configuration-Properties-Glossary).
+
+
+#### MySQL Writer
+
+The MySQL writer uses [buffered inserts](http://dev.mysql.com/doc/refman/5.0/en/insert-speed.html) to increase performance.  
+The sink configuration for MySQL in a Gobblin job is as follows:
+```properties
+writer.destination.type=MYSQL
+writer.builder.class=org.apache.gobblin.writer.JdbcWriterBuilder
+
+data.publisher.type=org.apache.gobblin.publisher.JdbcPublisher
+jdbc.publisher.url=jdbc:mysql://host:3306
+jdbc.publisher.driver=com.mysql.jdbc.Driver
+
+converter.classes=org.apache.gobblin.converter.jdbc.AvroToJdbcEntryConverter
+# If field name mapping is needed between the input Avro and the target table:
+converter.avro.jdbc.entry_fields_pairs={\"src_fn\":\"firstname\",\"src_ln\":\"lastname\"}
+```
+
+#### Teradata Writer
+
+Similarly to the MySQL Writer, this writer also inserts data in batches, configured by  ```writer.jdbc.batch_size```.
+Ideally, for performance reasons the target table is advised to be set to type MULTISET, without a primary index.  
+Please note, that the Teradata JDBC drivers are *not* part of Gobblin, one needs to obtain them from 
+[Teradata](http://downloads.teradata.com/download/connectivity/jdbc-driver) and pass them as job specific jars to the 
+Gobblin submitter scripts. Teradata may use the FASTLOAD option during the insert if conditions are met.  
+The sink configuration for Teradata in a Gobblin job is as follows:
+```properties
+writer.destination.type=TERADATA
+writer.builder.class=org.apache.gobblin.writer.JdbcWriterBuilder
+
+data.publisher.type=org.apache.gobblin.publisher.JdbcPublisher
+jdbc.publisher.url=jdbc:teradata://host/TMODE=ANSI,CHARSET=UTF16,TYPE=FASTLOAD
+jdbc.publisher.driver=com.teradata.jdbc.TeraDriver
+
+converter.classes=org.apache.gobblin.converter.jdbc.AvroToJdbcEntryConverter
+# If field name mapping is needed between the input Avro and the target table:
+converter.avro.jdbc.entry_fields_pairs={\"src_fn\":\"firstname\",\"src_ln\":\"lastname\"}
+```
+
+### Performance and Scalability
+As Gobblin can dial up parallelism, the bottleneck of performance will be the underlying RDBMS. Thus, performance and scalability will be mainly up to underlying RDBMS.
+
+Benchmark:  
+MySQL Writer performance test on 80k records. Each entry consists of 14 fields with sparse density. 
+
+Few observations:  
+- Starting from batch insert size 1,000, the performance gain diminishes.  
+- The parallelism does not show much of gain. This is mostly because of the overhead of parallelism. Parallelism is expected to show more performance gain on bigger record sets.  
+
+
+Batch insert size | Parallelism | Elapsed
+----------------- | ----------- | -------
+10 | 1 | 17 minutes
+30 | 1 | 5 minutes 30 seconds
+100 | 1 | 2 minutes 17 seconds
+1,000 | 1 | 40 seconds
+10,000 | 1 | 43 seconds
+1,000 | 2 | 42 seconds
+1,000 | 4 | 44 seconds
+1,000 | 8 | 57 seconds
+
+All tests used:
+
+- Hadoop 2.3 in Pseudo-distributed mode on CPU: 12 cores @1.2GHz, Memory: 64GBytes
+- MySQL server 5.6.21 with InnoDB storage engine, and compression on. Server runs on Dual CPU Intel Xeon 6 cores @2.5GHz, Memory: 48 GBytes, and HDD: 2.4TB (10K RPM)
+
+### Important note
+- As Gobblin framework comes with parallel processing via Hadoop, it can easily overburden the underlying RDBMS. User needs to choose parallelism level conservativley.
diff --git a/gobblin-website/docs/sinks/Http.md b/gobblin-website/docs/sinks/Http.md
new file mode 100644
index 0000000..9e41d9d
--- /dev/null
+++ b/gobblin-website/docs/sinks/Http.md
@@ -0,0 +1,109 @@
+---
+title: HTTP
+sidebar_label: HTTP
+---
+
+# Introduction
+
+Writing to a http based sink is done by sending a http or restful request and handling the response. Given
+the endpoint uri, query parameters, and body, it is straightforward to construct a http request. The idea
+is to build a writer that writes a http record, which contains those elements of a request. The writer
+builds a http or rest request from multiple http records, sends the request with a client that knows the server,
+and handles the response.
+
+## Note
+The old http write framework under [`AbstractHttpWriter`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/writer/http/AbstractHttpWriter.java)
+and [`AbstractHttpWriterBuilder`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/writer/http/AbstractHttpWriterBuilder.java)
+is deprecated (Deprecation date: 05/15/2018)! Use `AsyncHttpWriter` and `AsyncHttpWriterBuilder` instead
+
+# Constructs
+![Http write flow](../../static/img/Http-Write.png)
+Figure 1. Http write flow
+
+## `HttpOperation`
+A http record is represented as a `HttpOperation` object. It has 4 fields.
+
+| Field Name | Description | Example
+|---|---|---|
+| `keys` | Optional, a key/value map to interpolate the url template | ```{"memberId": "123"}``` |
+| `queryParams` | Optional, a map from query parameter to its value| ```{"action": "update"}``` |
+| `headers` | Optional, a map from header key to ts value | ```{"version": "2.0"}``` |
+| `body` | Optional, the request body in string or json string format | ```"{\"email\": \"httpwrite@test.com\"}"``` |
+
+Given an url template, ```http://www.test.com/profiles/${memberId}```, from job configuration, the resolved 
+example request url with `keys` and `queryParams` information will be ```http://www.test.com/profiles/123?action=update```.
+
+## `AsyncRequestBuilder`
+An `AsyncRequestBuilder` builds an `AsyncRequest` from a collection of `HttpOperation` records. It could build one
+request per record or batch multiple records into a single request. A builder is also responsible for
+putting the `headers` and setting the `body` to the request.
+
+## `HttpClient`
+A `HttpClient` sends a request and returns a response. If necessary, it should setup the connection to the server, for
+example, sending an authorization request to get access token. How authorization is done is per use case. Gobblin does
+not provide general support for authorization yet.
+
+## `ResponseHandler`
+A `ResponseHandler` handles a response of a request. It returns a `ResponseStatus` object to the framework, which
+would resend the request if it's a `SERVER_ERROR`.
+
+# Build an asynchronous writer
+`AsyncHttpWriterBuilder` is the base builder to build an asynchronous http writer. A specific writer can be created by 
+providing the 3 major components: a `HttpClient`, a `AsyncRequestBuilder`, and a `ResponseHandler`.
+
+Gobblin offers 2 implementations of async
+http writers. As long as your write requirement can be expressed as a `HttpOperation` through a `Converter`, the
+2 implementations should work with configurations.
+
+## `AvroHttpWriterBuilder`
+An `AvroHttpWriterBuilder` builds an `AsyncHttpWriter` on top of the [apache httpcomponents framework](https://hc.apache.org/), sending vanilla http request.
+The 3 major components are:
+
+  - `ApacheHttpClient`. It uses [`CloseableHttpClient`](https://github.com/apache/httpcomponents-client/blob/master/httpclient5/src/main/java/org/apache/hc/client5/http/impl/classic/CloseableHttpClient.java) to 
+  send [`HttpUriRequest`](https://github.com/apache/httpcomponents-client/blob/master/httpclient5/src/main/java/org/apache/hc/client5/http/classic/methods/HttpUriRequest.java)
+  and receive [`CloseableHttpResponse`](https://github.com/apache/httpcomponents-client/blob/master/httpclient5/src/main/java/org/apache/hc/client5/http/impl/classic/CloseableHttpResponse.java)
+  - `ApacheHttpRequestBuilder`. It builds a `ApacheHttpRequest`, which is an `AsyncRequest` that wraps the `HttpUriRequest`, from one `HttpOperation`
+  - `ApacheHttpResponseHandler`. It handles a `HttpResponse`
+
+Configurations for the builder are:
+
+| Configuration | Description | Example
+|---|---|---|
+| `gobblin.writer.http.urlTemplate` | Required, the url template(schema and port included), together with `keys` and `queryParams`, to be resolved to request url | ```http://www.test.com/profiles/${memberId}``` |
+| `gobblin.writer.http.verb` | Required, [http verbs](http://www.restapitutorial.com/lessons/httpmethods.html) | get, update, delete, etc |
+| `gobblin.writer.http.errorCodeWhitelist` | Optional, http error codes allowed to pass through | 404, 500, etc. No error code is allowed by default |
+| `gobblin.writer.http.maxAttempts` | Optional, max number of attempts including initial send | Default is 3 |
+| `gobblin.writer.http.contentType` | Optional, content type of the request body | ```"application/json"```, which is the default value |
+
+## `R2RestWriterBuilder`
+A `R2RestWriterBuilder` builds an `AsyncHttpWriter` on top of [restli r2 framework](https://github.com/linkedin/rest.li/wiki/Request---Response-API-(R2)), sending
+rest request. The 3 major components are:
+
+  - `R2Client`. It uses a R2 [`Client`](https://github.com/linkedin/rest.li/blob/master/r2-core/src/main/java/com/linkedin/r2/transport/common/Client.java) to
+  send [`RestRequest`](https://github.com/linkedin/rest.li/blob/master/r2-core/src/main/java/com/linkedin/r2/message/rest/RestRequest.java) and
+  receive [`RestResponse`](https://github.com/linkedin/rest.li/blob/master/r2-core/src/main/java/com/linkedin/r2/message/rest/RestResponse.java)
+  - `R2RestRequestBuilder`. It builds a `R2Request`, which is an `AsyncRequest` that wraps the `RestRequest`, from one `HttpOperation`
+  - `R2RestResponseHandler`. It handles a `RestResponse`
+  
+ `R2RestWriterBuilder` has [d2](https://github.com/linkedin/rest.li/wiki/Dynamic-Discovery) and ssl support. Configurations(`(d2.)` part should be added in d2 mode) for the builder are:
+ 
+ | Configuration | Description | Example
+ |---|---|---|
+ | `gobblin.writer.http.urlTemplate` | Required, the url template(schema and port included), together with `keys` and `queryParams`, to be resolved to request url. If the schema is `d2`, d2 is enabled | ```http://www.test.com/profiles/${memberId}``` |
+ | `gobblin.writer.http.verb` | Required, [rest(rest.li) verbs](https://github.com/linkedin/rest.li/wiki/Rest.li-User-Guide#resource-methods) | get, update, put, delete, etc |
+ | `gobblin.writer.http.maxAttempts` | Optional, max number of attempts including initial send | Default is 3 |
+ | `gobblin.writer.http.errorCodeWhitelist` | Optional, http error codes allowed to pass through | 404, 500, etc. No error code is allowed by default |
+ | `gobblin.writer.http.d2.zkHosts`| Required for d2, the zookeeper address | |
+ | `gobblin.writer.http.(d2.)ssl`| Optional, enable ssl | Default is false |
+ | `gobblin.writer.http.(d2.)keyStoreFilePath`| Required for ssl | /tmp/identity.p12 |
+ | `gobblin.writer.http.(d2.)keyStoreType`| Required for ssl | PKCS12 |
+ | `gobblin.writer.http.(d2.)keyStorePassword`| Required for ssl | |
+ | `gobblin.writer.http.(d2.)trustStoreFilePath`| Required for ssl | |
+ | `gobblin.writer.http.(d2.)trustStorePassword`| Required for ssl | |
+ | `gobblin.writer.http.protocolVersion` | Optional, protocol version of rest.li | ```2.0.0```, which is the default value |
+
+`R2RestWriterBuilder` isn't ingegrated with `PasswordManager` to process encrypted passwords yet. The task is tracked as https://issues.apache.org/jira/browse/GOBBLIN-487
+
+# Build a synchronous writer
+The idea is to reuse an asynchronous writer to build its synchronous version. The technical difference between them
+is the size of outstanding writes. Set `gobblin.writer.http.maxOutstandingWrites` to be `1`(default value is `1000`) to make a synchronous writer
diff --git a/gobblin-website/docs/sinks/Kafka.md b/gobblin-website/docs/sinks/Kafka.md
new file mode 100644
index 0000000..3d4ef25
--- /dev/null
+++ b/gobblin-website/docs/sinks/Kafka.md
@@ -0,0 +1,79 @@
+---
+title: Kafka
+sidebar_label: Kafka
+---
+
+# Introduction
+
+The Kafka writer allows users to create pipelines that ingest data from Gobblin sources into Kafka. This also enables Gobblin users to seamlessly transition their pipelines from ingesting directly to HDFS to ingesting into Kafka first, and then ingesting from Kafka to HDFS.
+
+#Pre-requisites
+
+* The following guide assumes that you are somewhat familiar with running Gobblin. If not, you should follow the [Getting Started](docs/Getting-Started) page first, then come back to this guide.
+
+* Before you can use the Kafka writer, you need to set up a Kafka cluster to write to. You can follow any of the guides listed by the Kafka project such as the [Apache Kafka quickstart guide](http://kafka.apache.org/documentation.html#quickstart).
+
+#Steps 
+
+* Edit the [wikipedia-kafka.pull](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/resources/wikipedia-kafka.pull) example to get started with setting up ingestion into Kafka. This is a very similar pipeline to the [wikipedia.pull](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/resources/wikipedia.pull) example which pulls pages from 5 titles from Wikipedia to HDFS. The main differences to note are: 
+    * The `writer.builder.class` is set to `gobblin.kafka.writer.KafkaDataWriterBuilder`. This is the class that creates a Kafka writer.
+    * The `writer.kafka.topic` is set to `WikipediaExample`. This is the topic that the writer will write the records to.
+    * The `writer.kafka.producerConfig.bootstrap.servers` is set to `localhost:9092`. This is the address of the kafka broker(s) that the writer must write to.
+    * There is no partitioner class specified. This implementation of the Kafka writer does not support partitioning and will use the default Kafka partitioner. 
+    * The `data.publisher.type` is set to `gobblin.publisher.NoopPublisher`. This is because Kafka doesn't offer transactional semantics, so it isn't possible to have a separate publish step to finally commit the data. 
+    * There is configuration for setting up the Schema Registry and Serializers that you will be using to write the data to Kafka. If you're using the Apache Kafka distribution, this file should work out of the box. 
+    * If you're using the Confluent distribution and want to use the Confluent schema registry, comment out the Local Schema Registry section and un-comment the Confluent schema registry section. The result should match the text below for Confluent users.
+
+```properties
+#Confluent Schema Registry and serializers
+writer.kafka.producerConfig.value.serializer=io.confluent.kafka.serializers.KafkaAvroSerializer
+writer.kafka.producerConfig.key.serializer=io.confluent.kafka.serializers.KafkaAvroSerializer
+writer.kafka.producerConfig.schema.registry.url=http://localhost:8081  #Set this to the correct schema-reg url
+
+##Use Local Schema Registry and serializers
+#writer.kafka.producerConfig.value.serializer=org.apache.gobblin.kafka.serialize.LiAvroSerializer
+#writer.kafka.producerConfig.kafka.schemaRegistry.class=org.apache.gobblin.kafka.schemareg.ConfigDrivenMd5SchemaRegistry
+#writer.kafka.producerConfig.schemaRegistry.schema.name=WikipediaExample
+#writer.kafka.producerConfig.schemaRegistry.schema.value={"namespace": "example.wikipedia.avro","type": "record","name": "WikipediaArticle","fields": [{"name": "pageid", "type": ["double", "null"]},{"name": "title", "type": ["string", "null"]},{"name": "user", "type": ["string", "null"]},{"name": "anon", "type": ["string", "null"]},{"name": "userid",  "type": ["double", "null"]},{"name": "timestamp", "type": ["string", "null"]},{"name": "size",  "type": ["double", "null"]},{"name": "cont [...]
+```
+
+
+* Run the standalone launcher with the wikipedia-kafka.pull file. You should see something like this. 
+
+```
+INFO  [TaskExecutor-0] gobblin.example.wikipedia.WikipediaExtractor  243 - 5 record(s) retrieved for title LinkedIn
+INFO  [TaskExecutor-0] gobblin.example.wikipedia.WikipediaExtractor  243 - 5 record(s) retrieved for title Parris_Cues
+INFO  [TaskExecutor-0] gobblin.example.wikipedia.WikipediaExtractor  243 - 5 record(s) retrieved for title Barbara_Corcoran
+INFO  [TaskExecutor-0] gobblin.runtime.Task  176 - Extracted 20 data records
+INFO  [TaskExecutor-0] gobblin.runtime.Task  177 - Row quality checker finished with results:
+INFO  [TaskExecutor-0] gobblin.publisher.TaskPublisher  43 - All components finished successfully, checking quality tests
+INFO  [TaskExecutor-0] gobblin.publisher.TaskPublisher  45 - All required test passed for this task passed.
+INFO  [TaskExecutor-0] gobblin.publisher.TaskPublisher  47 - Cleanup for task publisher executed successfully.
+INFO  [TaskExecutor-0] gobblin.runtime.Fork  261 - Committing data for fork 0 of task task_PullFromWikipediaToKafka_1472246706122_0
+INFO  [TaskExecutor-0] gobblin.kafka.writer.KafkaDataWriter  211 - Successfully committed 20 records.
+```
+
+* To verify that the records have indeed been ingested into Kafka, you can run a kafka console consumer or run Gobblin's [kafka-console pull file](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/resources/kafka-console.pull) which prints the events from Kafka onto the console.
+
+#Configuration Details
+
+At this time, Gobblin supports integration with Kafka 0.8 and 0.9. The Kafka writer supports all the configuration parameters supported by the version-specific Kafka Producer (e.g. [Latest Producer Configs](https://kafka.apache.org/documentation/#producerconfigs)). All you have to do is prefix `writer.kafka.producerConfig.` to each configuration property that the producer supports. For example, if you want to set the `acks` parameter to `all` to ensure full acknowledgement of writes, you [...]
+
+There are a few key parameters at the Gobblin level that control the behavior of the data writer. 
+
+| Property Name | Semantics | 
+|---|---|
+| `writer.kafka.topic` | The topic that the writer will be writing to. At this time, the writer can only write to a single topic per pipeline. | 
+| `writer.kafka.failureAllowancePercentage` | The percentage of failures that you are willing to tolerate while writing to Kafka. Gobblin will mark the workunit successful and move on if there are failures but not enough to trip the failure threshold. Only successfully acknowledged writes are counted as successful, all others are considered as failures. The default for the failureAllowancePercentage is set to 20.0. This means that as long as 80% of the data is acknowledged by Kafka, Gobb [...]
+| `writer.kafka.commitTimeoutMillis` | The amount of time that the Gobblin committer will wait before abandoning its wait for unacknowledged writes. This defaults to 1 minute. | 
+| `writer.kafka.keyed` | When set to true, enables key-based writes to Kafka. This defaults to false. If you set this to true, make sure to set the keyField configuration property. Serialization of the key is controlled by the Kafka Producer specific configuration property (`writer.kafka.producerConfig.key.serializer`) |
+| `writer.kafka.keyField` | The field of the record to use as the key for writing to Kafka. The field path follows a nested notation. So a top-level field "name" would be set to "name", a nested field "name" within a top-level struct "header" would be named "header.name" | 
+| `writer.kafka.typeMapperClass` | The class that the writer should use to extract keys and values from the input record. The default if not specified assumes that AvroGenericRecordTypeMapper is being used | 
+| `writer.kafka.valueField` | The field of the record to use as the value for writing to Kafka. Defaults to "*" which indicates that the entire record should be written. For nested records such as a pair of key, value, one would set the value of this configuration to the field-name for the value structure. | 
+
+#What Next?
+
+You can now set up Kafka as the destination for any of your sources. All you have to do is set up the writer configuration correctly in your pull files. Happy Ingesting!
+
+
+
diff --git a/gobblin-website/docs/sinks/ParquetHdfsDataWriter.md b/gobblin-website/docs/sinks/ParquetHdfsDataWriter.md
new file mode 100644
index 0000000..a40d01e
--- /dev/null
+++ b/gobblin-website/docs/sinks/ParquetHdfsDataWriter.md
@@ -0,0 +1,48 @@
+---
+title: Parquet HDFS
+sidebar_label: Parquet HDFS
+---
+
+# Description
+
+An extension to [`FsDataWriter`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/writer/FsDataWriter.java) that writes in Parquet format in the form of either Avro, Protobuf or [`ParquetGroup`](https://github.com/apache/parquet-mr/blob/master/parquet-column/src/main/java/org/apache/parquet/example/data/Group.java). This implementation allows users to specify the CodecFactory to use through the configuration property [`writer.codec.type`](https: [...]
+
+# Usage
+```properties
+writer.builder.class=org.apache.gobblin.writer.ParquetDataWriterBuilder
+writer.destination.type=HDFS
+writer.output.format=PARQUET
+```
+
+# Example Pipeline Configuration
+* [`example-parquet.pull`](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/resources/example-parquet.pull) contains an example of generating test data and writing to Parquet files.
+
+
+# Configuration
+
+| Key                    | Description | Default Value | Required |
+|------------------------|-------------|---------------|----------|
+| writer.parquet.page.size | The page size threshold. | 1048576 | No |
+| writer.parquet.dictionary.page.size | The block size threshold for the dictionary pages. | 134217728 | No |
+| writer.parquet.dictionary | To turn dictionary encoding on. Parquet has a dictionary encoding for data with a small number of unique values ( < 10^5 ) that aids in significant compression and boosts processing speed. | true | No |
+| writer.parquet.validate | To turn on validation using the schema. This validation is done by [`ParquetWriter`](https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java) not by Gobblin. | false | No |
+| writer.parquet.version | Version of parquet writer to use. Available versions are v1 and v2. | v1 | No |
+| writer.parquet.format | In-memory format of the record being written to Parquet. [`Options`](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-parquet-common/src/main/java/org/apache/gobblin/parquet/writer/ParquetRecordFormat.java) are AVRO, PROTOBUF and GROUP | GROUP | No |
+
+# Developer Notes
+
+Gobblin provides integration with two different versions of Parquet through its modules. Use the appropriate jar based on the Parquet library you use in your code.
+
+| Jar | Dependency | Gobblin Release |
+|-----|-------------|--------|
+| [`gobblin-parquet`](https://mvnrepository.com/artifact/org.apache.gobblin/gobblin-parquet) | [`com.twitter:parquet-hadoop-bundle`](https://mvnrepository.com/artifact/com.twitter/parquet-hadoop-bundle) | >= 0.12.0 |
+| [`gobblin-parquet-apache`](https://mvnrepository.com/artifact/org.apache.gobblin/gobblin-parquet-apache) | [`org.apache.parquet:parquet-hadoop`](https://mvnrepository.com/artifact/org.apache.parquet/parquet-hadoop) | >= 0.15.0 |
+
+If you want to look at the code, check out:
+
+| Module | File |
+| ------ | ---- |
+| gobblin-parquet | [`ParquetHdfsDataWriter`](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-parquet/src/main/java/org/apache/gobblin/writer/ParquetHdfsDataWriter.java) |
+| gobblin-parquet | [`ParquetDataWriterBuilder`](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-parquet/src/main/java/org/apache/gobblin/writer/ParquetDataWriterBuilder.java) |
+| gobblin-parquet-apache | [`ParquetHdfsDataWriter`](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-parquet-apache/src/main/java/org/apache/gobblin/writer/ParquetHdfsDataWriter.java) |
+| gobblin-parquet-apache | [`ParquetDataWriterBuilder`](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-parquet-apache/src/main/java/org/apache/gobblin/writer/ParquetDataWriterBuilder.java) |
diff --git a/gobblin-website/docs/sinks/SimpleBytesWriter.md b/gobblin-website/docs/sinks/SimpleBytesWriter.md
new file mode 100644
index 0000000..fa26465
--- /dev/null
+++ b/gobblin-website/docs/sinks/SimpleBytesWriter.md
@@ -0,0 +1,23 @@
+---
+title: HDFS Bytes Array
+sidebar_label: HDFS Bytes Array
+---
+
+# Description
+
+
+A simple writer for byte arrays to a Hadoop file system file. The byte arrays can be optionally prefixed by a long-sized length and/or record delimiter byte.
+
+# Usage
+
+```properties
+    writer.builder.class=org.apache.gobblin.writer.AvroDataWriterBuilder
+```
+# Configuration
+
+
+| Key | Type | Description | Default Value |
+|-----|------|-------------|---------------|
+| simple.writer.delimiter | character | An optional character to be used as records separator |  |
+| simple.writer.prepend.size | boolean | Enables/disables pre-pending the bytes written with a long size | false |
+
diff --git a/gobblin-website/docs/sources/AvroFileSource.md b/gobblin-website/docs/sources/AvroFileSource.md
new file mode 100644
index 0000000..cd08cba
--- /dev/null
+++ b/gobblin-website/docs/sources/AvroFileSource.md
@@ -0,0 +1,17 @@
+---
+title: Avro Files
+sidebar_label: Avro Files
+
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/CopySource.md b/gobblin-website/docs/sources/CopySource.md
new file mode 100644
index 0000000..3abedb0
--- /dev/null
+++ b/gobblin-website/docs/sources/CopySource.md
@@ -0,0 +1,17 @@
+---
+title: File Copy
+sidebar_label: File Copy
+
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/GoogleAnalyticsSource.md b/gobblin-website/docs/sources/GoogleAnalyticsSource.md
new file mode 100644
index 0000000..acd1c2b
--- /dev/null
+++ b/gobblin-website/docs/sources/GoogleAnalyticsSource.md
@@ -0,0 +1,18 @@
+---
+title: Google Analytics
+sidebar_label: Google Analytics
+
+---
+
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/GoogleDriveSource.md b/gobblin-website/docs/sources/GoogleDriveSource.md
new file mode 100644
index 0000000..9162066
--- /dev/null
+++ b/gobblin-website/docs/sources/GoogleDriveSource.md
@@ -0,0 +1,17 @@
+---
+title: Google Drive
+sidebar_label: Google Drive
+
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/GoogleWebmaster.md b/gobblin-website/docs/sources/GoogleWebmaster.md
new file mode 100644
index 0000000..9a02251
--- /dev/null
+++ b/gobblin-website/docs/sources/GoogleWebmaster.md
@@ -0,0 +1,84 @@
+---
+title: Google Webmaster
+sidebar_label: Google Webmaster
+
+---
+
+#Introduction
+The Google Search Console data ingestion project is to download query and analytics data from Google Search Console for the purpose of doing search analytics of your verified sites. Available analytics measures are clicks, impressions, CTR and position. Used dimensions are dates, pages, countries and queries. 
+
+Details about this Google service and API can be found at [https://developers.google.com/webmaster-tools/](http://).This service can be run on a daily or weekly basis to download data at a daily granularity. 
+
+Other useful links:
+
+* API Java documentation: [https://developers.google.com/resources/api-libraries/documentation/webmasters/v3/java/latest/](http://)
+
+* Google API Manager: [https://console.developers.google.com/apis/dashboard](http://)
+
+
+#Implementation
+##Summary
+This connector implements sources, extractors, and iterators for the extractors, where each iterator is responsible for downloading data of each market. Due to the limitations of the Google API, the whole service has to deal with a lot of asynchronous API calls to figure out the problems like
+
+* what is the total size of all unique pages
+* what is the full list of all unique pages
+* how to download queries and analytics data for each page
+* how to improve the overall performance
+
+There are two implementations for this service to download analytics data for each page, V1 and V2. V1 is the initial design, which is very straight forward. After we get a full list of all unique URL pages, we send a request for the queries and analytics data for that page with a page filter saying that the page needs to exactly equals that page. However, if the amount of pages is large, for example, above 100,000 pages; given the actual API request speed(less than 4 pages/second), the  [...]
+![Conversion Ratio by Group Size](../../static/img/Trie-Conversion-Ratio.png)
+
+In short, large group size can convert large percentage of pages into groups, each of which results in a single API call.
+
+The user still has the ability to choose which algorithm or implementation to use when starting the service by configuring the key `source.google_webmasters.request.tuning.get_queries.apply_trie`.
+
+##Entities
+Here is a table explaining the responsibility of each class briefly
+
+
+Name     | Description
+-------- | ---
+GoogleWebmasterClient | GoogleWebmasterClient provides basic accesses to Google Search Console by utilizing Google Webmaster API.
+GoogleWebmasterDataFetcher | GoogleWebmasterDataFetcher implements the features to get all pages, and download analytics data (e.g. queries, clicks, impressions, CTR, position) for a given set of constraints like dates, pages, and countries.
+GoogleWebmasterFilter | This is a util class providing enums and utility functions relevant to Google webmaster filters.
+GoogleWebMasterSource | This is an abstract class that extends Gobblin's standard QueryBasedSource. It provides basic checks and configuration processing for google-webmaster-pull configuration files.
+GoogleWebMasterSourceDaily | This implementation gives you the ability to do a daily extract from Google Search Console.
+GoogleWebmasterExtractor | An implementation of Gobblin's extractor. <p><br />It relies on a bunch of GoogleWebmasterExtractorIterator generated for each market to extract the data.</p>
+GoogleWebmasterExtractorIterator | The core piece used by GoogleWebmasterExtractor to iterate through the downloaded dataset.
+GoogleWebmasterDayPartitioner | The output partitioner that partitions output by the date of fetched data set
+ProducerJob | This is a partitionable request unit used by GoogleWebmasterExtractorIterator for sending detailed API requests to Google Search Console. It includes the filter dimensions like date range, page URL and page URL filter type(e.g. contains, non-contains, equals). These jobs are generated in a producer thread while requesting queries and analytics data for pages. They are placed into a ConcurrentLinkedDeque and dispatched or processed by a pool of working threads. The downloade [...]
+SimpleProducerJob | SimpleProducerJob is a basic implementation of ProducerJob, utilizing the default partition logic.
+TrieBasedProducerJob | TrieBasedProducerJob is a trie-based implementation of ProducerJob. <p><br/>For the partition logic, it first tries to partition the pages by splitting the trie into smaller ones based on a new group size, which is a half of previous value. When it is not partitionable at the page level, the parition logic falls back to the default one provided by the base class.</p>
+UrlTrie | The trie that keeps all URL pages. Save all fetched pages into a trie in order to use the TrieBasedProducerJobs
+UrlTrieNode | The trie node in the URL trie
+UrlTriePostOrderIterator | This is a post-order iterator that traverses the nodes on the URL trie with a stopping rule, which is, it will not go deeper into the nodes whose size(defined as the number of descendant URLs and itself if itself is a URL page) is less than or equal to the stopping size. In other words, those nodes with size less than or equal to the stopping size will be treated as leaf nodes.
+UrlTriePrefixGrouper | UrlTriePrefixGrouper will package the URL pages/nodes into groups given the group size while traversing the UrlTrie by utilizing a TrieIterator. If the current node is not a "leaf" node defined by the TrieIterator, then a "fake" group of size 1 will be created by only including this node. <p><br/>A group of URL pages will share the same common longest prefix and will be sent in one API request by using the "containing" page filter. A fake group containing only one  [...]
+
+
+##Work Flow
+Starting with GoogleWebMasterSource, it consumes the job or pull configuration file, does some basic checks, and decides the date range to work on based on the type of GoogleWebMasterSource specified. Then it passes the date range and the list of markets to GoogleWebmasterExtractor to work on. The GoogleWebmasterExtractor will create a GoogleWebmasterExtractorIterator for each market and start the downloading process, which is the same for every market. For the downloading process, first [...]
+
+
+#Configuration
+
+Configuration Key | Default Value | Description
+----------------- | ------------- | ---
+source.google_webmasters.property_url|Must Provide|Provide the property site URL whose google search analytics data you want to download
+source.google_webmasters.request.filters|Optional|The filters that will be passed to all your API requests. <p><br/>Filter format is [GoogleWebmasterFilter.Dimension].[DimensionValue].</p>Currently, this filter operator is "EQUALS" and only Country dimension is supported. Will extend this feature according to more use cases in the future.
+source.google_webmasters.request.dimensions|Must Provide|Allowed dimensions are DATE, PAGE, COUNTRY, QUERY, DEVICE, SEARCH_TYPE, SEARCH_APPEARANCE
+source.google_webmasters.request.metrics|Must Provide|Allowed metrics are CLICKS, IMPRESSIONS, CTR, POSITION
+source.google_webmasters.request.page_limit|5000|The response row limits when you ask for pages. Set it to 5000 when you want to get all pages. Default to 5000, which is the maximum allowed.
+source.google_webmasters.request.query_limit|5000|The response row limits when you ask for queries. Default to 5000, which is the maximum allowed.
+source.google_webmasters.request.hot_start|Optional|Hot start this service with pre-set pages. Once this is set, the service will ignore source.google_webmasters.request.page_limit, and won't get all pages, but use the pre-set pages instead. <p><br/>This is useful for debugging or resuming your failed work.</p>
+source.google_webmasters.request.tuning.get_queries.time_out|120|Set the time out in minutes for each round.
+source.google_webmasters.request.tuning.get_queries.max_retries|30|Tune the maximum rounds of retries allowed when API calls failed because of exceeding quota.
+source.google_webmasters.request.tuning.get_queries.cool_down_time|250|Tune the cool down time in millisecond between each round.
+source.google_webmasters.request.tuning.get_queries.batches_per_second|2.25|Tune the speed of API requests in batches
+source.google_webmasters.request.tuning.get_queries.batch_size|2|Tune the size of a batch. Batch API calls together to reduce the number of HTTP connections. <p><br/>Note: A set of n requests batched together counts toward your usage limit as n requests, not as one request. The batch request is taken apart into a set of requests before processing.</p>Read more at [https://developers.google.com/webmaster-tools/v3/how-tos/batch](http://).
+source.google_webmasters.request.tuning.get_queries.trie_group_size|500|Set the group size for the URL trie
+source.google_webmasters.request.tuning.get_queries.apply_trie|false|Set to true to use the Trie based algorithm. Otherwise, set to false.<p><br/>If set to true, you also need to set page_limit to 5000 indicating that you want to get all pages because trie based algorithm won't give you expected results if you just need a subset of all pages.</p>
+source.google_webmasters.request.tuning.get_pages.requests_per_second|5.0|Tune the speed of API requests while getting all pages.
+source.google_webmasters.request.tuning.get_pages.max_retries|120|Tune the number of maximum retries while getting all pages. Consider the following affecting factors while setting this number: <ol><li>the length of shared prefix path may be very long.</li><li>the Quota Exceeded exception.</li></ol>
+source.google_webmasters.request.tuning.get_pages.time_out|2|Set the time out in minutes while getting all pages.
+
+
diff --git a/gobblin-website/docs/sources/HadoopTextInputSource.md b/gobblin-website/docs/sources/HadoopTextInputSource.md
new file mode 100644
index 0000000..6ffded6
--- /dev/null
+++ b/gobblin-website/docs/sources/HadoopTextInputSource.md
@@ -0,0 +1,17 @@
+---
+title: Hadoop Text Input
+sidebar_label: Hadoop Text Input
+
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/HelloWorldSource.md b/gobblin-website/docs/sources/HelloWorldSource.md
new file mode 100644
index 0000000..64e7207
--- /dev/null
+++ b/gobblin-website/docs/sources/HelloWorldSource.md
@@ -0,0 +1,16 @@
+---
+title: Hello World
+sidebar_label: Hello World
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/HiveAvroToOrcSource.md b/gobblin-website/docs/sources/HiveAvroToOrcSource.md
new file mode 100644
index 0000000..a039989
--- /dev/null
+++ b/gobblin-website/docs/sources/HiveAvroToOrcSource.md
@@ -0,0 +1,16 @@
+---
+title: Hive Avro-to-ORC
+sidebar_label: Hive Avro-to-ORC
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/HivePurgerSource.md b/gobblin-website/docs/sources/HivePurgerSource.md
new file mode 100644
index 0000000..beba3ed
--- /dev/null
+++ b/gobblin-website/docs/sources/HivePurgerSource.md
@@ -0,0 +1,16 @@
+---
+title: Hive Compliance Purging
+sidebar_label: Hive Compliance Purging
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/HiveSource.md b/gobblin-website/docs/sources/HiveSource.md
new file mode 100644
index 0000000..f8e9f66
--- /dev/null
+++ b/gobblin-website/docs/sources/HiveSource.md
@@ -0,0 +1,16 @@
+---
+title: Hive
+sidebar_label: Hive
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/KafkaSource.md b/gobblin-website/docs/sources/KafkaSource.md
new file mode 100644
index 0000000..33c0454
--- /dev/null
+++ b/gobblin-website/docs/sources/KafkaSource.md
@@ -0,0 +1,16 @@
+---
+title: Kafka
+sidebar_label: Kafka
+---
+
+# Description
+
+See [Kafka-to-HDFS case study](../case-studies/Kafka-HDFS-Ingestion.md)
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/MySQLSource.md b/gobblin-website/docs/sources/MySQLSource.md
new file mode 100644
index 0000000..7c74ed5
--- /dev/null
+++ b/gobblin-website/docs/sources/MySQLSource.md
@@ -0,0 +1,16 @@
+---
+title: MySQL
+sidebar_label: MySQL 
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/OracleSource.md b/gobblin-website/docs/sources/OracleSource.md
new file mode 100644
index 0000000..f0e565d
--- /dev/null
+++ b/gobblin-website/docs/sources/OracleSource.md
@@ -0,0 +1,16 @@
+---
+title: Oracle
+sidebar_label: Oracle
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/QueryBasedSource.md b/gobblin-website/docs/sources/QueryBasedSource.md
new file mode 100644
index 0000000..01150b0
--- /dev/null
+++ b/gobblin-website/docs/sources/QueryBasedSource.md
@@ -0,0 +1,107 @@
+---
+title: Query Based
+sidebar_label: Query Based
+---
+
+# Introduction
+[`QueryBasedSource`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/extract/QueryBasedSource.java)
+represents a category of sources whose data is pulled by sending queries. A dataset of a source is identified as a
+[`SourceEntity`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/extract/QueryBasedSource.java#L96).
+Query can be done by sending HTTP requests or SQL commands. A source often, but not always, has a corresponding
+[`QueryBasedExtractor`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/extract/QueryBasedExtractor.java),
+which defines the way and implements common routines to extract data from the source.
+
+# Constructs
+## `QueryBasedSource`
+![Query based sources](../../static/img/Gobblin-Query-Based-Sources.png)
+Figure 1: Query based sources
+
+Like other categories of sources, a `QueryBasedSource` focuses on creating work units as well. The way it does follows the general pattern:
+
+- calculate low watermark of current run based on previous runs
+- compute a high watermark
+- partition datasets of current run into work units
+- pick up previously failed work units.
+
+At last, it will group several work units as
+[`MultiWorkUnit`](https://github.com/apache/gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/source/workunit/MultiWorkUnit.java)
+according to the `mr.job.max.mappers` configuration (Note: other categories of source might have a different approach to group work units into `MultiWorkUnit`). 
+
+## `QueryBasedExtractor`
+![Query based extractors](../../static/img/Gobblin-Query-Based-Extractors.png)
+Figure 2: Query based extractors
+
+Currently in Gobblin, depending on how an extractor communicates with a source
+(or [different communication protocols](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/extract/ProtocolSpecificLayer.java)),
+a `QueryBasedExtractor` falls into 2 categories:
+[`RestApiExtractor`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/extract/restapi/RestApiExtractor.java)
+and
+[`JdbcExtractor`](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-sql/src/main/java/org/apache/gobblin/source/jdbc/JdbcExtractor.java).
+A specific extractor has to provide some
+[source specific logic](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/extract/SourceSpecificLayer.java)
+in order to successfully extract information from the source.
+
+### `build`
+Building a query based extractor may involve three queries:
+
+![Query based extractor build queries](../../static/img/Gobblin-Query-Based-Extractor-Build-Queries.png)
+Figure 3: Query based extractor build queries
+
+- `extractMetadata` sends a query to fetch the data schema. For example:
+```sql
+select  col.column_name,  col.data_type,  
+  case when CHARACTER_OCTET_LENGTH is null then 0 else 0 end as length,
+  case when NUMERIC_PRECISION is null then 0 else NUMERIC_PRECISION end as precesion,
+  case when NUMERIC_SCALE is null then 0 else NUMERIC_SCALE end as scale,
+  case when is_nullable='NO' then 'false' else 'true' end as nullable,  '' as format,
+  case when col.column_comment is null then '' else col.column_comment end as comment 
+from information_schema.COLUMNS col
+WHERE upper(col.table_name)=upper(?) AND upper(col.table_schema)=upper(?)
+order by col.ORDINAL_POSITION 
+```
+- `getMaxWatermark` sends a query for calculating the latest high watermark. For example:
+```sql
+SELECT max(SystemModTime) FROM salesforce.campaign
+where (SystemModTime >= '2014-01-01 00:00:00' and SystemModTime <= '2017-03-09 10:42:10')
+```
+- `getSourceCount` sends a query for the total count of records to be pulled from the source. For example:
+```sql
+SELECT COUNT(1) FROM salesforce.campaign
+where (SystemModTime >= '2014-01-01 00:00:00' and SystemModTime <= '2017-03-01 19:03:07')
+```
+
+The actual implementations of those methods are pushed to an upper layer, which uses its own protocol(e.g. [Rest Api](../sources/RestApiSource.md) or Jdbc. The examples given are using Jdbc.) to query the source.
+
+### `readRecord`
+While querying the record set for the last work unit, the upper bounds will be removed if appropriate. For a daily open-ended full dump job, it will fetch a more complete data set as there
+might be some new data generated or existing data changes between the data query creation and execution. 
+
+Two separate approaches to fetch record set:
+
+- `getRecordSet`: A standard way to send a query, e.g. Rest api or Jdbc
+```sql
+SELECT id,name,budget,systemmodtime FROM salesforce.campaign
+where (SystemModTime >= '2014-01-01 00:00:00')
+```
+- `getRecordSetFromSourceApi`: A specific way to send a query based on source api, e.g. Salesforce
+
+Likewise, the actual implementations of those methods are pushed to an upper layer. See chapters: [Rest Api](../sources/RestApiSource.md), [Salesforce](../sources/SalesforceSource.md).
+
+
+# Configuration
+| Configuration Key | Default Value | Description |
+| ----------------- | ------------- | ----------- |
+| `source.querybased.schema` | Must Provide | Database name |
+| `source.entity` | Must Provide | Name of the source entity that will be pulled from the source. It could be a database table, a source topic, a restful entity, etc. |
+| `source.max.number.of.partitions` |  20 | Maximum number of partitions or work units to split this current run across. Only used by the `QueryBasedSource` and `FileBasedSource`. |
+| `source.querybased.watermark.type` | Must Provide | The format of the watermark that is used when extracting data from the source. Possible types are timestamp, date, hour, simple. |
+| `source.querybased.start.value` | Must provide | Value for the watermark to start pulling data from, also the default watermark if the previous watermark cannot be found in the old task states. |
+| `source.querybased.end.value` | Optional | The high watermark the job should pull up to. |
+| `extract.delta.fields` | Optional | List of columns that are associated with the watermark. |
+| `source.querybased.skip.count.calc` | False | Disable calculating the total counts of records to be pulled from the source. |
+| `source.querybased.is.specific.api.active` | False | True if this pull needs to use source specific apis instead of standard protocols. Ex: Use salesforce bulk api instead of rest api |
+
+
+
+
+
diff --git a/gobblin-website/docs/sources/RestApiSource.md b/gobblin-website/docs/sources/RestApiSource.md
new file mode 100644
index 0000000..eff4510
--- /dev/null
+++ b/gobblin-website/docs/sources/RestApiSource.md
@@ -0,0 +1,50 @@
+---
+title: Rest API
+sidebar_label: Rest API
+---
+
+# Introduction
+A RestApiSource is a [QueryBasedSource](../sources/QueryBasedSource.md) which uses [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer)
+Api for query. `RestApiExtractor` is a `QueryBasedExtractor` that uses REST to communicate with the source. To establish the communication,
+a [`RestApiConnector`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/extract/restapi/RestApiConnector.java) is
+required.
+
+# Constructs
+## `RestApiSource`
+Coming soon...
+
+## `RestApiExtractor`
+A `RestApiExtractor` sets up the common routines to query information from a REST source, for example, `extractMetadata`,
+`getMaxWatermark`, `getSourceCount`, `getRecordSet`, which are mentioned in chapter [QueryBasedSource](../sources/QueryBasedSource.md).
+In terms of constructing the actual query and extracting the data from the response, the source specific layer holds the truth,
+for example, `SalesforceExtractor`.
+
+A simplified general flow of routines is depicted in Figure 1:
+
+![Rest api extractor general routine flow](../../static/img/Rest-Api-Extractor-Flow.png)
+Figure 1: RestApiExtractor general routine flow
+
+Depends on the routines, [getX], [constructGetXQuery], [extractXFromResponse] are
+
+| Description | [getX] | [constructGetXQuery] | [extractXFromResponse] |
+| ----------- | ------ | -------------------- | ---------------------- |
+| Get data schema | `extractMetadata` | `getSchemaMetadata` | `getSchema` |
+| Calculate latest high watermark | `getMaxWatermark` | `getHighWatermarkMetadata` | `getHighWatermark` |
+| Get total counts of records to be pulled | `getSourceCount` | `getCountMetadata` | `getCount` |
+| Get records | `getRecordSet` | `getDataMetadata` | `getData` |
+
+There are other interactions between the `RestApiExtractor` layer and `SourceSpecificLayer`. The key points are:
+
+- A [`ProtocolSpecificLayer`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/extract/ProtocolSpecificLayer.java), such as
+`RestApiExtractor`, understands the protocol and sets up a routine to communicate with the source
+- A `SourceSpecificLayer`, such as `SalesforceExtractor`, knows the source and fits into the routine by providing and analyzing source specific information
+
+# Configuration
+| Configuration Key | Default Value | Description |
+| ----------------- | ------------- | ----------- |
+| `source.querybased.query` | Optional | The query that the extractor should execute to pull data. |
+| `source.querybased.excluded.columns` | Options | Names of columns excluded while pulling data. |
+| `extract.delta.fields` | Optional | List of columns that are associated with the watermark. |
+| `extract.primary.key.fields` | Optional | List of columns that will be used as the primary key for the data. |
+
+
diff --git a/gobblin-website/docs/sources/SalesforceSource.md b/gobblin-website/docs/sources/SalesforceSource.md
new file mode 100644
index 0000000..ff455c9
--- /dev/null
+++ b/gobblin-website/docs/sources/SalesforceSource.md
@@ -0,0 +1,16 @@
+---
+title: Salesforce
+sidebar_label: Salesforce
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/SftpSource.md b/gobblin-website/docs/sources/SftpSource.md
new file mode 100644
index 0000000..6286f34
--- /dev/null
+++ b/gobblin-website/docs/sources/SftpSource.md
@@ -0,0 +1,16 @@
+---
+title: SFTP
+sidebar_label: SFTP
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/SimpleJsonSource.md b/gobblin-website/docs/sources/SimpleJsonSource.md
new file mode 100644
index 0000000..3a33f3a
--- /dev/null
+++ b/gobblin-website/docs/sources/SimpleJsonSource.md
@@ -0,0 +1,16 @@
+---
+title: JSON
+sidebar_label: JSON
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/SqlServerSource.md b/gobblin-website/docs/sources/SqlServerSource.md
new file mode 100644
index 0000000..9d9c329
--- /dev/null
+++ b/gobblin-website/docs/sources/SqlServerSource.md
@@ -0,0 +1,16 @@
+---
+title: SQL Server
+sidebar_label: SQL Server
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/TeradataSource.md b/gobblin-website/docs/sources/TeradataSource.md
new file mode 100644
index 0000000..6c10570
--- /dev/null
+++ b/gobblin-website/docs/sources/TeradataSource.md
@@ -0,0 +1,16 @@
+---
+title: Teradata
+sidebar_label: Teradata
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/sources/WikipediaSource.md b/gobblin-website/docs/sources/WikipediaSource.md
new file mode 100644
index 0000000..44948e4
--- /dev/null
+++ b/gobblin-website/docs/sources/WikipediaSource.md
@@ -0,0 +1,16 @@
+---
+title: Wikipedia
+sidebar_label: Wikipedia
+---
+
+# Description
+
+TODO
+
+# Usage
+
+TODO
+
+# Configuration
+
+TODO
diff --git a/gobblin-website/docs/user-guide/Azure-Kubernetes-Deployment.md b/gobblin-website/docs/user-guide/Azure-Kubernetes-Deployment.md
new file mode 100644
index 0000000..ca19f3d
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Azure-Kubernetes-Deployment.md
@@ -0,0 +1,88 @@
+---
+title: GaaS on Azure Deployment
+sidebar_label: GaaS on Azure Deployment
+---
+
+# GaaS on Azure Deployment Steps
+
+## Create Azure Container Registry [Optional]
+
+1\) Log into Azure Container Registry
+
+```bash
+$ az acr login --name gobblintest
+```
+
+2\) Tag docker images to container registry
+
+```bash
+$ docker tag <gaas_image_id> gobblintest.azurecr.io/gobblin-service
+$ docker tag <standalone_image_id> gobblintest.azurecr.io/gobblin-standalone
+```
+
+3\) Push the images
+
+```bash
+$ docker push gobblintest.azurecr.io/gobblin-service
+$ docker push gobblintest.azurecr.io/gobblin-standalone
+```
+
+The images should now be hosted on azure with the tag:latest
+
+## Deploy the base K8s cluster
+
+1\) Create a resource group on Azure
+
+2\) Create a cluster and deploy it onto the resource group
+
+```bash
+az aks create --resource-group <resource_group_name> --name GaaS-cluster-test --node-count 1 --enable-addons monitoring --generate-ssh-keys
+```
+
+3\) Switch kubectl to use azure
+
+4\) Check status of cluster
+
+```bash
+$ kubectl get pods
+```
+
+## Install the nginx ingress to connect to the Azure Cluster
+
+1\) Install helm if you don't currently have it
+
+```bash
+brew install helm
+helm init
+```
+
+2\) Deploy the nginx helm chart to create the ingress
+
+```bash
+helm install stable/nginx-ingress
+```
+
+If this is the first time deploying helm (v2.0), you will need to set up the tiller, which is a helm serviceaccount with sudo permissions that lives inside of the cluster. Otherwise you'll run into this [issue](https://github.com/helm/helm/issues/2224).
+
+> Error: configmaps is forbidden: User "system:serviceaccount:kube-system:default" cannot list configmaps in the namespace "kube-system"
+
+To set up the tiller \(steps are also found in the issue link\)
+
+```bash
+kubectl create serviceaccount --namespace kube-system tiller
+kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
+kubectl edit deploy --namespace kube-system tiller-deploy #and add the line serviceAccount: tiller to spec/template/spec
+```
+
+3\) Deploy the ingress controller in `gobblin-kubernetes/gobblin-service/azure-cluster`
+
+4\) Run `kubectl get services`, and the output should look something like this:
+
+```text
+gaas-svc                                        ClusterIP      10.0.176.58    <none>           6956/TCP                     16h
+honorary-possum-nginx-ingress-controller        LoadBalancer   10.0.182.255   <EXTERNAL_IP>    80:30488/TCP,443:31835/TCP   6m13s
+honorary-possum-nginx-ingress-default-backend   ClusterIP      10.0.236.153   <none>           80/TCP                       6m13s
+kubernetes                                      ClusterIP      10.0.0.1       <none>           443/TCP                      10d
+```
+
+5\) Send a request to the IP for the `honorary-possum-nginx-ingress-controller`
diff --git a/gobblin-website/docs/user-guide/Building-Gobblin-as-a-Service.md b/gobblin-website/docs/user-guide/Building-Gobblin-as-a-Service.md
new file mode 100644
index 0000000..c9c064c
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Building-Gobblin-as-a-Service.md
@@ -0,0 +1,54 @@
+---
+title: Building Gobblin as a Service
+sidebar_label: Building Gobblin as a Service
+
+---
+
+# Introduction
+Gobblin as a service is a service that takes in a user request (a logical flow) and converts it into a series of Gobblin Jobs, and monitors these jobs in a distributed manner.
+The design of the service can be found here: https://cwiki.apache.org/confluence/display/GOBBLIN/Gobblin+as+a+Service
+
+# Running Gobblin as a Service
+1. [Build Gobblin] (./Building-Gobblin.md) or use one of the [provided distributions] (https://github.com/apache/gobblin/releases)
+2. Untar the build file `tar -xvf apache-gobblin-incubating-bin-${GOBBLIN_VERSION}.tar.gz`
+3. Execute the start script `./gobblin-dist/bin/gobblin-service.sh`
+4. View output in `service.out`
+
+Currently the setup only runs a portion of the service, but work will be done to have a basic end-to-end workflow soon.
+
+The service can now be accessed on `localhost:6956`
+
+# Running Gobblin as a Service with Docker
+There are also Dockerfiles to create new images of Gobblin based on the source code that can be easily run independently.
+
+The Docker compose is set up to easily create a working end-to-end workflow of Gobblin as a Service, which communicates Gobblin Standalone through a local volume filesystem.
+
+To run the full docker compose:
+
+1. `export GOBBLIN_ROOT_DIR=<root_directory_of_gobblin>`
+2. `export LOCAL_DATAPACK_DIR=<local_directory_of_templateUris>`
+3. `export LOCAL_JOB_DIR=<local_directory_to_read_and_write_jobs>`
+4. `docker compose -f gobblin-docker/gobblin-service/alpine-gaas-latest/docker-compose.yml build`
+5. `docker compose -f gobblin-docker/gobblin-service/alpine-gaas-latest/docker-compose.yml up`
+ 
+The docker container exposes the endpoints from Gobblin as a Service which can be accessed on `localhost:6956`
+
+# Running Gobblin as a Service with Kubernetes
+Gobblin as a service also has a kubernetes cluster, which can be deployed to any K8s environment.
+
+Currently, the yamls use [Kustomize](https://kubernetes.io/docs/tasks/manage-kubernetes-objects/kustomization/) for configuration management. In the future, we may utilise Helm instead.
+
+To cluster is split into 3 environments
+1) base-cluster (deploys one pod of GaaS and Gobblin standalone, where GaaS writes jobSpecs to a folder tracked by the standalone instance)
+2) mysql-cluster (utilises MySQL for storing specStores instead of FS, future work may involve writing to a job queue to be picked by gobblin standalone)
+3) azure-cluster (deploys Dev on Microsoft Azure), more docs [here](./Azure-Kubernetes-Deployment.md)
+
+To add any flow config template for GaaS to use, add the `.template` file to `gobblin-kubernetes/gobblin-service/base-cluster/` and add the file to the configmap.
+For production purposes, flow config templates should be stored in a proper file system or a database instead of being added to the configmap.
+
+To deploy any of these clusters, run the following command from the repository root.
+```bash
+kubectl apply -k gobblin-kubernetes/gobblin-service/<ENV>/
+```
+
+There, find the external IP of the cluster and start sending requests.
diff --git a/gobblin-website/docs/user-guide/Building-Gobblin.md b/gobblin-website/docs/user-guide/Building-Gobblin.md
new file mode 100644
index 0000000..fdf9484
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Building-Gobblin.md
@@ -0,0 +1,62 @@
+---
+title: Building Gobblin
+sidebar_label: Building Gobblin
+
+---
+
+# Introduction
+
+This page outlines all the options that can be specified when building Gobblin using Gradle. The typical way of building Gobblin is to first checkout the code-base from GitHub and then build the code-base using Gradle.
+
+```bash
+git clone https://github.com/apache/gobblin.git
+cd gobblin
+./gradlew assemble
+```
+
+If one wants to compile the code as well as run the tests, use `./gradle assemble test`
+or `./gradlew build`.
+
+There are a number of parameters that can be passed into the above command to customize the build process.
+
+# Options
+
+These options just need to be added to the command above to take effect.
+
+### Versions
+
+#### Hadoop Version
+
+The Hadoop version can be specified by adding the option `-PhadoopVersion=[my-hadoop-version]`.
+
+#### Hive Version
+
+The Hive version can be specified by adding the option `-PhiveVersion=[my-hive-version]`.
+
+#### Pegasus Version
+
+The Pegasus version can be specified by adding the option `-PpegasusVersion=[my-pegasus-version]`.
+
+#### Byteman Version
+
+The Byteman version can be specified by adding the option `-PbytemanVersion=[my-byteman-version]`.
+
+### Exclude Hadoop Dependencies from `gobblin-dist.tar.gz`
+
+Add the option `-PexcludeHadoopDeps` to exclude all Hadoop libraries from `gobblin-dist.tar.gz`.
+
+### Exclude Hive Dependencies from `gobblin-dist.tar.gz`
+
+Add the option `-PexcludeHiveDeps` to exclude all Hive libraries from `gobblin-dist.tar.gz`.
+
+# Custom Gradle Tasks
+
+A few custom built Gradle tasks.
+
+### Print Project Dependencies
+
+Executing this command will print out all the dependencies between the different Gobblin Gradle sub-projects: `./gradlew dotProjectDependencies`.
+
+# Useful Gradle Commands
+
+These commands make working with Gradle a little easier.
diff --git a/gobblin-website/docs/user-guide/Compaction.md b/gobblin-website/docs/user-guide/Compaction.md
new file mode 100644
index 0000000..74e7039
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Compaction.md
@@ -0,0 +1,299 @@
+---
+title: Compaction
+sidebar_label: Compaction
+---
+
+Compaction can be used to post-process files pulled by Gobblin with certain semantics. Deduplication is one of the common reasons to do compaction, e.g., you may want to
+
+* deduplicate on all fields of the records.
+* deduplicate on key fields of the records, keep the one with the latest timestamp for records with the same key.
+
+This is because duplicates can be generated for multiple reasons including both intended and unintended:
+
+* For ingestion from data sources with mutable records (e.g., relational databases), instead of ingesting a full snapshot of a table every time, one may wish to ingest only the records that were changed since the previous run (i.e., delta records), and merge these delta records with previously generated snapshots in a compaction. In this case, for records with the same primary key, the one with the latest timestamp should be kept.
+* The data source you ingest from may have duplicate records, e.g., if you have a hierarchy of Kafka clusters where topics are replicated among the Kafka clusters, duplicate records may be generated during the replication. In some data sources duplicate records may also be produced by the data producer.
+* In rare circumstances, Gobblin may pull the same data twice, thus creating duplicate records. This may happen if Gobblin publishes the data successfully, but for some reason fails to persist the checkpoints (watermarks) into the state store.
+
+Gobblin provides two compactors out-of-the-box, a MapReduce compactor and a Hive compactor.
+
+# MapReduce Compactor
+
+The MapReduce compactor can be used to deduplicate on all or certain fields of the records. For duplicate records, one of them will be preserved; there is no guarantee which one will be preserved.
+
+A use case of MapReduce Compactor is for Kafka records deduplication. We will use the following example use case to explain the MapReduce Compactor.
+
+## Example Use Case
+
+Suppose we ingest data from a Kafka broker, and we would like to publish the data by hour and by day, both of which are deduplicated:
+
+- Data in the Kafka broker is first ingested into an `hourly_staging` folder, e.g., `/data/kafka_topics/PageViewEvent/hourly_staging/2015/10/29/08...`
+- A compaction with deduplication runs hourly, consumes data in `hourly_staging` and publish data into `hourly`, e.g., `/data/kafka_topics/PageViewEvent/hourly/2015/10/29/08...`
+- A non-deduping compaction runs daily, consumes data in `hourly` and publish data into `daily`, e.g., `/data/kafka_topics/PageViewEvent/daily/2015/10/29...`
+
+## Basic Usage
+
+`MRCompactor.compact()` is the entry point for MapReduce-based compaction. The compaction unit is [`Dataset`](https://github.com/apache/gobblin/blob/master/gobblin-compaction/src/main/java/org/apache/gobblin/compaction/dataset/Dataset.java). `MRCompactor` uses a [`DatasetsFinder`](https://github.com/apache/gobblin/blob/master/gobblin-compaction/src/main/java/org/apache/gobblin/compaction/dataset/DatasetsFinder.java) to find all datasets eligible for compaction. Implementations of `Datase [...]
+
+In the above example use case, for hourly compaction, each dataset contains an hour's data in the `hourly_staging` folder, e.g., `/data/kafka_topics/PageViewEvent/hourly_staging/2015/10/29/08`; for daily compaction, each dataset contains 24 hourly folder of a day, e.g., `/data/kafka_topics/PageViewEvent/hourly/2015/10/29`. In hourly compaction, you may use the following config properties:
+
+```properties
+compaction.datasets.finder=org.apache.gobblin.compaction.dataset.TimeBasedSubDirDatasetsFinder
+compaction.input.dir=/data/kafka_topics
+compaction.dest.dir=/data/kafka_topics
+compaction.input.subdir=hourly_staging
+compaction.dest.subdir=hourly
+compaction.folder.pattern=YYYY/MM/dd
+compaction.timebased.max.time.ago=3h
+compaction.timebased.min.time.ago=1h
+compaction.jobprops.creator.class=org.apache.gobblin.compaction.mapreduce.MRCompactorTimeBasedJobPropCreator
+compaction.job.runner.class=org.apache.gobblin.compaction.mapreduce.avro.MRCompactorAvroKeyDedupJobRunner (if your data is Avro)
+```
+
+If your data format is not Avro, you can implement a different job runner class for deduplicating your data format. 
+
+## Non-deduping Compaction via Map-only Jobs
+
+There are two types of Non-deduping compaction.
+
+- **Type 1**: deduplication is not needed, for example you simply want to consolidate files in 24 hourly folders into a single daily folder.
+- **Type 2**: deduplication is needed, i.e., the published data should not contain duplicates, but the input data are already deduplicated. The daily compaction in the above example use case is of this type.
+
+Property `compaction.input.deduplicated` specifies whether the input data are deduplicated (default is false), and property `compaction.output.deduplicated` specifies whether the output data should be deduplicated (default is true). For type 1 deduplication, set both to false. For type 2 deduplication, set both to true.
+
+The reason these two types of compaction need to be separated is because of late data handling, which we will explain next.
+
+## Handling Late Records
+
+Late records are records that arrived at a folder after compaction on this folder has started. We explain how Gobblin handles late records using the following example.
+
+In this use case, both hourly compaction and daily compaction need a mechanism to handle late records. For hourly compaction, late records are records that arrived at an `hourly_staging` folder after the hourly compaction of that folder has started. It is similar for daily compaction.
+
+**Compaction with Deduplication**
+
+For a compaction with deduplication (i.e., hourly compaction in the above use case), there are two options to deal with late data:
+
+- **Option 1**: if there are late data, re-do the compaction. For example, you may run the hourly compaction multiple times per hour. The first run will do the normal compaction, and in each subsequent run, if it detects late data in a folder, it will re-do compaction for that folder.
+
+To do so, set `compaction.job.overwrite.output.dir=true` and `compaction.recompact.from.input.for.late.data=true`.
+
+Please note the following when you use this option: (1) this means that your already-published data will be re-published if late data are detected; (2) this is potentially dangerous if your input folders have short retention periods. For example, suppose `hourly_staging` folders have a 2-day retention period, i.e., folder `/data/kafka_topics/PageViewEvent/hourly_staging/2015/10/29` will be deleted on 2015/10/31. If, after 2015/10/31, new data arrived at this folder and you re-compact thi [...]
+
+- **Option 2**: (this is the default option) if there are late data, copy the late data into a `[output_subdir]/_late` folder, e.g., for hourly compaction, late data in `hourly_staging` will be copied to `hourly_late` folders, e.g., `/data/kafka_topics/PageViewEvent/hourly_late/2015/10/29...`. 
+
+If re-compaction is not necessary, this is all you need to do. If re-compaction is needed, you may schedule or manually invoke a re-compaction job which will re-compact by consuming data in both `hourly` and `hourly_late`. For this job, you need to set `compaction.job.overwrite.output.dir=true` and `compaction.recompact.from.dest.paths=true`.
+
+Note that this re-compaction is different from the re-compaction in Option 1: this re-compaction consumes data in output folders (i.e., `hourly`) whereas the re-compaction in Option 1 consumes data in input folders (i.e., `hourly_staging`).
+
+**Compaction without Deduplication**
+
+For a compaction without deduplication, if it is type 2, the same two options above apply. If it is type 1, late data will simply be copied to the output folder.
+
+**How to Determine if a Data File is Late**
+
+Every time a compaction finishes (except the case below), Gobblin will create a file named `_COMPACTION_COMPLETE` in the compaction output folder. This file contains the timestamp of when the compaction job starts. All files in the input folder with earlier modification timestamps have been compacted. Next time the compaction runs, files in the input folder with later timestamps are considered late data.
+
+The `_COMPACTION_COMPLETE` file will be only be created if it is a regular compaction that consumes input data (including compaction jobs that just copy late data to the output folder or the `[output_subdir]/_late` folder without launching an MR job). It will not be created if it is a re-compaction that consumes output data. This is because whether a file in the input folder is a late file depends on whether it has been compacted or moved into the output folder, which is not affected by  [...]
+
+One way of reducing the chance of seeing late records is to verify data completeness before running compaction, which will be explained next.
+
+## Verifying Data Completeness Before Compaction
+
+Besides aborting the compaction job for a dataset if new data in the input folder is found, another way to reduce the chance of seeing late events is to verify the completeness of input data before running compaction. To do so, set `compaction.completeness.verification.enabled=true`, extend `DataCompletenessVerifier.AbstractRunner` and put in your verification logic, and pass it via `compaction.completeness.verification.class`.
+
+When data completeness verification is enabled, `MRCompactor` will verify data completeness for the input datasets, and meanwhile speculatively start the compaction MR jobs. When the compaction MR job for a dataset finishes, if the completeness of the dataset is verified, its compacted data will be published, otherwise it is discarded, and the compaction MR job for this dataset will be launched again with a reduced priority.
+
+It is possible to control which topics should or should not be verified via `compaction.completeness.verification.whitelist` and `compaction.completeness.verification.blacklist`. It is also possible to set a timeout for data completeness verification via `compaction.completeness.verification.timeout.minutes`. A dataset whose completeness verification timed out can be configured to be either compacted anyway or not compacted.
+
+# Hive Compactor
+
+The Hive compactor can be used to merge a snapshot with one or multiple deltas. It assumes the snapshot and the deltas meet the following requirements:
+
+1. Snapshot and all deltas are in Avro format.
+1. Snapshot and all deltas have the same primary key attributes (they do not need to have the same schema).
+2. Snapshot is pulled earlier than all deltas. Therefore if a key appears in both snapshot and deltas, the one in the snapshot should be discarded.
+3. The deltas are pulled one after another, and ordered in ascending order of pull time. If a key appears in both the ith delta and the jth delta (i < j), the one in the jth delta survives.
+
+The merged data will be written to the HDFS directory specified in `output.datalocation`, as one or more Avro files. The schema of the output data will be the same as the schema of the last delta (which is the last pulled data and thus has the latest schema).
+
+In the near future we also plan to support selecting records by timestamps (rather than which file they appear). This is useful if the snapshot and the deltas are pulled in parallel, where if a key has multiple occurrences we should keep the one with the latest timestamp.
+
+Note that since delta tables don't have information of deleted records, such information is only available the next time the full snapshot is pulled.
+
+## Basic Usage
+
+A Hive Compactor job consists of one global configuration file which refers to one or more job configuration(s).  
+
+### Global Config Properties (example: compaction.properties)
+
+(1) Required:
+
+- _**compaction.config.dir**_
+
+This is the the compaction jobconfig directory. Each file in this directory should be a jobconfig file (described in the next section).
+
+(2) Optional:
+
+- _**hadoop.configfile.***_
+
+Hadoop configuration files that should be loaded
+(e.g., hadoop.configfile.coresite.xml=/export/apps/hadoop/latest/etc/hadoop/core-site.xml)
+
+- _**hdfs.uri**_
+
+If property `fs.defaultFS` (or `fs.default.name`) is specified in the hadoop config file, then this property is not needed. However, if it is specified, it will override `fs.defaultFS` (or `fs.default.name`).
+
+If `fs.defaultFS` or `fs.default.name` is not specified in the hadoop config file, and this property is also not specified, then the default value "hdfs://localhost:9000" will be used.
+
+- _**hiveserver.version**_ (default: 2)
+
+Either 1 or 2.
+
+- _**hiveserver.connection.string**_
+
+- _**hiveserver.url**_
+
+- _**hiveserver.user**_ (default: "")
+
+- _**hiveserver.password**_ (default: "")
+
+If `hiveserver.connection.string` is specified, it will be used to connect to hiveserver.
+
+If `hiveserver.connection.string` is not specified but `hiveserver.url` is specified, then it uses (`hiveserver.url`, `hiveserver.user`, `hiveserver.password`) to connect to hiveserver.
+
+If neither `hiveserver.connection.string` nor `hiveserver.url` is specified, then embedded hiveserver will be used (i.e., `jdbc:hive://` if `hiveserver.version=1`, `jdbc:hive2://` if `hiveserver.version=2`)
+
+- _**hivesite.dir**_
+
+Directory that contains hive-site.xml, if hive-site.xml should be loaded.
+
+- _**hive.***_
+
+Any hive config property. (e.g., `hive.join.cache.size`). If specified, it will override the corresponding property in hive-site.xml.
+
+
+### Job Config Properties (example: jobconf/task1.conf)
+
+(1) Required:
+
+- _**snapshot.pkey**_
+
+comma separated primary key attributes of the snapshot table
+
+- _**snapshot.datalocation**_
+
+snapshot data directory in HDFS
+
+- _**delta.i.pkey**_ (i = 1, 2...)
+
+the primary key of ith delta table
+(the primary key of snapshot and all deltas should be the same)
+
+- _**delta.i.datalocation**_ (i = 1, 2...)
+
+ith delta table's data directory in HDFS
+
+- _**output.datalocation**_
+
+the HDFS data directory for the output
+(make sure you have write permission on this directory)
+
+(2) Optional:
+
+- _**snapshot.name**_ (default: randomly generated name)
+
+prefix name of the snapshot table. The table name will be snapshot.name + random suffix
+
+- _**snapshot.schemalocation**_
+
+snapshot table's schema location in HDFS. If not specified, schema will be extracted from the data.
+
+- _**delta.i.name**_ (default: randomly generated name)
+
+prefix name of the ith delta table. The table name will be delta.i.name + random suffix
+
+- _**delta.i.schemalocation**_
+
+ith delta table's schema location in HDFS. If not specified, schema will be extracted from the data.
+
+- _**output.name**_ (default: randomly generated name)
+
+prefix name of the output table. The table name will be output.name + random suffix
+
+- _**hive.db.name**_ (default: default)
+
+the database name to be used. This database should already exist, and you should have write permission on it.
+
+- _**hive.queue.name**_ (default: default)
+
+queue name to be used.
+
+- _**hive.use.mapjoin**_ (default: if not specified in the global config file, then false)
+
+whether map-side join should be turned on. If specified both in this property and in the global config file (hive.*), this property takes precedences. 
+
+- _**hive.mapjoin.smalltable.filesize**_ (default: if not specified in the global config file, then use Hive's default value)
+
+if hive.use.mapjoin = true, mapjoin will be used if the small table size is smaller than hive.mapjoin.smalltable.filesize (in bytes).
+If specified both in this property and in the global config file (hive.*), this property takes precedences. 
+
+- _**hive.tmpschema.dir**_ (default: the parent dir of the data location dir where the data is used to extract the schema)
+
+If we need to extract schema from data, this dir is for the extracted schema.
+Note that if you do not have write permission on the default dir, you must specify this property as a dir where you do have write permission.
+
+- _**snapshot.copydata**_ (default: false)
+
+Set to true if you don't want to (or are unable to) create external table on snapshot.datalocation. A copy of the snapshot data will be created in `hive.tmpdata.dir`, and will be removed after the compaction.
+
+This property should be set to true if either of the following two situations applies:
+
+(i) You don't have write permission to `snapshot.datalocation`. If so, once you create an external table on `snapshot.datalocation`, you may not be able to drop it. This is a Hive bug and for more information, see [this page](https://issues.apache.org/jira/browse/HIVE-9020), which includes a Hive patch for the bug.
+
+(ii) You want to use a certain subset of files in `snapshot.datalocation` (e.g., `snapshot.datalocation` contains both .csv and .avro files but you only want to use .avro files)
+
+- _**delta.i.copydata**_ (i = 1, 2...) (default: false)
+
+Similar as `snapshot.copydata`
+
+- _**hive.tmpdata.dir**_ (default: "/")
+
+If `snapshot.copydata` = true or `delta.i.copydata` = true, the data will be copied to this dir. You should have write permission to this dir.
+
+- _**snapshot.dataformat.extension.name**_ (default: "")
+
+If `snapshot.copydata` = true, then only those data files whose extension is `snapshot.dataformat` will be moved to `hive.tmpdata.dir`.
+
+- _**delta.i.dataformat.extension.name**_ (default: "")
+
+Similar as `snapshot.dataformat.extension.name`. 
+
+- _**mapreduce.job.num.reducers**_
+
+Number of reducers for the job.
+
+- _**timing.file**_ (default: time.txt)
+
+A file where the running time of each compaction job is printed.
+
+
+# Running a Compaction Job
+
+Both the MapReduce and Hive-based compaction configurations can be executed with `bin/gobblin-compaction.sh` .
+
+The usage is as follows:
+```text
+gobblin-compaction.sh [OPTION] --type <compaction type: hive or mr> --conf <compaction configuration file>
+Where OPTION can be:
+  --projectversion <version>    Gobblin version to be used. If set, overrides the distribution build version
+  --logdir <log dir>            Gobblin's log directory: if not set, taken from ${GOBBLIN_LOG_DIR} if present. 
+  --help                        Display this help and exit
+```
+
+Example:
+```bash
+cd gobblin-dist
+bin/gobblin-compaction.sh --type hive --conf compaction.properties
+```
+
+The log4j configuration is read from `conf/log4j-compaction.xml`.
+Please note that in case of a Hive-compaction for drop table queries (`DROP TABLE IF EXISTS <tablename>`), the Hive JDBC client will throw `NoSuchObjectException` if the table doesn't exist. This is normal and such exceptions should be ignored.
diff --git a/gobblin-website/docs/user-guide/Config-Management.md b/gobblin-website/docs/user-guide/Config-Management.md
new file mode 100644
index 0000000..1500f8c
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Config-Management.md
@@ -0,0 +1,140 @@
+---
+title: Config Management
+sidebar_label: Config Management
+
+---
+
+#Introduction
+There are multiple challenges in dataset configuration management in the context of ETL data processing as ETL infrastructure employs multi-state processing flows to ingest and publish data on HDFS. Here are some examples types of datasets and types of processing:
+
+* OLTP Snapshots: ingest, publishing, replication, retention management, compliance post-processing
+* OLTP Increments: ingest, publishing, replication, roll-up, compaction, retention management
+* Streaming data: ingest, publishing, roll-up of streaming data, retention management, compliance post-processing
+* Opaque (derived) data: replication, retention management
+
+A typical dataset could be a database table, a Kafka topic, etc. Current the customization of the dataset processing is typically achieved through file/directory blacklists/whitelists in job/flow level configurations. This approach suffers from a number issues:
+
+* Dataset unaware - control is done through low-level file/directory wildcards which can be hard to understand for more complex data layouts. 
+* Difficulty in using/applying policies, i.e. applying the same configuration settings to large number of datasets.
+* Non-intuitive - the use of blacklists and whitelists can lead to properties whose effect is not always clear. 
+* A large potential of inconsistencies across different jobs/flows.
+* Lack of version control
+* Lack of easy, aggregated view of the setup/configuration for a given dataset across all flows (including consumer access) 
+
+We want to have a new way to customize the processing of each dataset like enabling/disabling certain types of processing, specific SLAs, access restrictions, retention policies, etc. without previous mentioned problems.
+
+#Dataset Config Management Requirement
+Design a backend and flexible client library for storing, managing and accessing configuration that can be used to customize the process of thousands of datasets across multiple systems/flows.
+
+###Data Model
+
+* (Dataset) configs are identified by a string config key
+* Each config key is mapped to a config object (a collection of properties)
+* The config object should be extensible, i.e. we should be able to add properties with arbitrary names
+* Hierarchical system for overrides
+	* Global default values
+	* Children datasets override/inherit parent dataset configs
+	* Ability to group properties that are specific for a group of datasets (aka tags). For example, we should be able to tag a group of Kafka datasets as "High-priority" and associate specific configuration properties with these settings.
+	* Tags should can be applied to both datasets and other tags.
+* Support for references to other properties (parameter expansion)
+* Late expansion of references (at time of access)
+
+###Versioning
+
+* Generated configs have a monotonically increasing version number
+* Generated configs have a date of generation
+* Once published configurations are immutable
+* Easy rollback to a previous version
+* Even if rolled back, a configuration is still available for processes that already use it
+* Audit traces about changes to configuration
+
+###Client library
+
+* Always loads the latest (non-rollbacked) configuration version (as of time of initialization). Once the version is fixed, the same version should be used for the remained of the processing. The consistency is needed only within a process. 
+* Non-requirement: cross-process version consistency. The application needs to enforce consistency across processes if necessary, e.g. by copying the config to a stable location.
+* Ability to list the tags (policies) associated with a dataset. 
+* Ability to list datasets associated with a tag. For example, we should be able to have a flow which can discover and process only "High-priority" datasets.
+* Debug info how values were derived in generated configs (e.g. from where a property value was inherited)
+
+###Config Store
+
+* Each config store is represented by an URI path
+* The URI path is significant so that configs can be associated at every level. For example, for the dataset URI hdfs_store:/data/databases/DB/Table , we should be able to associate config at every level: /data/databases/DB/Table, /data/databases/DB/, /data/databases/, etc.
+
+#Current Dataset Config Management Implementation
+
+At a very high-level, we extend [typesafe config](https://github.com/typesafehub/config) with:
+
+* Support for logical include URIs
+* Abstraction of a Config Store
+* Config versioning
+* Ability to traverse the ”import” relationships
+
+###Data model 
+
+**Config key (configuration node) / config value**
+
+For our use cases, we can define each configuration node per data set. All the configuration related to that dataset are specified together.
+
+Essentially, the system provides a mapping from a config key to a config object. Each config key is represented through a URI. The config object is a map from property name to a property value. We refer to this as own config (object) and refer to it through the function own_config(K, property_name) = property_value.
+
+A config key K can import one or more config keys I1, I2, ... . The config key K will inherit any properties from I1, I2, … that are not defined in K. The inheritance is resolved in the order of the keys I1, I2, … etc., i.e. the property will be resolved to the value in the last one that defines the property. This is similar to including configs in typesafe config. We will refer to resulting configuration as own config (object) and denote it though the function resolved_config(K, propert [...]
+
+We also use the path in the config key URI for implicit tagging. For example, `/data/tracking/TOPIC` implicitly imports `/data/tracking/`, which implicitly imports `/data/` which implicitly imports `/`. Note that all these URI are considered as config Key so their path level implicitly indicates importation. For a given config key, all implicit imports are before the explicit imports, i.e. they have lower priority in resolution. Typical use case for this implicit importation can be a glo [...]
+ 
+**Tags**
+
+For our use cases, we can define the static tags in a well known file per dataset.
+
+**Dynamic tags**
+Some tags cannot be applied statically at “compile” time. For example, such are cluster-specific tags since they are on the environment where the client application runs. We will support such tags about allowing the use of limited number of variables when importing another config key. For example, such a variable can be “local_cluster.name”. Then, importing /data/tracking/${local_cluster.name} can provide cluster-specific overrides.
+
+**Config Store**
+The configuration is partitioned in a number of Config Stores . Each Config Store is:
+
+* mapped to a unique URI scheme;
+* responsible for managing the mapping of config keys (represented through URIs with the Config Store scheme) to unresolved configs;
+
+![Config store data model](../../static/img/configStoreDataModel.png)
+
+###Client application 
+The client application interacts using the **ConfigClient** API . The ConfigClient maintains a set of **ConfigStoreAccessor** objects which interact through the **ConfigStore** API with the appropriate ConfigStore implementation depending on the scheme of the ConfigStore URI . There can be a native implementation of the API like the **HadoopFS ConfigStore** or an adapter to an existing config/metadata store like the Hive MetaStore, etc
+
+![Config store client api](../../static/img/configStoreClientApi.png)
+
+###File System layout
+
+1. All configurations in one configuration store reside in it’s ROOT directory
+2. _CONFIG_STORE file in ROOT directory (identification file for configuration store)
+3. One or multiple version directories under ROOT
+4. In each version, each directory represented as one configuration node
+	* In each directory, the main.conf file specify the configuration for that node
+	* In each directory, the includes file specify the imports links
+
+###Example of a config store
+
+```
+  ROOT  
+  ├── _CONFIG_STORE (contents = latest non-rolled-back version)  
+  └── 1.0.53	       (version directory)  
+    ├── data
+    │   └── tracking
+    │           ├── TOPIC
+    │           │   ├── includes		     (imports links)
+    │           │   └── main.conf		(configuration file)
+    │           ├── includes
+    │           └── main.conf
+    └── tags
+        ├── tracking
+        │   └── retention
+        │       └── LONG
+        │       │   ├── includes
+        │       │   └── main.conf
+        │       └── main.conf
+        └── acl
+            └── restricted
+                ├── main.conf
+                └── secdata
+                    ├── includes
+                    └── main.conf
+```
diff --git a/gobblin-website/docs/user-guide/Configuration-Properties-Glossary.md b/gobblin-website/docs/user-guide/Configuration-Properties-Glossary.md
new file mode 100644
index 0000000..1a466f6
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Configuration-Properties-Glossary.md
@@ -0,0 +1,476 @@
+---
+title: Configuration Glossary
+sidebar_label: Configuration Glossary
+
+---
+
+Configuration properties are key/value pairs that are set in text files. They include system properties that control how Gobblin will pull data, and control what source Gobblin will pull the data from. Configuration files end in some user-specified suffix (by default text files ending in `.pull` or `.job` are recognized as configs files, although this is configurable). Each file represents some unit of work that needs to be done in Gobblin. For example, there will typically be a separate [...]
+  
+The first section of this document contains all the required properties needed to run a basic Gobblin job. The rest of the document is dedicated to other properties that can be used to configure Gobbin jobs. The description of each configuration parameter will often refer to core Gobblin concepts and terms. If any of these terms are confusing, check out the [Gobblin Architecture](../Gobblin-Architecture) page for a more detailed explanation of how Gobblin works. The GitHub repo also cont [...]
+
+Gobblin also allows you to specify a global configuration file that contains common properties that are shared across all jobs. The [Job Launcher Properties](#Job-Launcher-Properties) section has more information on how to specify a global properties file.  
+
+# Table of Contents
+* [Properties File Format](#Properties-File-Format)
+* [Creating a Basic Properties File](#Creating-a-Basic-Properties-File)   
+* [Job Launcher Properties](#Job-Launcher-Properties)  
+    * [Common Job Launcher Properties](#Common-Launcher-Properties)  
+    * [SchedulerDaemon Properties](#SchedulerDaemon-Properties)  
+    * [CliMRJobLauncher Properties](#CliMRJobLauncher-Properties)  
+    * [AzkabanJobLauncher Properties](#AzkabanJobLauncher-Properties)  
+* [Job Type Properties](#Job-Type-Properties)  
+    * [Common Job Type Properties](#Common-Job-Type-Properties)  
+    * [LocalJobLauncher Properties](#LocalJobLauncher-Properties)  
+    * [MRJobLauncher Properties](#MRJobLauncher-Properties)  
+* [Retry Properties](#Retry-Properties)
+* [Task Execution Properties](#Task-Execution-Properties)  
+* [State Store Properties](#State-Store-Properties)  
+* [Metrics Properties](#Metrics-Properties)  
+* [Email Alert Properties](#Email-Alert-Properties)  
+* [Source Properties](#Source-Properties)  
+    * [Common Source Properties](#Common-Source-Properties)  
+    * [Distcp CopySource Properties](#Distcp-CopySource-Properties)
+        * [RecursiveCopyableDataset Properties](#RecursiveCopyableDataset-Properties)
+        * [DistcpFileSplitter Properties](#DistcpFileSplitter-Properties)
+        * [WorkUnitBinPacker Properties](#WorkUnitBinPacker-Properties)
+    * [QueryBasedExtractor Properties](#QueryBasedExtractor-Properties) 
+        * [JdbcExtractor Properties](#JdbcExtractor-Properties)  
+    * [FileBasedExtractor Properties](#FileBasedExtractor-Properties)  
+        * [SftpExtractor Properties](#SftpExtractor-Properties)  
+* [Converter Properties](#Converter-Properties)
+    * [CsvToJsonConverter Properties](#CsvToJsonConverter-Properties)    
+    * [JsonIntermediateToAvroConverter Properties](#JsonIntermediateToAvroConverter-Properties)
+    * [JsonStringToJsonIntermediateConverter Properties](#JsonStringToJsonIntermediateConverter-Properties)
+    * [AvroFilterConverter Properties](#AvroFilterConverter-Properties)  
+    * [AvroFieldRetrieverConverter Properties](#AvroFieldRetrieverConverter-Properties)  
+    * [AvroFieldsPickConverter Properties](#AvroFieldsPickConverter-Properties)  
+    * [AvroToJdbcEntryConverter Properties](#AvroToJdbcEntryConverter-Properties)  
+* [Fork Properties](#Fork-Properties)
+* [Quality Checker Properties](#Quality-Checker-Properties)  
+* [Writer Properties](#Writer-Properties)  
+* [Data Publisher Properties](#Data-Publisher-Properties)  
+* [Generic Properties](#Generic-Properties)  
+* [FileBasedJobLock Properties](#FileBasedJobLock-Properties)
+* [ZookeeperBasedJobLock Properties](#ZookeeperBasedJobLock-Properties)
+* [JDBC Writer Properties](#JdbcWriter-Properties)
+
+# Properties File Format <a name="Properties-File-Format"></a>
+
+Configuration properties files follow the [Java Properties text file format](http://docs.oracle.com/javase/7/docs/api/java/util/Properties.html#load(java.io.Reader)). Further, file includes and variable expansion/interpolation as defined in [Apache Commons Configuration](http://commons.apache.org/proper/commons-configuration/userguide_v1.10/user_guide.html) are also supported.
+
+Example:
+
+* common.properties
+
+```properties
+    writer.staging.dir=/path/to/staging/dir/
+    writer.output.dir=/path/to/output/dir/
+```
+* my-job.properties
+
+```properties    
+    include=common.properties
+    
+    job.name=MyFirstJob
+```
+
+# Creating a Basic Properties File <a name="Creating-a-Basic-Properties-File"></a>
+In order to create a basic configuration property there is a small set of required properties that need to be set. The following properties are required to run any Gobblin job:
+
+* `job.name` - Name of the job  
+* `source.class` - Fully qualified path to the Source class responsible for connecting to the data source  
+* `writer.staging.dir` - The directory each task will write staging data to  
+* `writer.output.dir` - The directory each task will commit data to  
+* `data.publisher.final.dir` - The final directory where all the data will be published
+* `state.store.dir` - The directory where state-store files will be written  
+
+For more information on each property, check out the comprehensive list below.  
+
+If only these properties are set, then by default, Gobblin will run in Local mode, as opposed to running on Hadoop M/R. This means Gobblin will write Avro data to the local filesystem. In order to write to HDFS, set the `writer.fs.uri` property to the URI of the HDFS NameNode that data should be written to. Since the default version of Gobblin writes data in Avro format, the writer expects Avro records to be passed to it. Thus, any data pulled from an external source must be converted to [...]
+
+The `source.class` property is one of the most important properties in Gobblin. It specifies what Source class to use. The Source class is responsible for determining what work needs to be done during each run of the job, and specifies what Extractor to use in order to read over each sub-unit of data. Examples of Source classes are [WikipediaSource](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/java/org/apache/gobblin/example/wikipedia/WikipediaSource.java) and [ [...]
+
+Typically, Gobblin jobs will be launched using the launch scripts in the `bin` folder. These scripts allow jobs to be launched on the local machine (e.g. SchedulerDaemon) or on Hadoop (e.g. CliMRJobLauncher). Check out the Job Launcher section below, to see the configuration difference between each launch mode. The [Deployment](Gobblin-Deployment) page also has more information on the different ways a job can be launched.  
+
+# Job Launcher Properties <a name="Job-Launcher-Properties"></a>
+Gobblin jobs can be launched and scheduled in a variety of ways. They can be scheduled via a Quartz scheduler or through [Azkaban](https://github.com/azkaban/azkaban). Jobs can also be run without a scheduler via the Command Line. For more information on launching Gobblin jobs, check out the [Deployment](Gobblin-Deployment) page.
+
+## Common Job Launcher Properties <a name="Common-Launcher-Properties"></a>
+These properties are common to both the Job Launcher and the Command Line.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `job.name` | The name of the job to run. This name must be unique within a single Gobblin instance. | Yes | None |
+| `job.group` | A way to group logically similar jobs together. | No | None |
+| `job.description` | A description of what the jobs does. | No | None |
+| `job.lock.enabled` | If set to true job locks are enabled, if set to false they are disabled | No | True |
+| `job.lock.type` | The fully qualified name of the JobLock class to run. The JobLock is responsible for ensuring that only a single instance of a job runs at a time. <br/><br/> Allowed values: [gobblin.runtime.locks.FileBasedJobLock](#FileBasedJobLock-Properties), [gobblin.runtime.locks.ZookeeperBasedJobLock](#ZookeeperBasedJobLock-Properties) | No | `gobblin.runtime.locks.FileBasedJobLock` |
+| `job.runonce` | A boolean specifying whether the job will be only once, or multiple times. If set to true the job will only be run once even if a job.schedule is specified. If set to false and a job.schedule is specified then it will run according to the schedule. If set false and a job.schedule is not specified, it will run only once. | No | False |
+| `job.disabled` | Whether the job is disabled or not. If set to true, then Gobblin will not run this job. | No | False |
+
+## SchedulerDaemon Properties <a name="SchedulerDaemon-Properties"></a>
+This class is used to schedule Gobblin jobs on Quartz. The job can be launched via the command line, and takes in the location of a global configuration file as a parameter. This configuration file should have the property `jobconf.dir` in order to specify the location of all the `.job` or `.pull` files. Another core difference, is that the global configuration file for the SchedulerDaemon must specify the following properties:
+
+* `writer.staging.dir`  
+* `writer.output.dir`  
+* `data.publisher.final.dir`  
+* `state.store.dir`  
+
+They should not be set in individual job files, as they are system-level parameters.
+For more information on how to set the configuration parameters for jobs launched through the SchedulerDaemon, check out the [Deployment](Gobblin-Deployment) page.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `job.schedule` | Cron-Based job schedule. This schedule only applies to jobs that run using Quartz. | No | None |
+| `jobconf.dir` | When running in local mode, Gobblin will check this directory for any configuration files. Each configuration file should correspond to a separate Gobblin job, and each one should in a suffix specified by the jobconf.extensions parameter. | No | None |
+| `jobconf.extensions` | Comma-separated list of supported job configuration file extensions. When running in local mode, Gobblin will only pick up job files ending in these suffixes. | No | pull,job |
+| `jobconf.monitor.interval` | Controls how often Gobblin checks the jobconf.dir for new configuration files, or for configuration file updates. The parameter is measured in milliseconds. | No | 300000 |
+
+## CliMRJobLauncher Properties <a name="CliMRJobLauncher-Properties"></a>
+There are no configuration parameters specific to CliMRJobLauncher. This class is used to launch Gobblin jobs on Hadoop from the command line, the jobs are not scheduled. Common properties are set using the `--sysconfig` option when launching jobs via the command line. For more information on how to set the configuration parameters for jobs launched through the command line, check out the [Deployment](Gobblin-Deployment) page.
+  
+## AzkabanJobLauncher Properties <a name="AzkabanJobLauncher-Properties"></a>
+There are no configuration parameters specific to AzkabanJobLauncher. This class is used to schedule Gobblin jobs on Azkaban. Common properties can be set through Azkaban by creating a `.properties` file, check out the [Azkaban Documentation](http://azkaban.github.io/) for more information. For more information on how to set the configuration parameters for jobs scheduled through the Azkaban, check out the [Deployment](Gobblin-Deployment) page.
+
+# Job Type Properties <a name="Job-Type-Properties"></a>
+## Common Job Type Properties <a name="Common-Job-Type-Properties"></a>
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `launcher.type` | Job launcher type; one of LOCAL, MAPREDUCE, YARN. LOCAL mode runs on a single machine (LocalJobLauncher), MAPREDUCE runs on a Hadoop cluster (MRJobLauncher), and YARN runs on a YARN cluster (not implemented yet). | No | LOCAL |
+
+## LocalJobLauncher Properties <a name="LocalJobLauncher-Properties"></a>
+There are no configuration parameters specific to LocalJobLauncher. The LocalJobLauncher will launch a Hadoop job on a single machine. If launcher.type is set to LOCAL then this class will be used to launch the job.
+Properties required by the MRJobLauncher class.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `framework.jars` | Comma-separated list of jars the Gobblin framework depends on. These jars will be added to the classpath of the job, and to the classpath of any containers the job launches. | No | None |
+| `job.jars` | Comma-separated list of jar files the job depends on. These jars will be added to the classpath of the job, and to the classpath of any containers the job launches. | No | None |
+| `job.hdfs.jars` | Comma-separated list of jar files the job depends on located in HDFS. These jars will be added to the classpath of the job, and to the classpath of any containers the job launches. | No | None |
+| `job.local.files` | Comma-separated list of local files the job depends on. These files will be available to any map tasks that get launched via the DistributedCache. | No | None |
+| `job.hdfs.files` | Comma-separated list of files on HDFS the job depends on. These files will be available to any map tasks that get launched via the DistributedCache. | No | None |
+
+## MRJobLauncher Properties <a name="MRJobLauncher-Properties"></a>
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `mr.job.root.dir` | Working directory for a Gobblin Hadoop MR job. Gobblin uses this to write intermediate data, such as the workunit state files that are used by each map task. This has to be a path on HDFS. | Yes | None |
+| `mr.job.max.mappers` | Maximum number of mappers to use in a Gobblin Hadoop MR job. If no explicit limit is set then a map task for each workunit will be launched. If the value of this properties is less than the number of workunits created, then each map task will run multiple tasks. | No | None |
+| `mr.include.task.counters` | Whether to include task-level counters in the set of counters reported as Hadoop counters. Hadoop imposes a system-level limit (default to 120) on the number of counters, so a Gobblin MR job may easily go beyond that limit if the job has a large number of tasks and each task has a few counters. This property gives users an option to not include task-level counters to avoid going over that limit. | Yes | False | 
+
+# Retry Properties <a name="Retry-Properties"></a>
+Properties that control how tasks and jobs get retried on failure.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `workunit.retry.enabled` | Whether retries of failed work units across job runs are enabled or not. | No | True |
+| `workunit.retry.policy` | Work unit retry policy, can be one of {always, never, onfull, onpartial}. | No | always |
+| `task.maxretries` | Maximum number of task retries. A task will be re-tried this many times before it is considered a failure. | No | 5 |
+| `task.retry.intervalinsec` | Interval in seconds between task retries. The interval increases linearly with each retry. For example, if the first interval is 300 seconds, then the second one is 600 seconds, etc. | No | 300 |
+| `job.max.failures` | Maximum number of failures before an alert email is triggered. | No | 1 |
+
+# Task Execution Properties <a name="Task-Execution-Properties"></a>
+These properties control how tasks get executed for a job. Gobblin uses thread pools in order to executes the tasks for a specific job. In local mode there is a single thread pool per job that executes all the tasks for a job. In MR mode there is a thread pool for each map task (or container), and all Gobblin tasks assigned to that mapper are executed in that thread pool.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `taskexecutor.threadpool.size` | Size of the thread pool used by task executor for task execution. Each task executor will spawn this many threads to execute any Tasks that is has been allocated. | No | 10 |
+| `tasktracker.threadpool.coresize` | Core size of the thread pool used by task tracker for task state tracking and reporting. | No | 10 |
+| `tasktracker.threadpool.maxsize` | Maximum size of the thread pool used by task tracker for task state tracking and reporting. | No | 10 |
+| `taskretry.threadpool.coresize` | Core size of the thread pool used by the task executor for task retries. | No | 2 |
+| `taskretry.threadpool.maxsize` | Maximum size of the thread pool used by the task executor for task retries. | No | 2 |
+| `task.status.reportintervalinms` | Task status reporting interval in milliseconds. | No | 30000 |
+
+# State Store Properties <a name="State-Store-Properties"></a>
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `state.store.dir` | Root directory where job and task state files are stored. The state-store is used by Gobblin to track state between different executions of a job. All state-store files will be written to this directory. | Yes | None |
+| `state.store.fs.uri` | File system URI for file-system-based state stores. | No | file:/// |
+
+# Metrics Properties <a name="Metrics-Properties"></a>
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `metrics.enabled` | Whether metrics collecting and reporting are enabled or not. | No | True |
+| `metrics.report.interval` | Metrics reporting interval in milliseconds. | No | 60000 |
+| `metrics.log.dir` | The directory where metric files will be written to. | No | None |
+| `metrics.reporting.file.enabled` | A boolean indicating whether or not metrics should be reported to a file. | No | True |
+| `metrics.reporting.jmx.enabled` | A boolean indicating whether or not metrics should be exposed via JMX. | No | False |
+
+# Email Alert Properties <a name="Email-Alert-Properties"></a>
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `email.alert.enabled` | Whether alert emails are enabled or not. Email alerts are only sent out when jobs fail consecutively job.max.failures number of times. | No | False |
+| `email.notification.enabled` | Whether job completion notification emails are enabled or not. Notification emails are sent whenever the job completes, regardless of whether it failed or not. | No | False |
+| `email.host` | Host name of the email server. | Yes, if email notifications or alerts are enabled. | None |
+| `email.smtp.port` | SMTP port number. | Yes, if email notifications or alerts are enabled. | None |
+| `email.user` | User name of the sender email account. | No | None |
+| `email.password` | User password of the sender email account. | No | None |
+| `email.from` | Sender email address. | Yes, if email notifications or alerts are enabled. | None |
+| `email.tos` | Comma-separated list of recipient email addresses. | Yes, if email notifications or alerts are enabled. | None |
+
+# Source Properties <a name="Source-Properties"></a>
+## Common Source Properties <a name="Common-Source-Properties"></a>
+These properties are common properties that are used among different Source implementations. Depending on what source class is being used, these parameters may or may not be necessary. These parameters are not tied to a specific source, and thus can be used in new source classes.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `source.class` | Fully qualified name of the Source class. For example, `org.apache.gobblin.example.wikipedia.WikipediaSource` | Yes | None |
+| `source.entity` | Name of the source entity that needs to be pulled from the source. The parameter represents a logical grouping of data that needs to be pulled from the source. Often this logical grouping comes in the form a database table, a source topic, etc. In many situations, such as when using the QueryBasedExtractor, it will be the name of the table that needs to pulled from the source. | Required for QueryBasedExtractors, FileBasedExtractors. | None |
+| `source.timezone` | Timezone of the data being pulled in by the extractor. Examples include "PST" or "UTC". | Required for QueryBasedExtractors | None |
+| `source.max.number.of.partitions` | Maximum number of partitions to split this current run across. Only used by the QueryBasedSource and FileBasedSource. | No | 20 |
+| `source.skip.first.record` | True if you want to skip the first record of each data partition. Only used by the FileBasedExtractor. | No | False |
+| `extract.namespace` | Namespace for the extract data. The namespace will be included in the default file name of the outputted data. | No | None |
+| `source.conn.use.proxy.url` | The URL of the proxy to connect to when connecting to the source. This parameter is only used for SFTP and REST sources. | No | None |
+| `source.conn.use.proxy.port` | The port of the proxy to connect to when connecting to the source. This parameter is only used for SFTP and REST sources. | No | None |
+| `source.conn.username` | The username to authenticate with the source. This is parameter is only used for SFTP and JDBC sources. | No | None |
+| `source.conn.password` | The password to use when authenticating with the source. This is parameter is only used for JDBC sources. | No | None |
+| `source.conn.host` | The name of the host to connect to. | Required for SftpExtractor, MySQLExtractor, OracleExtractor, SQLServerExtractor and TeradataExtractor. | None |
+| `source.conn.rest.url` | URL to connect to for REST requests. This parameter is only used for the Salesforce source. | No | None |
+| `source.conn.version` | Version number of communication protocol. This parameter is only used for the Salesforce source. | No | None |
+| `source.conn.timeout` | The timeout set for connecting to the source in milliseconds. | No | 500000 |
+| `source.conn.port` | The value of the port to connect to. | Required for SftpExtractor, MySQLExtractor, OracleExtractor, SQLServerExtractor and TeradataExtractor. | None |
+| `source.conn.sid` | The Oracle System ID (SID) that identifies the database to connect to. | Required for OracleExtractor. | None |
+| `extract.table.name` | Table name in Hadoop which is different table name in source. | No | Source table name  |
+| `extract.is.full` | True if this pull should treat the data as a full dump of table from the source, false otherwise. | No | false |
+| `extract.delta.fields` | List of columns that will be used as the delta field for the data. | No | None |
+| `extract.primary.key.fields ` | List of columns that will be used as the primary key for the data. | No | None |
+| `extract.pull.limit` | This limits the number of records read by Gobblin. In Gobblin's extractor the readRecord() method is expected to return records until there are no more to pull, in which case it runs null. This parameter limits the number of times readRecord() is executed. This parameter is useful for pulling a limited sample of the source data for testing purposes. | No | Unbounded |
+| `extract.full.run.time` | TODO | TODO | TODO |
+
+
+## Distcp CopySource Properties <a name="Distcp-CopySource-Properties"></a>
+
+ Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `gobblin.copy.simulate` | Will perform copy file listing but doesn't execute actual copy. | No | False |
+| `gobblin.copy.includeEmptyDirectories` | Whether to include empty directories from the source in the copy. | No | False
+
+### RecursiveCopyableDataset Properties <a name="RecursiveCopyableDataset-Properties"></a>
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `gobblin.copy.recursive.deleteEmptyDirectories` | Whether to delete newly empty directories found, up to the dataset root.| No | False |
+| `gobblin.copy.recursive.delete` | Whether to delete files in the target that don't exist in the source. | No | False | 
+| `gobblin.copy.recursive.update` | Will update files that are different between the source and target, and skip files already in the target. | No | False |
+
+### DistcpFileSplitter Properties <a name="DistcpFileSplitter-Properties"></a>
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `gobblin.copy.split.enabled` | Will split files into block level granularity work units, which can be copied independently, then merged back together before publishing. To actually achieve splitting, the max split size property also needs to be set. | No | False| 
+| `gobblin.copy.file.max.split.size` | If splitting is enabled, the split size (in bytes) for the block level work units is calculated based on rounding down the value of this property to the nearest integer multiple of the block size. If the value of this property is less than the block size, it gets adjusted up. | No | Long.MAX_VALUE
+
+### WorkUnitBinPacker Properties <a name="WorkUnitBinPacker-Properties"></a>
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `gobblin.copy.binPacking.maxSizePerBin` | Limits the maximum weight that can be packed into a multi work unit produced from bin packing. A value of 0 means packing is not done. | No | 0 |
+| `gobblin.copy.binPacking.maxWorkUnitsPerBin` | Limits the maximum number/amount of work units that can be packed into a multi work unit produced from bin packing. | No | 50 |
+
+## QueryBasedExtractor Properties <a name="QueryBasedExtractor-Properties"></a>
+The following table lists the query based extractor configuration properties.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `source.querybased.watermark.type`  | The format of the watermark that is used when extracting data from the source. Possible types are timestamp, date, hour, simple. | Yes | timestamp | 
+| `source.querybased.start.value`  | Value for the watermark to start pulling data from, also the default watermark if the previous watermark cannot be found in the old task states. | Yes | None |
+| `source.querybased.partition.interval`  | Number of hours to pull in each partition. | No | 1 | 
+| `source.querybased.hour.column`  | Delta column with hour for hourly extracts (Ex: hour_sk) | No | None |
+| `source.querybased.skip.high.watermark.calc`  | If it is true, skips high watermark calculation in the source and it will use partition higher range as high watermark instead of getting it from source. | No | False | 
+| `source.querybased.query`  | The query that the extractor should execute to pull data. | No | None |
+| `source.querybased.hourly.extract`  | True if hourly extract is required. | No | False | 
+| `source.querybased.extract.type`  | "snapshot" for the incremental dimension pulls. "append_daily", "append_hourly" and "append_batch" for the append data append_batch for the data with sequence numbers as watermarks | No | None |
+| `source.querybased.end.value`  | The high watermark which this entire job should pull up to. If this is not specified, pull entire data from the table | No | None |
+| `source.querybased.append.max.watermark.limit`  | max limit of the high watermark for the append data.  CURRENT_DATE - X CURRENT_HOUR - X where X>=1 | No | CURRENT_DATE for daily extract CURRENT_HOUR for hourly extract | 
+| `source.querybased.is.watermark.override`  | True if this pull should override previous watermark with start.value and end.value. False otherwise. | No | False | 
+| `source.querybased.low.watermark.backup.secs`  | Number of seconds that needs to be backup from the previous high watermark. This is to cover late data.  Ex: Set to 3600 to cover 1 hour late data. | No | 0 | 
+| `source.querybased.schema`  | Database name | No | None |
+| `source.querybased.is.specific.api.active`  | True if this pull needs to use source specific apis instead of standard protocols.  Ex: Use salesforce bulk api instead of rest api | No | False | 
+| `source.querybased.skip.count.calc` | A boolean, if true then the QueryBasedExtractor will skip the source count calculation. | No | False | 
+| `source.querybased.fetch.size` | This parameter is currently only used in JDBCExtractor. The JDBCExtractor will process this many number of records from the JDBC ResultSet at a time. It will then take these records and return them to the rest of the Gobblin flow so that they can get processed by the rest of the Gobblin components. | No  | 1000 |
+| `source.querybased.is.metadata.column.check.enabled` | When a query is specified in the configuration file, it is possible a user accidentally adds in a column name that does not exist on the source side. By default, this parameter is set to false, which means that if a column is specified in the query and it does not exist in the source data set, Gobblin will just skip over that column. If it is set to true, Gobblin will actually take the config specified column and check to see if it [...]
+| `source.querybased.is.compression.enabled` | A boolean specifying whether or not compression should be enabled when pulling data from the source. This parameter is only used for MySQL sources. If set to true, the MySQL will send compressed data back to the source. | No | False |
+| `source.querybased.jdbc.resultset.fetch.size` | The number of rows to pull through JDBC at a time. This is useful when the JDBC ResultSet is too big to fit into memory, so only "x" number of records will be fetched at a time. | No | 1000 |
+
+### JdbcExtractor Properties <a name="JdbcExtractor-Properties"></a>
+The following table lists the jdbc based extractor configuration properties.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `source.conn.driver` | The fully qualified path of the JDBC driver used to connect to the external source. | Yes | None |
+| `source.column.name.case` | A enum specifying whether or not to convert the column names to a specific case before performing a query. Possible values are TOUPPER or TOLOWER. | No | NOCHANGE  |
+
+## FileBasedExtractor Properties <a name="FileBasedExtractor-Properties"></a>
+The following table lists the file based extractor configuration properties.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `source.filebased.data.directory` |  The data directory from which to pull data from. | Yes | None |
+| `source.filebased.files.to.pull` |  A list of files to pull - this should be set in the Source class and the extractor will pull the specified files. | Yes   | None |
+| `filebased.report.status.on.count` | The FileBasedExtractor will report it's status every time it processes the number of records specified by this parameter. The way it reports status is by logging out how many records it has seen. | No | 10000 |  
+| `source.filebased.fs.uri` | The URI of the filesystem to connect to. | Required for HadoopExtractor. | None |
+| `source.filebased.preserve.file.name` | A boolean, if true then the original file names will be preserved when they are are written to the source. | No | False |
+| `source.schema` | The schema of the data that will be pulled by the source. | Yes | None |
+
+### SftpExtractor Properties <a name="SftpExtractor-Properties"></a>
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `source.conn.private.key` | File location of the private key used for key based authentication. This parameter is only used for the SFTP source. | Yes | None |
+| `source.conn.known.hosts` | File location of the known hosts file used for key based authentication. | Yes | None |
+
+# Converter Properties <a name="Converter-Properties"></a>
+Properties for Gobblin converters.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `converter.classes` |  Comma-separated list of fully qualified names of the Converter classes. The order is important as the converters will be applied in this order. | No | None |
+
+## CsvToJsonConverter Properties <a name="CsvToJsonConverter-Properties"></a>
+This converter takes in text data separated by a delimiter (converter.csv.to.json.delimiter), and splits the data into a JSON format recognized by JsonIntermediateToAvroConverter.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `converter.csv.to.json.delimiter` | The regex delimiter between CSV based files, only necessary when using the CsvToJsonConverter - e.g. ",", "/t" or some other regex | Yes | None |
+
+## JsonIntermediateToAvroConverter Properties <a name="JsonIntermediateToAvroConverter-Properties"></a>
+This converter takes in JSON data in a specific schema, and converts it to Avro data.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `converter.avro.date.format` | Source format of the date columns for Avro-related converters. | No | None |
+| `converter.avro.timestamp.format` | Source format of the timestamp columns for Avro-related converters. | No | None |
+| `converter.avro.time.format` | Source format of the time columns for Avro-related converters. | No | None |
+| `converter.avro.binary.charset` | Source format of the time columns for Avro-related converters. | No | UTF-8 |
+| `converter.is.epoch.time.in.seconds` | A boolean specifying whether or not a epoch time field in the JSON object is in seconds or not. | Yes | None |
+| `converter.avro.max.conversion.failures` | This converter is will fail for this many number of records before throwing an exception. | No | 0 |
+| `converter.avro.nullify.fields.enabled` | Generate new avro schema by nullifying fields that previously existed but not in the current schema. | No | false |
+| `converter.avro.nullify.fields.original.schema.path` | Path of the original avro schema which will be used for merging and nullify fields. | No | None |
+
+## JsonStringToJsonIntermediateConverter Properties <a name="JsonStringToJsonIntermediateConverter-Properties"></a>
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `gobblin.converter.jsonStringToJsonIntermediate.unpackComplexSchemas` | Parse nested JSON record using source.schema. | No | True |
+
+## AvroFilterConverter Properties <a name="AvroFilterConverter-Properties"></a>
+This converter takes in an Avro record, and filters out records by performing an equality operation on the value of the field specified by converter.filter.field and the value specified in converter.filter.value. It returns the record unmodified if the equality operation evaluates to true, false otherwise.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `converter.filter.field` | The name of the field in the Avro record, for which the converter will filter records on. | Yes | None |
+| `converter.filter.value` | The value that will be used in the equality operation to filter out records. | Yes | None |
+
+## AvroFieldRetrieverConverter Properties <a name="AvroFieldRetrieverConverter-Properties"></a>
+This converter takes a specific field from an Avro record and returns its value.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `converter.avro.extractor.field.path` | The field in the Avro record to retrieve. If it is a nested field, then each level must be separated by a period. | Yes | None |
+
+## AvroFieldsPickConverter Properties <a name="AvroFieldsPickConverter-Properties"></a>
+Unlike AvroFieldRetriever, this converter takes multiple fields from Avro schema and convert schema and generic record.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `converter.avro.fields` | Comma-separted list of the fields in the Avro record. If it is a nested field, then each level must be separated by a period. | Yes | None |
+
+## AvroToJdbcEntryConverter Properties <a name="AvroToJdbcEntryConverter-Properties"></a>
+Converts Avro schema and generic record into Jdbc entry schema and data.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `converter.avro.jdbc.entry_fields_pairs` | Converts Avro field name(s) to fit for JDBC underlying data base. Input format is key value pairs of JSON array where key is avro field name and value is corresponding JDBC column name. | No | None |
+
+# Fork Properties <a name="Fork-Properties"></a>
+Properties for Gobblin's fork operator.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `fork.operator.class` |  Fully qualified name of the ForkOperator class. | No | `org.apache.gobblin.fork.IdentityForkOperator` |
+| `fork.branches` |  Number of fork branches. | No | 1 |
+| `fork.branch.name.${branch index}` |  Name of a fork branch with the given index, e.g., 0 and 1. | No | fork_${branch index}, e.g., fork_0 and fork_1. |
+
+# Quality Checker Properties <a name="Quality-Checker-Properties"></a>
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `qualitychecker.task.policies` | Comma-separted list of fully qualified names of the TaskLevelPolicy classes that will run at the end of each Task. | No | None |
+| `qualitychecker.task.policy.types` | OPTIONAL implies the corresponding class in qualitychecker.task.policies is optional and if it fails the Task will still succeed, FAIL implies that if the corresponding class fails then the Task will fail too. | No | OPTIONAL | 
+| `qualitychecker.row.policies` | Comma-separted list of fully qualified names of the RowLevelPolicy classes that will run on each record. | No | None |
+| `qualitychecker.row.policy.types` | OPTIONAL implies the corresponding class in qualitychecker.row.policies is optional and if it fails the Task will still succeed, FAIL implies that if the corresponding class fails then the Task will fail too, ERR_FILE implies that if the record does not pass the test then the record will be written to an error file. | No | OPTIONAL | 
+| `qualitychecker.row.err.file` | The quality checker will write the current record to the location specified by this parameter, if the current record fails to pass the quality checkers specified by qualitychecker.row.policies; this file will only be written to if the quality checker policy type is ERR_FILE. | No | None |
+
+# Writer Properties <a name="Writer-Properties"></a>
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `writer.destination.type` | Writer destination type. Can be HDFS, KAFKA, MYSQL or TERADATA | No | HDFS | 
+| `writer.output.format` | Writer output format; currently only Avro is supported. | No | AVRO | 
+| `writer.fs.uri` | File system URI for writer output. | No | file:/// | 
+| `writer.staging.dir` | Staging directory of writer output. All staging data that the writer produces will be placed in this directory, but all the data will be eventually moved to the writer.output.dir. | Yes | None |
+| `writer.output.dir` | Output directory of writer output. All output data that the writer produces will be placed in this directory, but all the data will be eventually moved to the final directory by the publisher. | Yes | None |
+| `writer.builder.class` | Fully qualified name of the writer builder class. | No | `org.apache.gobblin.writer.AvroDataWriterBuilder` |
+| `writer.file.path` | The Path where the writer will write it's data. Data in this directory will be copied to it's final output directory by the DataPublisher. | Yes | None |
+| `writer.file.name` | The name of the file the writer writes to. | Yes | part | 
+| `writer.partitioner.class` | Partitioner used for distributing records into multiple output files. `writer.builder.class` must be a subclass of `PartitionAwareDataWriterBuilder`, otherwise Gobblin will throw an error.  | No | None (will not use partitioner) |
+| `writer.buffer.size` |  Writer buffer size in bytes. This parameter is only applicable for the AvroHdfsDataWriter. | No | 4096 | 
+| `writer.deflate.level` |  Writer deflate level. Deflate is a type of compression for Avro data. | No | 9 | 
+| `writer.codec.type` |  This is used to specify the type of compression used when writing data out. Possible values are NOCOMPRESSION, DEFLATE, SNAPPY. | No | DEFLATE | 
+| `writer.eager.initialization` | This is used to control the writer creation. If the value is set to true, writer is created before records are read. This means an empty file will be created even if no records were read. | No | False | 
+| `writer.parquet.page.size` | The page size threshold | No | 1048576 |
+| `writer.parquet.dictionary.page.size` | The block size threshold. | No | 134217728 |
+| `writer.parquet.dictionary` | To turn dictionary encoding on. | No | true |
+| `writer.parquet.validate` | To turn on validation using the schema. | No | false |
+| `writer.parquet.version` | Version of parquet writer to use. Available versions are v1 and v2. | No | v1 |
+
+# Data Publisher Properties <a name="Data-Publisher-Properties"></a>
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `data.publisher.type` |  The fully qualified name of the DataPublisher class to run. The DataPublisher is responsible for publishing task data once all Tasks have been completed. | Yes | None |
+| `data.publisher.final.dir` |  The final output directory where the data should be published. | Yes | None |
+| `data.publisher.replace.final.dir` | A boolean, if true and the the final output directory already exists, then the data will not be committed. If false and the final output directory already exists then it will be overwritten. | Yes | None |
+| `data.publisher.final.name` | The final name of the file that is produced by Gobblin. By default, Gobblin already assigns a unique name to each file it produces. If that default name needs to be overridden then this parameter can be used. Typically, this parameter should be set on a per workunit basis so that file names don't collide. | No | None |
+ 
+# Generic Properties <a name="Generic-Properties"></a>
+These properties are used throughout multiple Gobblin components.
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `fs.uri` | Default file system URI for all file storage; over-writable by more specific configuration properties. | No | file:/// |
+
+# FileBasedJobLock Properties <a name="FileBasedJobLock-Properties"></a>
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `job.lock.dir` | Directory where job locks are stored. Job locks are used by the scheduler to ensure two executions of a job do not run at the same time. If a job is scheduled to run, Gobblin will first check this directory to see if there is a lock file for the job. If there is one, it will not run the job, if there isn't one then it will run the job. | No | None |
+
+# ZookeeperBasedJobLock Properties <a name="ZookeeperBasedJobLock-Properties"></a>
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `zookeeper.connection.string` | The connection string to the ZooKeeper cluster used to manage the lock. | No | localhost:2181 |
+| `zookeeper.session.timeout.seconds` | The zookeeper session timeout. | No | 180 |
+| `zookeeper.connection.timeout.seconds` | The zookeeper conection timeout. | No | 30 |
+| `zookeeper.retry.backoff.seconds` | The amount of time in seconds to wait between retries.  This will increase exponentially when retries occur. | No | 1 |
+| `zookeeper.retry.count.max` | The maximum number of times to retry. | No | 10 |
+| `zookeeper.locks.acquire.timeout.milliseconds` | The amount of time in milliseconds to wait while attempting to acquire the lock. | No | 5000 |
+| `zookeeper.locks.reaper.threshold.seconds` | The threshold in seconds that determines when a lock path can be deleted. | No | 300 |
+
+# JDBC Writer properties <a name="JdbcWriter-Properties"></a>
+Writer(and publisher) that writes to JDBC database. Please configure below two properties to use JDBC writer & publisher.
+
+*  writer.builder.class=org.apache.gobblin.writer.JdbcWriterBuilder
+*  data.publisher.type=org.apache.gobblin.publisher.JdbcPublisher
+
+| Name | Description | Required | Default Value |
+| --- | --- | --- | --- |
+| `jdbc.publisher.database_name` | Destination database name | Yes | None | 
+| `jdbc.publisher.table_name` | Destination table name | Yes | None | 
+| `jdbc.publisher.replace_table` | Gobblin will replace the data in destination table. | No | false | 
+| `jdbc.publisher.username` | User name to connect to destination database | Yes | None | 
+| `jdbc.publisher.password` | Password to connect to destination database. Also, accepts encrypted password. | Yes | None | 
+| `jdbc.publisher.encrypt_key_loc` | Location of a key to decrypt an encrypted password | No | None | 
+| `jdbc.publisher.url` | Connection URL | Yes | None | 
+| `jdbc.publisher.driver` | JDBC driver class | Yes | None |  
+| `writer.staging.table` | User can pass staging table for Gobblin to use instead of Gobblin to create one. (e.g: For the user who does not have create table previlege can pass staging table for Gobblin to use). | No | None | 
+| `writer.truncate.staging.table` | Truncate staging table if user passed their own staging table via "writer.staging.table". | No | false | 
+| `writer.jdbc.batch_size` | Batch size for Insert operation | No | 30 | 
+| `writer.jdbc.insert_max_param_size` | Maximum number of parameters for JDBC insert operation (for MySQL Writer). | No | 100,000 (MySQL limitation) | 
diff --git a/gobblin-website/docs/user-guide/Docker-Integration.md b/gobblin-website/docs/user-guide/Docker-Integration.md
new file mode 100644
index 0000000..ec49363
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Docker-Integration.md
@@ -0,0 +1,115 @@
+---
+title: Docker Integration
+sidebar_label: Docker Integration
+
+---
+
+# Introduction
+
+Gobblin integrates with Docker by running a Gobblin standalone service inside a Docker container. The Gobblin service inside the container can monitor the host filesystem for new job configuration files, run the jobs, and write the resulting data to the host filesystem. The Gobblin Docker images can be found on Docker Hub at: https://hub.docker.com/u/gobblin/
+
+# Docker
+
+For more information on Docker, including how to install it, check out the documentation at: https://docs.docker.com/
+
+# Docker Repositories
+
+Github Actions pushes the latest docker image to the Apache DockerHub repository [here](https://hub.docker.com/r/apache/gobblin) from `gobblin-docker/gobblin/alpine-gobblin-latest/Dockerfile`
+
+To run this image, you will need to pass in the corresponding execution mode. The execution modes can be found [here](https://gobblin.readthedocs.io/en/latest/user-guide/Gobblin-Deployment/)
+
+```bash
+docker pull apache/gobblin
+docker run apache/gobblin --mode <execution mode> <additional args>
+```
+
+For example, to run Gobblin in standalone mode
+```bash
+docker run apache/gobblin --mode standalone
+```
+
+To pass your own configuration to Gobblin standalone, use a docker volume. Due to the nature of the startup script, the volumes
+will need to be declared before the arguments are passed to the execution mode. E.g.
+```bash
+docker run -v <path to local configuration files>:/home/gobblin/conf/standalone apache/gobblin --mode standalone
+```
+
+Before running docker containers, set a working directory for Gobblin jobs:
+
+`export LOCAL_JOB_DIR=<local_gobblin_directory>`
+
+We will use this directory as the [volume](https://docs.docker.com/storage/volumes/) for Gobblin jobs and outputs. Make sure your Docker has the [access](https://docs.docker.com/docker-for-mac/#file-sharing) to this folder. This is the prerequisite for all following example jobs.
+
+### Run the docker image with simple wikipedia jobs
+
+Run these commands to start the docker image:
+
+`docker pull apache/gobblin:latest`
+
+`docker run -v $LOCAL_JOB_DIR:/etc/gobblin-standalone/jobs apache/gobblin:latest --mode standalone`
+
+After the container spins up, put the [wikipedia.pull](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/resources/wikipedia.pull) in ${LOCAL_JOB_DIR}. You will see the Gobblin daemon pick up the job, and the result output is in ${LOCAL_JOB_DIR}/job-output/.
+
+This example job is correspondent to the [getting started guide](https://gobblin.readthedocs.io/en/latest/Getting-Started/). With the docker image, you can focus on the Gobblin functionalities, avoiding the hassle of building a distribution.
+
+### Use Gobblin Standalone on Docker for Kafka and HDFS Ingestion 
+
+* To ingest from/to Kafka and HDFS by Gobblin, you need to start services for Zookeeper, Kafka and HDFS along with Gobblin. We use docker [compose](https://docs.docker.com/compose/) with images contributed to docker hub. Firstly, you need to create a [docker-compose.yml](https://github.com/apache/gobblin/blob/master/gobblin-docker/gobblin-recipes/kafka-hdfs/docker-compose.yml) file.
+
+* Second, in the same folder of the yml file, create a [hadoop.env](https://github.com/apache/gobblin/blob/master/gobblin-docker/gobblin-recipes/kafka-hdfs/hadoop.env) file to specify all HDFS related config(copy the content into your .env file).
+
+* Open a terminal in the same folder, pull and run these docker services:
+
+    `docker-compose -f ./docker-compose.yml pull`
+
+    `docker-compose -f ./docker-compose.yml up`
+    
+    Here we expose Zookeeper at port 2128, Kafka at 9092 with an auto created Kafka topic “test”. All hadoop related configs are stated in the .env file.
+
+* You should see all services running. Now we can push some events into the Kafka topic. Open a terminal from [docker desktop](https://docs.docker.com/desktop/dashboard/) dashboard or [docker exec](https://docs.docker.com/engine/reference/commandline/exec/) to interact with Kafka. Inside the Kafka container terminal:
+
+    `cd /opt/kafka`
+
+    `./bin/kafka-console-producer.sh --broker-list kafka:9092 --topic test`
+
+    You can type messages for the topic “test”, and press ctrl+c to exit.
+
+* Put the [kafka-hdfs.pull](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/resources/kafka-hdfs.pull) in ${LOCAL_JOB_DIR}, so that the Gobblin daemon will pick up this job and write the result to HDFS. You will see the Gobblin daemon pick up the job.
+
+After the job finished, open a terminal in the HDFS namenode container:
+
+`hadoop fs -ls /gobblintest/job-output/test/`
+
+You will see the result file in this HDFS folder. You can use this command to verify the content in the text file:
+
+`hadoop fs -cat /gobblintest/job-output/test/<output_file.txt>`
+
+# Run Gobblin as a Service
+
+The goal of GaaS(Gobblin as a Service) is to enable a self service so that different users can automatically provision and execute various supported Gobblin applications limiting the need for development and operation teams to be involved during the provisioning process. You can take a look at our [design detail](https://cwiki.apache.org/confluence/display/GOBBLIN/Gobblin+as+a+Service).
+
+### Set working directory
+
+Similar to standalone working directory settings:
+
+`export GAAS_JOB_DIR=<gaas_gobblin_directory>`
+
+`export LOCAL_DATAPACK_DIR=<local_directory_of_templateUris>`
+
+### Start Gobblin as a Service
+
+Run these commands to start the docker image:
+
+`docker run -p 6956:6956 -v $GAAS_JOB_DIR:/etc/gobblin-as-service/jobs -v $LOCAL_DATAPACK_DIR:/etc/templateCatalog apache/gobblin --mode gobblin-as-service`
+
+The GaaS will be started, and the service can now be accessed on localhost:6956.
+
+### Interact with GaaS
+
+##### TODO: Add an end-to-end workflow example in GaaS.
+
+# Future Work
+
+* Complete `gobblin-service` docker guidance that serve as a quick-start for GaaS user
+* Implement a simple converter and inject into the docker service. Create a corresponding doc to guide users implement their own logic but no need to tangle with the Gobblin codebase
+* Finish the Github action to automate the docker build
diff --git a/gobblin-website/docs/user-guide/FAQs.md b/gobblin-website/docs/user-guide/FAQs.md
new file mode 100644
index 0000000..131c6db
--- /dev/null
+++ b/gobblin-website/docs/user-guide/FAQs.md
@@ -0,0 +1,81 @@
+---
+title: FAQs
+sidebar_label: FAQs
+---
+
+# Gobblin
+
+## General Questions <a name="General-Questions"></a>
+
+##### What is Gobblin?
+
+Gobblin is a universal ingestion framework. It's goal is to pull data from any source into an arbitrary data store. One major use case for Gobblin is pulling data into Hadoop. Gobblin can pull data from file systems, SQL stores, and data that is exposed by a REST API. See the Gobblin [Home](/docs/index) page for more information.
+
+##### What programming languages does Gobblin support?
+
+Gobblin currently only supports Java 7 and up.
+
+##### Does Gobblin require any external software to be installed?
+
+The machine that Gobblin is built on must have Java installed, and the `$JAVA_HOME` environment variable must be set.
+
+##### What Hadoop versions can Gobblin run on?
+
+Gobblin can only be run on Hadoop 2.x. By default, Gobblin compiles against Hadoop 2.3.0.
+
+##### How do I run and schedule a Gobblin job?
+
+Check out the [Deployment](Gobblin Deployment) page for information on how to run and schedule Gobblin jobs. Check out the [Configuration](Configuration Properties Glossary) page for information on how to set proper configuration properties for a job.
+
+##### How is Gobblin different from Sqoop?
+
+Sqoop main focus bulk import and export of data from relational databases to HDFS, it lacks the ETL functionality of data cleansing, data transformation, and data quality checks that Gobblin provides. Gobblin is also capable of pulling from any data source (e.g. file systems, RDMS, REST APIs).
+
+## Technical Questions <a name="Technical-Questions"></a>
+
+##### When running on Hadoop, each map task quickly reaches 100 Percent completion, but then stalls for a long time. Why does this happen?
+
+Gobblin currently uses Hadoop map tasks as a container for running Gobblin tasks. Each map task runs 1 or more Gobblin workunits, and the progress of each workunit is not hooked into the progress of each map task. Even though the Hadoop job reports 100% completion, Gobblin is still doing work. See the [Gobblin Deployment](Gobblin Deployment) page for more information.
+
+##### Why does Gobblin on Hadoop stall for a long time between adding files to the DistrbutedCache, and launching the actual job?
+
+Gobblin takes all WorkUnits created by the Source class and serializes each one into a file on Hadoop. These files are read by each map task, and are deserialized into Gobblin Tasks. These Tasks are then run by the map-task. The reason the job stalls is that Gobblin is writing all these files to HDFS, which can take a while especially if there are a lot of tasks to run. See the [Gobblin Deployment](Gobblin Deployment) page for more information.
+
+##### How do I fix `UnsupportedFileSystemException: No AbstractFileSystem for scheme: null`?
+
+This error typically occurs due to Hadoop version conflict issues. If Gobblin is compiled against a specific Hadoop version, but then deployed on a different Hadoop version or installation, this error may be thrown. For example, if you simply compile Gobblin using `./gradlew clean build`, but deploy Gobblin to a cluster with [CDH](https://www.cloudera.com/content/www/en-us/products/apache-hadoop/key-cdh-components.html) installed, you may hit this error.
+
+It is important to realize that the the `gobblin-dist.tar.gz` file produced by `./gradlew clean build` will include all the Hadoop jar dependencies; and if one follows the [MR deployment guide](Gobblin-Deployment#Hadoop-MapReduce-Deployment), Gobblin will be launched with these dependencies on the classpath.
+
+To fix this take the following steps:
+
+* Delete all the Hadoop jars from the Gobblin `lib` folder
+* Ensure that the environment variable `HADOOP_CLASSPATH` is set and points to a directory containing the Hadoop libraries for the cluster
+
+##### How do I compile Gobblin against CDH?
+
+[Cloudera Distributed Hadoop](https://www.cloudera.com/content/www/en-us/products/apache-hadoop/key-cdh-components.html) (often abbreviated as CDH) is a popular Hadoop distribution. Typically, when running Gobblin on a CDH cluster it is recommended that one also compile Gobblin against the same CDH version. Not doing so may cause unexpected runtime behavior. To compile against a specific CDH version simply use the `hadoopVersion` parameter. For example, to compile against version `2.5.0- [...]
+
+##### Resolve Gobblin-on-MR Exception `IOException: Not all tasks running in mapper attempt_id completed successfully`
+
+This exception usually just means that a Hadoop Map Task running Gobblin Tasks threw some exception. Unfortunately, the exception isn't truly indicative of the underlying problem, all it is really saying is that something went wrong in the Gobblin Task. Each Hadoop Map Task has its own log file and it is often easiest to look at the logs of the Map Task when debugging this problem. There are multiple ways to do this, but one of the easiest ways is to execute `yarn logs -applicationId <ap [...]
+
+##### Gradle Build Fails With `Cannot invoke method getURLs on null object`
+
+Add `-x test` to build the project without running the tests; this will make the exception go away. If one needs to run the tests then make sure [Java Cryptography Extension](https://en.wikipedia.org/wiki/Java_Cryptography_Extension) is installed.
+
+# Gradle
+
+## Technical Questions
+
+#### How do I add a new external dependency?
+
+Say I want to add [`oozie-core-4.2.0.jar`](http://mvnrepository.com/artifact/org.apache.oozie/oozie-core/4.2.0) as a dependency to the `gobblin-scheduler` subproject. I would first open the file `build.gradle` and add the following entry to the `ext.externalDependency` array: `"oozieCore": "org.apache.oozie:oozie-core:4.2.0"`.
+
+Then in the `gobblin-scheduler/build.gradle` file I would add the following line to the dependency block: `compile externalDependency.oozieCore`.
+
+#### How do I add a new Maven Repository to pull artifacts from?
+
+Often times, one may have important artifacts stored in a local or private Maven repository. As of 01/21/2016 Gobblin only pulls artifacts from the following Maven Repositories: [Maven Central](http://repo1.maven.org/maven/), [Conjars](http://conjars.org/repo), and [Cloudera](https://repository.cloudera.com/artifactory/cloudera-repos/).
+
+In order to add another Maven Repository modify the `defaultEnvironment.gradle` file and the new repository using the same pattern as the existing ones.
diff --git a/gobblin-website/docs/user-guide/Gobblin-CLI.md b/gobblin-website/docs/user-guide/Gobblin-CLI.md
new file mode 100644
index 0000000..58bda1e
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Gobblin-CLI.md
@@ -0,0 +1,261 @@
+---
+title: Gobblin CLI
+sidebar_label: Gobblin CLI
+---
+
+Gobblin Commands & Execution Modes
+-----------
+
+The Gobblin distribution comes with a script `./bin/gobblin` for all commands and services.
+Here is the usage:  
+
+```text
+Usage:
+gobblin.sh  cli     <cli-command>    <params>
+gobblin.sh  service <execution-mode> <start|stop|status>
+
+Use "gobblin <cli|service> --help" for more information.         (Gobblin Version: 0.15.0)
+```
+
+For Gobblin CLI commands, run following:  
+```text
+Usage:
+gobblin.sh  cli     <cli-command>    <params>
+
+options:
+    cli-commands:
+                passwordManager             Encrypt or decrypt strings for the password manager.
+                decrypt                     Decryption utilities
+                run                         Run a Gobblin application.
+                config                      Query the config library
+                jobs                        Command line job info and operations
+                stateMigration              Command line tools for migrating state store
+                job-state-to-json           To convert Job state to JSON
+                cleaner                     Data retention utility
+                keystore                    Examine JCE Keystore files
+                watermarks                  Inspect streaming watermarks
+                job-store-schema-manager    Database job history store schema manager
+
+    --conf-dir <gobblin-conf-dir-path> Gobblon config path. default is '$GOBBLIN_HOME/conf/<exe-mode-name>'.
+    --log4j-conf <path-of-log4j-file>  default is '<gobblin-conf-dir-path>/<execution-mode>/log4j.properties'.
+    --jvmopts <jvm or gc options>      String containing JVM flags to include, in addition to "-Xmx1g -Xms512m".
+    --jars <csv list of extra jars>    Column-separated list of extra jars to put on the CLASSPATH.
+    --enable-gc-logs                   enables gc logs & dumps.
+    --show-classpath                   prints gobblin runtime classpath.
+    --help                             Display this help.
+    --verbose                          Display full command used to start the process.
+                                       Gobblin Version: 0.15.0
+```
+
+
+Argument details:
+* `--conf-dir`: specifies the path to directory containing gobblin system configuration files, like `application.conf` or `reference.conf`, `log4j.properties` and `quartz.properties`.
+* `--log4j-conf`: specify the path of log4j config file to override the one in config directory (default is `<conf>/<gobblin-mode>/log4j.properties`. Gobblin uses [SLF4J](http://www.slf4j.org/) and the [slf4j-log4j12](http://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12) binding for logging.
+* `--jvmopts`: to specify any JVM parameters, default is `-Xmx1g -Xms512m`.
+* `--enable-gc-logs`: adds GC options to JVM parameters:  ``` -XX:+UseConcMarkSweepGC -XX:+UseParNewGC -XX:+UseCompressedOops -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:+PrintTenuringDistribution -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=$GOBBLIN_LOGS/ -Xloggc:$GOBBLIN_LOGS/gobblin-$GOBBLIN_MODE-gc.log ```
+* `--show-classpath`: It prints the full value of the classpath that gobblin uses.
+* all other arguments are self-explanatory.
+
+Gobblin Commands
+-------------------
+
+Gobblin provides following CLI commands:
+```text
+    Available commands:
+        job-state-to-json	        To convert Job state to JSON
+        jobs		                Command line job info and operations
+        passwordManager		        Encrypt or decrypt strings for the password manager.
+        run		                    Run a Gobblin application.
+        decrypt		                Decryption utilities
+        job-store-schema-manager	Database job history store schema manager
+        stateMigration		        Command line tools for migrating state store
+        keystore		            Examine JCE Keystore files
+        config		                Query the config library
+        watermarks		            Inspect streaming watermarks
+        cleaner		                Data retention utility
+```
+Details on how to use `run` command: 
+ 
+Gobblin ingestion applications can be accessed through the following command:
+```text
+    gobblin cli run [listQuickApps] [<quick-app>] -jobName <jobName> [OPTIONS]
+```
+For usage run `./bin/gobblin cli run`.
+
+`gobblin cli run` uses [Embedded Gobblin](Gobblin-as-a-Library.md) and subclasses to run Gobblin ingestion jobs, giving CLI access to most functionality that could be achieved using `EmbeddedGobblin`. For example, the following command will run a Hello World job (it will print "Hello World 1 !" somewhere in the logs).
+
+```bash   
+    gobblin cli run -jobName helloWorld -setTemplate resource:///templates/hello-world.template
+```
+
+Obviously, it is daunting to have to know the path to templates and exactly which configurations to set. The alternative is to use a quick app. Running:
+```bash   
+    gobblin cli run listQuickApps
+```
+will provide with a list of available quick apps. To run a quick app:
+    
+    gobblin cli run <quick-app-name>
+
+Quick apps may require additional arguments. For the usage of a particular app, run `bin/gobblin cli run <quick-app-name> -h`.
+
+The Distcp Quick App
+--------------------
+
+For example, consider the quick app distcp:
+```bash
+$ gobblin cli run distcp -h
+usage: gobblin cli run distcp [OPTIONS] <source> <target>
+ -delete                         Delete files in target that don't exist
+                                 on source.
+ -deleteEmptyParentDirectories   If deleting files on target, also delete
+                                 newly empty parent directories.
+ -distributeJar <arg>
+ -h,--help
+ -l                              Uses log to print out erros in the base CLI code.
+ -mrMode
+ -setConfiguration <arg>
+ -setJobTimeout <arg>
+ -setLaunchTimeout <arg>
+ -setShutdownTimeout <arg>
+ -simulate
+ -update                         Specifies files should be updated if they're different in the source.
+ -useStateStore <arg>
+```
+This provides usage for the app distcp, as well as listing all available options. Distcp could then be run:
+```bash
+gobblin cli run distcp file:///source/path file:///target/path
+```
+
+The OneShot Quick App
+----------------------
+
+The Gobblin cli also ships with a generic job runner, the **oneShot** quick app. You can use it to run a single job using a standard config file. This is very useful during development, testing and also makes it easy to integrate with schedulers that just need to fire off a command line job. The **oneShot** app allows you to run a job in standalone mode or in map-reduce mode.
+```bash
+$ gobblin cli run oneShot -baseConf <base-config-file> -appConf <path-to-job-conf-file>
+# The Base Config file is an optional parameter and contains defaults for your mode of
+# execution (e.g. standalone modes would typically use
+# gobblin-dist/conf/standalone/application.conf and
+# mapreduce mode would typically use gobblin-dist/conf/mapreduce/application.conf)
+#
+# The Job Config file is your regular .pull or .conf file and is a required parameter.
+# You should use a fully qualified URI to your pull file. Otherwise Gobblin will pick the
+# default FS configured in the environment, which may not be what you want.
+# e.g file:///gobblin-conf/my-job/wikipedia.pull or hdfs:///gobblin-conf/my-job/kafka-hdfs.pull
+```
+
+The **oneShot** app comes with certain hardcoded defaults (that it inherits from EmbeddedGobblin [here](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/resources/embedded/embedded.conf)), that you may not be expecting. Make sure you understand what they do and override them in your baseConf or appConf files if needed.
+
+Notable differences at the time of this writing include:
+
+* state.store.enabled = false (set this to true in your appConfig or baseConfig if you want state storage for repeated oneshot runs)
+* data.publisher.appendExtractToFinalDir = false (set this to true in your appConfig or baseConfig if you want to see the extract name appended to the job output directory)
+
+The **oneShot** app allows for specifying the log4j file of your job execution which can be very helpful while debugging pesky failures.
+You can launch the job in MR-Mode by using the -mrMode switch.
+
+* oneShot execution of standalone with a log4j file.
+```bash
+$ gobblin cli run oneShot -baseConf /app/gobblin-dist/conf/standalone/application.conf -appConf file:///app/kafkaConfDir/kafka-simple-hdfs.pull --log4j-conf /app/gobblin-dist/conf/standalone/log4j.properties
+```
+* oneShot execution of map-reduce job with a log4j file
+```bash
+$ gobblin cli run oneShot -mrMode -baseConf /app/gobblin-dist/conf/standalone/application.conf -appConf file:///app/kafkaConfDir/kafka-simple-hdfs.pull --log4j-conf /app/gobblin-dist/conf/standalone/log4j.properties
+```
+
+Developing quick apps for the CLI
+--------------------------------------------
+    
+It is very easy to convert a subclass of `EmbeddedGobblin` into a quick application for Gobblin CLI. All that is needed is to implement a `EmbeddedGobblinCliFactory` which knows how instantiate the `EmbeddedGobblin` from a `CommandLine` object and annotate it with the `Alias` annotation. There are two utility classes that make this very easy:
+
+* `PublicMethodsGobblinCliFactory`: this class will automatically infer CLI options from the public methods of a subclass of `EmbeddedGobblin`. All the developer has to do is implement the method `constructEmbeddedGobblin(CommandLine)` that calls the appropriate constructor of the desired `EmbeddedGobblin` subclass with parameters extracted from the CLI. Additionally, it is a good idea to override `getUsageString()` with the appropriate usage string. For an example, see `gobblin.runtime. [...]
+* `ConstructorAndPublicMethodsGobblinCliFactory`: this class does everything `PublicMethodsGobblinCliFactory` does, but it additionally automatically infers how to construct the `EmbeddedGobblin` object from a constructor annotated with `EmbeddedGobblinCliSupport`. For an example, see `gobblin.runtime.embedded.EmbeddedGobblin.CliFactory`.
+
+Implementing new Gobblin commands
+---------------------------------
+
+To implement a new Gobblin command to list and execute using `./bin/gobblin`, implement the class `gobblin.runtime.cli.CliApplication`, and annotate it with the `Alias` annotation. The Gobblin CLI will automatically find the command, and users can invoke it by the Alias value.
+
+
+
+Gobblin Service Execution Modes ( as Daemon )
+-------------------
+
+For more info on Gobblin service execution modes, run `bin/gobblin service --help`: 
+```bash
+Usage:
+gobblin.sh  service <execution-mode> <start|stop|status>
+
+Argument Options:
+    <execution-mode>                   standalone, cluster-master, cluster-worker, aws,
+                                                 yarn, mapreduce, service-manager.
+
+    --conf-dir <gobblin-conf-dir-path> Gobblon config path. default is '$GOBBLIN_HOME/conf/<exe-mode-name>'.
+    --log4j-conf <path-of-log4j-file>  default is '<gobblin-conf-dir-path>/<execution-mode>/log4j.properties'.    --jvmopts <jvm or gc options>      String containing JVM flags to include, in addition to "-Xmx1g -Xms512m".
+    --jars <csv list of extra jars>    Column-separated list of extra jars to put on the CLASSPATH.
+    --enable-gc-logs                   enables gc logs & dumps.
+    --show-classpath                   prints gobblin runtime classpath.
+    --cluster-name                     Name of the cluster to be used by helix & other services. ( default: gobblin_cluster).
+    --jt <resource manager URL>        Only for mapreduce mode: Job submission URL, if not set, taken from ${HADOOP_HOME}/conf.
+    --fs <file system URL>             Only for mapreduce mode: Target file system, if not set, taken from ${HADOOP_HOME}/conf.
+    --help                             Display this help.
+    --verbose                          Display full command used to start the process.
+                                       Gobblin Version: 0.15.0
+```
+
+
+1. Standalone:
+    This mode starts all Gobblin services in single JVM on a single node. This mode is useful for development and light weight usage: 
+    ```bash
+    gobblin service standalone start
+    ```
+    For more details and architecture on each execution mode, refer [Standalone-Deployment](/docs/user-guide/Gobblin-Deployment)
+    
+2. Mapreduce:
+
+    This mode is dependent on Hadoop (both MapReduce and HDFS) running locally or remote cluster. Before launching any Gobblin jobs on Hadoop MapReduce, check the Gobblin system configuration file located at `conf/mapreduce/application.properties` for property `fs.uri`, which defines the file system URI used. The default value is `hdfs://localhost:8020`, which points to the local HDFS on the default port 8020. Change it to the right value depending on your Hadoop/HDFS setup. For example, [...]
+``` fs.uri=hdfs://<namenode host name>:9000/ ```
+    * `--jt`: resource manager URL
+    * `--fs`: file system type value for `fs.uri`
+    
+    This mode will have the minimum set of Gobblin jars, selected using `libs/gobblin-<module_name>-$GOBBLIN_VERSION.jar`, which is passed as `-libjar` to hadoop command while running the job. These same set of jars also gets added to the Hadoop `DistributedCache` for use in the mappers. If a job has additional jars needed for task executions (in the mappers), those jars can also be included by using the `--jars` option or the following job configuration property in the job configuration file:
+    ```
+    job.jars=<comma-separated list of jars the job depends on>
+    ```
+    if `HADOOP_HOME` is set in the environment, Gobblin will add result of `hadoop classpath` prior to default `GOBBLIN_CLASSPATH` to give them precedence while running `bin/gobblin`. 
+    
+    All job data and persisted job/task states will be written to the specified file system. Before launching any jobs, make sure the environment variable `HADOOP_HOME` is set so that it can access hadoop binaries under `{HADOOP_HOME}/bin` and also working directory should be set with configuration `{gobblin.cluster.work.dir}`. Note that the Gobblin working directory will be created on the file system specified above.
+    
+    An important side effect of this is that (depending on the application) non-fully-qualified paths (like `/my/path`) will default to local file system if `HADOOP_HOME` is not set, while they will default to HDFS if the variable is set. When referring to local paths, it is always a good idea to use the fully qualified path (e.g. `file:///my/path`).
+    
+
+3. Cluster Mode (master & worker)
+    This is a cluster mode consist of master and worker process. 
+    ```bash
+        gobblin service cluster-master start
+        gobblin service cluster-worker start
+    ```
+    
+4. AWS
+   This mode starts Gobblin on AWS cloud cluster.
+    ```bash
+       gobblin service aws start
+    ```
+
+5. YARN
+    This mode starts Gobblin on YARN cluster.
+    ```bash
+       gobblin service yarn start
+    ```
+
+
+Gobblin System Configurations
+----------------------
+
+Following values can be override by setting it in `gobblin-env.sh`
+
+`GOBBLIN_LOGS` : by default the logs are written to `$GOBBLIN_HOME/logs`, it can be override by setting `GOBBLIN_LOGS`\
+`GOBBLIN_VERSION` : by default gobblin version is set by the build process, it can be override by setting `GOBBLIN_VERSION`\
+
+
+All Gobblin system configurations details can be found here: [Configuration Properties Glossary](/docs/user-guide/Configuration-Properties-Glossary).
+
diff --git a/gobblin-website/docs/user-guide/Gobblin-Compliance.md b/gobblin-website/docs/user-guide/Gobblin-Compliance.md
new file mode 100644
index 0000000..9de349c
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Gobblin-Compliance.md
@@ -0,0 +1,57 @@
+---
+title: Gobblin Compliance
+sidebar_label: Gobblin Compliance
+
+---
+
+# Introduction
+--------------
+The Gobblin Compliance module allows for data purging to meet regulatory compliance requirements. The module includes functionality for purging datasets and the associated operational support in production.
+
+The purging is performed using Hive meaning that purging of datasets is supported in any format that Hive can read from and write to, including for example ORC and Parquet. Further the purger is built on top of the Gobblin framework which means that the fault-tolerance, scalability and flexibility provided by Gobblin is taken full advantage of.
+
+# Usage
+-------
+As an example, let us assume that regulation requires that once a guest has checked out of a hotel, certain guest data needs to be purged within a certain number of days after the guest leaves. Hence, the goal is to purge the hotel's datasets of data associated with their guests after they have left in order to meet regulatory compliance requirements.
+
+Hive databases and tables are setup to be purged by following these steps:
+
+1. Whitelisting of the database, or table for purging
+2. Specifying the dataset descriptor for the tables to be purged
+3. Specifying the JSON path of the compliance field in the dataset descriptor
+4. The table which contains the list of ids whose associated data is to be purged
+5. The name of the id column in the table to match against
+
+For example, in order to purge a Hive table named `tracking.event`, these properties are specified:
+
+* Specify the purger whitelist to include tracking.event
+`gobblin.compliance.dataset.whitelist=tracking.event`
+* Add a TBLPROPERTY named `dataset.descriptor` to the tracking.event Hive table to specify the compliance field to match in the table as an escaped JSON (since it has to be a valid string):
+`{\"complianceSpec\" : {\"identifierField\" : \"metadata.guestid\" }}`
+* Specify the JSON field path for the compliance field (that evaluates to metadata.guestId)
+`dataset.descriptor.fieldPath=complianceSpec.identifierField`
+* Specify the table containing the list of ids to purge
+`gobblin.compliance.complianceIdTable=u_purger.guestIds`
+* Specify the name of the id in the table to match against
+`gobblin.compliance.complianceId=guestId`
+
+With these properties in place, a Gobblin job can be setup to purge the table. The work unit for the purger is an individual table partition. Hence the purger will iterate over all the partitions in the table, and purge each partition individually, processing as many partitions in parallel as specified (by the property 'gobblin.compliance.purger.maxWorkunits', which defaults to 5).
+
+# Configuration
+---------------
+Configuration options for the Hive Purger include
+
+| Property      | Description |
+| ------------- |-------------|
+| gobblin.compliance.dataset.whitelist | The list of databases/tables to purge, comma-separated |
+| dataset.descriptor.fieldPath | The JSON field path specifying the compliance field |
+| gobblin.compliance.complianceIdTable | The table containing the list of ids whose data needs to be purged |
+| gobblin.compliance.complianceId | The name of the id column in the complianceIdTable to match against |
+| gobblin.compliance.purger.maxWorkunits | The number of partitions to purge in parallel |
+| gobblin.compliance.purger.policy.class | The policy class that specifies the criteria for purging, defaults to HivePurgerPolicy |
+| gobblin.compliance.purger.commit.policy.class | The policy class that specifies the criteria for committing a purged dataset, defaults to HivePurgerCommitPolicy |
+
+# Developer Guide
+
+The [Developer Guide](../developer-guide/Gobblin-Compliance-Design) further describes the design of the module.
+
diff --git a/gobblin-website/docs/user-guide/Gobblin-Deployment.md b/gobblin-website/docs/user-guide/Gobblin-Deployment.md
new file mode 100644
index 0000000..d959b26
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Gobblin-Deployment.md
@@ -0,0 +1,88 @@
+---
+title: Deployment
+sidebar_label: Deployment
+---
+
+Gobblin Execution Modes Overview <a name="gobblin-execution-modes-Overview"></a>
+--------------------------------------------------------------------------------
+One important feature of Gobblin is that it can be run on different platforms. Currently, Gobblin can run in standalone mode (which runs on a single machine), and on Hadoop MapReduce mode (which runs on a Hadoop cluster). This page summarizes the different deployment modes of Gobblin. It is important to understand the architecture of Gobblin in a specific deployment mode, so this page also describes the architecture of each deployment mode.  
+
+Gobblin supports Java 7 and up, but can only run on Hadoop 2.x. By default, Gobblin will build against Hadoop 2.x, run `./gradlew clean build`. More information on how to build Gobblin can be found [here](https://github.com/apache/gobblin/blob/master/README.md). All directories/paths referred below are relative to `gobblin-dist`.
+
+To run gobblin in any of the following executuon mode using ```gobblin.sh```, refer [Gobblin-CLI](/docs/user-guide/Gobblin-CLI) for the usage.
+
+
+Standalone Architecture <a name="Standalone-Architecture"></a>
+--------------------
+The following diagram illustrates the Gobblin standalone architecture. In the standalone mode, a Gobblin instance runs in a single JVM and tasks run in a thread pool, the size of which is configurable. The standalone mode is good for light-weight data sources such as small databases. The standalone mode is also the default mode for trying and testing Gobblin. 
+
+![Gobblin on Single Node](../../static/img/Gobblin-on-Single-Node.png)
+
+In the standalone deployment, the `JobScheduler` runs as a daemon process that schedules and runs jobs using the so-called `JobLauncher`s. The `JobScheduler` maintains a thread pool in which a new `JobLauncher` is started for each job run. Gobblin ships with two types of `JobLauncher`s, namely, the `LocalJobLauncher` and `MRJobLauncher` for launching and running Gobblin jobs on a single machine and on Hadoop MapReduce, respectively. Which `JobLauncher` to use can be configured on a per-j [...]
+
+Each `LocalJobLauncher` starts and manages a few components for executing tasks of a Gobblin job. Specifically, a `TaskExecutor` is responsible for executing tasks in a thread pool, whose size is configurable on a per-job basis. A `LocalTaskStateTracker` is responsible for keep tracking of the state of running tasks, and particularly updating the task metrics. The `LocalJobLauncher` follows the steps below to launch and run a Gobblin job:    
+
+1. Starting the `TaskExecutor` and `LocalTaskStateTracker`.
+2. Creating an instance of the `Source` class specified in the job configuration and getting the list of `WorkUnit`s to do.
+3. Creating a task for each `WorkUnit` in the list, registering the task with the `LocalTaskStateTracker`, and submitting the task to the `TaskExecutor` to run.
+4. Waiting for all the submitted tasks to finish.
+5. Upon completion of all the submitted tasks, collecting tasks states and persisting them to the state store, and publishing the extracted data.  
+
+
+MapReduce architecture <a name="MapReduce-Architecture"></a>
+--------------------
+The digram below shows the architecture of Gobblin on Hadoop MapReduce. As the diagram shows, a Gobblin job runs as a mapper-only MapReduce job that runs tasks of the Gobblin job in the mappers. The basic idea here is to use the mappers purely as _containers_ to run Gobblin tasks. This design also makes it easier to integrate with Yarn. Unlike in the standalone mode, task retries are not handled by Gobblin itself in the Hadoop MapReduce mode. Instead, Gobblin relies on the task retry mec [...]
+
+![Gobblin on Hadoop MR](../../static/img/Gobblin-on-Hadoop-MR.png)
+
+In this mode, a `MRJobLauncher` is used to launch and run a Gobblin job on Hadoop MapReduce, following the steps below:
+
+1. Creating an instance of the `Source` class specified in the job configuration and getting the list of `WorkUnit`s to do.
+2. Serializing each `WorkUnit` into a file on HDFS that will be read later by a mapper.
+3. Creating a file that lists the paths of the files storing serialized `WorkUnit`s.
+4. Creating and configuring a mapper-only Hadoop MapReduce job that takes the file created in step 3 as input.
+5. Starting the MapReduce job to run on the cluster of choice and waiting for it to finish.
+6. Upon completion of the MapReduce job, collecting tasks states and persisting them to the state store, and publishing the extracted data. 
+
+A mapper in a Gobblin MapReduce job runs one or more tasks, depending on the number of `WorkUnit`s to do and the (optional) maximum number of mappers specified in the job configuration. If there is no maximum number of mappers specified in the job configuration, each `WorkUnit` corresponds to one task that is executed by one mapper and each mapper only runs one task. Otherwise, if a maximum number of mappers is specified and there are more `WorkUnit`s than the maximum number of mappers a [...]
+
+A mapper in a Gobblin MapReduce job follows the step below to run tasks assigned to it:
+
+1. Starting the `TaskExecutor` that is responsible for executing tasks in a configurable-size thread pool and the `MRTaskStateTracker` that is responsible for keep tracking of the state of running tasks in the mapper. 
+2. Reading the next input record that is the path to the file storing a serialized `WorkUnit`.
+3. Deserializing the `WorkUnit` and adding it to the list of `WorkUnit`s to do. If the input is a `MultiWorkUnit`, the `WorkUnit`s it wraps are all added to the list. Steps 2 and 3 are repeated until all assigned `WorkUnit`s are deserialized and added to the list.
+4. For each `WorkUnit` on the list of `WorkUnit`s to do, creating a task for the `WorkUnit`, registering the task with the `MRTaskStateTracker`, and submitting the task to the `TaskExecutor` to run. Note that the tasks may run in parallel if the `TaskExecutor` is [configured](/docs/user-guide/Configuration-Properties-Glossary#taskexecutorthreadpoolsize) to have more than one thread in its thread pool.
+4. Waiting for all the submitted tasks to finish.
+5. Upon completion of all the submitted tasks, writing out the state of each task into a file that will be read by the `MRJobLauncher` when collecting task states.
+6. Going back to step 2 and reading the next input record if available.
+
+Master-Worker architecture
+----------------------------------
+
+
+
+
+
+AWS architecture
+---------------
+
+
+
+
+
+YARN architecture
+---------------
+
+
+
+
+
+
+Gobblin-As-A-Service  architecture
+----------------------------------
+
+
+
+
+
+
diff --git a/gobblin-website/docs/user-guide/Gobblin-Schedulers.md b/gobblin-website/docs/user-guide/Gobblin-Schedulers.md
new file mode 100644
index 0000000..85caa44
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Gobblin-Schedulers.md
@@ -0,0 +1,81 @@
+---
+title: Schedulers
+sidebar_label: Schedulers
+---
+
+# Introduction
+
+Gobblin jobs can be scheduled on a recurring basis using a few different tools. Gobblin ships with a built in [Quartz Scheduler](https://quartz-scheduler.org/). Gobblin also integrates with a few other third party tools.
+
+# Quartz
+
+Gobblin has a built in Quartz scheduler as part of the [`JobScheduler`](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/java/org/apache/gobblin/scheduler/JobScheduler.java) class. This class integrates with the Gobblin [`SchedulerDaemon`](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/java/org/apache/gobblin/scheduler/SchedulerDaemon.java), which can be run using the Gobblin [`bin/gobblin-standalone.sh](https://github.com/apache/gobblin/blob [...]
+
+So in order to take advantage of the Quartz scheduler two steps need to be taken:
+
+* Use the `bin/gobblin-standalone.sh` script
+* Add the property `job.schedule` to the `.pull` file
+    * The value for this property should be a [CRONTrigger](http://quartz-scheduler.org/api/2.2.0/org/quartz/CronTrigger.html)
+
+# Azkaban
+
+Gobblin can be launched via [Azkaban](https://azkaban.github.io/), and open-source Workflow Manager for scheduling and launching Hadoop jobs. Gobblin's [`AzkabanJobLauncher`](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-azkaban/src/main/java/org/apache/gobblin/azkaban/AzkabanJobLauncher.java) can be used to launch a Gobblin job through Azkaban.
+
+One has to follow the typical setup to create a zip file that can be uploaded to Azkaban (it should include all dependent jars, which can be found in `gobblin-dist.tar.gz`). The `.job` file for the Azkaban Job should contain all configuration properties that would be put in a `.pull` file (for example, the [Wikipedia Example](https://github.com/apache/gobblin/blob/master/gobblin-example/src/main/resources/wikipedia.pull) `.pull` file). All Gobblin system dependent properties (e.g. [`conf [...]
+
+In the Azkaban `.job` file, the `type` parameter should be set to `hadoopJava` (see [here](http://azkaban.github.io/azkaban/docs/latest/#hadoopjava-type) for more information about the `hadoopJava` Job Type). The `job.class` parameter should be set to `gobblin.azkaban.AzkabanJobLauncher`.
+
+# Oozie
+
+[Oozie](https://oozie.apache.org/) is a very popular scheduler for the Hadoop environment. It allows users to define complex workflows using XML files. A workflow can be composed of a series of actions, such as Java Jobs, Pig Jobs, Spark Jobs, etc. Gobblin has two integration points with Oozie. It can be run as a stand-alone Java process via Oozie's `<java>` tag, or it can be run as an Map Reduce job via Oozie.
+
+The following guides assume Oozie is already setup and running on some machine, if this is not the case consult the Oozie documentation for getting everything setup.
+
+These tutorial only outline how to launch a basic Oozie job that simply runs a Gobblin java a single time. For information on how to build more complex flows, and how to run jobs on a schedule, check out the Oozie documentation online.
+
+### Launching Gobblin in Local Mode
+
+This guide focuses on getting Gobblin to run in as a stand alone Java Process. This means it will not launch a separate MR job to distribute its workload. It is important to understand how the current version of Oozie will launch a Java process. It will first start an MapReduce job and will run the Gobblin as a Java process inside a single map task. The Gobblin job will then ingest all data it is configured to pull and then it will shutdown.
+
+#### Example Config Files
+
+[`gobblin-oozie/src/main/resources/local`](https://github.com/apache/gobblin/blob/master/gobblin-oozie/src/test/resources/local) contains sample configuration files for launching Gobblin Oozie. There are a number of important files in this directory:
+
+[`gobblin-oozie-example-system.properties`](https://github.com/apache/gobblin/blob/master/gobblin-oozie/src/test/resources/local/gobblin-oozie-example-system.properties) contains default system level properties for Gobblin. When launched with Oozie, Gobblin will run inside a map task; it is thus recommended to configure Gobblin to write directly to HDFS rather than the local file system. The property `fs.uri` in this file should be changed to point to the NameNode of the Hadoop File Syst [...]
+
+[`gobblin-oozie-example-workflow.properties`](https://github.com/apache/gobblin/blob/master/gobblin-oozie/src/test/resources/local/gobblin-oozie-example-workflow.properties) contains default Oozie properties for any job launched. It is also the entry point for launching an Oozie job (e.g. to launch an Oozie job from the command line you execute `oozie job -config gobblin-oozie-example-workflow.properties -run`). In this file one needs to update the `name.node` and `resource.manager` to t [...]
+
+[`gobblin-oozie-example-workflow.xml`](https://github.com/apache/gobblin/blob/master/gobblin-oozie/src/test/resources/local/gobblin-oozie-example-workflow.xml) contains an example Oozie workflow. This example simply launches a Java process that invokes the main method of the [`CliLocalJobLauncher`](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/local/CliLocalJobLauncher.java). The main method of this class expects two file paths to  [...]
+).
+
+<!---Ying Do you think we can add some descriptions about launching through MR mode? The simplest way is to use the <shell> tag and invoke `gobblin-mapreduce.sh`. I've tested it before.-->
+
+#### Uploading Files to HDFS
+
+Oozie only reads a job properties file from the local file system (e.g. `gobblin-oozie-example-workflow.properties`), it expects all other configuration and dependent files to be uploaded to HDFS. Specifically, it looks for these files under the directory specified by `oozie.wf.application.path` Make sure this is the case before trying to launch an Oozie job.
+
+##### Adding Gobblin `jar` Dependencies
+
+Gobblin has a number of `jar` dependencies that need to be used when launching a Gobblin job. These dependencies can be taken from the `gobblin-dist.tar.gz` file that is created after building Gobblin. The tarball should contain a `lib` folder will the necessary dependencies. This folder should be placed into a `lib` folder under the same same directory specified by `oozie.wf.application.path` in the `gobblin-oozie-example-workflow.properties` file.
+
+#### Launching the Job
+
+Assuming one has the [Oozie CLI](https://oozie.apache.org/docs/3.1.3-incubating/DG_CommandLineTool.html) installed, the job can be launched using the following command: `oozie job -config gobblin-oozie-example-workflow.properties -run`.
+
+### Launching Gobblin in MapReduce Mode
+
+Launching Gobblin in mapreduce Mode works quite similar to the local mode. In this mode, the oozie launcher action will spawn a second mapreduce process where gobblin will process its tasks in distributed mode across the cluster. Since each of the Mappers needs access to the gobblin libraries, we need to provide the jars via the `job.hdfs.jars` variable
+
+#### Example Config Files
+
+[`gobblin-oozie/src/main/resources/mapreduce`](https://github.com/apache/gobblin/tree/master/gobblin-oozie/src/test/resources/mapreduce) contains sample configuration files for launching Gobblin Oozie in Mapreduce mode. The main difference to launching Gobblin Oozie in Local mode are a view extra MapReduce related configuration variables in the sysconfig.properties file and launching CliMRJobLauncher instead CliLocalJobLauncher.
+
+#### Further steps
+
+Everything else should be working the same way as in Local mode (see above)
+
+### Debugging Tips
+
+Once the job has been launched, its status can be queried via the following command: `oozie job -info <oozie-job-id>` and the logs can be shown via the following command `oozie job -log <oozie-job-id>`.
+
+In order to get see the standard output of Gobblin, one needs to check the logs the Map task running the Gobblin process. `oozie job -info <oozie-job-id>` should show the Hadoop `job_id` of the Hadoop Job launched to run the Gobblin process. Using this id one should be able to find the logs of the Map tasks through the UI or other command line tools (e.g. `yarn logs`).
diff --git a/gobblin-website/docs/user-guide/Gobblin-as-a-Library.md b/gobblin-website/docs/user-guide/Gobblin-as-a-Library.md
new file mode 100644
index 0000000..431553d
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Gobblin-as-a-Library.md
@@ -0,0 +1,75 @@
+---
+title: Gobblin as a Library
+sidebar_label: Gobblin as a Library
+---
+
+Using Gobblin as a Library
+-----------------------
+
+A Gobblin ingestion flow can be embedded into a java application using the `EmbeddedGobblin` class.
+
+The following code will run a Hello-World Gobblin job as an embedded application using a template. This will simply print "Hello World \<i\>!" to stdout a few times.
+```java
+EmbeddedGobblin embeddedGobblin = new EmbeddedGobblin("TestJob")
+        .setTemplate(ResourceBasedJobTemplate.forResourcePath("templates/hello-world.template"));
+JobExecutionResult result = embeddedGobblin.run();
+```
+
+Note: `EmbeddedGobblin` starts and destroys an embedded Gobblin instance every time `run()` is called. If an application needs to run a large number of Gobblin jobs, it should instantiate and manage its own Gobblin driver.
+
+Creating an Embedded Gobblin instance
+-----------------------------------
+
+The code snippet above creates an `EmbeddedGobblin` instance. This instance can run arbitrary Gobblin ingestion jobs, and allows the use of templates. However, the user needs to configure the job by using the exact key needed for each feature.
+
+An alternative is to use a subclass of `EmbeddedGobblin` which provides methods to more easily configure the job. For example, an easier way to run a Gobblin distcp job is to use `EmbeddedGobblinDistcp`:
+```java
+EmbeddedGobblinDistcp distcp = new EmbeddedGobblinDistcp(sourcePath, targetPath).delete();
+distcp.run();
+```
+This subclass automatically knows which template to use, the required configurations for the job (which are included as constructor parameters), and also provides convenience methods for the most common configurations (in the case above, the method `delete()` instructs the job to delete files that exist in the target but not the source).
+
+The following is a non-extensive list of available subclasses of `EmbeddedGobblin`:
+* `EmbeddedGobblinDistcp`: distributed copy between Hadoop compatible file systems.
+* `EmbeddedWikipediaExample`: a getting started example that pulls page updated from Wikipedia.
+
+Configuring Embedded Gobblin
+---------------------------
+
+`EmbeddedGobblin` allows any configuration that a standalone Gobblin job would allow. `EmbeddedGobblin` itself provides a few convenience methods to alter the behavior of the Gobblin framework. Other methods allow users to set a job template to use or set job level configurations.
+
+|Method|Parameters|Description|
+|-------------|-------------|-------------|
+|`mrMode`| N/A | Gobblin should run on MR mode. |
+|`setTemplate`| Template object to use | Use a job template.|
+|`useStateStore` | State store directory | By default, embedded Gobblin is stateless and disables state store. This method enables the state store at the indicated location allowing using watermarks from previous jobs. |
+|`distributeJar` | Path to jar in local fs | Indicates that a specific jar is needed by Gobblin workers when running in distributed mode (e.g. MR mode). Gobblin will automatically add this jar to the classpath of the workers. |
+|`setConfiguration` | key - value pair | Sets a job level configuration. |
+|`setJobTimeout` | timeout and time unit, or ISO period | Sets the timeout for the Gobblin job. `run()` will throw a `TimeoutException` if the job is not done after this period. (Default: 10 days) |
+|`setLaunchTimeout` | timeout and time unit, or ISO period | Sets the timeout for launching Gobblin job. `run()` will throw a `TimeoutException` if the job has not started after this period. (Default: 10 seconds) |
+|`setShutdownTimeout` | timeout and time unit, or ISO period | Sets the timeout for shutting down embedded Gobblin after the job has finished. `run()` will throw a `TimeoutException` if the method has not returned within the timeout after the job finishes. Note that a `TimeoutException` may indicate that Gobblin could not release JVM resources, including threads. |
+
+Additional to the above, subclasses of `EmbeddedGobblin` might offer their own convenience methods.
+
+Running Embedded Gobblin
+-----------------------
+
+After `EmbeddedGobblin` has been configured it can be run with one of two methods:
+* `run()`: blocking call. Returns a `JobExecutionResult` after the job finishes and Gobblin shuts down.
+* `runAsync()`: asynchronous call. Returns a `JobExecutionDriver`, which implements `Future<JobExecutionResult>`.
+
+Extending Embedded Gobblin
+-------------------------
+Developers can extend `EmbeddedGobblin` to provide users with easier ways to launch a particular type of job. For an example see `EmbeddedGobblinDistcp`.
+
+Best practices:
+* Generally, a subclass of `EmbeddedGobblin` is based on a template. The template should be automatically loaded on construction and the constructor should call `setTemplate(myTemplate)`.
+* All required configurations for a job should be parsed from the constructor arguments. User should be able to run `new MyEmbeddedGobblinExtension(params...).run()` and get a sensible job run.
+* Convenience methods should be added for the most common configurations users would want to change. In general a convenience method will call a few other methods transparently to the user. For example:
+```java
+  public EmbeddedGobblinDistcp simulate() {
+    this.setConfiguration(CopySource.SIMULATE, Boolean.toString(true));
+    return this;
+  }
+```
+* If the job requires additional jars in the workers that are not part of the minimal Gobblin ingestion classpath (see `EmbeddedGobblin#getCoreGobblinJars` for this list), then the constructor should call `distributeJar(myJar)` for the additional jars.
diff --git a/gobblin-website/docs/user-guide/Gobblin-genericLoad.md b/gobblin-website/docs/user-guide/Gobblin-genericLoad.md
new file mode 100644
index 0000000..33f8e4a
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Gobblin-genericLoad.md
@@ -0,0 +1,19 @@
+---
+title: Generic Configuration Loading
+sidebar_label: Generic Configuration Loading
+---
+
+Overview
+--------------------
+Previously, the job configuration files could only be loaded from and monitored in the local file system. Efforts have been made to change the limitation and now Gobblin can also load job configuration files in other file systems. Users can easily submit `.pull` files through their preferred file system and specify it in system configuration accordingly. 
+
+This page will use the wikipedia example of Gobblin-standalone interacting with job configuration files in HDFS.
+
+
+How to submit `.pull` file through HDFS
+--------------------
+Here are the steps to change the system configuration: 
+- Set `fs.uri` to the HDFS uri that the `.pull` file will be submitted to.  
+- Use `jobconf.fullyQualifiedPath` to specify the fully qualified location where pull files should be searched for (this replaces the previously used key `jobconf.dir`)
+
+With all these changes to `gobblin-standalone.properties`, you can now submit the `.pull` to the target file system path.  
diff --git a/gobblin-website/docs/user-guide/Gobblin-on-Yarn.md b/gobblin-website/docs/user-guide/Gobblin-on-Yarn.md
new file mode 100644
index 0000000..122ab70
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Gobblin-on-Yarn.md
@@ -0,0 +1,309 @@
+---
+title: Gobblin on Yarn
+sidebar_label: Gobblin on Yarn
+---
+
+# Introduction
+
+Gobblin currently is capable of running in the standalone mode on a single machine or in the MapReduce (MR) mode as a MR job on a Hadoop cluster. A Gobblin job is typically running on a schedule through a scheduler, e.g., the built-in `JobScheduler`, Azkaban, or Oozie, and each job run ingests new data or data updated since the last run. So this is essentially a batch model for data ingestion and how soon new data becomes available on Hadoop depends on the schedule of the job. 
+
+On another aspect, for high data volume data sources such as Kafka, Gobblin typically runs in the MR mode with a considerable number of tasks running in the mappers of a MR job. This helps Gobblin to scale out for data sources with large volumes of data. The MR mode, however, suffers from problems such as large overhead mostly due to the overhead of submitting and launching a MR job and poor cluster resource usage. The MR mode is also fundamentally not appropriate for real-time data inge [...]
+
+* In the MR mode, every Gobblin job run starts a new MR job, which costs a considerable amount of time to allocate and start the containers for running the mapper/reducer tasks. This cost can be totally eliminated if the containers are already up and running.
+* Each Gobblin job running in the MR mode requests a new set of containers and releases them upon job completion. So it's impossible for two jobs to share the containers even though the containers are perfectly capable of running tasks of both jobs.
+* In the MR mode, All `WorkUnit`s are pre-assigned to the mappers before launching the MR job. The assignment is fixed by evenly distributing the `WorkUnit`s to the mappers so each mapper gets a fair share of the work in terms of the _number of `WorkUnits`_. However, an evenly distributed number of `WorkUnit`s per mapper does not always guarantee a fair share of the work in terms of the volume of data to pull. This, combined with the fact that the mappers that finish earlier cannot "stea [...]
+* A MR job can only hold its containers for a limited of time, beyond which the job may get killed. Real-time data ingestion, however, requires the ingestion tasks to be running all the time or alternatively dividing a continuous data stream into well-defined mini-batches (as in Spark Streaming) that can be promptly executed once created. Both require long-running containers, which are not supported in the MR mode. 
+
+Those deficiencies motivated the work on making Gobblin run on Yarn as a native Yarn application. Running Gobblin as a native Yarn application allows much more control over container provisioning and lifecycle management so it's possible to keep the containers running continuously. It also makes it possible to dynamically change the number of containers at runtime depending on the load to further improve the resource efficiency, something that's impossible in the MR mode.         
+
+This wiki page documents the design and architecture of the native Gobblin Yarn application and some implementation details. It also covers the configuration system and properties for the application, as well as deployment settings on both unsecured and secured Yarn clusters. 
+
+# Architecture
+
+## Overview
+
+The architecture of Gobblin on Yarn is illustrated in the following diagram. In addition to Yarn, Gobblin on Yarn also leverages [Apache Helix](http://helix.apache.org/), whose role is discussed in [The Role of Apache Helix](#the-role-of-apache-helix). A Gobblin Yarn application consists of three components: the Yarn Application Launcher, the Yarn ApplicationMaster (serving as the Helix _controller_), and the Yarn WorkUnitRunner (serving as the Helix _participant_). The following section [...]
+
+![Gobblin on Yarn with Helix](../../static/img/Gobblin-on-Yarn-with-Helix.png)
+
+## The Role of Apache Helix
+
+[Apache Helix](http://helix.apache.org/) is mainly used for managing the cluster of containers and running the `WorkUnit`s through its [Distributed Task Execution Framework](http://helix.apache.org/0.7.1-docs/recipes/task_dag_execution.html). 
+
+The assignment of tasks to available containers (or participants in Helix's term) is handled by Helix through a finite state model named the `TaskStateModel`. Using this `TaskStateModel`, Helix is also able to do task rebalancing in case new containers get added or some existing containers die. Clients can also choose to force a task rebalancing if some tasks take much longer time than the others. 
+
+Helix also supports a way of doing messaging between different components of a cluster, e.g., between the controller to the participants, or between the client and the controller. The Gobblin Yarn application uses this messaging mechanism to implement graceful shutdown initiated by the client as well as delegation token renew notifications from the client to the ApplicationMaster and the WorkUnitRunner containers.
+
+Heiix relies on ZooKeeper for its operations, and particularly for maintaining the state of the cluster and the resources (tasks in this case). Both the Helix controller and participants connect to ZooKeeper during their entire lifetime. The ApplicationMaster serves as the Helix controller and the worker containers serve as the Helix participants, respectively, as discussed in details below.  
+
+## Gobblin Yarn Application Launcher
+
+The Gobblin Yarn Application Launcher (implemented by class [`GobblinYarnAppLauncher`](https://github.com/apache/gobblin/blob/master/gobblin-yarn/src/main/java/org/apache/gobblin/yarn/GobblinYarnAppLauncher.java)) is the client/driver of a Gobblin Yarn application. The first thing the `GobblinYarnAppLauncher` does when it starts is to register itself with Helix as a _spectator_ and creates a new Helix cluster with name specified through the configuration property `gobblin.yarn.helix.clus [...]
+
+The `GobblinYarnAppLauncher` then sets up the Gobblin Yarn application and submits it to run on Yarn. Once the Yarn application successfully starts running, it starts an application state monitor that periodically checks the state of the Gobblin Yarn application. If the state is one of the exit states (`FINISHED`, `FAILED`, or `KILLED`), the `GobblinYarnAppLauncher` shuts down itself. 
+
+Upon successfully submitting the application to run on Yarn, the `GobblinYarnAppLauncher` also starts a `ServiceManager` that manages the following services that auxiliate the running of the application:
+
+### YarnAppSecurityManager
+
+The [`YarnAppSecurityManager`](https://github.com/apache/gobblin/blob/master/gobblin-yarn/src/main/java/org/apache/gobblin/yarn/YarnAppSecurityManager.java) works with the [`YarnContainerSecurityManager`](https://github.com/apache/gobblin/blob/master/gobblin-yarn/src/main/java/org/apache/gobblin/yarn/YarnContainerSecurityManager.java) running in the ApplicationMaster and the WorkUnitRunner for a complete solution for security and delegation token management. The `YarnAppSecurityManager`  [...]
+
+### LogCopier
+
+The service [`LogCopier`](https://github.com/apache/gobblin/blob/master/gobblin-utility/src/main/java/org/apache/gobblin/util/logs/LogCopier.java) in `GobblinYarnAppLauncher` streams the ApplicationMaster and WorkUnitRunner logs in near real-time from the central location on HDFS where the logs are streamed to from the ApplicationMaster and WorkUnitRunner containers, to the local directory specified through the configuration property `gobblin.yarn.logs.sink.root.dir` on the machine where [...]
+
+## Gobblin ApplicationMaster
+
+The ApplicationMaster process runs the [`GobblinApplicationMaster`](https://github.com/apache/gobblin/blob/master/gobblin-yarn/src/main/java/org/apache/gobblin/yarn/GobblinApplicationMaster.java), which uses a `ServiceManager` to manage the services supporting the operation of the ApplicationMaster process. The services running in `GobblinApplicationMaster` will be discussed later. When it starts, the first thing `GobblinApplicationMaster` does is to connect to ZooKeeper and register its [...]
+
+### YarnService
+
+The service [`YarnService`](https://github.com/apache/gobblin/blob/master/gobblin-yarn/src/main/java/org/apache/gobblin/yarn/YarnService.java) handles all Yarn-related task including the following:
+
+* Registering and un-registering the ApplicationMaster with the Yarn ResourceManager.
+* Requesting the initial set of containers from the Yarn ResourceManager.
+* Handling any container changes at runtime, e.g., adding more containers or shutting down containers no longer needed. This also includes stopping running containers when the application is asked to stop.
+
+This design makes it switch to a different resource manager, e.g., Mesos, by replacing the service `YarnService` with something else specific to the resource manager, e.g., `MesosService`.
+
+### GobblinHelixJobScheduler
+
+[`GobblinApplicationMaster`](https://github.com/apache/gobblin/blob/master/gobblin-yarn/src/main/java/org/apache/gobblin/yarn/GobblinApplicationMaster.java) runs the [`GobblinHelixJobScheduler`](https://github.com/apache/gobblin/blob/master/gobblin-cluster/src/main/java/org/apache/gobblin/cluster/GobblinHelixJobScheduler.java) that schedules jobs to run through the Helix [Distributed Task Execution Framework](http://helix.apache.org/0.7.1-docs/recipes/task_dag_execution.html). For each G [...]
+
+Like the [`LocalJobLauncher`](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/local/LocalJobLauncher.java) and [`MRJobLauncher`](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/mapreduce/MRJobLauncher.java), the `GobblinHelixJobLauncher` handles output data commit and job state persistence.   
+
+### LogCopier
+
+The service [`LogCopier`](https://github.com/apache/gobblin/blob/master/gobblin-utility/src/main/java/org/apache/gobblin/util/logs/LogCopier.java) in `GobblinApplicationMaster` streams the ApplicationMaster logs in near real-time from the machine running the ApplicationMaster container to a central location on HDFS so the logs can be accessed at runtime. More details on this can be found in [Log Aggregation](#log-aggregation).
+
+### YarnContainerSecurityManager
+
+The [`YarnContainerSecurityManager`](https://github.com/apache/gobblin/blob/master/gobblin-yarn/src/main/java/org/apache/gobblin/yarn/YarnContainerSecurityManager.java) runs in both the ApplicationMaster and the WorkUnitRunner. When it starts, it registers a message handler with the `HelixManager` for handling messages on refreshes of the delegation token. Once such a message is received, the `YarnContainerSecurityManager` gets the path to the token file on HDFS from the message, and upd [...]
+
+## Gobblin WorkUnitRunner
+
+The WorkUnitRunner process runs as part of [`GobblinTaskRunner`](https://github.com/apache/gobblin/blob/master/gobblin-cluster/src/main/java/org/apache/gobblin/cluster/GobblinTaskRunner.java), which uses a `ServiceManager` to manage the services supporting the operation of the WorkUnitRunner process. The services running in `GobblinWorkUnitRunner` will be discussed later. When it starts, the first thing `GobblinWorkUnitRunner` does is to connect to ZooKeeper and register itself as a Heli [...]
+
+### TaskExecutor
+
+The [`TaskExecutor`](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/TaskExecutor.java) remains the same as in the standalone and MR modes, and is purely responsible for running tasks assigned to a WorkUnitRunner. 
+
+### GobblinHelixTaskStateTracker
+
+The [`GobblinHelixTaskStateTracker`](https://github.com/apache/gobblin/blob/master/gobblin-cluster/src/main/java/org/apache/gobblin/cluster/GobblinHelixTaskStateTracker.java) has a similar responsibility as the `LocalTaskStateTracker` and `MRTaskStateTracker`: keeping track of the state of running tasks including operational metrics, e.g., total records pulled, records pulled per second, total bytes pulled, bytes pulled per second, etc.
+
+### LogCopier
+
+The service [`LogCopier`](https://github.com/apache/gobblin/blob/master/gobblin-utility/src/main/java/org/apache/gobblin/util/logs/LogCopier.java) in `GobblinWorkUnitRunner` streams the WorkUnitRunner logs in near real-time from the machine running the WorkUnitRunner container to a central location on HDFS so the logs can be accessed at runtime. More details on this can be found in [Log Aggregation](#log-aggregation).
+
+### YarnContainerSecurityManager
+
+The [`YarnContainerSecurityManager`](https://github.com/apache/gobblin/blob/master/gobblin-yarn/src/main/java/org/apache/gobblin/yarn/YarnContainerSecurityManager.java) in `GobblinWorkUnitRunner` works in the same way as it in `GobblinApplicationMaster`. 
+
+## Failure Handling
+
+### ApplicationMaster Failure Handling
+
+Under normal operation, the Gobblin ApplicationMaster stays alive unless being asked to stop through a message sent from the launcher (the `GobblinYarnAppLauncher`) as part of the orderly shutdown process. It may, however, fail or get killed by the Yarn ResourceManager for various reasons. For example, the container running the ApplicationMaster may fail and exit due to node failures, or get killed because of using more memory than claimed. When a shutdown of the ApplicationMaster is tri [...]
+
+### Container Failure Handling
+
+Under normal operation, a Gobblin Yarn container stays alive unless being released and stopped by the Gobblin ApplicationMaster, and in this case the exit status of the container will be zero. However, a container may exit unexpectedly due to various reasons. For example, a container may fail and exit due to node failures, or be killed because of using more memory than claimed. In this case when a container exits abnormally with a non-zero exit code, Gobblin Yarn tries to restart the Hel [...]
+
+When requesting a new container to replace the one that completes and exits abnormally, the application has a choice of specifying the same host that runs the completed container as the preferred host, depending on the boolean value of configuration key `gobblin.yarn.container.affinity.enabled`. Note that for certain exit codes that indicate something wrong with the host, the value of `gobblin.yarn.container.affinity.enabled` is ignored and no preferred host gets specified, leaving Yarn  [...]
+
+### Handling Failures to get ApplicationReport
+
+As mentioned above, once the Gobblin Yarn application successfully starts running, the `GobblinYarnAppLauncher` starts an application state monitor that periodically checks the state of the Yarn application by getting an `ApplicationReport`. It may fail to do so and throw an exception, however, if the Yarn client is having some problem connecting and communicating with the Yarn cluster. For example, if the Yarn cluster is down for maintenance, the Yarn client will not be able to get an ` [...]
+
+# Log Aggregation
+
+Yarn provides both a Web UI and a command-line tool to access the logs of an application, and also does log aggregation so the logs of all the containers become available on the client side upon requested. However, there are a few limitations that make it hard to access the logs of an application at runtime:
+
+* The command-line utility for downloading the aggregated logs will only be able to do so after the application finishes, making it useless for getting access to the logs at the application runtime.  
+* The Web UI does allow logs to be viewed at runtime, but only when the user that access the UI is the same as the user that launches the application. On a Yarn cluster where security is enabled, the user launching the Gobblin Yarn application is typically a user of some headless account.
+
+Because Gobblin runs on Yarn as a long-running native Yarn application, getting access to the logs at runtime is critical to know what's going on in the application and to detect any issues in the application as early as possible. Unfortunately we cannot use the log facility provided by Yarn here due to the above limitations. Alternatively, Gobblin on Yarn has its own mechanism for doing log aggregation and providing access to the logs at runtime, described as follows.
+
+Both the Gobblin ApplicationMaster and WorkUnitRunner run a `LogCopier` that periodically copies new entries of both `stdout` and `stderr` logs of the corresponding processes from the containers to a central location on HDFS under the directory `${gobblin.yarn.work.dir}/_applogs` in the subdirectories named after the container IDs, one per container. The names of the log files on HDFS combine the container IDs and the original log file names so it's easy to tell which container generates [...]
+
+The Gobblin YarnApplicationLauncher also runs a `LogCopier` that periodically copies new log entries from log files under `${gobblin.yarn.work.dir}/_applogs` on HDFS to the local filesystem under the directory configured by the property `gobblin.yarn.logs.sink.root.dir`. By default, the `LogCopier` checks for new log entries every 60 seconds and will keep reading new log entries until it reaches the end of the log file. This setup enables the Gobblin Yarn application to stream container  [...]
+
+# Security and Delegation Token Management
+
+On a Yarn cluster with security enabled (e.g., Kerberos authentication is required to access HDFS), security and delegation token management is necessary to allow Gobblin run as a long-running Yarn application. Specifically, Gobblin running on a secured Yarn cluster needs to get its delegation token for accessing HDFS renewed periodically, which also requires periodic keytab re-logins because a delegation token can only be renewed up to a limited number of times in one login.
+
+The Gobblin Yarn application supports Kerberos-based authentication and login through a keytab file. The `YarnAppSecurityManager` running in the Yarn Application Launcher and the `YarnContainerSecurityManager` running in the ApplicationMaster and WorkUnitRunner work together to get every Yarn containers updated whenever the delegation token gets updated on the client side by the `YarnAppSecurityManager`. More specifically, the `YarnAppSecurityManager` periodically logins through the keyt [...]
+
+Both the interval between two Kerberos keytab logins and the interval between two delegation token refreshes are configurable, through the configuration properties `gobblin.yarn.login.interval.minutes` and `gobblin.yarn.token.renew.interval.minutes`, respectively.    
+
+# Configuration
+
+## Configuration Properties
+
+In additional to the common Gobblin configuration properties, documented in [`Configuration Properties Glossary`](Configuration-Properties-Glossary), Gobblin on Yarn uses the following configuration properties. 
+
+|Property|Default Value|Description|
+|-------------|-------------|-------------|
+|`gobblin.yarn.app.name`|`GobblinYarn`|The Gobblin Yarn appliation name.|
+|`gobblin.yarn.app.queue`|`default`|The Yarn queue the Gobblin Yarn application will run in.|
+|`gobblin.yarn.work.dir`|`/gobblin`|The working directory (typically on HDFS) for the Gobblin Yarn application.|
+|`gobblin.yarn.app.report.interval.minutes`|5|The interval in minutes between two Gobblin Yarn application status reports.|
+|`gobblin.yarn.max.get.app.report.failures`|4|Maximum allowed number of consecutive failures to get a Yarn `ApplicationReport`.|
+|`gobblin.yarn.email.notification.on.shutdown`|`false`|Whether email notification is enabled or not on shutdown of the `GobblinYarnAppLauncher`. If this is set to `true`, the following configuration properties also need to be set for email notification to work: `email.host`, `email.smtp.port`, `email.user`, `email.password`, `email.from`, and `email.tos`. Refer to [Email Alert Properties](Configuration-Properties-Glossary#Email-Alert-Properties) for more information on those configuratio [...]
+|`gobblin.yarn.app.master.memory.mbs`|512|How much memory in MBs to request for the container running the Gobblin ApplicationMaster.|
+|`gobblin.yarn.app.master.cores`|1|The number of vcores to request for the container running the Gobblin ApplicationMaster.|
+|`gobblin.yarn.app.master.jars`||A comma-separated list of jars the Gobblin ApplicationMaster depends on but not in the `lib` directory.|
+|`gobblin.yarn.app.master.files.local`||A comma-separated list of files on the local filesystem the Gobblin ApplicationMaster depends on.|
+|`gobblin.yarn.app.master.files.remote`||A comma-separated list of files on a remote filesystem (typically HDFS) the Gobblin ApplicationMaster depends on.|
+|`gobblin.yarn.app.master.jvm.args`||Additional JVM arguments for the JVM process running the Gobblin ApplicationMaster, e.g., `-XX:ReservedCodeCacheSize=100M -XX:MaxMetaspaceSize=256m` `-XX:CompressedClassSpaceSize=256m -Dconfig.trace=loads`.|
+|`gobblin.yarn.initial.containers`|1|The number of containers to request initially when the application starts to run the WorkUnitRunner.|
+|`gobblin.yarn.container.memory.mbs`|512|How much memory in MBs to request for the container running the Gobblin WorkUnitRunner.|
+|`gobblin.yarn.container.cores`|1|The number of vcores to request for the container running the Gobblin WorkUnitRunner.|
+|`gobblin.yarn.container.jars`||A comma-separated list of jars the Gobblin WorkUnitRunner depends on but not in the `lib` directory.|
+|`gobblin.yarn.container.files.local`||A comma-separated list of files on the local filesystem the Gobblin WorkUnitRunner depends on.|
+|`gobblin.yarn.container.files.remote`||A comma-separated list of files on a remote filesystem (typically HDFS) the Gobblin WorkUnitRunner depends on.|
+|`gobblin.yarn.container.jvm.args`||Additional JVM arguments for the JVM process running the Gobblin WorkUnitRunner, e.g., `-XX:ReservedCodeCacheSize=100M -XX:MaxMetaspaceSize=256m` `-XX:CompressedClassSpaceSize=256m -Dconfig.trace=loads`.|
+|`gobblin.yarn.container.affinity.enabled`|`true`|Whether the same host should be used as the preferred host when requesting a replacement container for the one that exits.|
+|`gobblin.yarn.helix.cluster.name`|`GobblinYarn`|The name of the Helix cluster that will be registered with ZooKeeper.|
+|`gobblin.yarn.zk.connection.string`|`localhost:2181`|The ZooKeeper connection string used by Helix.|
+|`helix.instance.max.retries`|2|Maximum number of times the application tries to restart a failed Helix instance (corresponding to a Yarn container).|
+|`gobblin.yarn.lib.jars.dir`||The directory where library jars are stored, typically `gobblin-dist/lib`.|
+|`gobblin.yarn.job.conf.path`||The path to either a directory where Gobblin job configuration files are stored or a single job configuration file. Internally Gobblin Yarn will package the configuration files as a tarball so you don't need to.|
+|`gobblin.yarn.logs.sink.root.dir`||The directory on local filesystem on the driver/client side where the aggregated container logs of both the ApplicationMaster and WorkUnitRunner are stored.|
+|`gobblin.yarn.log.copier.max.file.size`|Unbounded|The maximum bytes per log file.  When this is exceeded a new log file will be created.|
+|`gobblin.yarn.log.copier.scheduler`|`ScheduledExecutorService`|The scheduler to use to copy the log files. Possible values: `ScheduledExecutorService`, `HashedWheelTimer`.  The `HashedWheelTimer` scheduler is experimental but is expected to become the default after a sufficient burn in period. 
+|`gobblin.yarn.keytab.file.path`||The path to the Kerberos keytab file used for keytab-based authentication/login.|
+|`gobblin.yarn.keytab.principal.name`||The principal name of the keytab.|
+|`gobblin.yarn.login.interval.minutes`|1440|The interval in minutes between two keytab logins.|
+|`gobblin.yarn.token.renew.interval.minutes`|720|The interval in minutes between two delegation token renews.|
+
+## Job Lock
+It is recommended to use zookeeper for maintaining job locks.  See [ZookeeperBasedJobLock Properties](Configuration-Properties-Glossary#ZookeeperBasedJobLock-Properties) for the relevant configuration properties.
+
+## Configuration System
+
+The Gobblin Yarn application uses the [Typesafe Config](https://github.com/typesafehub/config) library to handle the application configuration. Following [Typesafe Config](https://github.com/typesafehub/config)'s model, the Gobblin Yarn application uses a single file named `application.conf` for all configuration properties and another file named `reference.conf` for default values. A sample `application.conf` is shown below: 
+
+```properties
+# Yarn/Helix configuration properties
+gobblin.yarn.helix.cluster.name=GobblinYarnTest
+gobblin.yarn.app.name=GobblinYarnTest
+gobblin.yarn.lib.jars.dir="/home/gobblin/gobblin-dist/lib/"
+gobblin.yarn.app.master.files.local="/home/gobblin/gobblin-dist/conf/log4j-yarn.properties,/home/gobblin/gobblin-dist/conf/application.conf,/home/gobblin/gobblin-dist/conf/reference.conf"
+gobblin.yarn.container.files.local=${gobblin.yarn.app.master.files.local}
+gobblin.yarn.job.conf.path="/home/gobblin/gobblin-dist/job-conf"
+gobblin.yarn.keytab.file.path="/home/gobblin/gobblin.headless.keytab"
+gobblin.yarn.keytab.principal.name=gobblin
+gobblin.yarn.app.master.jvm.args="-XX:ReservedCodeCacheSize=100M -XX:MaxMetaspaceSize=256m -XX:CompressedClassSpaceSize=256m"
+gobblin.yarn.container.jvm.args="-XX:ReservedCodeCacheSize=100M -XX:MaxMetaspaceSize=256m -XX:CompressedClassSpaceSize=256m"
+gobblin.yarn.logs.sink.root.dir=/home/gobblin/gobblin-dist/applogs
+
+# File system URIs
+writer.fs.uri=${fs.uri}
+state.store.fs.uri=${fs.uri}
+
+# Writer related configuration properties
+writer.destination.type=HDFS
+writer.output.format=AVRO
+writer.staging.dir=${gobblin.yarn.work.dir}/task-staging
+writer.output.dir=${gobblin.yarn.work.dir}/task-output
+
+# Data publisher related configuration properties
+data.publisher.type=org.apache.gobblin.publisher.BaseDataPublisher
+data.publisher.final.dir=${gobblin.yarn.work.dir}/job-output
+data.publisher.replace.final.dir=false
+
+# Directory where job/task state files are stored
+state.store.dir=${gobblin.yarn.work.dir}/state-store
+
+# Directory where error files from the quality checkers are stored
+qualitychecker.row.err.file=${gobblin.yarn.work.dir}/err
+
+# Use zookeeper for maintaining the job lock
+job.lock.enabled=true
+job.lock.type=ZookeeperBasedJobLock
+
+# Directory where job locks are stored
+job.lock.dir=${gobblin.yarn.work.dir}/locks
+
+# Directory where metrics log files are stored
+metrics.log.dir=${gobblin.yarn.work.dir}/metrics
+```
+
+A sample `reference.conf` is shown below:
+
+```properties
+# Yarn/Helix configuration properties
+gobblin.yarn.app.queue=default
+gobblin.yarn.helix.cluster.name=GobblinYarn
+gobblin.yarn.app.name=GobblinYarn
+gobblin.yarn.app.master.memory.mbs=512
+gobblin.yarn.app.master.cores=1
+gobblin.yarn.app.report.interval.minutes=5
+gobblin.yarn.max.get.app.report.failures=4
+gobblin.yarn.email.notification.on.shutdown=false
+gobblin.yarn.initial.containers=1
+gobblin.yarn.container.memory.mbs=512
+gobblin.yarn.container.cores=1
+gobblin.yarn.container.affinity.enabled=true
+gobblin.yarn.helix.instance.max.retries=2
+gobblin.yarn.keytab.login.interval.minutes=1440
+gobblin.yarn.token.renew.interval.minutes=720
+gobblin.yarn.work.dir=/user/gobblin/gobblin-yarn
+gobblin.yarn.zk.connection.string=${zookeeper.connection.string}
+
+fs.uri="hdfs://localhost:9000"
+zookeeper.connection.string="localhost:2181"
+```
+# Deployment
+
+A standard deployment of Gobblin on Yarn requires a Yarn cluster running Hadoop 2.x (`2.3.0` and above recommended) and a ZooKeeper cluster. Make sure the client machine (typically the gateway of the Yarn cluster) is able to access the ZooKeeper instance. 
+
+## Deployment on a Unsecured Yarn Cluster
+
+To do a deployment of the Gobblin Yarn application, first build Gobblin using the following command from the root directory of the Gobblin project.
+
+```bash
+./gradlew clean build
+```
+
+To build Gobblin against a specific version of Hadoop 2.x, e.g., `2.7.0`, run the following command instead:
+
+```bash
+./gradlew clean build  -PhadoopVersion=2.7.0
+```
+ 
+After Gobblin is successfully built, a tarball named `gobblin-dist-[project-version].tar.gz` should have been created under the root directory of the project. To deploy the Gobblin Yarn application on a unsecured Yarn cluster, uncompress the tarball somewhere and run the following commands:  
+
+```bash
+cd gobblin-dist
+bin/gobblin-yarn.sh
+```
+
+Note that for the above commands to work, the Hadoop/Yarn configuration directory must be on the classpath and the configuration must be pointing to the right Yarn cluster, or specifically the right ResourceManager and NameNode URLs. This is defined like the following in `gobblin-yarn.sh`:
+
+```bash
+CLASSPATH=${FWDIR_CONF}:${GOBBLIN_JARS}:${YARN_CONF_DIR}:${HADOOP_YARN_HOME}/lib
+```
+
+## Deployment on a Secured Yarn Cluster
+
+When deploying the Gobblin Yarn application on a secured Yarn cluster, make sure the keytab file path is correctly specified in `application.conf` and the correct principal for the keytab is used as follows. The rest of the deployment is the same as that on a unsecured Yarn cluster.
+
+```properties
+gobblin.yarn.keytab.file.path="/home/gobblin/gobblin.headless.keytab"
+gobblin.yarn.keytab.principal.name=gobblin
+```
+
+## Supporting Existing Gobblin Jobs
+
+Gobblin on Yarn is backward compatible and supports existing Gobblin jobs running in the standalone and MR modes. To run existing Gobblin jobs, simply put the job configuration files into a directory on the local file system of the driver and setting the configuration property `gobblin.yarn.job.conf.path` to point to the directory. When the Gobblin Yarn application starts, Yarn will package the configuration files as a tarball and make sure the tarball gets copied to the ApplicationMaste [...]
+
+# Monitoring
+
+Gobblin Yarn uses the [Gobblin Metrics](../metrics/Gobblin-Metrics) library for collecting and reporting metrics at the container, job, and task levels. Each `GobblinWorkUnitRunner` maintains a [`ContainerMetrics`](https://github.com/apache/gobblin/blob/master/gobblin-cluster/src/main/java/org/apache/gobblin/cluster/ContainerMetrics.java) that is the parent of the [`JobMetrics`](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/util/Jo [...]
+
+Collected metrics can be reported to various sinks such as Kafka, files, and JMX, depending on the configuration. Specifically, `metrics.enabled` controls whether metrics collecting and reporting are enabled or not. `metrics.reporting.kafka.enabled`, `metrics.reporting.file.enabled`, and `metrics.reporting.jmx.enabled` control whether collected metrics should be reported or not to Kafka, files, and JMX, respectively. Please refer to [Metrics Properties](Configuration-Properties-Glossary# [...]
+
+In addition to metric collecting and reporting, Gobblin Yarn also supports writing job execution information to a MySQL-backed job execution history store, which keeps track of job execution information. Please refer to the [DDL](https://github.com/apache/gobblin/tree/master/gobblin-metastore/src/main/resources/db/migration) for the relevant MySQL tables. Detailed information on the job execution history store including how to configure it can be found [here](Job-Execution-History-Store). 
diff --git a/gobblin-website/docs/user-guide/Gobblin-template.md b/gobblin-website/docs/user-guide/Gobblin-template.md
new file mode 100644
index 0000000..1d9122c
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Gobblin-template.md
@@ -0,0 +1,73 @@
+---
+title: Template
+sidebar_label: Template
+---
+
+Overview
+--------------------
+The job configuration template is implemented for saving efforts of Gobblin users. For a specific type of job, e.g. Gobblin-Kafka data pulling, there exists quite amount of repetitive options to fill in. We are aiming at moving those repetitive options into a template for specific type of job, only exposing some essential configurable options for user to specify. This does not sacrifice flexibility, users can still specify options that already exist in the template to override the defaul [...]
+
+Here is the `.pull` file for wikipedia example with template support:
+
+```properties
+job.template=templates/wikiSample.template
+source.page.titles=NASA,LinkedIn,Parris_Cues,Barbara_Corcoran
+
+```  
+
+How to Use Templates
+--------------------
+Users need only submit the `.pull` file above to the specified directory as described in wikipedia example. Although there are far fewer options there are still some mandatory options to specify in `.pull` file. 
+
+In general, to use a template:
+- Specify which template to use in the key `job.template`.
+- All the keys specified in `gobblin.template.required_attributes` must be provided.
+- As mentioned before, user can also specify existing options in template to override the default value. 
+
+
+Available Templates 
+--------------------
+- wikiSample.template 
+- gobblin-kafka.template 
+
+
+Templates above are available on Github repo. 
+
+
+How to Create Your Own Template 
+--------------------
+To create a template, simply create a file with all the common configurations for that template (recommended to use `.template` extension). Place this file into Gobblin's classpath, and set `job.template` to the path to that file in the classpath.
+
+For reference, this is how the Wikipedia template looks:
+
+```properties
+job.name=PullFromWikipedia
+job.group=Wikipedia
+job.description=A getting started example for Gobblin
+
+source.class=org.apache.gobblin.example.wikipedia.WikipediaSource
+source.revisions.cnt=5
+
+wikipedia.api.rooturl=https://en.wikipedia.org/w/api.php?format=json&action=query&prop=revisions&rvprop=content|timestamp|user|userid|size
+wikipedia.avro.schema={"namespace": "example.wikipedia.avro","type": "record","name": "WikipediaArticle","fields": [{"name": "pageid", "type": ["double", "null"]},{"name": "title", "type": ["string", "null"]},{"name": "user", "type": ["string", "null"]},{"name": "anon", "type": ["string", "null"]},{"name": "userid",  "type": ["double", "null"]},{"name": "timestamp", "type": ["string", "null"]},{"name": "size",  "type": ["double", "null"]},{"name": "contentformat",  "type": ["string", "nu [...]
+
+converter.classes=org.apache.gobblin.example.wikipedia.WikipediaConverter
+
+extract.namespace=org.apache.gobblin.example.wikipedia
+
+writer.destination.type=HDFS
+writer.output.format=AVRO
+writer.partitioner.class=org.apache.gobblin.example.wikipedia.WikipediaPartitioner
+
+data.publisher.type=org.apache.gobblin.publisher.BaseDataPublisher
+
+gobblin.template.required_attributes=source.page.titles
+
+```
+
+How does Template Work in Gobblin
+--------------------
+
+Currently Gobblin stores and loads existing templates as resources in the classpath. Gobblin will then resolve this template with the user-specified `.pull` file. Note that there is an option in template named  `gobblin.template.required_attributes` which lists all options that are required for users to fill in. If any of options in the required list is absent, the configuration will be detected as invalid by Gobblin throw an runtime excpetion accordingly.
+
+Gobblin provides methods to retrieve all options inside `.template` file and resolved configuration option list. These interactive funtions will be integrated soon.
diff --git a/gobblin-website/docs/user-guide/Hive-Registration.md b/gobblin-website/docs/user-guide/Hive-Registration.md
new file mode 100644
index 0000000..7e4cae9
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Hive-Registration.md
@@ -0,0 +1,79 @@
+---
+title: Hive Registration
+sidebar_label: Hive Registration
+---
+
+Gobblin has the ability to register the ingested/compacted data in Hive. This allows registering data in Hive immediately after data is published at the destination, offering much lower latency compared to doing data ingestion and Hive registration separately.
+
+## How Hive Registration Works in Gobblin
+
+Hive registration is done in [`HiveRegister`](https://github.com/apache/gobblin/blob/master/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/HiveRegister.java). After the data is published, the publisher or compaction runner will create an instance of `HiveRegister`, and for each path that should be registered in Hive, the publisher or compaction runner will use a specific [`HiveRegistrationPolicy`](https://github.com/apache/gobblin/blob/master/gobblin-hive-registration/sr [...]
+
+### `HiveSpec`
+
+A `HiveSpec` specifies how a path should be registered in Hive, i.e., which database, which table, which partition should the path be registered. An example is [`SimpleHiveSpec`](https://github.com/apache/gobblin/blob/master/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/spec/SimpleHiveSpec.java).
+
+### `HiveRegistrationPolicy`
+
+`HiveRegistrationPolicy` is responsible for generating `HiveSpec`s given a path. For example, if you want paths ending with a date (e.g., `/(something)/2016/05/22`) to be registered in the corresponding daily partition (e.g., `daily-2016-05-22`), you can create an implementation of `HiveRegistrationPolicy` that contains the logic of converting such a path into a Hive partition. 
+
+An example is [`HiveRegistrationPolicyBase`](https://github.com/apache/gobblin/blob/master/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/policy/HiveRegistrationPolicyBase.java), which provides base implementation for getting database names and table names for a path:
+
+* A database/table name can be specified explicitly in `hive.database.name` or `hive.table.name`.
+* Alternatively, a database/table regex can be provided in `hive.database.regex` or `hive.table.regex`. The regex will be matched against the path to be registered, and if they match, the first group is considered the database/table name.
+* It is possible to register a path to multiple databases or tables by specifying `additional.hive.database.names` and `additional.hive.table.names`. If multiple databases and tables are specified, the path will be registered to the cross product.
+* If the provided/derived Hive database/table names are invalid, they are sanitized into a valid name. A database/table name is valid if it starts with an alphanumeric character, contains only alphanumeric characters and `_`, and is not composed of numbers only.
+
+One should in general extend `HiveRegistrationPolicyBase` when implementing a new `HiveRegistrationPolicy`.
+
+### `HiveSerDeManager`
+
+If the data to be registered is in a format other than plain text (CSV, TSV, etc.), you often need to use a SerDe and specify some SerDe properties including the type of SerDe, input format, output format, schema, etc. This is done in [`HiveSerDeManager`](https://github.com/apache/gobblin/blob/master/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/HiveSerDeManager.java), which is part of a [`HiveRegistrationUnit`](https://github.com/apache/gobblin/blob/master/gobblin-hive [...]
+
+### Predicate and Activity
+
+One or more `Predicate`s can be attached to a `HiveSpec`. If a `HiveSpec` contains `Predicate`s, unless `Predicate`s return `true`, the `HiveSpec` will not be registered. This is useful in cases where, for example, one only wants to register a path in Hive if a particular Hive table or partition doesn't already exist. An example is [`TableNotExistPredicate`](https://github.com/apache/gobblin/blob/master/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/spec/predicate/TableN [...]
+
+One or more [`Activity`](https://github.com/apache/gobblin/blob/master/gobblin-hive-registration/src/main/java/org/apache/gobblin/hive/spec/activity/Activity.java)s can be attached to a `HiveSpec`. There are two types of activities: pre-activities and post-activities, which will be executed before and after a `HiveSpec` is registered, respectively. This is useful, for example, when you need to drop/alter a table/partition before or after a path is registered. An example is [`DropTableAct [...]
+
+## How to Use Hive Registration in Your Gobblin Job
+
+First, is to implement a `HiveRegistrationPolicy` (or reuse an existing one), then specify its class name in config property `hive.registration.policy`.
+
+Then, specify the appropriate table/partition properties in `hive.table.partition.props`, storage descriptor properties in 
+`hive.storage.props`, and SerDe properties in `hive.serde.props`. Some SerDe properties are usually dynamic (e.g., schema), which are added in the `HiveSerDeManager`.
+
+Example table/partition properties are "owner" and "retention", example storage descriptor properties are "location", "compressed", "numBuckets", example SerDe properties are "serializationLib", "avro.schema.url".
+
+If you are running a Gobblin ingestion job:
+
+* If data is published in the job (which is the default case), use a job-level data publisher that can perform Hive registration, such as [`BaseDataPublisherWithHiveRegistration`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/publisher/BaseDataPublisherWithHiveRegistration.java). If you need to do Hive registration with a different publisher than `BaseDataPublisher`, you will need to extend that publisher to do Hive registration, which will b [...]
+* If data is published in the tasks, use [`HiveRegistrationPublisher`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/publisher/HiveRegistrationPublisher.java) as the job-level data publisher. This publisher does not publish any data; it only does Hive registration.
+
+If you are running a Gobblin compaction job: add [`HiveRegistrationCompactorListener`](https://github.com/apache/gobblin/blob/master/gobblin-compaction/src/main/java/org/apache/gobblin/compaction/hive/registration/HiveRegistrationCompactorListener.java) to the list of compaction listeners by adding the class name to property `compaction.listeners`.
+
+## Hive Registration Config Properties
+
+| Property Name  | Semantics  |
+|---|---|
+| `hive.registration.policy` | Class name which implements `HiveRegistrationPolicy` |
+| `hive.row.format` | Either `AVRO`, or the class name which implements `HiveSerDeManager`|
+| `hive.database.name` | Hive database name |
+| `hive.database.regex` | Hive database regex |
+| `hive.database.name.prefix` | Hive database name prefix |
+| `hive.database.name.suffix` | Hive database name suffix |
+| `additional.hive.database.names` | Additional Hive database names |
+| `hive.table.name` | Hive table name |
+| `hive.table.regex` | Hive table regex |
+| `hive.table.name.prefix` | Hive table name prefix |
+| `hive.table.name.suffix` | Hive table name suffix |
+| `additional.hive.table.names` | Additional Hive table names |
+| `hive.register.threads` | Thread pool size used for Hive registration |
+| `hive.db.root.dir` | The root dir of Hive db |
+| `hive.table.partition.props` | Table/partition properties |
+| `hive.storage.props` | Storage descriptor properties |
+| `hive.serde.props` | SerDe properties |
+| `hive.registration.fs.uri` | File system URI for Hive registration |
+| `hive.upstream.data.attr.names` | Attributes to describe upstream data source as Hive Metadata |
+
+
diff --git a/gobblin-website/docs/user-guide/Job-Execution-History-Store.md b/gobblin-website/docs/user-guide/Job-Execution-History-Store.md
new file mode 100644
index 0000000..82d0c5d
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Job-Execution-History-Store.md
@@ -0,0 +1,172 @@
+---
+title: Job Execution History
+sidebar_label: Job Execution History
+---
+
+Overview
+--------------------
+Gobblin provides the users a way of keeping tracking of executions of their jobs through the Job Execution History Store, which can be queried either directly if the implementation supports queries directly or through a Rest API. Note that using the Rest API needs the Job Execution History Server to be up and running. The Job Execution History Server will be discussed later. By default, writing to the Job Execution History Store is not enabled. To enable it, set configuration property `j [...]
+
+Information Recorded
+--------------------------------
+The Job Execution History Store stores various pieces of information of a job execution, including both job-level and task-level stats and measurements that are summarized below.
+
+Job Execution Information
+-------------------------------------------------
+The following table summarizes job-level execution information the Job Execution History Store stores. 
+
+|Information| Description|
+|---------------------------------|----------------------|
+|Job name|Gobblin job name.|
+|Job ID|Gobblin job ID.|
+|Start time|Start time in epoch time (of unit milliseconds) of the job in the local time zone.|
+|End time|End time in epoch time (of unit milliseconds) of the job in the local time zone.|
+|Duration|Duration of the job in milliseconds.|
+|Job state|Running state of the job. Possible values are `PENDING`, `RUNNING`, `SUCCESSFUL`, `COMMITTED`, `FAILED`, `CANCELLED`.|
+|Launched tasks|Number of launched tasks of the job.|
+|Completed tasks|Number of tasks of the job that completed.|
+|Launcher type|The type of the launcher used to launch and run the task.|
+|Job tracking URL|This will be set to the MapReduce job URL if the Gobblin job is running on Hadoop MapReduce. This may also be set to the Azkaban job execution tracking URL if the job is running through Azkaban but not on Hadoop MapReduce. Otherwise, this will be empty.|
+|Job-level metrics|Values of job-level metrics. Note that this data is not time-series based so the values will be overwritten on every update.|
+|Job configuration properties|Job configuration properties used at runtime for job execution. Note that it may include changes made at runtime by the job.|
+
+Task Execution Information
+-------------------------------------------------
+The following table summarizes task-level execution information the Job Execution History Store stores. 
+
+|Information| Description|
+|---------------------------------|----------------------|
+|Task ID|Gobblin task ID.|
+|Job ID|Gobblin job ID.|
+|Start time|Start time in epoch time (of unit milliseconds) of the task in the local time zone.|
+|End time|End time in epoch time (of unit milliseconds) of the task in the local time zone.|
+|Duration|Duration of the task in milliseconds.|
+|Task state|Running state of the task. Possible values are `PENDING`, `RUNNING`, `SUCCESSFUL`, `COMMITTED`, `FAILED`, `CANCELLED`.|
+|Task failure exception|Exception message in case of task failure.|
+|Low watermark|The low watermark of the task if avaialble.|
+|High watermark|The high watermark of the task if available.|
+|Extract namespace|The namespace of the `Extract`. An `Extract` is a concept describing the ingestion work of a job. This stores the value specified through the configuration property `extract.namespace`.|
+|Extract name|The name of the `Extract`. This stores the value specified through the configuration property `extract.table.name`.|
+|Extract type|The type of the `Extract`. This stores the value specified through the configuration property `extract.table.type`.|
+|Task-level metrics|Values of task-level metrics. Note that this data is not time-series based so the values will be overwritten on every update.|
+|Task configuration properties|Task configuration properties used at runtime for task execution. Note that it may include changes made at runtime by the task.|
+
+
+Default Implementation
+--------------------------------
+The default implementation of the Job Execution History Store stores job execution information into a MySQL database in a few different tables. Specifically, the following tables are used and should be created before writing to the store is enabled. Checkout the MySQL [DDLs](https://github.com/apache/gobblin/tree/master/gobblin-metastore/src/main/resources/db/migration) of the tables for detailed columns of each table.
+
+* Table `gobblin_job_executions` stores basic information about a job execution including the start and end times, job running state, number of launched and completed tasks, etc. 
+* Table `gobblin_task_executions` stores basic information on task executions of a job, including the start and end times, task running state, task failure message if any, etc, of each task. 
+* Table `gobblin_job_metrics` stores values of job-level metrics collected through the `JobMetrics` class. Note that this data is not time-series based and values of metrics are overwritten on every update to the job execution information. 
+* Table `gobblin_task_metrics` stores values of task-level metrics collected through the `TaskMetrics` class. Again, this data is not time-series based and values of metrics are overwritten on updates.
+* Table `gobblin_job_properties` stores the job configuration properties used at runtime for the job execution, which may include changes made at runtime by the job.
+* Table `gobblin_task_properties` stores the task configuration properties used at runtime for task executions, which also may include changes made at runtime by the tasks.
+
+To enable writing to the MySQL-backed Job Execution History Store, the following configuration properties (with sample values) need to be set:
+
+```properties
+job.history.store.url=jdbc:mysql://localhost/gobblin
+job.history.store.jdbc.driver=com.mysql.jdbc.Driver
+job.history.store.user=gobblin
+job.history.store.password=gobblin
+``` 
+
+
+Rest Query API
+--------------------------------
+
+The Job Execution History Store Rest API supports three types of queries: query by job name, query by job ID, or query by extract name. The query type can be specified using the field `idType` in the query json object and can have one of the values `JOB_NAME`, `JOB_ID`, or `TABLE`. All three query types require the field `id` in the query json object, which should have a proper value as documented in the following table. 
+
+|Query type|Query ID|
+|---------------------------------|----------------------|
+|JOB_NAME|Gobblin job name.|
+|JOB_ID|Gobblin job ID.|
+|TABLE|A json object following the `TABLE` schema shown below.|
+
+```json
+{
+    "type": "record",
+    "name": "Table",
+    "namespace": "gobblin.rest",
+    "doc": "Gobblin table definition",
+    "fields": [
+      {
+          "name": "namespace",
+          "type": "string",
+          "optional": true,
+          "doc": "Table namespace"
+      },
+      {
+          "name": "name",
+          "type": "string",
+          "doc": "Table name"
+      },
+      {
+          "name": "type",
+          "type": {
+              "name": "TableTypeEnum",
+              "type": "enum",
+              "symbols" : [ "SNAPSHOT_ONLY", "SNAPSHOT_APPEND", "APPEND_ONLY" ]
+          },
+          "optional": true,
+          "doc": "Table type"
+      }
+    ]
+}
+```
+
+For each query type, there are also some option fields that can be used to control the number of records returned and what should be included in the query result. The optional fields are summarized in the following table.
+
+|Optional field|Type|Description|
+|---------------------------------|----------------------|----------------------|
+|`limit`|`int`|Limit on the number of records returned.|
+|`timeRange`|`TimeRange`|The query time range. The schema of `TimeRange` is shown below.|
+|`jobProperties`|`boolean`|This controls whether the returned record should include the job configuration properties.|
+|`taskProperties`|`boolean`|This controls whether the returned record should include the task configuration properties.|
+
+```json
+{
+    "type": "record",
+    "name": "TimeRange",
+    "namespace": "gobblin.rest",
+    "doc": "Query time range",
+    "fields": [
+      {
+          "name": "startTime",
+          "type": "string",
+          "optional": true,
+          "doc": "Start time of the query range"
+      },
+      {
+          "name": "endTime",
+          "type": "string",
+          "optional": true,
+          "doc": "End time of the query range"
+      },
+      {
+          "name": "timeFormat",
+          "type": "string",
+          "doc": "Date/time format used to parse the start time and end time"
+      }
+    ]
+}
+```
+
+The API is built with [rest.li](http://www.rest.li), which generates documentation on compilation and can be found at `http://<hostname:port>/restli/docs`.
+
+### Example Queries
+*Fetch the 10 most recent job executions with a job name `TestJobName`*
+```bash
+curl "http://<hostname:port>/jobExecutions/idType=JOB_NAME&id.string=TestJobName&limit=10"
+```
+
+Job Execution History Server
+--------------------------------
+The Job Execution History Server is a Rest server for serving queries on the Job Execution History Store through the Rest API described above. The Rest endpoint URL is configurable through the following configuration properties (with their default values):
+```properties
+rest.server.host=localhost
+rest.server.port=8080
+```
+
+**Note:** This server is started in the standalone deployment if configuration property `job.execinfo.server.enabled` is set to `true`.
diff --git a/gobblin-website/docs/user-guide/Monitoring.md b/gobblin-website/docs/user-guide/Monitoring.md
new file mode 100644
index 0000000..fc4b741
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Monitoring.md
@@ -0,0 +1,83 @@
+---
+title: Monitoring
+sidebar_label: Monitoring
+
+---
+
+Overview
+--------------------
+As a framework for ingesting potentially huge volume of data from many different sources, it's critical to monitor the health and status of the system and job executions. Gobblin employs a variety of approaches introduced below for this purpose. All the approaches are optional and can be configured to be turned on and off in different combinations through the framework and job configurations. 
+
+Metrics Collecting and Reporting
+--------------------
+
+## Metrics Reporting
+
+Out-of-the-box, Gobblin reports metrics though:
+
+* _JMX_ : used in the standalone deployment. Metrics reported to JMX can be checked using using tools such as [VisualVM](http://visualvm.java.net/) or JConsole. 
+* _Metric log files_: Files are stored in a root directory defined by the property `metrics.log.dir`. Each Gobblin job has its own subdirectory under the root directory and each run of the job has its own metric log file named after the job ID as `${job_id}.metrics.log`.
+* _Hadoop counters_ : used for M/R deployments. Gobblin-specific metrics are reported in the "JOB" or "TASK" groups for job- and task- level metrics. By default, task-level metrics are not reported through Hadoop counters as doing so may cause the number of Hadoop counters to go beyond the system-wide limit. However, users can choose to turn on reporting task-level metrics as Hadoop counters by setting `mr.include.task.counters=true`. 
+
+
+## Metrics collection
+### JVM Metrics
+The standalone deployment of Gobblin runs in a single JVM so it's important to monitor the health of the JVM, through a set of pre-defined JVM metrics in the following four categories. 
+
+* `jvm.gc`: this covers metrics related to garbage collection, e.g., counts and time spent on garbage collection.
+* `jvm.memory`: this covers metrics related to memory usage, e.g., detailed heap usage.  
+* `jvm.threads`: this covers metrics related to thread states, e.g., thread count and thread deadlocks.
+* `jvm.fileDescriptorRatio`: this measures the ratio of open file descriptors.
+
+All JVM metrics are reported via JMX and can be checked using tools such as [VisualVM](http://visualvm.java.net/) or JConsole. 
+
+### Pre-defined Job Execution Metrics
+Internally, Gobblin pre-defines a minimum set of metrics listed below in two metric groups: `JOB` and `TASK` for job-level metrics and task-level metrics, respectively. Those metrics are useful in keeping track of the progress and performance of job executions.
+
+* `${metric_group}.${id}.records`: this metric keeps track of the total number of data records extracted by the job or task depending on the `${metric_group}`. The `${id}` is either a job ID or a task ID depending on the `${metric_group}`. 
+* `${metric_group}.${id}.recordsPerSecond`: this metric keeps track of the rate of data extraction as data records extracted per second by the job or task depending on the `${metric_group}`.
+* `${metric_group}.${id}.bytes`: this metric keeps track of the total number of bytes extracted by the job or task depending on the `${metric_group}`.
+* `${metric_group}.${id}.bytesPerSecond`: this metric keeps track of the rate of data extraction as bytes extracted per second by the job or task depending on the `${metric_group}`.
+
+Among the above metrics, `${metric_group}.${id}.records` and `${metric_group}.${id}.bytes` are reported as Hadoop MapReduce counters for Gobblin jobs running on Hadoop.
+
+Job Execution History Store
+--------------------
+Gobblin also supports writing job execution information to a job execution history store backed by a database of choice. Gobblin uses MySQL by default and it ships with the SQL [DDLs](https://github.com/apache/gobblin/tree/master/gobblin-metastore/src/main/resources/db/migration) of the relevant MySQL tables, although  it still allows users to choose which database to use as long as the schema of the tables is compatible. Users can use the properties `job.history.store.url` and `job.hist [...]
+
+```properties
+job.history.store.url=jdbc:mysql://localhost/gobblin
+job.history.store.jdbc.driver=com.mysql.jdbc.Driver
+job.history.store.user=gobblin
+job.history.store.password=gobblin
+``` 
+
+Email Notifications 
+--------------------
+In addition to writing job execution information to the job execution history store, Gobblin also supports sending email notifications about job status. Job status notifications fall into two categories: alerts in case of job failures and normal notifications in case of successful job completions. Users can choose to enable or disable both categories using the properties `email.alert.enabled` and `email.notification.enabled`. 
+
+The main content of an email alert or notification is a job status report in Json format. Below is an example job status report:
+
+```json
+{
+	"job name": "Gobblin_Demo_Job",
+	"job id": "job_Gobblin_Demo_Job_1417487480842",
+	"job state": "COMMITTED",
+	"start time": 1417487480874,
+	"end time": 1417490858913,
+	"duration": 3378039,
+	"tasks": 1,
+	"completed tasks": 1,
+	"task states": [
+		{
+			"task id": "task_Gobblin_Demo_Job_1417487480842_0",
+			"task state": "COMMITTED",
+			"start time": 1417490795903,
+			"end time": 1417490858908,
+			"duration": 63005,
+			"high watermark": -1,
+			"exception": ""
+		}
+	]
+}
+``` 
diff --git a/gobblin-website/docs/user-guide/Partitioned-Writers.md b/gobblin-website/docs/user-guide/Partitioned-Writers.md
new file mode 100644
index 0000000..b70bed0
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Partitioned-Writers.md
@@ -0,0 +1,77 @@
+---
+title: Partitioned Writers
+sidebar_label: Partitioned Writers
+---
+
+Gobblin allows partitioning output data using a writer partitioner. This allows, for example, to write timestamped records to a different file depending on the timestamp of the record.
+
+To partition output records, two things are needed:
+
+* Set `writer.builder.class` to a class that implements `PartitionAwareDataWriterBuilder`.
+* Set `writer.partitioner.class` to the class of the desired partitioner, which must be subclass of `WriterPartitioner`. The partitioner will get all Gobblin configuration options, so some partitioners may require additional configurations.
+
+If `writer.partitioner.class` is set but `writer.builder.class` is not a `PartitionAwareDataWriterBuilder`, Gobblin will throw an error. If `writer.builder.class` is a `PartitionAwareDataWriterBuilder`, but no partitioner is set, Gobblin will attempt to still create the writer with no partition, however, the writer may not support unpartitioned data, in which case it will throw an error.
+
+`WriterPartitioner`s compute a partition key for each record. Some `PartitionAwareDataWriterBuilder` are unable to handle certain partition keys (for example, a writer that can only partition by date would expect a partition schema that only contains date information). If the writer cannot handle the partitioner key, Gobblin will throw an error. The Javadoc of partitioners should always include the schema it emits and the writer Javadoc should contain which schemas it accepts for ease of use.
+
+Existing Partition Aware Writers
+--------------------------------
+* `gobblin.writer.AvroDataWriterBuilder`: If partition is present, creates directory structure based on partition. For example, if partition is `{name="foo", type="bar"}`, the record will be written to a file in directory `/path/to/data/name=foo/type=bar/file.avro`.  
+
+Existing Partitioners
+---------------------
+* `gobblin.example.wikipedia.WikipediaPartitioner`: Sample partitioner for the Wikipedia example. Partitions record by article title.
+
+Design
+------
+![Partitioned Writer Logic](../../static/img/Gobblin-Partitioned-Writer.png)
+
+Gobblin always instantiates a `PartitionedDataWriter` for each fork. On construction, the partitioned writer:
+
+ 1. checks whether a partitioner is present in the configuration. If no partitioner is present, then the instance of `PartitionedDataWriter` is simply a thin wrapper around a normal writer. 
+ 2. If a partitioner is present, the partitioned writer will check if the class configured at `writer.builder.class` is an instance of `PartitionAwareDataWriterBuilder`, throwing an error in case this is not true.  
+ 3. The partitioned writer instantiate the partitioner, runs `partitionSchema()`, and then checks whether the partition aware writer builder accepts that schema using `validatePartitionSchema`. If this returns false, Gobblin will throw an error.
+
+Every time the partitioned writer gets a record, it uses the partitioner to get a partition key for that record. The partitioned writer keeps an internal map from partition key to instances of writers for each partition. If a writer is already created for this key, it will call write on that writer for the new record. If the writer is not present, the partitioned writer will instantiate a new writer with the computed partition, and then pass in the record.
+
+`WriterPartitioner` partitions records by returning a partition key for each record, which is of type `GenericRecord`. Each `WriterPartitioner` emits keys with a particular `Schema` which is available by using the method `WriterPartitioner#partitionSchema()`. Implementations of `PartitionAwareDataWriterBuilder` must check the partition schema to decide if they can understand and correctly handle that schema when the method `PartitionAwareDataWriterBuilder#validateSchema` is called (for e [...]
+
+Implementing a partitioner
+--------------------------
+
+The interface for a partitioner is
+
+```java
+/**
+ * Partitions records in the writer phase.
+ */
+public interface WriterPartitioner<D> {
+  /**
+   * @return The schema that {@link GenericRecord} returned by {@link #partitionForRecord} will have.
+   */
+  public Schema partitionSchema();
+
+  /**
+   * Returns the partition that the input record belongs to. If
+   * partitionFoRecord(record1).equals(partitionForRecord(record2)), then record1 and record2
+   * belong to the same partition.
+   * @param record input to compute partition for.
+   * @return {@link GenericRecord} representing partition record belongs to.
+   */
+  public GenericRecord partitionForRecord(D record);
+}
+```
+
+For an example of a partitioner implementation see `gobblin.example.wikipedia.WikipediaPartitioner`.
+
+Each class that implements `WriterPartitioner` is required to have a public constructor with signature `(State state, int numBranches, int branchId)`.
+
+Implementing a Partition Aware Writer Builder
+---------------------------------------------
+
+This is very similar to a regular `DataWriterBuilder`, with two differences:
+
+* You must implement the method `validatePartitionSchema(Schema)` that must return false unless the builder can handle that schema.
+* The field `partition` is available, which is a `GenericRecord` that contains the partition key for the built writer. For any two different keys, Gobblin may create a writer for each key, so it is important that writers for different keys do not collide (e.g. do not try to use the same path).
+
+For an example of a simple `PartitionAwareWriterBuilder` see `gobblin.writer.AvroDataWriterBuilder`.
diff --git a/gobblin-website/docs/user-guide/Source-schema-and-Converters.md b/gobblin-website/docs/user-guide/Source-schema-and-Converters.md
new file mode 100644
index 0000000..33e0387
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Source-schema-and-Converters.md
@@ -0,0 +1,372 @@
+---
+title: Source Schema and Converters
+sidebar_label: Source Schema and Converters
+---
+
+## Source schema
+A source schema has to be declared before extracting the data from the source. 
+To define the source schema `source.schema` property is available which takes a JSON value defining the source schema. 
+This schema is used by Converters to perform data type or data format conversions. 
+The java class representation of a source schema can be found here [Schema.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/schema/Schema.java).
+
+## Converters
+In Gobblin library a Converter is an interface for classes that implement data transformations, e.g., data type conversions,
+schema projections, data manipulations, data filtering, etc. This interface is responsible for 
+converting both schema and data records. Classes implementing this interface are composible and 
+can be chained together to achieve more complex data transformations.
+
+A converter basically needs four inputs:
+- Input schema
+- Output schema type
+- Input data
+- Output data type
+
+There are various inbuilt Converters available within gobblin-core. However, you can also implement your own converter 
+by extending abstract class ```org.apache.gobblin.converter.Converter```. Below, is example of such a custom implementation 
+of Gobblin Converter which replaces multiple newlines and spaces from JSON values.
+
+```java
+
+package org.apache.gobblin.example.sample;
+import org.apache.gobblin.configuration.WorkUnitState;
+import org.apache.gobblin.converter.Converter;
+import org.apache.gobblin.converter.DataConversionException;
+import org.apache.gobblin.converter.SchemaConversionException;
+import org.apache.gobblin.converter.SingleRecordIterable;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonParser;
+
+
+public class FilterSpacesConverter extends Converter<JsonArray, JsonArray, JsonObject, JsonObject> {
+  @Override
+  public JsonArray convertSchema(JsonArray inputSchema, WorkUnitState workUnit)
+      throws SchemaConversionException {
+    return inputSchema; //We are not doing any schema conversion
+  }
+
+  @Override
+  public Iterable<JsonObject> convertRecord(JsonArray outputSchema, JsonObject inputRecord, WorkUnitState workUnit)
+      throws DataConversionException {
+    String jsonStr = inputRecord.toString().replaceAll("\\s{2,}", " ");
+    return new SingleRecordIterable<>(new JsonParser().parse(jsonStr).getAsJsonObject());
+  }
+}
+```
+The converters can also be chained to perform sequential conversion on each input record. 
+To chain converters use the property ```converter.classes``` and provide a list of comma separated 
+converters with full reference name of converters. The execution order of the converters is same as 
+defined in the comma separated list. 
+
+For example:
+If you are reading data from a JsonSource and you want to write data into Avro format. 
+For this you can chain the converters to convert from Json string to Json and the convert Json into 
+Avro. By using the following property in your .pull file.
+```converter.classes="org.apache.gobblin.converter.json.JsonStringToJsonIntermediateConverter,org.apache.gobblin.converter.avro.JsonIntermediateToAvroConverter"```
+
+## Converters available in Gobblin
+- [AvroFieldRetrieverConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/avro/AvroFieldRetrieverConverter.java)
+- [AvroRecordToAvroWritableConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/avro/AvroRecordToAvroWritableConverter.java)
+- [AvroToAvroCopyableConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/avro/AvroToAvroCopyableConverter.java)
+- [AvroToBytesConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/avro/AvroToBytesConverter.java)
+- [BytesToAvroConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/avro/BytesToAvroConverter.java)
+- [FlattenNestedKeyConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/avro/FlattenNestedKeyConverter.java)
+- [JsonIntermediateToAvroConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/avro/JsonIntermediateToAvroConverter.java)
+- [JsonRecordAvroSchemaToAvroConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/avro/JsonRecordAvroSchemaToAvroConverter.java)
+- [CsvToJsonConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/csv/CsvToJsonConverter.java)
+- [CsvToJsonConverterV2.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/csv/CsvToJsonConverterV2.java)
+- [AvroFieldsPickConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/filter/AvroFieldsPickConverter.java)
+- [AvroFilterConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/filter/AvroFilterConverter.java)
+- [AvroToRestJsonEntryConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/http/AvroToRestJsonEntryConverter.java)
+- [BytesToJsonConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/json/BytesToJsonConverter.java)
+- [JsonStringToJsonIntermediateConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/json/JsonStringToJsonIntermediateConverter.java)
+- [JsonToStringConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/json/JsonToStringConverter.java)
+- [ObjectStoreConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/objectstore/ObjectStoreConverter.java)
+- [ObjectStoreDeleteConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/objectstore/ObjectStoreDeleteConverter.java)
+- [HiveSerDeConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/serde/HiveSerDeConverter.java)
+- [ObjectToStringConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/string/ObjectToStringConverter.java)
+- [StringFilterConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/string/StringFilterConverter.java)
+- [StringSplitterConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/string/StringSplitterConverter.java)
+- [StringSplitterToListConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/string/StringSplitterToListConverter.java)
+- [StringToBytesConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/string/StringToBytesConverter.java)
+- [TextToStringConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/string/TextToStringConverter.java)
+- [GobblinMetricsPinotFlattenerConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/GobblinMetricsPinotFlattenerConverter.java)
+
+
+## Schema specification
+  The following section discusses the specification to define source schema using a JSON format.
+
+| Key Name     			| Value data type   	| Description                                             |
+|-----------------------|-----------------------|---------------------------------------------------------|
+| columnName			| String            	| The name of the JSON key which will contain the data.   |
+| isNullable			| Boolean				| Can data be null?                         |
+| comment				| String				| Field description just for documentation purpose.|
+| dataType				| JSON					| Provides more information about the data type.                   |
+| dataType.type			| String				| Type of data to store. ex: int, long etc                |
+| dataType.name			| String				| Provide a name to your data type.                       |
+| dataType.items		| String/JSON			| Used for array type to define the data type of items contained by the array. If data type of array items is primitive the String is used as value otherwise for complex type dataType JSON should be used as a value to provide further information on complex array items.  |
+| dataType.values		| String/JSON/Array		| Used by map and record types to define the data type of the values. In case of records it will always be Array type defining fields. In case of map it could be String or JSON based on primitive or complex data type involved.|
+| dataype.symbols		| Array&lt;String&gt;			| Array of strings to define the enum symbols. |
+| watermark				| Boolean				| To specify if the key is used as a watermark. Or use `extract.delta.fields` property to define comma separated list of watermark fields. |
+| unique			| Boolean				| To specify if the key should be unique set of records. |
+| defaultValue			| Object				| To specify the default value. |
+
+## Supported data types by different converters
+The converters which perform data format conversions such as CSV to JSON, JSON to AVRO etc. will have to perform data type conversions. Below, is the list of such converters and the data types they support.
+
+| Converter  | Data types  |
+|---|---|
+| [JsonIntermediateToAvroConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/converter/avro/JsonIntermediateToAvroConverter.java)  | <ul><li>DATE</li><li>TIMESTAMP</li><li>TIME</li><li>STRING</li><li>BYTES</li><li>INT</li><li>LONG</li><li>FLOAT</li><li>DOUBLE</li><li>BOOLEAN</li><li>ARRAY</li><li>MAP</li><li>ENUM</li></ul>|
+| [JsonIntermediateToParquetGroupConverter.java](https://github.com/apache/gobblin/blob/master/gobblin-modules/gobblin-parquet/src/main/java/org/apache/gobblin/converter/parquet/JsonIntermediateToParquetGroupConverter.java)  | <ul><li>DATE</li><li>TIMESTAMP</li><li>TIME</li><li>STRING</li><li>BYTES</li><li>INT</li><li>LONG</li><li>FLOAT</li><li>DOUBLE</li><li>BOOLEAN</li><li>ARRAY</li><li>MAP</li><li>ENUM</li></ul>|
+
+
+### Primitive types 
+ The following primitive types are available int, float, string, double, long, null, boolean.
+ 
+**Sample data**
+
+```json
+{
+	"jobRoles": 42,
+	"peopleWeightAvg": 50.5,
+	"peopleOrg": "EvilCorp",
+	"peopleAvgSal": 342222.65,
+        "peopleCount": 8344242342,
+	"peopleBrain": null,
+	"public": false
+}
+```
+**Sample schema**
+```json
+[
+    {
+        "columnName": "jobRoles",
+        "isNullable": false,
+        "comment": "Number of roles in the org"
+        "dataType": {
+                "type": "int"
+            }
+    },
+    {
+        "columnName": "peopleWeightAvg",
+        "isNullable": false,
+        "comment": "Avg weight of people in org"
+        "dataType": {
+                "type": "float"
+            }
+    },
+    {
+        "columnName": "peopleOrg",
+        "isNullable": false,
+        "comment": "Name of org people works for"
+        "dataType": {
+                "type": "string"
+            }
+    },
+    {
+        "columnName": "peopleAvgSal",
+        "isNullable": false,
+        "comment": "Avg salary of people in org"
+        "dataType": {
+                "type": "double"
+            }
+    },
+    {
+        "columnName": "peopleCount",
+        "isNullable": false,
+        "comment": "Count of people in org"
+        "dataType": {
+                "type": "long"
+            }
+    },
+    {
+        "columnName": "peopleBrain",
+        "comment": "Brain obj of people"
+        "dataType": {
+                "type": "null"
+            }
+    },
+    {
+        "columnName": "public",
+        "isNullable": false,
+        "comment": "Is data public"
+        "dataType": {
+                "type": "boolean"
+            }
+    }
+]
+```
+
+
+### Complex types
+#### Array
+
+**Sample data**
+```json
+{
+	"arrayOfInts": [25, 50, 75]
+}
+```
+**Sample schema**
+```json
+[
+    {
+        "columnName": "arrayOfInts",
+        "isNullable": false,
+        "comment": "Items in array have same data type as defined in dataType."
+        "dataType": {
+                "type": "array",
+                "items": "int"
+            }
+    }
+]
+```
+#### Map
+Maps can contain n number of key value pairs with constraint of same data type for values and keys are always string.
+**Sample data**
+```json
+{
+	"bookDetails":{
+		"harry potter and the deathly hallows": 10245,
+		"harry potter and the cursed child": 20362
+	}
+}
+```
+
+**Sample schema**
+
+```json
+[
+    {
+        "columnName": "bookDetails",
+        "isNullable": false,
+        "comment": "Maps always have string as keys and all values have same type as defined in dataType"
+        "dataType": {
+                "type": "map",
+                "values": "long"
+            }
+    }
+]
+```
+
+#### Record
+Unlike map, values in record type are not bound by single value type. Keys and values have to be declared in the schema with data type.
+**Sample data**
+```json
+{
+	"userDetails": {
+		"userName": "anonyoumous",
+		"userAge": 50,
+	}
+}
+```
+**Sample schema**
+```json
+[
+    {
+        "columnName": "userDetails",
+        "isNullable": false,
+        "comment": "user detail",
+        "dataType": {
+                "type": "record",
+                "values": [
+                    {
+                        "columnName": "userName",
+                        "dataType":{
+                            "type":"string"
+                        }
+                    },
+                    {
+                        "columnName": "userAge",
+                        "dataType":{
+                            "type":"int"
+                        }
+                    }
+                ]
+            }
+    }
+]
+```
+
+#### Enum
+**Sample data**
+```json
+{
+	"userStatus": "ACTIVE"
+}
+```
+**Sample schema**
+```json
+[
+    {
+        "columnName": "userStatus",
+        "dataType":{
+            "type": "enum",
+            "symbols":[
+                "ACTIVE", "INACTIVE"
+            ]
+        }
+    }
+]
+```
+
+### Nesting types
+Complex types can be used to created nested schemas.
+**Array, Map and Record can have complex items instead of just primitive types.**
+
+Few of the examples to show how nested schema is written
+
+**Array with nested record**
+```json
+[
+  {
+    "columnName": "userName",
+    "dataType": {
+      "type": "string"
+    }
+  },
+  {
+    "columnName": "purchase",
+    "dataType": {
+      "type": "array",
+      "items": {
+        "dataType": {
+          "type": "record",
+          "values": [
+            {
+              "columnName": "ProductName",
+              "dataType": {
+                "type": "string"
+              }
+            },
+            {
+              "columnName": "ProductPrice",
+              "dataType": {
+                "type": "long"
+              }
+            }
+          ]
+        }
+      }
+    }
+  }
+]
+```
+**Map with nested array**
+```json
+[
+  {
+    "columnName": "persons",
+    "dataType": {
+      "type": "map",
+      "values": {
+        "dataType": {
+          "type": "array",
+          "items": "int"
+        }
+      }
+    }
+  }
+]
+```
diff --git a/gobblin-website/docs/user-guide/State-Management-and-Watermarks.md b/gobblin-website/docs/user-guide/State-Management-and-Watermarks.md
new file mode 100644
index 0000000..00ad557
--- /dev/null
+++ b/gobblin-website/docs/user-guide/State-Management-and-Watermarks.md
@@ -0,0 +1,84 @@
+---
+title: State Management and Watermarks
+sidebar_label: State Management and Watermarks
+---
+
+This page has two parts. Section 1 is an instruction on how to carry over checkpoints between two runs of a scheduled batch ingestion job, so that each run can start at where the previous run left off. Section 2 is a deep dive of different types of states in Gobblin and how they are used in a typical job run.
+
+## Managing Watermarks in a Job
+
+When scheduling a Gobblin job to run in batches and pull data incrementally, each run, upon finishing its tasks, should check in the state of its work into the state store, so that the next run can continue the work based on the previous run. This is done through a concept called Watermark.
+
+### Basics
+
+**low watermark and expected high watermark**
+
+When the `Source` creates `WorkUnit`s, each `WorkUnit` should generally contain a low watermark and an expected high watermark. They are the start and finish points for the corresponding task, and the task is expected to pull the data from the low watermark to the expected high watermark. 
+
+**actual high watermark**
+
+When a task finishes extracting data, it should write the actual high watermark into its `WorkUnitState`. To do so, the `Extractor` may maintain a `nextWatermark` field, and in `Extractor.close()`, call `this.workUnitState.setActualHighWatermark(this.nextWatermark)`. The actual high Watermark is normally the same as the expected high Watermark if the task completes successfully, and may be smaller than the expected high Watermark if the task failed or timed-out. In some cases, the expect [...]
+
+In the next run, the `Source` will call `SourceState.getPreviousWorkUnitStates()` which should contain the actual high watermarks the last run checked in, to be used as the low watermarks of the new run.
+
+**watermark type**
+
+A watermark can be of any custom type by implementing the [`Watermark`](https://github.com/apache/gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/source/extractor/Watermark.java) interface. For example, for Kafka-HDFS ingestion, if each `WorkUnit` is responsible for pulling a single Kafka topic partition, a watermark is a single `long` value representing a Kafka offset. If each `WorkUnit` is responsible for pulling multiple Kafka topic partitions, a watermark can be a li [...]
+
+### Task Failures
+
+A task may pull some data and then fail. If a task fails and job commit policy specified by configuration property `job.commit.policy` is set to `full`, the data it pulled won't be published. In this case, it doesn't matter what value `Extractor.nextWatermark` is, the actual high watermark will be automatically rolled back to the low watermark by Gobblin internally. On the other hand, if the commit policy is set to `partial`, the failed task may get committed and the data may get publish [...]
+
+### Multi-Dataset Jobs
+
+Currently the only state store implementation Gobblin provides is [`FsStateStore`](https://github.com/apache/gobblin/blob/master/gobblin-metastore/src/main/java/org/apache/gobblin/metastore/FsStateStore.java) which uses Hadoop SequenceFiles to store the states. By default, each job run reads the SequenceFile created by the previous run, and generates a new SequenceFile. This creates a pitfall when a job pulls data from multiple datasets: if a data set is skipped in a job run for whatever [...]
+
+**Example**: suppose we schedule a Gobblin job to pull a Kafka topic from a Kafka broker, which has 10 partitions. In this case each partition is a dataset. In one of the job runs, a partition is skipped due to either being blacklisted or some failure. If no `WorkUnit` is created for this partition, this partition's watermark will not be checked in to the state store, and will not be available for the next run.
+
+The are two solutions to the above problem (three if you count the one that implements a different state store that behaves differently and doesn't have this problem).
+
+**Solution 1**: make sure to create a `WorkUnit` for every dataset. Even if a dataset should be skipped, an empty `WorkUnit` should still be created for the dataset ('empty' means low watermark = expected high watermark).
+
+**Solution 2**: use Dataset URNs. When a job pulls multiple datasets, the `Source` class may define a URN for each dataset, e.g., we may use `PageViewEvent.5` as the URN of the 5th partition of topic `PageViewEvent`. When the `Source` creates the `WorkUnit` for this partition, it should set property `dataset.urn` in this `WorkUnit` with value `PageViewEvent.5`. This is the solution gobblin current uses to support jobs pulling data for multiple datasets.
+
+If different `WorkUnit`s have different values of `dataset.urn`, the job will create one state store SequenceFile for each `dataset.urn`. In the next run, instead of calling `SourceState.getPreviousWorkUnitStates()`, one should use `SourceState.getPreviousWorkUnitStatesByDatasetUrns()`. In this way, each run will look for the most recent state store SequenceFile for each dataset, and therefore, even if a dataset is not processed by a job run, its watermark won't be lost.
+
+Note that when using Dataset URNs, **each `WorkUnit` can only have one `dataset.urn`**, which means, for example, in the Kafka ingestion case, each `WorkUnit` can only process one partition. This is usually not a big problem except that it may output too many small files (as explained in [Kafka HDFS ingestion](../case-studies/Kafka-HDFS-Ingestion), by having a `WorkUnit` pull multiple partitions of the same topic, these partitions can share output files). On the other hand, different `Wo [...]
+
+## Gobblin State Deep Dive
+
+Gobblin involves several types of states during a job run, such as `JobState`, `TaskState`, `WorkUnit`, etc. They all extend the [`State`](https://github.com/apache/gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/configuration/State.java) class, which is a wrapper around [`Properties`](https://docs.oracle.com/javase/8/docs/api/java/util/Properties.html) and provides some useful utility functions. 
+
+### `State` class hierarchy
+
+![Gobblin-State-Hierarchy](../../static/img/Gobblin-State-Hierarchy.png)
+
+* **`SourceState`, `JobState` and `DatasetState`**: `SourceState` contains properties that define the current job run. It contains properties in the job config file, and the states the previous run persisted in the state store. It is passed to Source to create `WorkUnit`s.
+
+Compared to `SourceState`, a `JobState` also contains properties of a job run such as job ID, starting time, end time, etc., as well as status of a job run, e.g, `PENDING`, `RUNNING`, `COMMITTED`, `FAILED`, etc.
+
+When the data pulled by a job is separated into different datasets (by using `dataset.urn` explained above), each dataset will have a `DatasetState` object in the JobState, and each dataset will persist its states separately.
+
+* **`WorkUnit` and `MultiWorkUnit`**: A `WorkUnit` defines a unit of work. It may contain properties such as which data set to be pulled, where to start (low watermark), where to finish (expected high watermark), among others. A `MultiWorkUnit` contains one or more `WorkUnit`s. All `WorkUnit`s in a `MultiWorkUnit` will be run by a single Task.
+
+The `MultiWorkUnit` is useful for finer-grained control and load balancing. Without `MultiWorkUnit`s, if the number of `WorkUnit`s exceeds the number of mappers in the MR mode, the job launcher can only balance the number of `WorkUnit`s in the mappers. If different `WorkUnit`s have very different workloads (e.g., some pull from very large partitions and others pull from small partitions), this may lead to mapper skew. With `MultiWorkUnit`, if the `Source` class knows or can estimate the  [...]
+
+* **`WorkUnitState` and `TaskState`**: A `WorkUnitState` contains the runtime properties of a `WorkUnit`, e.g., actual high watermark, as well as the status of a WorkUnit, e.g., `PENDING`, `RUNNING`, `COMMITTED`, `FAILED`, etc. A `TaskState` additionally contains properties of a Task that runs a `WorkUnit`, e.g., task ID, start time, end time, etc.
+
+* **`Extract`**: `Extract` is mainly used for ingesting from databases. It contains properties such as job type (snapshot-only, append-only, snapshot-append), primary keys, delta fields, etc.
+
+### How States are Used in a Gobblin Job
+
+* When a job run starts, the job launcher first creates a `JobState`, which contains (1) all properties specified in the job config file, and (2) the `JobState` / `DatasetState` of the previous run, which contains, among other properties, the actual high watermark the previous run checked in for each of its tasks / datasets.
+
+* The job launcher then passes the `JobState` (as a `SourceState` object) to the `Source`, based on which the `Source` will create a set of `WorkUnit`s. Note that when creating `WorkUnit`s, the `Source` should not add properties in `SourceState` into the `WorkUnit`s, which will be done when each `WorkUnit` is executed in a `Task`. The reason is that since the job launcher runs in a single JVM, creating a large number of `WorkUnit`s, each containing a copy of the `SourceState`, may cause OOM.
+
+* The job launcher prepares to run the `WorkUnit`s.
+ * In standalone mode, the job launcher will add properties in the `JobState` into each `WorkUnit` (if a property in `JobState` already exists in the `WorkUnit`, it will NOT be overwritten, i.e., the value in the `WorkUnit` takes precedence). Then for each `WorkUnit` it creates a `Task` to run the `WorkUnit`, and submits all these Tasks to a [`TaskExecutor`](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/TaskExecutor.java) which wil [...]
+ * In MR mode, the job launcher will serialize the `JobState` and each `WorkUnit` into a file, which will be picked up by the mappers. It then creates, configures and submits a Hadoop job.
+
+After this step, the job launcher will be waiting till all tasks finish.
+
+* Each `Task` corresponding to a `WorkUnit` contains a `TaskState`. The `TaskState` initially contains all properties in `JobState` and the corresponding `WorkUnit`, and during the Task run, more runtime properties can be added to `TaskState` by `Extractor`, `Converter` and `Writer`, such as the actual high watermark explained in Section 1.
+
+* After all `Task`s finish, `DatasetState`s will be created from all `TaskState`s based on the `dataset.urn` specified in the `WorkUnit`s. For each dataset whose data is committed, the job launcher will persist its `DatasetState`. If no `dataset.urn` is specified, there will be a single DatasetState, and thus the DatasetState will be persisted if either all `Task`s successfully committed, or some task failed but the commit policy is set to `partial`, in which case the watermarks of these [...]
diff --git a/gobblin-website/docs/user-guide/Troubleshooting.md b/gobblin-website/docs/user-guide/Troubleshooting.md
new file mode 100644
index 0000000..47e8fd8
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Troubleshooting.md
@@ -0,0 +1,92 @@
+---
+title: Troubleshooting
+sidebar_label: Troubleshooting
+---
+
+## Checking Job State
+When there's an issue with a Gobblin job to troubleshoot, it is often helpful to check the state of the job persisted in the state store. Gobblin provides a tool `gobblin-dist/bin/statestore-checker.sh' for checking job states. The tool print job state(s) as a Json document that are easily readable. The usage of the tool is as follows:
+
+```text
+usage: statestore-checker.sh
+ -a,--all                                  Whether to convert all past job
+                                           states of the given job
+ -i,--id <gobblin job id>                  Gobblin job id
+ -kc,--keepConfig                          Whether to keep all
+                                           configuration properties
+ -n,--name <gobblin job name>              Gobblin job name
+ -u,--storeurl <gobblin state store URL>   Gobblin state store root path
+                                           URL
+``` 
+
+For example, assume that the state store is located at `file://gobblin/state-store/`, to check the job state of the most recent run of a job named "Foo", run the following command:
+
+```bash
+statestore-checker.sh -u file://gobblin/state-store/ -n Foo
+``` 
+
+To check the job state of a particular run (say, with job ID job_Foo_123456) of job "Foo", run the following command:
+
+```bash
+statestore-checker.sh -u file://gobblin/state-store/ -n Foo -i job_Foo_123456
+```
+
+To check the job states of all past runs of job "Foo", run the following command:
+
+```bash
+statestore-checker.sh -u file://gobblin/state-store/ -n Foo -a
+```
+
+To include job configuration in the output Json document, add option `-kc` or `--keepConfig` in the command.
+
+A sample output Json document is as follows:
+
+```json
+{
+	"job name": "GobblinMRTest",
+	"job id": "job_GobblinMRTest_1425622600239",
+	"job state": "COMMITTED",
+	"start time": 1425622600240,
+	"end time": 1425622601326,
+	"duration": 1086,
+	"tasks": 4,
+	"completed tasks": 4,
+	"task states": [
+		{
+			"task id": "task_GobblinMRTest_1425622600239_3",
+			"task state": "COMMITTED",
+			"start time": 1425622600383,
+			"end time": 1425622600395,
+			"duration": 12,
+			"high watermark": -1,
+			"retry count": 0
+		},
+		{
+			"task id": "task_GobblinMRTest_1425622600239_2",
+			"task state": "COMMITTED",
+			"start time": 1425622600354,
+			"end time": 1425622600374,
+			"duration": 20,
+			"high watermark": -1,
+			"retry count": 0
+		},
+		{
+			"task id": "task_GobblinMRTest_1425622600239_1",
+			"task state": "COMMITTED",
+			"start time": 1425622600325,
+			"end time": 1425622600344,
+			"duration": 19,
+			"high watermark": -1,
+			"retry count": 0
+		},
+		{
+			"task id": "task_GobblinMRTest_1425622600239_0",
+			"task state": "COMMITTED",
+			"start time": 1425622600405,
+			"end time": 1425622600421,
+			"duration": 16,
+			"high watermark": -1,
+			"retry count": 0
+		}
+	]
+}
+```
diff --git a/gobblin-website/docs/user-guide/Working-with-Job-Configuration-Files.md b/gobblin-website/docs/user-guide/Working-with-Job-Configuration-Files.md
new file mode 100644
index 0000000..ddb9031
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Working-with-Job-Configuration-Files.md
@@ -0,0 +1,97 @@
+---
+title: Job Configuration Files
+sidebar_label: Job Configuration Files
+---
+
+Job Configuration Basics
+--------------------
+A Job configuration file is a text file with extension `.pull` or `.job` that defines the job properties that can be loaded into a Java [Properties](http://docs.oracle.com/javase/7/docs/api/java/util/Properties.html) object. Gobblin uses [commons-configuration](http://commons.apache.org/proper/commons-configuration/) to allow variable substitutions in job configuration files. You can find some example Gobblin job configuration files [here](https://github.com/apache/gobblin/tree/master/go [...]
+
+A Job configuration file typically includes the following properties, in additional to any mandatory configuration properties required by the custom [Gobblin Constructs](/docs/Gobblin-Architecture#gobblin-constructs) classes. For a complete reference of all configuration properties supported by Gobblin, please refer to [Configuration Properties Glossary](Configuration-Properties-Glossary).
+
+* `job.name`: job name.
+* `job.group`: the group the job belongs to.
+* `source.class`: the `Source` class the job uses.
+* `converter.classes`: a comma-separated list of `Converter` classes to use in the job. This property is optional.
+* Quality checker related configuration properties: a Gobblin job typically has both row-level and task-level quality checkers specified. Please refer to [Quality Checker Properties](/docs/user-guide/Configuration-Properties-Glossary#Quality-Checker-Properties) for configuration properties related to quality checkers. 
+
+Hierarchical Structure of Job Configuration Files
+--------------------
+It is often the case that a Gobblin instance runs many jobs and manages the job configuration files corresponding to those jobs. The jobs may belong to different job groups and are for different data sources. It is also highly likely that jobs for the same data source shares a lot of common properties. So it is very useful to support the following features:
+
+* Job configuration files can be grouped by the job groups they belong to and put into different subdirectories under the root job configuration file directory.
+* Common job properties shared among multiple jobs can be extracted out to a common properties file that will be applied into the job configurations of all these jobs. 
+
+Gobblin supports the above features using a hierarchical structure to organize job configuration files under the root job configuration file directory. The basic idea is that there can be arbitrarily deep nesting of subdirectories under the root job configuration file directory. Each directory regardless how deep it is can have a single `.properties` file storing common properties that will be included when loading the job configuration files under the same directory or in any subdirecto [...]
+
+```text
+root_job_config_dir/
+  common.properties
+  foo/
+    foo1.job
+    foo2.job
+    foo.properties
+  bar/
+    bar1.job
+    bar2.job
+    bar.properties
+    baz/
+      baz1.pull
+      baz2.pull
+      baz.properties
+```
+
+In this example, `common.properties` will be included when loading `foo1.job`, `foo2.job`, `bar1.job`, `bar2.job`, `baz1.pull`, and `baz2.pull`. `foo.properties` will be included when loading `foo1.job` and `foo2.job` and properties set here are considered more special and will overwrite the same properties defined in `common.properties`. Similarly, `bar.properties` will be included when loading `bar1.job` and `bar2.job`, as well as `baz1.pull` and `baz2.pull`. `baz.properties` will be i [...]
+
+Password Encryption
+--------------------
+To avoid storing passwords in configuration files in plain text, Gobblin supports encryption of the password configuration properties. All such properties can be encrypted (and decrypted) using a master password. The master password is stored in a file available at runtime. The file can be on a local file system or HDFS and has restricted access.
+
+The URI of the master password file is controlled by the configuration option `encrypt.key.loc` . By default, Gobblin will use [org.jasypt.util.password.BasicPasswordEncryptor](http://www.jasypt.org/api/jasypt/1.8/org/jasypt/util/password/BasicPasswordEncryptor.html). If you have installed the [JCE Unlimited Strength Policy](http://www.oracle.com/technetwork/java/javase/downloads/jce-7-download-432124.html), you can set
+`encrypt.use.strong.encryptor=true` which will configure Gobblin to use [org.jasypt.util.password.StrongPasswordEncryptor](http://www.jasypt.org/api/jasypt/1.8/org/jasypt/util/password/StrongPasswordEncryptor.html).
+
+Encrypted passwords can be generated using the `CLIPasswordEncryptor` tool.
+```bash
+    $ gradle :gobblin-utility:assemble
+    $ cd build/gobblin-utility/distributions/
+    $ tar -zxf gobblin-utility.tar.gz
+    $ bin/gobblin_password_encryptor.sh 
+      usage:
+       -f <master password file>   file that contains the master password used
+                                   to encrypt the plain password
+       -h                          print this message
+       -m <master password>        master password used to encrypt the plain
+                                   password
+       -p <plain password>         plain password to be encrypted
+       -s                          use strong encryptor
+    $ bin/gobblin_password_encryptor.sh -m Hello -p Bye
+    ENC(AQWoQ2Ybe8KXDXwPOA1Ziw==)
+```
+
+If you are extending Gobblin and you want some of your configurations (e.g. the ones containing credentials) to support encryption, you can use `gobblin.password.PasswordManager.getInstance()` methods to get an instance of `PasswordManager`. You can then use `PasswordManager.readPassword(String)` which will transparently decrypt the value if needed, i.e. if it is in the form `ENC(...)` and a master password is provided.
+
+Adding or Changing Job Configuration Files
+--------------------
+The Gobblin job scheduler in the standalone deployment monitors any changes to the job configuration file directory and reloads any new or updated job configuration files when detected. This allows adding new job configuration files or making changes to existing ones without bringing down the standalone instance. Currently, the following types of changes are monitored and supported:
+
+* Adding a new job configuration file with a `.job` or `.pull` extension. The new job configuration file is loaded once it is detected. In the example hierarchical structure above, if a new job configuration file `baz3.pull` is added under `bar/baz`, it is loaded with properties included from `common.properties`, `bar.properties`, and `baz.properties` in that order.
+* Changing an existing job configuration file with a `.job` or `.pull` extension. The job configuration file is reloaded once the change is detected. In the example above, if a change is made to `foo2.job`, it is reloaded with properties included from `common.properties` and `foo.properties` in that order.
+* Changing an existing common properties file with a `.properties` extension. All job configuration files that include properties in the common properties file will be reloaded once the change is detected. In the example above, if `bar.properties` is updated, job configuration files `bar1.job`, `bar2.job`, `baz1.pull`, and `baz2.pull` will be reloaded. Properties from `bar.properties` will be included when loading `bar1.job` and `bar2.job`. Properties from `bar.properties` and `baz.prope [...]
+
+Note that this job configuration file change monitoring mechanism uses the `FileAlterationMonitor` of Apache's [commons-io](http://commons.apache.org/proper/commons-io/) with a custom `FileAlterationListener`. Regardless of how close two adjacent file system checks are, there are still chances that more than one files are changed between two file system checks. In case more than one file including at least one common properties file are changed between two adjacent checks, the reloading  [...]
+
+Scheduled Jobs
+--------------------
+Gobblin ships with a job scheduler backed by a [Quartz](http://quartz-scheduler.org/) scheduler and supports Quartz's [cron triggers](http://quartz-scheduler.org/generated/2.2.1/html/qs-all/#page/Quartz_Scheduler_Documentation_Set%2Fco-trg_crontriggers.html%23). A job that is to be scheduled should have a cron schedule defined using the property `job.schedule`. Here is an example cron schedule that triggers every two minutes:
+
+```properties
+job.schedule=0 0/2 * * * ?
+```
+
+One Time Jobs
+--------------------
+Some Gobblin jobs may only need to be run once. A job without a cron schedule in the job configuration is considered a run-once job and will not be scheduled but run immediately after being loaded. A job with a cron schedule but also the property `job.runonce=true` specified in the job configuration is also treated as a run-once job and will only be run the first time the cron schedule is triggered.
+
+Disabled Jobs
+--------------------
+A Gobblin job can be disabled by setting the property `job.disabled` to `true`. A disabled job will not be loaded nor scheduled to run.
diff --git a/gobblin-website/docs/user-guide/Working-with-the-ForkOperator.md b/gobblin-website/docs/user-guide/Working-with-the-ForkOperator.md
new file mode 100644
index 0000000..74aa6cf
--- /dev/null
+++ b/gobblin-website/docs/user-guide/Working-with-the-ForkOperator.md
@@ -0,0 +1,200 @@
+---
+title: Fork Operator
+sidebar_label: Fork Operator
+---
+
+Overview of the ForkOperator
+--------------------
+
+The [`ForkOperator`](https://github.com/apache/gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/fork/ForkOperator.java) is a type of control operators that allow a task flow to branch into multiple streams (or forked branches) as represented by a [`Fork`](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/fork/Fork.java), each of which goes to a separately configured sink with its own data writer. The `ForkOperator` give [...]
+
+![Gobblin Task Flow](../../static/img/Gobblin-Task-Flow.png)
+Gobblin Task Flow
+
+Using the ForkOperator
+--------------------
+
+### Basics of Usage
+
+The [`ForkOperator`](https://github.com/apache/gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/fork/ForkOperator.java), like most other operators in a Gobblin task flow, is pluggable through the configuration, or more specifically , the configuration property `fork.operator.class` that points to a class that implements the `ForkOperator` interface. For instance:
+
+```properties
+fork.operator.class=org.apache.gobblin.fork.IdentityForkOperator
+```
+
+By default, if no `ForkOperator` class is specified, internally Gobblin uses the default implementation [`IdentityForkOperator`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/fork/IdentityForkOperator.java) with a single forked branch (although it does supports multiple forked branches). The `IdentityForkOperator` simply unconditionally forwards the schema and ingested data records to all the forked branches, the number of which is specified  [...]
+
+The _expected_ number of forked branches is given by the method `getBranches` of the `ForkOperator`. This number must match the size of the list of `Boolean`s returned by `forkSchema` as well as the size of the list of `Boolean`s returned by `forkDataRecords`. Otherwise, `ForkBranchMismatchException` will be thrown. Note that the `ForkOperator` itself _is not making and returning a copy_ for the input schema and data records, but rather just providing a `Boolean` for each forked branch t [...]
+
+The use of the `ForkOperator` with _the possibility that the schema and/or data records may be forwarded to more than one forked branches_ has some special requirement on the input schema and data records to the `ForkOperator`. Specifically, because the same schema or data records may be forwarded to more than branches that may alter the schema or data records in place, it is necessary for the Gobblin task flow to make a copy of the input schema or data records for each forked branch so  [...]
+
+To guarantee that it is always able to make a copy in such a case, Gobblin requires the input schema and data records to be of type `Copyable` when there are more than one forked branch. [`Copyable`](https://github.com/apache/gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/fork/Copyable.java) is an interface that defines a method `copy` for making a copy of an instance of a given type. The Gobblin task flow will check if the input schema and data records are instances of [...]
+
+Gobblin ships with some built-in `Copyable` implementations, e.g., [`CopyableSchema`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/fork/CopyableSchema.java) and [`CopyableGenericRecord`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/fork/CopyableGenericRecord.java) for Avro's `Schema` and `GenericRecord`.   
+
+### Per-Fork Configuration
+
+Since each forked branch may have it's own converters, quality checkers, and writers, in addition to the ones in the pre-fork stream (which does not have a writer apparently), there must be a way to tell the converter, quality checker, and writer classes of one branch from another and from the pre-fork stream. Gobblin uses a pretty straightforward approach: if a configuration property is used to specify something for a branch in a multi-branch use case, _the branch index should be append [...]
+
+* Converter configuration properties: configuration properties whose names start with `converter`.
+* Quality checker configuration properties: configuration properties whose names start with `qualitychecker`.
+* Writer configuration properties: configuration properties whose names start with `writer`.
+
+### Failure Semantics
+
+In a normal task flow where the default `IdentityForkOperator` with a single branch is used, the failure of the single branch also means the failure of the task flow. When there are more than one forked branch, however, the failure semantics are more involved. Gobblin uses the following failure semantics in this case: 
+
+* The failure of any forked branch means the failure of the whole task flow, i.e., the task succeeds if and only if all the forked branches succeed.
+* A forked branch stops processing any outstanding incoming data records in the queue if it fails in the middle of processing the data.   
+* The failure and subsequent stop/completion of any forked branch does not prevent other branches from processing their copies of the ingested data records. The task will wait until all the branches to finish, regardless if they succeed or fail.   
+* The commit of output data of forks is determined by the job commit policy (see [`JobCommitPolicy`](https://github.com/apache/gobblin/blob/master/gobblin-core-base/src/main/java/org/apache/gobblin/source/extractor/JobCommitPolicy.java)) specified. If `JobCommitPolicy.COMMIT_ON_FULL_SUCCESS` (or `full` in short) is used, the output data of the entire job will be discarded if any forked branch fails, which will fail the task and consequently the job. If instead `JobCommitPolicy.COMMIT_SUC [...]
+  
+### Performance Tuning
+
+Internally, each forked branch as represented by a [`Fork`](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/fork/Fork.java) maintains a bounded record queue (implemented by [`BoundedBlockingRecordQueue`](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/BoundedBlockingRecordQueue.java)), which serves as a buffer between the pre-fork stream and the forked stream of the particular br [...]
+
+In terms of the number of forked branches, we have seen use cases with a half dozen forked branches, and we are anticipating uses cases with much larger numbers. Again, when using a large number of forked branches, the size of the record queues and the timeout time need to be carefully tuned. 
+
+The [`BoundedBlockingRecordQueue`](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/BoundedBlockingRecordQueue.java) in each [`Fork`](https://github.com/apache/gobblin/blob/master/gobblin-runtime/src/main/java/org/apache/gobblin/runtime/fork/Fork.java) keeps trach of the following queue statistics that can be output to the logs if the `DEBUG` logging level is turned on. Those statistics provide good indications on the performance of t [...]
+
+* Queue size, i.e., the number of records in queue.
+* Queue fill ratio, i.e., a ratio of the number of records in queue over the queue capacity.
+* Put attempt rate (per second).
+* Total put attempt count.
+* Get attempt rate (per second).
+* Total get attempt count. 
+
+### Comparison with PartitionedDataWriter
+
+Gobblin ships with a special type of `DataWriter`s called [`PartitionedDataWriter`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/writer/PartitionedDataWriter.java) that allow ingested records to be written in a partitioned fashion using a `WriterPartitioner` into different locations in the same sink. The `WriterPartitioner` determines the specific partition for each data record. So there's certain overlap in terms of functionality between th [...]
+
+* The `ForkOperator` requires the number of forked branches to be known and returned through `getBranches` before the task starts, whereas the `PartitionedDataWriter` does not have this requirement.
+* The `PartitionedDataWriter` writes each data record to a single partition, whereas the `ForkOperator` allows data records to be forwarded to any number of forked branches.
+* The `ForkOperator` allows the use of additional converters and quality checkers in any forked branches before data gets written out. The `PartitionedDataWriter` is the last operator in a task flow.
+* Use of the `ForkOperator` allows data records to be written to different sinks, whereas the `PartitionedDataWriter` is not capable of doing this.
+* The `PartitionedDataWriter` writes data records sequentially in a single thread, whereas use of the `ForkOperator` allows forked branches to write independently in parallel since `Fork`s are executed in a thread pool.  
+
+Writing your Own ForkOperator
+--------------------
+
+Since the built-in default implementation [`IdentityForkOperator`](https://github.com/apache/gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/fork/IdentityForkOperator.java) simply blindly forks the input schema and data records to every branches, it's often necessary to have a custom implementation of the `ForkOperator` interface for more fine-grained control on the actual branching. Checkout the interface [`ForkOperator`](https://github.com/apache/gobblin/blob/master/g [...]
+
+Best Practices
+--------------------
+
+The `ForkOperator` can have many potential use cases and we have seen the following common ones:
+
+* Using a `ForkOperator` to write the same ingested data to multiple sinks, e.g., HDFS and S3, possibly in different formats. This kind of use cases is often referred to as "dual writes", which are _generally NOT recommended_ as "dual writes" may lead to data inconsistency between the sinks in case of write failures. However, with the failure semantics discussed above, data inconsistency generally should not happen with the job commit policy `JobCommitPolicy.COMMIT_ON_FULL_SUCCESS` or `J [...]
+* Using a `ForkOperator` to process ingested data records in different ways conditionally. For example, a `ForkOperator` may be used to classify and write ingested data records to different places on HDFS depending on some field in the data that serves as a classifier.
+* Using a `ForkOperator` to group ingested data records of a certain schema type in case the incoming stream mixes data records of different schema types. For example, we have seen a use case in which a single Kafka topic is used for records of various schema types and when data gets ingested to HDFS, the records need to be written to different paths according to their schema types.
+
+Generally, a common use case of the `ForkOperator` is to route ingested data records so they get written to different output locations _conditionally_. The `ForkOperator` also finds common usage for "dual writes" to different sinks potentially in different formats if the job commit policy `JobCommitPolicy.COMMIT_ON_FULL_SUCCESS` (or `full` in short) or `JobCommitPolicy.COMMIT_SUCCESSFUL_TASKS` (or `successful` in short) is used, as explained above. 
+
+Troubleshooting
+--------------------
+
+1) When using Forks with jobs defined as Hocon, you may encounter an error like:
+```text
+    com.typesafe.config.ConfigException$BugOrBroken: In the map, path 'converter.classes' occurs as both the parent object of a value and as a value. Because Map has no defined ordering, this is a broken situation.
+    at com.typesafe.config.impl.PropertiesParser.fromPathMap(PropertiesParser.java:115)
+    at com.typesafe.config.impl.PropertiesParser.fromPathMap(PropertiesParser.java:82)
+    at com.typesafe.config.impl.ConfigImpl.fromAnyRef(ConfigImpl.java:260)
+    at com.typesafe.config.impl.ConfigImpl.fromPathMap(ConfigImpl.java:200)
+    at com.typesafe.config.ConfigFactory.parseMap(ConfigFactory.java:855)
+    at com.typesafe.config.ConfigFactory.parseMap(ConfigFactory.java:866)
+    at gobblin.runtime.embedded.EmbeddedGobblin.getSysConfig(EmbeddedGobblin.java:497)
+    at gobblin.runtime.embedded.EmbeddedGobblin.runAsync(EmbeddedGobblin.java:442)
+```
+
+This is because in Hocon a key can have only a single type (see: https://github.com/lightbend/config/blob/master/HOCON.md#java-properties-mapping).
+
+To solve this, try writing your config like: 
+   
+ 
+```text
+    converter.classes.ROOT_VALUE="..."
+    ...
+    converter.classes.0="..."
+    ...
+    converter.classes.1="..."
+```
+
+
+Example
+--------------------
+
+Let's take a look at one example that shows how to work with the `ForkOperator` for a real use case. Say you have a Gobblin job that ingests Avro data from a data source that may have some sensitive data in some of the fields that need to be purged. Depending on if data records have sensitive data, they need to be written to different locations on the same sink, which we assume is HDFS. So essentially the tasks of the job need a mechanism to conditionally write ingested data records to d [...]
+
+In this particular use case, we need a `ForkOperator` implementation of two branches that forwards the schema to both branches but each data record to only one of the two branches. The default `IdentityForkOperator` cannot be used since it simply forwards every data records to every branches. So we need a custom implementation of the `ForkOperator` and let's simply call it `SensitiveDataAwareForkOperator` under the package `gobblin.example.fork`. Let's also assume that branch 0 is for da [...]
+
+```java
+public class SensitiveDataAwareForkOperator implements ForkOperator<Schema, GenericRecord> {
+  
+  private static final int NUM_BRANCHES = 2;
+
+  @Override
+  public void init(WorkUnitState workUnitState) {
+  }
+
+  @Override
+  public int getBranches(WorkUnitState workUnitState) {
+    return NUM_BRANCHES;
+  }
+
+  @Override
+  public List<Boolean> forkSchema(WorkUnitState workUnitState, Schema schema) {
+    // The schema goes to both branches.
+    return ImmutableList.of(Boolean.TRUE, Boolean.TRUE);
+  }
+
+  @Override
+  public List<Boolean> forkDataRecord(WorkUnitState workUnitState, GenericRecord record) {
+    // Data records only go to one of the two branches depending on if they have sensitive data.
+    // Branch 0 is for data records with sensitive data and branch 1 is for data records without.
+    // hasSensitiveData checks the record and returns true of the record has sensitive data and false otherwise.
+    if (hasSensitiveData(record)) {
+      return ImmutableList.of(Boolean.TRUE, Boolean.FALSE)      
+    }
+
+    return ImmutableList.of(Boolean.FALSE, Boolean.TRUE);
+  }
+
+  @Override
+  public void close() throws IOException {
+  }
+}
+```
+
+To make the example more concrete, let's assume that the job uses some converters and quality checkers before the schema and data records reach the `SensitiveDataAwareForkOperator`, and it also uses a converter to purge the sensitive fields and a quality checker that makes sure some mandatory fields exist for purged data records in branch 0. Both branches will be written to the same HDFS but into different locations.
+
+```properties
+fork.operator.  class=org.apache.gobblin.example.fork.SensitiveDataAwareForkOperator
+
+# Pre-fork or non-fork-specific configuration properties
+converter.classes=<Converter classes used in the task flow prior to OutlierAwareForkOperator>
+qualitychecker.task.policies=org.apache.gobblin.policies.count.RowCountPolicy,org.apache.gobblin.policies.schema.SchemaCompatibilityPolicy
+qualitychecker.task.policy.types=OPTIONAL,OPTIONAL
+data.publisher.type=org.apache.gobblin.publisher.BaseDataPublisher
+
+# Configuration properties for branch 0
+converter.classes.0=org.apache.gobblin.example.converter.PurgingConverter
+qualitychecker.task.policies.0=org.apache.gobblin.example,policies.MandatoryFieldExistencePolicy
+qualitychecker.task.policy.types.0=FAILED
+writer.fs.uri.0=hdfs://<namenode host>:<namenode port>/
+writer.destination.type.0=HDFS
+writer.output.format.0=AVRO
+writer.staging.dir.0=/gobblin/example/task-staging/purged
+writer.output.dir.0=/gobblin/example/task-output/purged
+data.publisher.final.dir.0=/gobblin/example/job-output/purged
+
+# Configuration properties for branch 1
+writer.fs.uri.1=hdfs://<namenode host>:<namenode port>/
+writer.destination.type.1=HDFS
+writer.output.format.1=AVRO
+writer.staging.dir.1=/gobblin/example/task-staging/normal
+writer.output.dir.1=/gobblin/example/task-output/normal
+data.publisher.final.dir.1=/gobblin/example/job-output/normal
+``` 
+
+   
+
diff --git a/gobblin-website/docusaurus.config.js b/gobblin-website/docusaurus.config.js
new file mode 100644
index 0000000..f81c4fd
--- /dev/null
+++ b/gobblin-website/docusaurus.config.js
@@ -0,0 +1,114 @@
+module.exports = {
+  title: 'Apache Gobblin',
+  tagline: 'A distributed data integration framework that simplifies common aspects of big data integration such as data ingestion, replication, organization and lifecycle management for both streaming and batch data ecosystems.',
+  url: 'https://your-docusaurus-test-site.com',
+  baseUrl: '/',
+  onBrokenLinks: 'throw',
+  onBrokenMarkdownLinks: 'warn',
+  favicon: 'img/favicon.ico',
+  organizationName: 'apache', // Usually your GitHub org/user name.
+  projectName: 'gobblin', // Usually your repo name.
+  themeConfig: {
+    hideableSidebar: true,
+    sidebarCollapsible: true,
+    prism: {
+      additionalLanguages: ['java', 'properties'],
+    },
+    navbar: {
+      title: 'Apache Gobblin',
+      logo: {
+        alt: 'Apache Gobblin logo',
+        src: 'img/gobblin-logo.png',
+      },
+      items: [
+        {
+          to: 'docs/',
+          activeBasePath: 'docs',
+          label: 'Docs',
+          position: 'left'
+        },
+        {
+          href: 'https://communityinviter.com/apps/apache-gobblin/apache-gobblin',
+          label: 'Slack',
+          position: 'right'
+        },
+        {
+          href: 'https://github.com/apache/gobblin',
+          label: 'GitHub',
+          position: 'right',
+        },
+        {
+          to: 'downloads/',
+          activeBasePath: 'downloads',
+          label: 'Downloads',
+          position: 'right',
+        },
+      ],
+    },
+    footer: {
+      style: 'dark',
+      links: [
+        {
+          title: 'Community',
+          items: [
+            {
+              label: 'Slack',
+              href: 'https://discordapp.com/invite/docusaurus',
+            },
+            {
+              label: 'Stack Overflow',
+              href: 'https://stackoverflow.com/questions/tagged/gobblin',
+            },
+          ],
+        },
+        {
+          title: 'Apache',
+          items: [
+            {
+              label: 'Foundation',
+              href: 'https://www.apache.org/',
+            },
+            {
+              label: 'License',
+              href: 'https://www.apache.org/licenses',
+            },
+            {
+              label: 'Events',
+              href: 'https://www.apache.org/events/current-event',
+            },
+            {
+              label: 'Security',
+              href: 'https://www.apache.org/security',
+            },
+            {
+              label: 'Sponsorship',
+              href: 'https://www.apache.org/foundation/sponsorship.html',
+            },
+            {
+              label: 'Thanks',
+              href: 'https://www.apache.org/foundation/thanks.html',
+            }
+          ],
+        },
+      ],
+      copyright: `Copyright © ${new Date().getFullYear()} <a href="https://www.apache.org/">The Apache Software Foundation</a><br\>Apache, Apache Gobblin, the Apache feather and the Gobblin logo are trademarks of The Apache Software Foundation
+`,
+    },
+  },
+  presets: [
+    [
+      '@docusaurus/preset-classic',
+      {
+        docs: {
+          sidebarPath: require.resolve('./sidebars.js'),
+          // Please change this to your repo.
+          editUrl:
+            'https://github.com/apache/gobblin-docs/edit/master/website/',
+        },
+        theme: {
+          customCss: require.resolve('./src/css/custom.css'),
+        },
+      },
+    ],
+  ],
+};
diff --git a/gobblin-website/download/index.md b/gobblin-website/download/index.md
new file mode 100644
index 0000000..197d7d7
--- /dev/null
+++ b/gobblin-website/download/index.md
@@ -0,0 +1,23 @@
+Downloads
+Be sure to verify your downloads by these procedures using these KEYS for any Apache release.
+
+Current Releases
+0.15.0
+Official source release [ SHA512 ] [ ASC ]
+
+CHANGELOG
+
+0.14.0
+Official source release [ SHA512 ] [ ASC ]
+
+CHANGELOG
+
+0.13.0
+Official source release [ SHA512 ] [ ASC ]
+
+CHANGELOG
+
+0.12.0
+Official source release [ SHA512 ] [ ASC ]
+
+CHANGELOG
diff --git a/gobblin-website/package.json b/gobblin-website/package.json
new file mode 100644
index 0000000..dcf9f28
--- /dev/null
+++ b/gobblin-website/package.json
@@ -0,0 +1,47 @@
+{
+  "name": "gobblin-docs",
+  "version": "0.0.0",
+  "private": true,
+  "scripts": {
+    "docusaurus": "docusaurus",
+    "start": "docusaurus start",
+    "build": "docusaurus build",
+    "swizzle": "docusaurus swizzle",
+    "deploy": "docusaurus deploy",
+    "serve": "docusaurus serve",
+    "clear": "docusaurus clear",
+    "lint": "prettier --write src",
+    "lint-check": "prettier --check 'src/**/*'"
+  },
+  "dependencies": {
+    "@docusaurus/core": "2.0.0-beta.0",
+    "@docusaurus/preset-classic": "2.0.0-beta.0",
+    "@mdx-js/react": "^1.6.21",
+    "clsx": "^1.1.1",
+    "react": "^16.8.4",
+    "react-dom": "^16.8.4"
+  },
+  "devDependencies": {
+    "@docusaurus/module-type-aliases": "2.0.0-alpha.70",
+    "@types/react": "^16.8.4",
+    "add": "2.0.6",
+    "prettier": "2.0.5",
+    "typescript": "^4.0.2",
+    "yarn": "^1.22.5"
+  },
+  "prettier": {
+    "tabWidth": 4
+  },
+  "browserslist": {
+    "production": [
+      ">0.5%",
+      "not dead",
+      "not op_mini all"
+    ],
+    "development": [
+      "last 1 chrome version",
+      "last 1 firefox version",
+      "last 1 safari version"
+    ]
+  }
+}
diff --git a/gobblin-website/sidebars.js b/gobblin-website/sidebars.js
new file mode 100644
index 0000000..d841997
--- /dev/null
+++ b/gobblin-website/sidebars.js
@@ -0,0 +1,117 @@
+/*module.exports = {
+  Docs: {
+    Docusaurus: ['index'],
+    Features: ['mdx'],
+    Docs: ['Getting-Started'],
+  },
+};
+*/
+
+module.exports = {
+  docs: [
+    {
+      type: 'doc',
+      id: 'index',
+    },
+    {
+      type: 'doc',
+      id: 'Getting-Started',
+    },
+    {
+      type: 'doc',
+      id: 'Gobblin-Architecture',
+    },
+    {
+      type: 'category',
+      label: 'User Guide', // generate sidebar slice from docs/api (or versioned_docs/version/api)
+      items: [
+        {
+          type: 'autogenerated',
+          dirName: 'user-guide', // generate sidebar slice from docs/api (or versioned_docs/version/api)
+        },
+      ]
+    },
+    {
+      type: 'category',
+      label: 'Sources', // generate sidebar slice from docs/api (or versioned_docs/version/api)
+      items: [
+        {
+          type: 'autogenerated',
+          dirName: 'sources', // generate sidebar slice from docs/api (or versioned_docs/version/api)
+        },
+      ]
+    },
+    {
+      type: 'category',
+      label: 'Gobblin Adaptors',
+      items: [
+        'adaptors/Gobblin-Distcp',
+        'adaptors/Hive-Avro-To-ORC-Converter',
+      ],
+    },
+    {
+      type: 'category',
+      label: 'Case Studies',
+      items: [
+        'case-studies/Kafka-HDFS-Ingestion',
+        'case-studies/Publishing-Data-to-S3',
+        'case-studies/Writing-ORC-Data',
+        'case-studies/Hive-Distcp',
+      ],
+    },
+    {
+      type: 'category',
+      label: 'Gobblin Data Management',
+      items: [
+        'data-management/Gobblin-Retention',
+        'data-management/DistcpNgEvents',
+      ],
+    },
+    {
+      type: 'category',
+      label: 'Gobblin Metrics',
+      items: [
+        'metrics/Gobblin-Metrics',
+        'metrics/Existing-Reporters',
+        'metrics/Metrics-for-Gobblin-ETL',
+        'metrics/Gobblin-Metrics-Architecture',
+        'metrics/Implementing-New-Reporters',
+        'metrics/Gobblin-Metrics-Performance',
+      ],
+    },
+    {
+      type: 'category',
+      label: 'Developer Guide',
+      items: [
+        'developer-guide/Customization-for-New-Source',
+        'developer-guide/Customization-for-Converter-and-Operator',
+        'developer-guide/CodingStyle',
+        'developer-guide/Gobblin-Compliance-Design',
+        'developer-guide/IDE-setup',
+        'developer-guide/Monitoring-Design',
+        'developer-guide/Documentation-Architecture',
+        'developer-guide/Contributing',
+        'developer-guide/GobblinModules',
+        'developer-guide/HighLevelConsumer',
+      ],
+    },
+    {
+      type: 'category',
+      label: 'Project',
+      items: [
+        'project/Feature-List',
+        'project/Talks-and-Tech-Blogs',
+        'project/Posts',
+      ],
+    },
+    {
+      type: 'category',
+      label: 'Miscellaneous',
+      items: [
+        'miscellaneous/Camus-to-Gobblin-Migration',
+        'miscellaneous/Exactly-Once-Support',
+      ],
+    },
+
+  ]
+};
diff --git a/gobblin-website/src/css/custom.css b/gobblin-website/src/css/custom.css
new file mode 100644
index 0000000..8f6fd9e
--- /dev/null
+++ b/gobblin-website/src/css/custom.css
@@ -0,0 +1,24 @@
+/* stylelint-disable docusaurus/copyright-header */
+/**
+ * Any CSS included here will be global. The classic template
+ * bundles Infima by default. Infima is a CSS framework designed to
+ * work well for content-centric websites.
+ */
+
+/* You can override the default Infima variables here. */
+:root {
+    --ifm-color-primary: #dc8d32;
+    --ifm-color-primary-dark: #d08023;
+    --ifm-color-primary-darker: #c47821;
+    --ifm-color-primary-darkest: #a1631c;
+    --ifm-color-primary-light: #e09a49;
+    --ifm-color-primary-lighter: #e2a055;
+    --ifm-color-primary-lightest: #e8b377;
+}
+
+.docusaurus-highlight-code-line {
+    background-color: rgb(72, 77, 91);
+    display: block;
+    margin: 0 calc(-1 * var(--ifm-pre-padding));
+    padding: 0 var(--ifm-pre-padding);
+}
diff --git a/gobblin-website/src/pages/downloads.js b/gobblin-website/src/pages/downloads.js
new file mode 100644
index 0000000..abde25e
--- /dev/null
+++ b/gobblin-website/src/pages/downloads.js
@@ -0,0 +1,68 @@
+import React from "react";
+import Layout from "@theme/Layout";
+import clsx from "clsx";
+import styles from "./styles.module.css";
+
+const releases = [
+    { version: "0.15.0" },
+    { version: "0.14.0" },
+    { version: "0.13.0" },
+    { version: "0.12.0" },
+];
+
+function Release({ version }) {
+    return (
+        <div className={clsx("col col--12", styles.releases)}>
+            <h3>{version}</h3>
+            <p>
+                Official{" "}
+                <a
+                    href={`https://www.apache.org/dyn/closer.lua/gobblin/apache-gobblin-incubating-${version}/apache-gobblin-incubating-sources-${version}.tgz`}
+                >
+                    source release
+                </a>
+                <a
+                    href={`https://www.apache.org/dyn/closer.lua/gobblin/apache-gobblin-incubating-${version}/apache-gobblin-incubating-sources-${version}.tgz.sha512`}
+                >
+                    [SHA512]
+                </a>
+                <a
+                    href={`https://www.apache.org/dyn/closer.lua/gobblin/apache-gobblin-incubating-${version}/apache-gobblin-incubating-sources-${version}.tgz.asc`}
+                >
+                    [ASC]
+                </a>
+            </p>
+            <a
+                href={`https://github.com/apache/gobblin/blob/release-${version}/CHANGELOG.md`}
+            >
+                CHANGELOG
+            </a>
+        </div>
+    );
+}
+
+function Download() {
+    return (
+        <Layout title="Downloads">
+          <section className={styles.executionMode}>
+                <div className="container">
+                  <h1> Downloads </h1>
+                  <p>Be sure to verify your downloads by these <a href="https://www.apache.org/info/verification" title="Verify" target="_blank">procedures</a> using these <a href="https://www.apache.org/dist/gobblin/KEYS" title="KEYS" target="_blank">KEYS</a> for any Apache release.</p>
+                  <div
+                        className={clsx(
+                            "row row--no-gutters",
+                            styles.releaseRow
+                        )}
+                    >
+                    <h2> Current Releases </h2>
+                    {releases.map((props, idx) => (
+                            <Release key={idx} {...props} />
+                        ))}
+                    </div>
+                </div>
+            </section>
+        </Layout>
+    );
+}
+
+export default Download;
diff --git a/gobblin-website/src/pages/index.js b/gobblin-website/src/pages/index.js
new file mode 100644
index 0000000..45ac666
--- /dev/null
+++ b/gobblin-website/src/pages/index.js
@@ -0,0 +1,129 @@
+import React from "react";
+import clsx from "clsx";
+import Layout from "@theme/Layout";
+import Link from "@docusaurus/Link";
+import useDocusaurusContext from "@docusaurus/useDocusaurusContext";
+import useBaseUrl from "@docusaurus/useBaseUrl";
+import styles from "./styles.module.css";
+
+const executionModes = [
+    {
+        title: "Standalone",
+        imageUrl: "img/threads.gif",
+        description: (
+            <>
+                Runs as standalone application on a single box. Also supports
+                embedded mode.
+            </>
+        ),
+    },
+    {
+        title: "Mapreduce Mode",
+        imageUrl: "img/hadoop.png",
+        description: (
+            <>
+                Runs as an mapreduce application on multiple Hadoop versions.
+                Also supports <a href="https://azkaban.github.io/">Azkaban</a>{" "}
+                for launcing mapreduce jobs.
+            </>
+        ),
+    },
+    {
+        title: "Cluster / Yarn",
... 10282 lines suppressed ...