You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by zh...@apache.org on 2023/03/30 08:31:46 UTC
[singa-doc] branch master updated: 23-3-28 v4 webpage singa doc

This is an automated email from the ASF dual-hosted git repository.

zhaojing pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/singa-doc.git


The following commit(s) were added to refs/heads/master by this push:
     new 54a1e00  23-3-28 v4 webpage singa doc
     new 30d9aac  Merge pull request #67 from lzjpaul/23-3-28-v4-doc
54a1e00 is described below

commit 54a1e00ca9ffeec8cd69fca5cc8cf9d3a26a05d8
Author: zhaojing <zh...@comp.nus.edu.sg>
AuthorDate: Thu Mar 30 16:21:22 2023 +0800

    23-3-28 v4 webpage singa doc
---
 docs-site/docs/installation.md                     |   4 +-
 docs-site/docs/releases/RELEASE_NOTES_4.0.0.md     |  43 ++
 docs-site/docs/wheel-cpu.md                        |   6 +
 docs-site/docs/wheel-gpu.md                        |   6 +
 .../versioned_docs/version-4.0.0}/installation.md  |   7 +-
 .../version-4.0.0/releases/RELEASE_NOTES_4.0.0.md  |  44 ++
 .../versioned_docs/version-4.0.0}/wheel-cpu.md     |   9 +-
 .../versioned_docs/version-4.0.0}/wheel-gpu.md     |   9 +-
 .../version-4.0.0_Chinese/autograd.md              | 241 ++++++++
 .../version-4.0.0_Chinese/benchmark-train.md       |  17 +
 .../versioned_docs/version-4.0.0_Chinese/build.md  | 431 +++++++++++++
 .../version-4.0.0_Chinese/contribute-code.md       | 109 ++++
 .../version-4.0.0_Chinese/contribute-docs.md       |  83 +++
 .../versioned_docs/version-4.0.0_Chinese/device.md |  30 +
 .../version-4.0.0_Chinese/dist-train.md            | 379 ++++++++++++
 .../version-4.0.0_Chinese/download.md              | 159 +++++
 .../version-4.0.0_Chinese/examples.md              |  63 ++
 .../version-4.0.0_Chinese/git-workflow.md          |  86 +++
 .../versioned_docs/version-4.0.0_Chinese/graph.md  | 465 ++++++++++++++
 .../version-4.0.0_Chinese/half-precision.md        |  99 +++
 .../version-4.0.0_Chinese/history-singa.md         |  38 ++
 .../version-4.0.0_Chinese/how-to-release.md        | 173 ++++++
 .../version-4.0.0_Chinese/install-win.md           | 360 +++++++++++
 .../version-4.0.0_Chinese}/installation.md         |  94 ++-
 .../version-4.0.0_Chinese/issue-tracking.md        |  11 +
 .../version-4.0.0_Chinese/mail-lists.md            |  15 +
 .../versioned_docs/version-4.0.0_Chinese/onnx.md   | 674 +++++++++++++++++++++
 .../version-4.0.0_Chinese/optimizer.md             | 123 ++++
 .../version-4.0.0_Chinese/security.md              |   9 +
 .../version-4.0.0_Chinese/software-stack.md        |  85 +++
 .../version-4.0.0_Chinese/source-repository.md     |  24 +
 .../version-4.0.0_Chinese/team-list.md             |  57 ++
 .../versioned_docs/version-4.0.0_Chinese/tensor.md | 245 ++++++++
 .../version-4.0.0_Chinese/time-profiling.md        | 154 +++++
 .../version-4.0.0_Chinese/wheel-cpu-dev.md         |  13 +
 .../version-4.0.0_Chinese}/wheel-cpu.md            |  15 +-
 .../version-4.0.0_Chinese/wheel-gpu-dev.md         |  13 +
 .../version-4.0.0_Chinese/wheel-gpu.md             |  22 +
 .../versioned_docs/version-4.0.0_Viet/autograd.md  | 282 +++++++++
 .../version-4.0.0_Viet/benchmark-train.md          |  29 +
 .../versioned_docs/version-4.0.0_Viet/build.md     | 523 ++++++++++++++++
 .../version-4.0.0_Viet/contribute-code.md          | 128 ++++
 .../version-4.0.0_Viet/contribute-docs.md          | 106 ++++
 .../versioned_docs/version-4.0.0_Viet/device.md    |  33 +
 .../version-4.0.0_Viet/dist-train.md               | 447 ++++++++++++++
 .../versioned_docs/version-4.0.0_Viet/download.md  | 206 +++++++
 .../versioned_docs/version-4.0.0_Viet/examples.md  |  69 +++
 .../version-4.0.0_Viet/git-workflow.md             | 130 ++++
 .../versioned_docs/version-4.0.0_Viet/graph.md     | 525 ++++++++++++++++
 .../version-4.0.0_Viet/history-singa.md            |  43 ++
 .../version-4.0.0_Viet/how-to-release.md           | 207 +++++++
 .../version-4.0.0_Viet/install-win.md              | 396 ++++++++++++
 .../version-4.0.0_Viet/installation.md             | 142 +++++
 .../version-4.0.0_Viet/issue-tracking.md           |  13 +
 .../version-4.0.0_Viet/mail-lists.md               |  17 +
 .../versioned_docs/version-4.0.0_Viet/onnx.md      | 410 +++++++++++++
 .../versioned_docs/version-4.0.0_Viet/optimizer.md | 128 ++++
 .../versioned_docs/version-4.0.0_Viet/security.md  |  10 +
 .../version-4.0.0_Viet/software-stack.md           | 146 +++++
 .../version-4.0.0_Viet/source-repository.md        |  24 +
 .../versioned_docs/version-4.0.0_Viet/team-list.md |  60 ++
 .../versioned_docs/version-4.0.0_Viet/tensor.md    | 283 +++++++++
 .../version-4.0.0_Viet/time-profiling.md           | 164 +++++
 .../version-4.0.0_Viet/wheel-cpu-dev.md            |  13 +
 .../version-4.0.0_Viet}/wheel-cpu.md               |  17 +-
 .../version-4.0.0_Viet/wheel-gpu-dev.md            |  13 +
 .../versioned_docs/version-4.0.0_Viet/wheel-gpu.md |  22 +
 .../version-4.0.0_Chinese-sidebars.json            |  44 ++
 .../version-4.0.0_Viet-sidebars.json               |  43 ++
 docs-site/website/versions.json                    |   5 +-
 docs-site/website/versions_otherlang.json          |   4 +-
 71 files changed, 8999 insertions(+), 98 deletions(-)

diff --git a/docs-site/docs/installation.md b/docs-site/docs/installation.md
index bc5fa57..b59a665 100644
--- a/docs-site/docs/installation.md
+++ b/docs-site/docs/installation.md
@@ -21,7 +21,7 @@ pip install singa -f http://singa.apache.org/docs/next/wheel-cpu.html --trusted-
 ```
 
 You can install a specific version of SINGA via `singa==<version>`, where the
-`<version>` field should be replaced, e.g., `3.3.0`. The available SINGA
+`<version>` field should be replaced, e.g., `4.0.0`. The available SINGA
 versions are listed at the link.
 
 2. GPU With CUDA and cuDNN
@@ -32,7 +32,7 @@ pip install singa -f http://singa.apache.org/docs/next/wheel-gpu.html --trusted-
 ```
 
 You can also configure SINGA version and the CUDA version, like
-`singa==3.3.0+cuda10.2`. The available combinations of SINGA version and CUDA
+`singa==4.0.0+cuda10.2`. The available combinations of SINGA version and CUDA
 version are listed at the link.
 
 Note: the Python version of your local Python environment will be used to find
diff --git a/docs-site/docs/releases/RELEASE_NOTES_4.0.0.md b/docs-site/docs/releases/RELEASE_NOTES_4.0.0.md
new file mode 100644
index 0000000..bf7dad0
--- /dev/null
+++ b/docs-site/docs/releases/RELEASE_NOTES_4.0.0.md
@@ -0,0 +1,43 @@
+---
+id: RELEASE_NOTES_4.0.0
+title: Apache SINGA-4.0.0 Release Notes
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+SINGA is a distributed deep learning library.
+
+This release includes following changes:
+
+- Enhance distributed training
+
+  - Add support for configuration of number of GPUs to be used.
+  - Increase max epoch for better convergence.
+  - Print intermediate mini-batch information.
+  - Add support for switching between CPU and GPU devices.
+
+- Enhance example code
+
+  - Update the args of normalize forward function in the transforms of the BloodMnist example.
+  - Update the xceptionnet in the cnn example.
+  - Add arguments for weight decay, momentum and learning rates in the cnn example.
+  - Add training scripts for more datasets and model types in the cnn example.
+  - Add resnet dist version for the large dataset cnn example.
+  - Add cifar 10 multi process for the large dataset cnn example.
+  - Add sparsification implementation for mnist in the large dataset cnn example.
+  - Update the cifar datasets downloading to local directories.
+  - Extend the cifar datasets load function for customized directorires.
+
+- Enhance the webpage
+
+  - Update online documentation for distributed training.
+
+- Promote code quality
+
+  - Update inline comments for prepreocessing and data loading.
+
+- Update the PIL image module
+
+- Update the runtime Dockerfile
+  
+- Update the conda files
diff --git a/docs-site/docs/wheel-cpu.md b/docs-site/docs/wheel-cpu.md
index 295fced..4094f1f 100644
--- a/docs-site/docs/wheel-cpu.md
+++ b/docs-site/docs/wheel-cpu.md
@@ -5,6 +5,12 @@ title: CPU only
 
 <!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
 
+## 4.0.0
+
+- [Python 3.6](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-4.0.0-cp36-cp36m-manylinux2014_x86_64.whl)
+- [Python 3.7](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-4.0.0-cp37-cp37m-manylinux2014_x86_64.whl)
+- [Python 3.8](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-4.0.0-cp38-cp38-manylinux2014_x86_64.whl)
+
 ## 3.3.0
 
 - [Python 3.6](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-3.3.0-cp36-cp36m-manylinux2014_x86_64.whl)
diff --git a/docs-site/docs/wheel-gpu.md b/docs-site/docs/wheel-gpu.md
index 1f488a8..f69c783 100644
--- a/docs-site/docs/wheel-gpu.md
+++ b/docs-site/docs/wheel-gpu.md
@@ -5,6 +5,12 @@ title: CUDA enabled
 
 <!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
 
+## 4.0.0
+
+- [CUDA10.2, cuDNN 7.6.5, Python 3.6](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-4.0.0%2Bcuda10.2-cp36-cp36m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python 3.7](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-4.0.0%2Bcuda10.2-cp37-cp37m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python 3.8](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-4.0.0%2Bcuda10.2-cp38-cp38-manylinux2014_x86_64.whl)
+
 ## 3.3.0
 
 - [CUDA10.2, cuDNN 7.6.5, Python 3.6](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-3.3.0%2Bcuda10.2-cp36-cp36m-manylinux2014_x86_64.whl)
diff --git a/docs-site/docs/installation.md b/docs-site/website/versioned_docs/version-4.0.0/installation.md
similarity index 97%
copy from docs-site/docs/installation.md
copy to docs-site/website/versioned_docs/version-4.0.0/installation.md
index bc5fa57..4baea07 100644
--- a/docs-site/docs/installation.md
+++ b/docs-site/website/versioned_docs/version-4.0.0/installation.md
@@ -1,6 +1,7 @@
 ---
-id: installation
+id: version-4.0.0-installation
 title: Installation
+original_id: installation
 ---
 
 <!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
@@ -21,7 +22,7 @@ pip install singa -f http://singa.apache.org/docs/next/wheel-cpu.html --trusted-
 ```
 
 You can install a specific version of SINGA via `singa==<version>`, where the
-`<version>` field should be replaced, e.g., `3.3.0`. The available SINGA
+`<version>` field should be replaced, e.g., `4.0.0`. The available SINGA
 versions are listed at the link.
 
 2. GPU With CUDA and cuDNN
@@ -32,7 +33,7 @@ pip install singa -f http://singa.apache.org/docs/next/wheel-gpu.html --trusted-
 ```
 
 You can also configure SINGA version and the CUDA version, like
-`singa==3.3.0+cuda10.2`. The available combinations of SINGA version and CUDA
+`singa==4.0.0+cuda10.2`. The available combinations of SINGA version and CUDA
 version are listed at the link.
 
 Note: the Python version of your local Python environment will be used to find
diff --git a/docs-site/website/versioned_docs/version-4.0.0/releases/RELEASE_NOTES_4.0.0.md b/docs-site/website/versioned_docs/version-4.0.0/releases/RELEASE_NOTES_4.0.0.md
new file mode 100644
index 0000000..d5237b6
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0/releases/RELEASE_NOTES_4.0.0.md
@@ -0,0 +1,44 @@
+---
+id: version-4.0.0-RELEASE_NOTES_4.0.0
+title: Apache SINGA-4.0.0 Release Notes
+original_id: RELEASE_NOTES_4.0.0
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+SINGA is a distributed deep learning library.
+
+This release includes following changes:
+
+- Enhance distributed training
+
+  - Add support for configuration of number of GPUs to be used.
+  - Increase max epoch for better convergence.
+  - Print intermediate mini-batch information.
+  - Add support for switching between CPU and GPU devices.
+
+- Enhance example code
+
+  - Update the args of normalize forward function in the transforms of the BloodMnist example.
+  - Update the xceptionnet in the cnn example.
+  - Add arguments for weight decay, momentum and learning rates in the cnn example.
+  - Add training scripts for more datasets and model types in the cnn example.
+  - Add resnet dist version for the large dataset cnn example.
+  - Add cifar 10 multi process for the large dataset cnn example.
+  - Add sparsification implementation for mnist in the large dataset cnn example.
+  - Update the cifar datasets downloading to local directories.
+  - Extend the cifar datasets load function for customized directorires.
+
+- Enhance the webpage
+
+  - Update online documentation for distributed training.
+
+- Promote code quality
+
+  - Update inline comments for prepreocessing and data loading.
+
+- Update the PIL image module
+
+- Update the runtime Dockerfile
+  
+- Update the conda files
diff --git a/docs-site/docs/wheel-cpu.md b/docs-site/website/versioned_docs/version-4.0.0/wheel-cpu.md
similarity index 84%
copy from docs-site/docs/wheel-cpu.md
copy to docs-site/website/versioned_docs/version-4.0.0/wheel-cpu.md
index 295fced..e20773d 100644
--- a/docs-site/docs/wheel-cpu.md
+++ b/docs-site/website/versioned_docs/version-4.0.0/wheel-cpu.md
@@ -1,10 +1,17 @@
 ---
-id: wheel-cpu
+id: version-4.0.0-wheel-cpu
 title: CPU only
+original_id: wheel-cpu
 ---
 
 <!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
 
+## 4.0.0
+
+- [Python 3.6](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-4.0.0-cp36-cp36m-manylinux2014_x86_64.whl)
+- [Python 3.7](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-4.0.0-cp37-cp37m-manylinux2014_x86_64.whl)
+- [Python 3.8](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-4.0.0-cp38-cp38-manylinux2014_x86_64.whl)
+
 ## 3.3.0
 
 - [Python 3.6](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-3.3.0-cp36-cp36m-manylinux2014_x86_64.whl)
diff --git a/docs-site/docs/wheel-gpu.md b/docs-site/website/versioned_docs/version-4.0.0/wheel-gpu.md
similarity index 83%
copy from docs-site/docs/wheel-gpu.md
copy to docs-site/website/versioned_docs/version-4.0.0/wheel-gpu.md
index 1f488a8..eb1946e 100644
--- a/docs-site/docs/wheel-gpu.md
+++ b/docs-site/website/versioned_docs/version-4.0.0/wheel-gpu.md
@@ -1,10 +1,17 @@
 ---
-id: wheel-gpu
+id: version-4.0.0-wheel-gpu
 title: CUDA enabled
+original_id: wheel-gpu
 ---
 
 <!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
 
+## 4.0.0
+
+- [CUDA10.2, cuDNN 7.6.5, Python 3.6](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-4.0.0%2Bcuda10.2-cp36-cp36m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python 3.7](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-4.0.0%2Bcuda10.2-cp37-cp37m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python 3.8](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-4.0.0%2Bcuda10.2-cp38-cp38-manylinux2014_x86_64.whl)
+
 ## 3.3.0
 
 - [CUDA10.2, cuDNN 7.6.5, Python 3.6](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-3.3.0%2Bcuda10.2-cp36-cp36m-manylinux2014_x86_64.whl)
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/autograd.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/autograd.md
new file mode 100644
index 0000000..84d2153
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/autograd.md
@@ -0,0 +1,241 @@
+---
+id: version-4.0.0_Chinese-autograd
+title: Autograd
+original_id: autograd
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+实现autograd有两种典型的方式，一种是通过如[Theano](http://deeplearning.net/software/theano/index.html)的符号微分（symbolic differentiation）或通过如[Pytorch](https://pytorch.org/docs/stable/notes/autograd.html)的反向微分（reverse differentialtion）。SINGA遵循Pytorch方式，即通过记录计算图，并在正向传播后自动应用反向传播。自动传播算法的详细解释请参阅[这里](https://pytorch.org/docs/stable/notes/autograd.html)。我们接下来对SINGA中的相关模块进行解释，并举例说明其使用方法。
+
+## 相关模块
+
+在autograd中涉及三个类，分别是`singa.tensor.Tensor`，`singa.autograd.Operation`和`singa.autograd.Layer`。在本篇的后续部分中，我们使用Tensor、Operation和Layer来指代这三个类。
+
+### Tensor
+
+Tensor的三个属性被autograd使用：
+
+- `.creator`是一个`Operation`实例。它记录了产生Tensor实例的这个操作。
+- `.request_grad`是一个布尔变量。它用于指示autograd算法是否需要计算张量的梯度。例如，在反向传播的过程中，线性层的权重矩阵和卷积层（非底层）的特征图的张量梯度应该被计算。
+- `.store_grad`是一个布尔变量。它用于指示张量的梯度是否应该被存储并由后向函数输出。例如，特征图的梯度是在反向传播过程中计算出来的，但不包括在反向函数的输出中。
+
+开发者可以改变Tensor实例的`requires_grad`和`stores_grad`。例如，如果将后者设置为True，那么相应的梯度就会被包含在后向函数的输出。需要注意的是，如果`stores_grad`是True，那么 `requires_grad`一定是真，反之亦然。
+
+
+### Operation
+
+它将一个或多个`Tensor`实例作为输入，然后输出一个或多个`Tensor`实例。例如，ReLU可以作为一个具体的Operation子类来实现。当一个`Operation`实例被调用时（实例化后），会执行以下两个步骤。
+
+1.记录源操作，即输入张量的`创建者`。
+2.通过调用成员函数`.forward()`进行计算。
+
+有两个成员函数用于前向和反向传播，即`.forward()`和`.backward()`。它们以`Tensor.data`作为输入（类型为`CTensor`），并输出`Ctensor`s。要添加一个特定的操作，子类`Operation`应该实现自己的`.forward()`和`.backward()`函数。在后向传播过程中，autograd的`backward()`函数会自动调用`backward()`函数来计算输入的梯度（根据`require_grad`字段的参数和约束）。
+
+### Layer
+
+对于那些需要参数的Operation，我们把它们封装成一个新的类，`Layer`。例如，卷积操作被封装到卷积层(Convolution layer)中。`层`管理（存储）参数，并调用相应的`Operation`来实现变换。
+
+## 样例
+
+在[example folder](https://github.com/apache/singa/tree/master/examples/autograd)中提供了很多样例。在这里我我们分析两个最具代表性的例子。
+
+### 只使用Operation
+
+下一段代码展示了一个只使用`Operation`的多层感知机（MLP）模型：
+
+#### 调用依赖包
+
+```python
+from singa.tensor import Tensor
+from singa import autograd
+from singa import opt
+```
+
+#### 创建权重矩阵和偏置向量
+
+在将`requires_grad`和`stores_grad`都设置为`True`的情况下，创建参数张量。
+
+```python
+w0 = Tensor(shape=(2, 3), requires_grad=True, stores_grad=True)
+w0.gaussian(0.0, 0.1)
+b0 = Tensor(shape=(1, 3), requires_grad=True, stores_grad=True)
+b0.set_value(0.0)
+
+w1 = Tensor(shape=(3, 2), requires_grad=True, stores_grad=True)
+w1.gaussian(0.0, 0.1)
+b1 = Tensor(shape=(1, 2), requires_grad=True, stores_grad=True)
+b1.set_value(0.0)
+```
+
+#### 训练
+
+```python
+inputs = Tensor(data=data)  # data matrix
+target = Tensor(data=label) # label vector
+autograd.training = True    # for training
+sgd = opt.SGD(0.05)   # optimizer
+
+for i in range(10):
+    x = autograd.matmul(inputs, w0) # matrix multiplication
+    x = autograd.add_bias(x, b0)    # add the bias vector
+    x = autograd.relu(x)            # ReLU activation operation
+
+    x = autograd.matmul(x, w1)
+    x = autograd.add_bias(x, b1)
+
+    loss = autograd.softmax_cross_entropy(x, target)
+
+    for p, g in autograd.backward(loss):
+        sgd.update(p, g)
+```
+
+### 使用Operation和Layer
+
+下面的[例子](https://github.com/apache/singa/blob/master/examples/autograd/mnist_cnn.py)使用autograd模块提供的层实现了一个CNN模型。
+
+#### 创建层
+
+```python
+conv1 = autograd.Conv2d(1, 32, 3, padding=1, bias=False)
+bn1 = autograd.BatchNorm2d(32)
+pooling1 = autograd.MaxPool2d(3, 1, padding=1)
+conv21 = autograd.Conv2d(32, 16, 3, padding=1)
+conv22 = autograd.Conv2d(32, 16, 3, padding=1)
+bn2 = autograd.BatchNorm2d(32)
+linear = autograd.Linear(32 * 28 * 28, 10)
+pooling2 = autograd.AvgPool2d(3, 1, padding=1)
+```
+
+#### 定义正向传播函数
+
+在正向传播中的operations会被自动记录，用于反向传播。
+
+```python
+def forward(x, t):
+    # x is the input data (a batch of images)
+    # t is the label vector (a batch of integers)
+    y = conv1(x)           # Conv layer
+    y = autograd.relu(y)   # ReLU operation
+    y = bn1(y)             # BN layer
+    y = pooling1(y)        # Pooling Layer
+
+    # two parallel convolution layers
+    y1 = conv21(y)
+    y2 = conv22(y)
+    y = autograd.cat((y1, y2), 1)  # cat operation
+    y = autograd.relu(y)           # ReLU operation
+    y = bn2(y)
+    y = pooling2(y)
+
+    y = autograd.flatten(y)        # flatten operation
+    y = linear(y)                  # Linear layer
+    loss = autograd.softmax_cross_entropy(y, t)  # operation
+    return loss, y
+```
+
+#### 训练
+
+```python
+autograd.training = True
+for epoch in range(epochs):
+    for i in range(batch_number):
+        inputs = tensor.Tensor(device=dev, data=x_train[
+                               i * batch_sz:(1 + i) * batch_sz], stores_grad=False)
+        targets = tensor.Tensor(device=dev, data=y_train[
+                                i * batch_sz:(1 + i) * batch_sz], requires_grad=False, stores_grad=False)
+
+        loss, y = forward(inputs, targets) # forward the net
+
+        for p, gp in autograd.backward(loss):  # auto backward
+            sgd.update(p, gp)
+```
+
+### Using the Model API
+
+下面的[样例](https://github.com/apache/singa/blob/master/examples/cnn/model/cnn.py)使用[Model API](./graph)实现了一个CNN模型。.
+
+#### 定义Model的子类
+
+定义模型类，它应该是Model的子类。只有这样，在训练阶段使用的所有操作才会形成一个计算图以便进行分析。图中的操作将被按时序规划并有效执行，模型类中也可以包含层。
+
+```python
+class MLP(model.Model):  # the model is a subclass of Model
+
+    def __init__(self, data_size=10, perceptron_size=100, num_classes=10):
+        super(MLP, self).__init__()
+
+        # init the operators, layers and other objects
+        self.relu = layer.ReLU()
+        self.linear1 = layer.Linear(perceptron_size)
+        self.linear2 = layer.Linear(num_classes)
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
+
+    def forward(self, inputs):  # define the forward function
+        y = self.linear1(inputs)
+        y = self.relu(y)
+        y = self.linear2(y)
+        return y
+
+    def train_one_batch(self, x, y):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+        self.optimizer(loss)
+        return out, loss
+
+    def set_optimizer(self, optimizer):  # attach an optimizer
+        self.optimizer = optimizer
+```
+
+#### 训练
+
+```python
+# create a model instance
+model = MLP()
+# initialize optimizer and attach it to the model
+sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+model.set_optimizer(sgd)
+# input and target placeholders for the model
+tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
+ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
+# compile the model before training
+model.compile([tx], is_train=True, use_graph=True, sequential=False)
+
+# train the model iteratively
+for b in range(num_train_batch):
+    # generate the next mini-batch
+    x, y = ...
+
+    # Copy the data into input tensors
+    tx.copy_from_numpy(x)
+    ty.copy_from_numpy(y)
+
+    # Training with one batch
+    out, loss = model(tx, ty)
+```
+
+#### 保存模型checkpoint
+
+```python
+# define the path to save the checkpoint
+checkpointpath="checkpoint.zip"
+
+# save a checkpoint
+model.save_states(fpath=checkpointpath)
+```
+
+#### 加载模型checkpoint
+
+```python
+# define the path to load the checkpoint
+checkpointpath="checkpoint.zip"
+
+# load a checkpoint
+import os
+if os.path.exists(checkpointpath):
+    model.load_states(fpath=checkpointpath)
+```
+
+### Python API
+
+关于Python API的更多细节，请参考[这里](https://singa.readthedocs.io/en/latest/autograd.html#module-singa.autograd)。
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/benchmark-train.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/benchmark-train.md
new file mode 100644
index 0000000..e6a3b7b
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/benchmark-train.md
@@ -0,0 +1,17 @@
+---
+id: version-4.0.0_Chinese-benchmark-train
+title: Benchmark for Distributed Training
+original_id: benchmark-train
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+
+项目：我们使用深度卷积神经网络[ResNet-50](https://github.com/apache/singa/blob/master/examples/cnn/model/resnet.py)。它有50个卷积层，用于图像分类。它需要3.8个GFLOPs来通过网络处理一张图像（尺寸为224x224）。输入的图像大小为224x224。
+
+
+硬件方面：我们使用的是AWS的p2.8xlarge实例，每个实例有8个Nvidia Tesla K80 GPU，共96GB GPU内存，32个vCPU，488GB主内存，10Gbps网络带宽。
+
+衡量标准：我们衡量不同数量worker的每次迭代时间，以评估SINGA的可扩展性。Batch-size固定为每个GPU32个。采用同步训练方案。因此，有效的batch-size是`32N`，其中N是GPU的数量。我们与一个流行的开源系统进行比较，该系统采用参数服务器拓扑结构。选择第一个GPU作为服务器。
+
+![Benchmark Experiments](assets/benchmark.png) <br/> **可扩展性测试。条形为吞吐量，折线形为通信成本。**
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/build.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/build.md
new file mode 100644
index 0000000..da0ec2e
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/build.md
@@ -0,0 +1,431 @@
+---
+id: version-4.0.0_Chinese-build
+title: Build SINGA from Source
+original_id: build
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+源文件可以通过[tar.gz文件](https://dist.apache.org/repos/dist/dev/singa/)或git repo的形式下载：
+
+```shell
+$ git clone https://github.com/apache/singa.git
+$ cd singa/
+```
+
+如果您想为SINGA贡献代码，请参考[贡献代码](contribute-code.md)页面的步骤和要求。
+
+## 使用Conda构筑SINGA
+
+Conda-build 是一个构建工具，它从anaconda cloud安装依赖的库并执行构建脚本。
+
+安装conda-build(需要先安装conda)：
+
+```shell
+conda install conda-build
+```
+
+### 构建CPU版本
+
+构建SINGA的CPU版本：
+
+```shell
+conda build tool/conda/singa/
+```
+
+以上命令已经在Ubuntu（14.04，16.04和18.04）和macOS 10.11上测试过。更多信息请参考[Travis-CI page](https://travis-ci.org/apache/singa)页面。
+
+### 构建GPU版本
+
+要构建GPU版的SINGA，计算机必须装备有Nvida GPU，而且需要安装CUDA driver(>=384.81)、CUDA toolkit(>=9)和cuDNN(>=7)。以下两个Docker镜像提供了构建环境：
+
+1. apache/singa:conda-cuda9.0
+2. apache/singa:conda-cuda10.0
+
+构建环境准备好后，需要先导出CUDA版本，然后运行conda命令构建SINGA：
+
+```shell
+export CUDA=x.y (e.g. 9.0)
+conda build tool/conda/singa/
+```
+
+### 后处理
+
+生成的包文件的位置(`.tar.gz`)将打印在终端上，生成的包可以直接安装：
+
+```shell
+conda install -c conda-forge --use-local <path to the package file>
+```
+
+若要上传到anaconda云端供他人下载安装，需要在 anaconda 上注册一个账号，才能[上传包](https://docs.anaconda.com/anaconda-cloud/user-guide/getting-started/)：
+
+```shell
+conda install anaconda-client
+anaconda login
+anaconda upload -l main <path to the package file>
+```
+
+将包上传到云端后，您可以在[Anaconda Cloud](https://anaconda.org/)上看到，也可以通过以下命令查看：
+
+```shell
+conda search -c <anaconda username> singa
+```
+
+每个特定的SINGA软件包都由版本和构建字符串来标识。要安装一个特定的SINGA包，需要提供所有信息，例如：
+
+```shell
+conda install -c <anaconda username> -c conda-forge singa=2.1.0.dev=cpu_py36
+```
+
+为了使安装命令简单化，您可以创建以下附加包，这些包依赖于最新的CPU和GPU SINGA包：
+
+```console
+# for singa-cpu
+conda build tool/conda/cpu/  --python=3.6
+conda build tool/conda/cpu/  --python=3.7
+# for singa-gpu
+conda build tool/conda/gpu/  --python=3.6
+conda build tool/conda/gpu/  --python=3.7
+```
+
+因此，当您运行：
+
+```shell
+conda install -c <anaconda username> -c conda-forge singa-xpu
+```
+
+时(`xpu`表示'cpu' or 'gpu'), 相应的真正的SINGA包将作为依赖库被安装。
+
+## 使用本地工具在Ubuntu上构建SINGA
+
+请参阅 SINGA [Dockerfiles](https://github.com/apache/singa/blob/master/tool/docker/devel/ubuntu/cuda9/Dockerfile#L30)，了解在 Ubuntu 16.04 上安装依赖库的说明。您也可以使用 devel 映像创建一个 Docker 容器，并在容器中构建 SINGA。要使用GPU、DNNL、Python和单元测试来构建SINGA，请运行以下命令：
+
+```shell
+mkdir build    # at the root of singa folder
+cd build
+cmake -DENABLE_TEST=ON -DUSE_CUDA=ON -DUSE_DNNL=ON -DUSE_PYTHON3=ON ..
+make
+cd python
+pip install .
+```
+CMake选项的详细内容在本页最后一节解释，上面最后一条命令是安装Python包。你也可以运行`pip install -e .`，它可以创建符号链接，而不是将 Python 文件复制到 site-package 文件夹中。
+
+如果SINGA在ENABLE_TEST=ON的情况下编译，您可以通过以下方式运行单元测试:
+
+```shell
+$ ./bin/test_singa
+```
+
+您可以看到所有的测试案例与测试结果。如果SINGA通过了所有测试，那么您就成功安装了SINGA。
+
+## 使用本地工具在Centos7上构建SINGA
+
+由于Centos7的软件包名称不同，因此从源码开始构建会有所不同。
+
+### 安装依赖项
+
+基础包和库文件：
+
+```shell
+sudo yum install freetype-devel libXft-devel ncurses-devel openblas-devel blas-devel lapack devel atlas-devel kernel-headers unzip wget pkgconfig zip zlib-devel libcurl-devel cmake curl unzip dh-autoreconf git python-devel glog-devel protobuf-devel
+```
+
+构建必需的包：
+
+```shell
+sudo yum group install "Development Tools"
+```
+
+若要安装swig：
+
+```shell
+sudo yum install pcre-devel
+wget http://prdownloads.sourceforge.net/swig/swig-3.0.10.tar.gz
+tar xvzf swig-3.0.10.tar.gz
+cd swig-3.0.10.tar.gz
+./configure --prefix=${RUN}
+make
+make install
+```
+
+安装gfortran：
+
+```shell
+sudo yum install centos-release-scl-rh
+sudo yum --enablerepo=centos-sclo-rh-testing install devtoolset-7-gcc-gfortran
+```
+
+安装pip和其他包：
+
+```shell
+sudo yum install epel-release
+sudo yum install python-pip
+pip install matplotlib numpy pandas scikit-learn pydot
+```
+
+### 安装SINGA
+
+按照《使用本地工具在Ubuntu上构建SINGA》的步骤1-5进行操作
+
+### 测试
+
+您可以通过如下方式进行
+
+```shell
+$ ./bin/test_singa
+```
+
+您可以看到所有的测试案例与测试结果。如果SINGA通过了所有测试，即表示安装成功。
+
+## 在Windows中编译SINGA
+在Windows上使用Python支持构建SINGA的说明可以在[install-win页面](install-win.md)找到。
+
+## 关于编译选项的更多细节
+
+### USE_MODULES (已过期废弃)
+
+如果没有安装protobuf和openblas，你可以用它们一起编译SINGA
+
+```shell
+$ In SINGA ROOT folder
+$ mkdir build
+$ cd build
+$ cmake -DUSE_MODULES=ON ..
+$ make
+```
+cmake 会下载 OpenBlas 和 Protobuf (2.6.1) 并与 SINGA 一起编译。
+
+您可以使用`ccmake ..`来配置编译选项。如果一些依赖的库不在系统默认路径中，则您需要手动导出以下环境变量：
+
+```shell
+export CMAKE_INCLUDE_PATH=<path to the header file folder>
+export CMAKE_LIBRARY_PATH=<path to the lib file folder>
+```
+
+### USE_PYTHON
+
+编译SINGA的Python封装器选项：
+
+```shell
+$ cmake -DUSE_PYTHON=ON ..
+$ make
+$ cd python
+$ pip install .
+```
+
+### USE_CUDA
+我们推荐用户安装CUDA和[cuDNN](https://developer.nvidia.com/cudnn)，以便在GPU上运行SINGA，以获得更好的性能。
+
+SINGA已经在CUDA 9/10和cuDNN 7上进行了测试。如果cuDNN安装在非系统文件夹中，例如 /home/bob/local/cudnn/，则需要执行以下命令来让cmake在编译时能够找到它们：
+
+```shell
+$ export CMAKE_INCLUDE_PATH=/home/bob/local/cudnn/include:$CMAKE_INCLUDE_PATH
+$ export CMAKE_LIBRARY_PATH=/home/bob/local/cudnn/lib64:$CMAKE_LIBRARY_PATH
+$ export LD_LIBRARY_PATH=/home/bob/local/cudnn/lib64:$LD_LIBRARY_PATH
+```
+
+CUDA和cuDNN的cmake选项应该设置成“ON”：
+
+```shell
+# Dependent libs are install already
+$ cmake -DUSE_CUDA=ON ..
+$ make
+```
+
+### USE_DNNL
+
+用户可以启用DNNL来提高CPU的计算性能，DNNL的安装指南可以在[这里](https://github.com/intel/mkl-dnn#installation)找到：
+
+
+SINGA在DNNL v1.1环境下已经进行过测试并通过，
+
+若要启用DNNL支持来编译SINGA:
+
+```shell
+# Dependent libs are installed already
+$ cmake -DUSE_DNNL=ON ..
+$ make
+```
+
+### USE_OPENCL
+
+SINGA使用opencl-headers和viennacl（版本1.7.1及以上）来支持OpenCL，它们可以通过如下方式安装：
+
+```shell
+# On Ubuntu 16.04
+$ sudo apt-get install opencl-headers, libviennacl-dev
+# On Fedora
+$ sudo yum install opencl-headers, viennacl
+```
+
+此外，你需要在你想运行OpenCL的平台安装OpenCL Installable Client Driver（ICD）。
+
+- 对于AMD和Nvidia的GPU，驱动包也应该安装与之匹配的OpenCL ICD。
+- 对于Intel的CPU和/或GPU，请从[Intel官方网站](https://software.intel.com/en-us/articles/opencl-drivers)上获取驱动程序。请注意，该网站上提供的驱动程序只支持最新的CPU和Iris GPU。
+- 对于旧的Intel CPU，你可以使用beignet-opencl-icd包。
+
+请注意，目前不建议在CPU上运行OpenCL，因为运行速度会很慢。内存传输是以整数秒为单位的（直观来说，CPU上是1000毫秒，而GPU上是1毫秒）。
+
+更多关于建立OpenCL工作环境的信息可以在[这里](https://wiki.tiker.net/OpenCLHowTo)找到。
+
+如果ViennaCL的软件包版本不是至少1.7.1，则需要从源码构建它：
+
+从[这个git repo](https://github.com/viennacl/viennacl-dev)clone版本库，切换（checkout）到release-1.7.1分支，然后构建它，并把项目路径添加到PATH，再把构建的库文件添加到LD_LIBRARY_PATH。
+
+构建支持OpenCL的SINGA（在SINGA 1.1上测试）：
+
+```shell
+$ cmake -DUSE_OPENCL=ON ..
+$ make
+```
+
+### PACKAGE
+
+这个设置是用来构建 Debian 软件包的。设置PACKAGE=ON，然后用make命令来编译软件包，如下所示：
+
+```shell
+$ cmake -DPACKAGE=ON
+$ make package
+```
+
+## FAQ
+
+- Q: 'import singa'阶段报错
+
+  A: 请检查`python -c "from singa import _singa_wrap`中的详细错误。有时是由依赖库引起的，比如protobuf有多个版本，cudnn缺失，numpy版本不匹配等问题。下面展示了不同情况下的解决方案
+
+  1. 检查cudnn和cuda。如果cudnn缺失或与wheel包的版本不一致，你可以下载正确的cudnn版本到~/local/cudnn/，然后：
+
+     ```shell
+     $ echo "export LD_LIBRARY_PATH=/home/<yourname>/local/cudnn/lib64:$LD_LIBRARY_PATH" >> ~/.bashrc
+     ```
+
+  2. 如果是protobuf的问题。你可以将protobuf (3.6.1)从源码安装到本地文件夹，比如 ~/local/，解压tar文件，然后：
+
+     ```shell
+     $ ./configure --prefix=/home/<yourname>local
+     $ make && make install
+     $ echo "export LD_LIBRARY_PATH=/home/<yourname>/local/lib:$LD_LIBRARY_PATH" >> ~/.bashrc
+     $ source ~/.bashrc
+     ```
+
+  3. 如果找不到包括python在内的其他类库，则使用`pip`或`conda`创建虚拟环境.
+
+  4. 如果不是上述原因造成的，请到`_singa_wrap.so`文件夹中查看：
+     ```shell
+     $ python
+     >> import importlib
+     >> importlib.import_module('_singa_wrap')
+     ```
+
+     来检查错误信息。例如，如果是numpy的版本不匹配，错误信息会是：
+
+     ```shell
+     RuntimeError: module compiled against API version 0xb but this version of numpy is 0xa
+     ```
+
+     那么你就需要更新numpy到更高版本。
+
+* Q: 运行`cmake ...`时出错，找不到依赖库。
+
+  A: 如果你还没有安装这些依赖库，请先安装它们。如果你在系统文件夹之外的文件夹中安装了库，例如/usr/local，那您需要手动导出以下变量:
+
+  ```shell
+  $ export CMAKE_INCLUDE_PATH=<path to your header file folder>
+  $ export CMAKE_LIBRARY_PATH=<path to your lib file folder>
+  ```
+
+- Q: 来自`make`的错误，例如linking阶段的错误.
+
+  A: 如果您的库文件在系统默认路径以外的其他文件夹中，则需要手动导出以下变量。
+
+  ```shell
+  $ export LIBRARY_PATH=<path to your lib file folder>
+  $ export LD_LIBRARY_PATH=<path to your lib file folder>
+  ```
+
+* Q: 来自头文件的错误，例如'cblas.h文件不存在'
+
+  A: 您需要手动将cblas.h的文件夹包含在CPLUS_INCLUDE_PATH中，例如：
+
+  ```shell
+  $ export CPLUS_INCLUDE_PATH=/opt/OpenBLAS/include:$CPLUS_INCLUDE_PATH
+  ```
+
+* Q: 在编译SINGA时，我收到错误信息`SSE2 instruction set not enabled`
+
+  A:您可以尝试如下指令:
+
+  ```shell
+  $ make CFLAGS='-msse2' CXXFLAGS='-msse2'
+  ```
+
+* Q:当试图导入.py文件时，从google.protobuf.internal收到`ImportError: cannot import name enum_type_wrapper`。
+
+  A: 您需要安装protobuf的Python绑定包，它可以通过如下方式安装：
+
+  ```shell
+  $ sudo apt-get install protobuf
+  ```
+
+  或从源文件编译：
+
+  ```shell
+  $ cd /PROTOBUF/SOURCE/FOLDER
+  $ cd python
+  $ python setup.py build
+  $ python setup.py install
+  ```
+
+* Q: 当我从源码构建OpenBLAS时，被告知需要一个Fortran编译器。
+
+  A: 您可以通过如下方式编译OpenBLAS：
+
+  ```shell
+  $ make ONLY_CBLAS=1
+  ```
+  或者通过如下方式安装：
+
+  ```shell
+  $ sudo apt-get install libopenblas-dev
+  ```
+
+* Q: 当我构建协议缓冲区时，它报告说在`/usr/lib64/libstdc++.so.6`中没有找到`GLIBC++_3.4.20`？
+
+  A: 这意味着链接器找到了libstdc++.so.6，但该库属于旧的GCC（用于编译和链接程序的GCC）版本。此程序依赖于新的libstdc++中定义的代码，而该代码属于较新版本的GCC，所以必须告诉链接器如何找到较新的libstdc++共享库。最简单的解决方法是找到正确的libstdc++，并将其导出到LD_LIBRARY_PATH。例如，如果下面命令的输出中列出了GLIBC++_3.4.20：
+
+        $ strings /usr/local/lib64/libstdc++.so.6|grep GLIBC++
+
+  那么接下来需要设置环境变量为：
+
+        $ export LD_LIBRARY_PATH=/usr/local/lib64:$LD_LIBRARY_PATH
+
+* Q: 当构建glog时，报告说 "src/logging_unittest.cc:83:20: error: 'gflags' is not a namespace-name"。
+
+  A: 可能是由于安装的gflags用了不同的命名空间，比如 "google"，所以glog找不到'gflags'的命名空间。实际上建立glog并不需要gflags，所以你可以修改configure.ac文件来忽略gflags。
+
+        1. cd to glog src directory
+        2. change line 125 of configure.ac  to "AC_CHECK_LIB(gflags, main, ac_cv_have_libgflags=0, ac_cv_have_libgflags=0)"
+        3. autoreconf
+
+  执行上述命令后，就可以重新构建glog了。
+
+* Q: 在使用虚拟环境时，每次运行pip install，都会重新安装numpy。但是，当运行`import numpy`时，numpy并没有被调用。
+
+  A: 可能是由于`PYTHONPATH`造成的，在使用虚拟环境时，应将`PYTHONPATH`设置为空，以避免与虚拟环境本身的路径冲突。
+
+* Q: 当从源代码编译PySINGA时，由于缺少<numpy/objectarray.h>，出现的编译错误。
+
+  A: 请安装numpy并导出numpy头文件的路径为
+
+        $ export CPLUS_INCLUDE_PATH=`python -c "import numpy; print numpy.get_include()"`:$CPLUS_INCLUDE_PATH
+
+* Q: 当我在Mac OS X中运行SINGA时，报错 "Fatal Python error:
+  PyThreadState_Get: no current thread Abort trap: 6"
+
+  A: 这个错误通常发生在系统上有多个版本的Python，并且是通过pip安装的SINGA (通过conda安装时不会出现这个问题)，例如，操作系统自带的版本和Homebrew安装的版本。PySINGA所链接的Python必须与Python解释器（interpreter）相同。 您可以通过 `which python` 检查您的解释器路径并通过`otool -L <path to _singa_wrap.so>`检查PySINGA链接的Python路径。要解决这个问题，请用正确的 Python 版本编译 SINGA。需要注意的是，如果您从源码编译 PySINGA，您需要在调用[cmake](http://stackoverflow.com/questions/15291500/i-have-2-versions-of-python-installed-but-cmake-is-using-older-version-how-do)时指定路径：
+  
+
+        $ cmake -DPYTHON_LIBRARY=`python-config --prefix`/lib/libpython2.7.dylib -DPYTHON_INCLUDE_DIR=`python-config --prefix`/include/python2.7/ ..
+
+  如果从二进制包中安装PySINGA，例如debian或wheel，那么你需要改变python解释器的路径，例如，重新设置$PATH，并把Python的正确路径放在前面的位置。
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/contribute-code.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/contribute-code.md
new file mode 100644
index 0000000..a49fd35
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/contribute-code.md
@@ -0,0 +1,109 @@
+---
+id: version-4.0.0_Chinese-contribute-code
+title: How to Contribute Code
+original_id: contribute-code
+---
+
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed [...]
+
+## 代码风格
+
+SINGA代码库在[CPP](http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml)和[Python](http://google.github.io/styleguide/pyguide.html)代码中都遵循Google风格。
+
+强制执行Google编码风格的一个简单方法是使用Visual Studio Code编辑器中的linting和格式化工具:
+
+- [C/C++扩展](https://marketplace.visualstudio.com/items?itemName=ms-vscode.cpptools)
+- [Python扩展](https://marketplace.visualstudio.com/items?itemName=ms-python.python)
+- [cpplint扩展](https://marketplace.visualstudio.com/items?itemName=mine.cpplint)
+- [Clang-Format](https://marketplace.visualstudio.com/items?itemName=xaver.clang-format)
+
+安装扩展后，编辑`settings.json`文件：
+
+```json
+{
+  "[cpp]": {
+    "editor.defaultFormatter": "xaver.clang-format"
+  },
+  "cpplint.cpplintPath": "path/to/cpplint",
+
+  "editor.formatOnSave": true,
+  "python.formatting.provider": "yapf",
+  "python.linting.enabled": true,
+  "python.linting.lintOnSave": true,
+  "clang-format.language.cpp.style": "google",
+  "python.formatting.yapfArgs": ["--style", "{based_on_style: google}"]
+}
+```
+
+根据您的操作系统，用户设置文件位于以下位置：
+
+1. Windows %APPDATA%\Code\User\settings.json
+2. macOS "\$HOME/Library/Application Support/Code/User/settings.json"
+3. Linux "\$HOME/.config/Code/User/settings.json"
+
+配置是在相应的配置文件中指定的。而这些工具会自动查找项目根目录下的配置文件，比如`.pylintrc`。
+
+#### 安装必要工具
+
+最理想的情况是所有贡献者都使用相同版本的代码格式化工具（clang-format 9.0.0和yapf 0.29.0），这样在不同PR中的代码格式化就会完全相同，从而摆脱github pull request冲突。
+
+首先，安装LLVM 9.0，它提供了clang-format 9.0.0版本，LLVM的下载页面如下:
+
+- [LLVM](http://releases.llvm.org/download.html#9.0.0)
+
+  - Ubuntu系统：
+
+    ```sh
+    sudo apt-get install clang-format-9
+    ```
+
+  - Windows系统，下载预编译包并安装。
+
+然后，安装cpplint, pylint和yapf
+
+- Ubuntu或OSX:
+
+  ```
+  $ sudo pip install cpplint
+  $ which cpplint
+  /path/to/cpplint
+
+  $ pip install yapf==0.29.0
+  $ pip install pylint
+  ```
+
+- Windows: 安装Anaconda进行包管理
+
+  ```
+  $ pip install cpplint
+  $ where cpplint
+  C:/path/to/cpplint.exe
+
+  $ pip install yapf==0.29.0
+  $ pip install pylint
+  ```
+
+#### 使用
+
+- 配置后，在编辑源代码文件时，linting会自动启用。错误和警告会在Visual Studio Code `PROBLEMS`面板中列出。
+- 代码格式化可以通过调出Command Palette(Windows中为`Shift+Ctrl+P`，OS X中为`Shift+Command+P`)并输入`Format Document`来完成。
+
+#### 提交
+
+修正格式错误以后就可以提交pull request了。
+
+## 开发环境
+
+推荐使用Visual Studio Code作为编辑器。可以安装Python、C/C++、代码拼写检查器、autoDocstring、vim、远程开发等扩展。这些扩展的参考配置（即`settings.json`）可以在[这里](https://gist.github.com/nudles/3d23cfb6ffb30ca7636c45fe60278c55)查看。
+
+如果更新CPP代码，需要从[源文件](./build.md)重新编译SINGA。建议使用`*-devel Docker`镜像中的原生构建工具或使用`conda build`。
+
+如果要只更新Python代码，您可以安装一次SINGA，然后复制更新后的Python文件来替换Python安装文件夹中的文件。
+
+```shell
+cp python/singa/xx.py  <path to conda>/lib/python3.7/site-packages/singa/
+```
+
+## 工作流程
+
+请参阅[git工作流程页面](./git-workflow.md).
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/contribute-docs.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/contribute-docs.md
new file mode 100644
index 0000000..1862b42
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/contribute-docs.md
@@ -0,0 +1,83 @@
+---
+id: version-4.0.0_Chinese-contribute-docs
+title: How to Contribute to Documentation
+original_id: contribute-docs
+---
+
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed [...]
+
+文档有两种类型，即markdown文件和API使用参考。本指南介绍了一些工具，并指导如何准备md文件和API注释。
+
+md文件将通过[Docusaurus](https://docusaurus.io/)构建成HTML页面；API注释（来自源代码）将用于使用Sphinx（对应Python）和Doxygen（对应CPP）生成API参考页面。
+
+## Markdown文件
+
+请尽量遵循[Google Documentation style](https://developers.google.com/style)。例如：
+
+1. 删除指令中的"please"。如：'Please click...' VS 'Click...'。
+2. 遵循标准的[大小写规则](https://owl.purdue.edu/owl/general_writing/mechanics/help_with_capitals.html)。
+3. 在说明中用"you"代替"we"。
+4. 使用现在时态，避免使用`will`。
+5. 尽量使用主动语态而不是被动。
+
+此外，为了使文件一致：
+
+1. 句子不宜过长, e.g., 长度<=80
+2. 尽量使用相对路径，假设我们在repo的根目录下，例如，`doc-site/docs`指的是`singa-doc/docs-site/docs`。
+3. 使用背标将命令、路径、类函数和变量亮出来，例如，`Tensor`, `singa-doc/docs-site/docs`。
+4. 为了突出其他术语/概念，使用 _斜体_ or **加粗**
+
+
+本项目使用的[prettier tool](https://prettier.io/)会在我们进行git提交时，根据[配置](https://github.com/apache/singa-doc/blob/master/docs-site/.prettierrc)自动格式化代码。例如，它会将markdown文件中的文本最多折叠成80个字符（注释行除外）。
+
+在介绍一个概念（如`Tensor`类）时，要提供概述（目的和与其他概念的关系）、API和例子，还可以用Google colab来演示其用法。
+
+详细的编辑md文件和建立网站的方法请参考[本页面](https://github.com/apache/singa-doc/tree/master/docs-site)。
+
+## API References
+
+### CPP API
+
+请遵循[Google CPP注释风格](https://google.github.io/styleguide/cppguide.html#Comments).
+
+要生成文档，请从doc文件夹中运行 "doxygen"（推荐使用Doxygen >= 1.8）。
+
+### Python API
+
+请遵循[Google Python DocString风格](http://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings).
+
+## Visual Studio Code (vscode)
+
+如果你使用vscode作为编辑器，我们推荐使用以下插件。
+
+### Docstring Snippet
+
+[autoDocstring](https://marketplace.visualstudio.com/items?itemName=njpwerner.autodocstring)生成函数、类等的docstring，要注意选择使用`google`的docString格式。
+
+### Spell Check
+
+[Code Spell Checker](https://marketplace.visualstudio.com/items?itemName=streetsidesoftware.code-spell-checker)可以用来检查代码的注释，或`.md`和`.rst`文件。
+
+要只对Python代码的注释进行拼写检查，可以在`File - Preferences - User Snippets - python.json`中添加以下代码段：
+
+    "cspell check" : {
+    "prefix": "cspell",
+    "body": [
+        "# Directives for doing spell check only for python and c/cpp comments",
+        "# cSpell:includeRegExp #.* ",
+        "# cSpell:includeRegExp (\"\"\"|''')[^\1]*\1",
+        "# cSpell: CStyleComment",
+    ],
+    "description": "# spell check only for python comments"
+    }
+
+如果要只对Cpp代码的注释进行拼写检查，可以在`File - Preferences - User Snippets - cpp.json`中添加以下代码段：
+
+    "cspell check" : {
+    "prefix": "cspell",
+    "body": [
+        "// Directive for doing spell check only for cpp comments",
+        "// cSpell:includeRegExp CStyleComment",
+    ],
+    "description": "# spell check only for cpp comments"
+    }
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/device.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/device.md
new file mode 100644
index 0000000..8f06205
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/device.md
@@ -0,0 +1,30 @@
+---
+id: version-4.0.0_Chinese-device
+title: Device
+original_id: device
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+Device代表一个有内存和计算单元的硬件设备。所有的[Tensor操作](./tensor)都由常驻设备安排执行的，Tensor内存也由设备的内存管理器管理。因此，需要在Device类中实现内存和执行的优化。
+
+
+## 特定设备
+
+目前，SINGA支持三种设备：
+
+1.  CudaGPU：用于运行Cuda代码的Nvidia GPU。
+2.  CppCPU：用于运行Cpp代码的CPU。
+3.  OpenclGPU：用于运行OpenCL代码的GPU卡。
+
+## 用法示例
+
+下面的代码提供了创建设备的例子：
+
+```python
+from singa import device
+cuda = device.create_cuda_gpu_on(0)  # use GPU card of ID 0
+host = device.get_default_device()  # get the default host device (a CppCPU)
+ary1 = device.create_cuda_gpus(2)  # create 2 devices, starting from ID 0
+ary2 = device.create_cuda_gpus([0,2])  # create 2 devices on ID 0 and 2
+```
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/dist-train.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/dist-train.md
new file mode 100644
index 0000000..edd752a
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/dist-train.md
@@ -0,0 +1,379 @@
+---
+id: version-4.0.0_Chinese-dist-train
+title: Distributed Training
+original_id: dist-train
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+SINGA支持跨多个GPU的数据并行训练（在单个节点上或跨不同节点）。下图说明了数据并行训练的情况：
+
+![MPI.png](assets/MPI.png)
+
+在分布式训练中，每个进程(称为worker)在单个GPU上运行一个训练脚本，每个进程都有一个单独的通信等级，训练数据被分发给各个worker，模型在每个worker上被复制。在每次迭代中，worker从其分区中读取数据的一个mini-batch（例如，256张图像），并运行BackPropagation算法来计算权重的梯度，通过all-reduce（由[NCCL](https://developer.nvidia.com/nccl)提供）进行平均，按照随机梯度下降算法（SGD）进行权重更新。
+
+NCCL的all-reduce操作可以用来减少和同步不同GPU的梯度。假设我们使用4个GPU进行训练，如下图所示。一旦计算出4个GPU的梯度，all-reduce将返回GPU的梯度之和，并使其在每个GPU上可用，然后就可以轻松计算出平均梯度。
+
+![AllReduce.png](assets/AllReduce.png)
+
+## 使用
+
+SINGA提供了一个名为`DistOpt`（`Opt`的一个子类）的模块，用于分布式训练。它封装了一个普通的SGD优化器，并调用`Communicator`进行梯度同步。下面的例子说明了如何使用`DistOpt`在MNIST数据集上训练一个CNN模型。源代码在[这里](https://github.com/apache/singa/blob/master/examples/cnn/)，或者可以使用[Colab notebook]()。
+
+### 代码示例
+
+1. 定义神经网络模型：
+
+```python
+class CNN(model.Model):
+
+    def __init__(self, num_classes=10, num_channels=1):
+        super(CNN, self).__init__()
+        self.conv1 = layer.Conv2d(num_channels, 20, 5, padding=0, activation="RELU")
+        self.conv2 = layer.Conv2d(20, 50, 5, padding=0, activation="RELU")
+        self.linear1 = layer.Linear(500)
+        self.linear2 = layer.Linear(num_classes)
+        self.pooling1 = layer.MaxPool2d(2, 2, padding=0)
+        self.pooling2 = layer.MaxPool2d(2, 2, padding=0)
+        self.relu = layer.ReLU()
+        self.flatten = layer.Flatten()
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.pooling1(y)
+        y = self.conv2(y)
+        y = self.pooling2(y)
+        y = self.flatten(y)
+        y = self.linear1(y)
+        y = self.relu(y)
+        y = self.linear2(y)
+        return y
+
+    def train_one_batch(self, x, y, dist_option='fp32', spars=0):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+
+        # Allow different options for distributed training
+        # See the section "Optimizations for Distributed Training"
+        if dist_option == 'fp32':
+            self.optimizer(loss)
+        elif dist_option == 'fp16':
+            self.optimizer.backward_and_update_half(loss)
+        elif dist_option == 'partialUpdate':
+            self.optimizer.backward_and_partial_update(loss)
+        elif dist_option == 'sparseTopK':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=True,
+                                                      spars=spars)
+        elif dist_option == 'sparseThreshold':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=False,
+                                                      spars=spars)
+        return out, loss
+
+# create model
+model = CNN()
+```
+
+2. 创建`DistOpt`实例并将其应用到创建的模型上：
+
+```python
+sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+sgd = opt.DistOpt(sgd)
+model.set_optimizer(sgd)
+dev = device.create_cuda_gpu_on(sgd.local_rank)
+```
+
+下面是关于代码中一些变量的解释：
+
+(i) `dev`
+
+dev代表`Device`实例，在设备中加载数据并运行CNN模型。
+
+(ii)`local_rank`
+
+Local rank表示当前进程在同一节点中使用的GPU数量。例如，如果你使用的节点有2个GPU，`local_rank=0`表示这个进程使用的是第一个GPU，而`local_rank=1`表示使用的是第二个GPU。使用MPI或多进程，你能够运行相同的训练脚本，唯一的区别`local_rank`的值不同。
+
+(iii)`global_rank`
+
+global中的rank代表了你使用的所有节点中所有进程的全局排名。让我们考虑这样的情况：你有3个节点，每个节点有两个GPU， `global_rank=0`表示使用第1个节点的第1个GPU的进程， `global_rank=2`表示使用第2个节点的第1个GPU的进程， `global_rank=4`表示使用第3个节点的第1个GPU的进程。
+
+3. 加载和分割训练/验证数据：
+
+```python
+def data_partition(dataset_x, dataset_y, global_rank, world_size):
+    data_per_rank = dataset_x.shape[0] // world_size
+    idx_start = global_rank * data_per_rank
+    idx_end = (global_rank + 1) * data_per_rank
+    return dataset_x[idx_start:idx_end], dataset_y[idx_start:idx_end]
+
+train_x, train_y, test_x, test_y = load_dataset()
+train_x, train_y = data_partition(train_x, train_y,
+                                  sgd.global_rank, sgd.world_size)
+test_x, test_y = data_partition(test_x, test_y,
+                                sgd.global_rank, sgd.world_size)
+```
+
+这个`dev`的数据集的一个分区被返回。
+
+
+这里，`world_size`代表你用于分布式训练的所有节点中的进程总数。
+
+4. 初始化并同步所有worker的模型参数:
+
+```python
+#Synchronize the initial parameter
+tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
+ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
+model.compile([tx], is_train=True, use_graph=graph, sequential=True)
+...
+#Use the same random seed for different ranks
+seed = 0
+dev.SetRandSeed(seed)
+np.random.seed(seed)
+```
+
+5. 运行BackPropagation和分布式SGD
+
+```python
+for epoch in range(max_epoch):
+    for b in range(num_train_batch):
+        x = train_x[idx[b * batch_size: (b + 1) * batch_size]]
+        y = train_y[idx[b * batch_size: (b + 1) * batch_size]]
+        tx.copy_from_numpy(x)
+        ty.copy_from_numpy(y)
+        # Train the model
+        out, loss = model(tx, ty)
+```
+
+### 执行命令
+
+有两种方式可以启动训练，MPI或Python multiprocessing。
+
+#### Python multiprocessing
+
+它可以在一个节点上使用多个GPU，其中，每个GPU是一个worker。
+
+1. 将上述训练用的代码打包进一个函数：
+
+```python
+def train_mnist_cnn(nccl_id=None, local_rank=None, world_size=None):
+    ...
+```
+
+2. 创建`mnist_multiprocess.py`。
+
+```python
+if __name__ == '__main__':
+    # Generate a NCCL ID to be used for collective communication
+    nccl_id = singa.NcclIdHolder()
+
+    # Define the number of GPUs to be used in the training process
+    world_size = int(sys.argv[1])
+
+    # Define and launch the multi-processing
+	import multiprocessing
+    process = []
+    for local_rank in range(0, world_size):
+        process.append(multiprocessing.Process(target=train_mnist_cnn,
+                       args=(nccl_id, local_rank, world_size)))
+
+    for p in process:
+        p.start()
+```
+
+下面是关于上面所创建的变量的一些说明：
+
+(i) `nccl_id`
+
+需要注意的是，我们在这里需要生成一个NCCL ID，用于集体通信，然后将其传递给所有进程。NCCL ID就像一个门票，只有拥有这个ID的进程才能加入到all-reduce操作中。(如果我们接下来使用MPI，NCCL ID的传递就没有必要了，因为在我们的代码中，这个ID会由MPI自动广播。)
+
+(ii) `world_size`
+
+world_size是您想用于训练的GPU数量。
+
+(iii) `local_rank`
+
+local_rank决定分布式训练的本地顺序，以及在训练过程中使用哪个gpu。在上面的代码中，我们使用for循环来运行训练函数，其中参数local_rank从0迭代到world_size。在这种情况下，不同的进程可以使用不同的GPU进行训练。
+
+创建`DistOpt`实例的参数应按照如下方式更新：
+
+```python
+sgd = opt.DistOpt(sgd, nccl_id=nccl_id, local_rank=local_rank, world_size=world_size)
+```
+
+3. 运行`mnist_multiprocess.py`：
+
+```sh
+python mnist_multiprocess.py 2
+```
+
+与单GPU训练相比，它最主要的意义是速度提升：
+
+```
+Starting Epoch 0:
+Training loss = 408.909790, training accuracy = 0.880475
+Evaluation accuracy = 0.956430
+Starting Epoch 1:
+Training loss = 102.396790, training accuracy = 0.967415
+Evaluation accuracy = 0.977564
+Starting Epoch 2:
+Training loss = 69.217010, training accuracy = 0.977915
+Evaluation accuracy = 0.981370
+Starting Epoch 3:
+Training loss = 54.248390, training accuracy = 0.982823
+Evaluation accuracy = 0.984075
+Starting Epoch 4:
+Training loss = 45.213406, training accuracy = 0.985560
+Evaluation accuracy = 0.985276
+Starting Epoch 5:
+Training loss = 38.868435, training accuracy = 0.987764
+Evaluation accuracy = 0.986278
+Starting Epoch 6:
+Training loss = 34.078186, training accuracy = 0.989149
+Evaluation accuracy = 0.987881
+Starting Epoch 7:
+Training loss = 30.138697, training accuracy = 0.990451
+Evaluation accuracy = 0.988181
+Starting Epoch 8:
+Training loss = 26.854443, training accuracy = 0.991520
+Evaluation accuracy = 0.988682
+Starting Epoch 9:
+Training loss = 24.039650, training accuracy = 0.992405
+Evaluation accuracy = 0.989083
+```
+
+#### MPI
+
+只要有多个GPU，MPI既适用于单节点，也适用于多节点。
+
+1. 创建`mnist_dist.py`。
+
+```python
+if __name__ == '__main__':
+    train_mnist_cnn()
+```
+
+2. 为MPI生成一个hostfile，例如下面的hostfile在一个节点上使用了2个进程（即2个GPU）：
+
+```txt
+localhost:2
+```
+
+3. 通过`mpiexec`启动训练： 
+
+```sh
+mpiexec --hostfile host_file python mnist_dist.py
+```
+
+与单GPU训练相比，它同样可以带来速度的提升：
+```
+Starting Epoch 0:
+Training loss = 383.969543, training accuracy = 0.886402
+Evaluation accuracy = 0.954327
+Starting Epoch 1:
+Training loss = 97.531479, training accuracy = 0.969451
+Evaluation accuracy = 0.977163
+Starting Epoch 2:
+Training loss = 67.166870, training accuracy = 0.978516
+Evaluation accuracy = 0.980769
+Starting Epoch 3:
+Training loss = 53.369656, training accuracy = 0.983040
+Evaluation accuracy = 0.983974
+Starting Epoch 4:
+Training loss = 45.100403, training accuracy = 0.985777
+Evaluation accuracy = 0.986078
+Starting Epoch 5:
+Training loss = 39.330826, training accuracy = 0.987447
+Evaluation accuracy = 0.987179
+Starting Epoch 6:
+Training loss = 34.655270, training accuracy = 0.988799
+Evaluation accuracy = 0.987780
+Starting Epoch 7:
+Training loss = 30.749735, training accuracy = 0.989984
+Evaluation accuracy = 0.988281
+Starting Epoch 8:
+Training loss = 27.422146, training accuracy = 0.991319
+Evaluation accuracy = 0.988582
+Starting Epoch 9:
+Training loss = 24.548153, training accuracy = 0.992171
+Evaluation accuracy = 0.988682
+```
+
+## 针对分布式训练的优化
+
+SINGA为分布式训练提供了多种优化策略，以降低模块间的通信成本。参考`DistOpt`的API，了解每种策略的配置。
+
+
+当我们使用`model.Model`建立模型时，我们需要在`training_one_batch`方法中启用分布式训练的选项，请参考本页顶部的示例代码。我们也可以直接复制这些选项的代码，然后在其他模型中使用。
+
+有了定义的选项，我们可以在使用`model(tx, ty, dist_option, spars)`开始训练时，设置对应的参数`dist_option`和`spars`。
+
+### 不采取优化手段
+
+```python
+out, loss = model(tx, ty)
+```
+
+`loss`是损失函数的输出张量，例如分类任务中的交叉熵。
+
+### 半精度梯度（Half-precision Gradients）
+
+```python
+out, loss = model(tx, ty, dist_option = 'fp16')
+```
+
+在调用all-reduce之前，它将每个梯度值转换为16-bit表示（即半精度）。
+
+### 部分同步（Partial Synchronization）
+
+```python
+out, loss = model(tx, ty, dist_option = 'partialUpdate')
+```
+
+在每一次迭代中，每个rank都做本地SGD更新。然后，只对一个部分的参数进行平均同步，从而节省了通信成本。分块大小是在创建`DistOpt`实例时配置的。
+
+### 梯度稀疏化（Gradient Sparsification）
+
+该策略应用稀疏化方案来选择梯度的子集进行all-reduce，有两种方式：
+
+- 选择前k大的元素，spars是被选择的元素的一部分（比例在0 - 1之间）。
+
+```python
+out, loss = model(tx, ty, dist_option = 'sparseTopK', spars = spars)
+```
+
+- 所有绝对值大于预定义阈值的梯度都会被选中。
+
+```python
+out, loss = model(tx, ty, dist_option = 'sparseThreshold', spars = spars)
+```
+超参数在创建`DistOpt`实例时配置。
+
+## 实现
+
+本节主要是让开发者了解分布训练模块的代码是如何实现的。
+
+### NCCL communicator的C接口
+
+
+首先，通信层是用C语言[communicator.cc](https://github.com/apache/singa/blob/master/src/io/communicator.cc)编写的，它调用用NCCL库进行集体通信。
+
+communicator有两个构造器，一个是MPI的，另一个是Multiprocess的。
+
+(i) MPI构造器
+
+构造器首先先获取全局rank和world_size，计算出本地rank，然后由rank 0生成NCCL ID并广播给每个rank。之后，它调用setup函数来初始化NCCL communicator、cuda流和缓冲区。
+
+(ii) Python multiprocess构造器
+
+构造器首先从输入参数中获取rank、world_size和NCCL ID。之后，调用setup函数来初始化NCCL communicator、cuda流和缓冲区。
+
+在初始化之后，它提供了all-reduce功能来同步模型参数或梯度。例如，synch接收一个输入张量，通过NCCL例程进行all-reduce，在我们调用synch之后，需要调用wait函数来等待all-reduce操作的完成。
+
+### DistOpt的Python接口
+
+然后，python接口提供了一个[DistOpt](https://github.com/apache/singa/blob/master/python/singa/opt.py)类来封装一个[optimizer](https://github.com/apache/singa/blob/master/python/singa/opt.py)对象，以执行基于MPI或Multiprocess的分布式训练。在初始化过程中，它创建了一个NCCL communicator对象（来自于上面小节提到的C接口），然后，`DistOpt`中的每一次all-reduce操作都会用到这个communicator对象。
+
+在MPI或Multiprocess中，每个进程都有一个独立的rank，它给出了各个进程使用的GPU的信息。训练数据是被拆分的，因此每个进程可以根据一部分训练数据来评估子梯度。一旦每个进程的子梯度被计算出来，就可以将所有进程计算出的子梯度做all-reduce，得到总体随机梯度。
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/download.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/download.md
new file mode 100644
index 0000000..95bafdb
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/download.md
@@ -0,0 +1,159 @@
+---
+id: version-4.0.0_Chinese-downloads
+title: Download SINGA
+original_id: downloads
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+## Verify
+
+要验证下载的tar.gz文件，下载[KEYS](https://www.apache.org/dist/singa/KEYS)和ASC文件，然后执行以下命令:
+
+```shell
+% gpg --import KEYS
+% gpg --verify downloaded_file.asc downloaded_file
+```
+
+你也可以检查SHA512或MD5值判断下载是否完成。
+
+## V3.0.0 (18 April 2020):
+
+- [Apache SINGA 3.0.0](http://www.apache.org/dyn/closer.cgi/singa/3.0.0/apache-singa-3.0.0.tar.gz)
+  [\[SHA512\]](https://www.apache.org/dist/singa/3.0.0/apache-singa-3.0.0.tar.gz.sha512)
+  [\[ASC\]](https://www.apache.org/dist/singa/3.0.0/apache-singa-3.0.0.tar.gz.asc)
+- [Release Notes 3.0.0](releases/RELEASE_NOTES_3.0.0)
+- 新特性及重要更新：
+  - 增强了ONNX。在SINGA中测试了多种ONNX模型。
+  - 使用 MPI 和 NCCL Communication进行分布式训练，通过梯度稀疏化和压缩以及分块传输进行了优化。
+  - 计算图的构建，利用图优化了速度和内存。
+  - 新的文档网站（singa.apache.org）和API参考网站（apache-singa.rtfd.io）。
+  - 使用CI实现代码质量检查。
+  - 将MKLDNN替换为DNNL。
+  - 更新Tensor API以支持广播操作。
+  - 实现了支持ONNX模型的新的autograd操作符。
+
+## 孵化版本（incubating） v2.0.0 (20 April 2019):
+
+- [Apache SINGA 2.0.0 (incubating)](http://www.apache.org/dyn/closer.cgi/incubator/singa/2.0.0/apache-singa-incubating-2.0.0.tar.gz)
+  [\[SHA512\]](https://www.apache.org/dist/incubator/singa/2.0.0/apache-singa-incubating-2.0.0.tar.gz.sha512)
+  [\[ASC\]](https://www.apache.org/dist/incubator/singa/2.0.0/apache-singa-incubating-2.0.0.tar.gz.asc)
+- [Release Notes 2.0.0 (incubating)](releases/RELEASE_NOTES_2.0.0.html)
+- 新特性及重要更新：
+  - 增强了autograd功能(适用于卷积网络和循环网络)。
+  - 支持ONNX。
+  - 通过英特尔MKL DNN lib改进CPP操作。
+  - 实现tensor广播。
+  - 在Apache用户名下移动Docker镜像。
+  - 更新conda-build配置中依赖的lib版本。
+
+## 孵化版本（incubating） v1.2.0 (6 June 2018):
+
+- [Apache SINGA 1.2.0 (incubating)](https://archive.apache.org/dist/incubator/singa/1.2.0/apache-singa-incubating-1.2.0.tar.gz)
+  [\[SHA512\]](https://archive.apache.org/dist/incubator/singa/1.2.0/apache-singa-incubating-1.2.0.tar.gz.sha512)
+  [\[ASC\]](https://archive.apache.org/dist/incubator/singa/1.2.0/apache-singa-incubating-1.2.0.tar.gz.asc)
+- [Release Notes 1.2.0 (incubating)](releases/RELEASE_NOTES_1.2.0.html)
+- 新特性及重要更新：
+  - 实现了autograd（目前支持MLP模式）。
+  - 升级PySinga以支持Python 3
+  - 改进Tensor类的stride范围。
+  - 将cuDNN从V5升级到V7。
+  - 增加VGG、Inception V4、ResNet和DenseNet进行ImageNet分类。
+  - 为conda包创建别名
+  - 完整的中文文档
+  - 添加在Windows上运行Singa的说明
+  - 更新编译，CI
+  - 修复一些错误
+
+## 孵化版本（incubating） v1.1.0 (12 February 2017):
+
+- [Apache SINGA 1.1.0 (incubating)](https://archive.apache.org/dist/incubator/singa/1.1.0/apache-singa-incubating-1.1.0.tar.gz)
+  [\[MD5\]](https://archive.apache.org/dist/incubator/singa/1.1.0/apache-singa-incubating-1.1.0.tar.gz.md5)
+  [\[ASC\]](https://archive.apache.org/dist/incubator/singa/1.1.0/apache-singa-incubating-1.1.0.tar.gz.asc)
+- [Release Notes 1.1.0 (incubating)](releases/RELEASE_NOTES_1.1.0.html)
+- 新特性和重要更新：
+  - 创建Docker镜像(CPU和GPU版本)
+  - 为SINGA创建Amazon AMI（CPU版）。
+  - 集成Jenkins，自动生成Wheel和Debian包（用于安装），并更新网站。
+  - 增强FeedFowardNet，例如，多输入和用于调试的verbose模式。
+  - 添加Concat和Slice层。
+  - 优化CrossEntropyLoss以接受带有多个标签的实例。
+  - 添加包含图像增强方法的image_tool.py。
+  - 支持通过快照API加载和保存模型。
+  - 在Windows上编译SINGA源代码。
+  - 将必要依赖库与SINGA代码一起编译。
+  - 为SINGA启用Java binding（基本）。
+  - 在checkpoint文件中添加版本ID。
+  - 增加Rafiki工具包以提供RESTFul APIs。
+  - 添加了从Caffe预训练的例子，包括GoogleNet。
+
+## 孵化版本（incubating） v1.0.0 (8 September 2016):
+
+- [Apache SINGA 1.0.0 (incubating)](https://archive.apache.org/dist/incubator/singa/1.0.0/apache-singa-incubating-1.0.0.tar.gz)
+  [\[MD5\]](https://archive.apache.org/dist/incubator/singa/1.0.0/apache-singa-incubating-1.0.0.tar.gz.md5)
+  [\[ASC\]](https://archive.apache.org/dist/incubator/singa/1.0.0/apache-singa-incubating-1.0.0.tar.gz.asc)
+- [Release Notes 1.0.0 (incubating)](releases/RELEASE_NOTES_1.0.0.html)
+- 新特性和重要更新：
+  - 创建Tensor概念，用于支持更多的机器学习模型。
+  - 创建Device概念，用于运行在不同的硬件设备上，包括CPU，(Nvidia/AMD) GPU 和 FPGA (将在以后的版本中测试)。
+  - 用 cmake 取代 GNU autotool 进行编译。
+  - 支持 Mac OS。
+  - 改进Python binding，包括安装和编程。
+  - 更多的深度学习模型，包括VGG和ResNet。
+  - 更多的IO类用于读取/写入文件和编码/解码数据。
+  - 直接基于Socket的新网络通信组件。
+  - Cudnn V5，包含Dropout和RNN层。
+  - 将网页构建工具从maven更换为Sphinx。
+  - 整合Travis-CI。
+
+## 孵化版本（incubating） v0.3.0 (20 April 2016):
+
+- [Apache SINGA 0.3.0 (incubating)](https://archive.apache.org/dist/incubator/singa/0.3.0/apache-singa-incubating-0.3.0.tar.gz)
+  [\[MD5\]](https://archive.apache.org/dist/incubator/singa/0.3.0/apache-singa-incubating-0.3.0.tar.gz.md5)
+  [\[ASC\]](https://archive.apache.org/dist/incubator/singa/0.3.0/apache-singa-incubating-0.3.0.tar.gz.asc)
+- [Release Notes 0.3.0 (incubating)](releases/RELEASE_NOTES_0.3.0.html)
+- 新特性和重要更新：
+  - 在GPU集群上进行训练，可以在GPU集群上进行深度学习模型的训练。
+  - 改进Python wrapper简化配置工作，包括神经网络和SGD算法。
+  - 新增SGD更新器，包括Adam、AdaDelta和AdaMax。
+  - 安装时减少了单节点训练的依赖库。
+  - 使用CPU和GPU进行异构训练。
+  - 支持 cuDNN V4。
+  - 数据预取。
+  - 修复一些bug。
+
+## 孵化版本（incubating） v0.2.0 (14 January 2016):
+
+- [Apache SINGA 0.2.0 (incubating)](https://archive.apache.org/dist/incubator/singa/0.2.0/apache-singa-incubating-0.2.0.tar.gz)
+  [\[MD5\]](https://archive.apache.org/dist/incubator/singa/0.2.0/apache-singa-incubating-0.2.0.tar.gz.md5)
+  [\[ASC\]](https://archive.apache.org/dist/incubator/singa/0.2.0/apache-singa-incubating-0.2.0.tar.gz.asc)
+- [Release Notes 0.2.0 (incubating)](releases/RELEASE_NOTES_0.2.0.html)
+- 新特性和重要更新：
+  - 在GPU上进行训练，可以在一个节点上用多个GPU卡训练复杂的模型。
+  - 混合神经网分区支持数据和模型同时并行。
+  - Python wrapper简化配置，包括神经网络和SGD算法。
+  - 实现了RNN模型和BPTT算法，支持基于RNN模型的应用，如GRU。
+  - 云软件集成，包括Mesos、Docker和HDFS。
+  - 可视化神经网结构和层信息，以便优化调试。
+  - 针对Blobs和原始数据指针的线性代数函数和随机函数。
+  - 添加新的层，包括SoftmaxLayer、ArgSortLayer、DummyLayer、RNN层和cuDNN层。
+  - 更新Layer类以携带多个数据/梯度Blobs。
+  - 通过加载之前训练的模型参数，提取特征并测试新数据的性能。
+  - 为IO操作添加Store类。
+
+## Incubating v0.1.0 (8 October 2015):
+
+- [Apache SINGA 0.1.0 (incubating)](https://archive.apache.org/dist/incubator/singa/apache-singa-incubating-0.1.0.tar.gz)
+  [\[MD5\]](https://archive.apache.org/dist/incubator/singa/apache-singa-incubating-0.1.0.tar.gz.md5)
+  [\[ASC\]](https://archive.apache.org/dist/incubator/singa/apache-singa-incubating-0.1.0.tar.gz.asc)
+- [Amazon EC2 image](https://console.aws.amazon.com/ec2/v2/home?region=ap-southeast-1#LaunchInstanceWizard:ami=ami-b41001e6)
+- [Release Notes 0.1.0 (incubating)](releases/RELEASE_NOTES_0.1.0.html)
+- 新特性和重要更新：
+  - 允许使用GNU构建工具进行安装。
+  - 实现用zookeeper进行工作管理的脚本。
+  - 实现基于NeuralNet和Layer的编程模型。
+  - 实现基于Worker、Server和Stub的系统架构。
+  - 训练模型来自三种不同的模型类别，即前馈模型、能量模型和RNN模型。
+  - 使用CPU的同步和异步分布式训练框架。
+  - checkpoint文件生成和恢复。
+  - 使用gtest进行单元测试。
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/examples.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/examples.md
new file mode 100644
index 0000000..4c2e495
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/examples.md
@@ -0,0 +1,63 @@
+---
+id: version-4.0.0_Chinese-examples
+title: Examples
+original_id: examples
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+本页列出了一些使用SINGA的深度学习任务示例。源代码维护在 [Github](https://github.com/apache/singa/tree/master/examples) 上的 SINGA repo 内。对于使用SINGA Python API在CPU或单GPU上运行的例子，它们也可以在[Google Colab](https://colab.research.google.com/)上获得。你可以直接在谷歌云上运行它们，而无需在本地设置环境。下面给出了每个例子的链接。
+
+## 图像分类
+
+| 网络模型       | 数据集                          | 链接                                                                                                   |
+| ----------- | --------------------------------- | ------------------------------------------------------------------------------------------------------- |
+| Simple CNN  | MNIST, CIFAR10, CIFAR100          | [Colab](https://colab.research.google.com/drive/1fbGUs1AsoX6bU5F745RwQpohP4bHTktq)                      |
+| AlexNet     | ImageNet                          | [Cpp]()                                                                                                 |
+| VGG         | ImageNet                          | [Cpp](), [Python](), [Colab](https://colab.research.google.com/drive/14kxgRKtbjPCKKsDJVNi3AvTev81Gp_Ds) |
+| XceptionNet | MNIST, CIFAR10, CIFAR100          | [Python]()                                                                                              |
+| ResNet      | MNIST, CIFAR10, CIFAR100, CIFAR10 | [Python](), [Colab](https://colab.research.google.com/drive/1u1RYefSsVbiP4I-5wiBKHjsT9L0FxLm9)          |
+| MobileNet   | ImageNet                          | [Colab](https://colab.research.google.com/drive/1HsixqJMIpKyEPhkbB8jy7NwNEFEAUWAf)                      |
+
+## 目标检测
+
+| 网络模型       | 数据集    | 链接                                                                             |
+| ----------- | ---------- | ---------------------------------------------------------------------------------- |
+| Tiny YOLOv2 | Pascal VOC | [Colab](https://colab.research.google.com/drive/11V4I6cRjIJNUv5ZGsEGwqHuoQEie6b1T) |
+
+## 面部及表情识别
+
+| 模型           | 数据集                                                                                                                                                | 链接                                                                              |
+| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------- |
+| ArcFace         | Refined MS-Celeb-1M                                                                                                                                    | [Colab](https://colab.research.google.com/drive/1qanaqUKGIDtifdzEzJOHjEj4kYzA9uJC) |
+| Emotion FerPlus | [Facial Expression Recognition Challenge](https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge/data) | [Colab](https://colab.research.google.com/drive/1XHtBQGRhe58PDi4LGYJzYueWBeWbO23r) |
+
+## 图像生成
+
+| 模型 | 数据集 | 链接                                                                             |
+| ----- | ------- | ---------------------------------------------------------------------------------- |
+| GAN   | MNIST   | [Colab](https://colab.research.google.com/drive/1f86MNDW47DJqHoIqWD1tOxcyx2MWys8L) |
+| LSGAN | MNIST   | [Colab](https://colab.research.google.com/drive/1C6jNRf28vnFOI9JVM4lpkJPqxsnhxdol) |
+
+## 机器理解
+
+| 模型     | 数据集                                                                  | 链接                                                                             |
+| ---------- | ------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- |
+| Bert-Squad | [SQuAD v1.1](https://rajpurkar.github.io/SQuAD-explorer/explore/1.1/dev/) | [Colab](https://colab.research.google.com/drive/1kud-lUPjS_u-TkDAzihBTw0Vqr0FjCE-) |
+
+## 文字识别
+
+| 模型       | 数据集 | 链接      |
+| ----------- | ------- | ---------- |
+| Simple LSTM | IMDB    | [python]() |
+
+## 文本排序
+
+| 模型  | 数据集    | 链接      |
+| ------ | ----------- | ---------- |
+| BiLSTM | InsuranceQA | [python]() |
+
+## 其他
+
+- MNIST数据集的有限玻尔兹曼机, [source](),
+  [Colab](https://colab.research.google.com/drive/19996noGu9JyHHkVmp4edBGu7PJSRQKsd).
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/git-workflow.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/git-workflow.md
new file mode 100644
index 0000000..6868554
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/git-workflow.md
@@ -0,0 +1,86 @@
+---
+id: version-4.0.0_Chinese-git-workflow
+title: Git Workflow
+original_id: git-workflow
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+## 对于开发者
+
+1. 将[SINGA Github repository](https://github.com/apache/singa) fork到你自己的Github账户。
+
+2. 从你自己的git仓库中clone **repo** (short for repository):
+
+   ```shell
+   git clone https://github.com/<Github account>/singa.git
+   git remote add upstream https://github.com/apache/singa.git
+   ```
+
+3. 创建一个新的分支（例如 `feature-foo` 或 `fixbug-foo`），在这个分支上工作并提交你的代码:
+
+   ```shell
+   git checkout dev
+   git checkout -b feature-foo
+   # write your code
+   git add <created/updated files>
+   git commit
+   ```
+
+   commit信息应包括：
+
+   - 一个概括性的标题。
+   - 详细的描述。如果提交是为了修复bug，描述中最好包括问题的简短复现；如果是新功能，可以描述新功能的动机/目的。
+
+   如果您的分支有很多小的commit，您需要通过:
+
+   ```shell
+   git rebase -i <commit id>
+   ```
+   你可以[压制和重写](https://help.github.com/en/articles/about-git-rebase)提交的内容。
+
+4. 当你在写代码的时候，SINGA的`dev`分支可能已经被别人更新了；在这种情况下，你需要拉取最新的`dev`分支：
+
+   ```shell
+   git checkout dev
+   git pull upstream dev:dev
+   ```
+
+5. 将`feature-foo` [rebase](https://git-scm.com/book/en/v2/Git-Branching-Rebasing)到`dev`分支上，并将提交的内容推送到自己的Github账户（你刚刚创建的新分支），rebase操作是为了清理提交历史。提交当前工作后，应执行以下 git 指令：
+
+   ```shell
+   git checkout feature-foo
+   git rebase dev
+   git push origin feature-foo:feature-foo
+   ```
+
+   Rebase命令的[操作步骤](https://git-scm.com/book/en/v2/Git-Branching-Rebasing)如下: "这个操作的工作原理是进入到两个分支（你所在的分支和你要rebase的分支）的共同来源 -> 获取你所在分支的每次commit所引入的差异 -> 将这些差异保存到临时文件中 -> 将当前分支重置为与你要rebase的分支相同的commit -> 最后依次修改每个差异。"
+   
+    因此，执行后，你还是在特性分支上，但你自己的提交ID/hash被改变了，因为diffs是在rebase时提交的；而且你的分支现在有来自`dev`分支和你自己分支的最新代码。
+
+6. 在 Github 网站上创建一个针对 apache/singa `dev`分支的pull request（PR）。如果你想通知其他在相同文件上工作过的贡献者，你可以在Github上找到文件，然后点击 "Blame"，就可以看到最后修改代码的人的逐行注释。然后，你可以在PR描述中加上@username，就可以立即ping到他们。请说明该贡献是你的原创作品，并且你在项目的开源许可下将该作品授权给项目。你的新分支的进一步提交（例如，bug修复）将由Github自动添加到这个pull request中。
+
+7. 接下来等待committer审核PR。在这段时间里，SINGA的`dev`可能已经被其他人更新了，这时你需要[合并](https://docs.fast.ai/dev/git.html#how-to-keep-your-feature-branch-up-to-date)最新的`dev`来解决冲突。有些人将PR重新[rebase到最新的dev](https://github.com/edx/edx-platform/wiki/How-to-Rebase-a-Pull-Request)上，而不是合并。但是，如果其他开发者获取这个PR来添加新的功能，然后再发送PR，那么rebase操作会在未来的PR中引入**重复的提交**（不同的哈希）。关于何时避免使用rebase的细节，请参见[The Golden Rule of Rebasing](https://www.atlassian.com/git/tutorials/merging-vs-rebasing)。另一种简单的更新PR的方法（修复冲突或提交错误）是，从Apache SINGAS repo的最新开发分支中che [...]
+
+## 对于Committers
+
+Committer可以将PR合并到上游 repo 的 dev 分支。在合并每一个PR之前，提交者应该做到：
+
+- 检查commit信息(内容和格式)
+- 检查对现有代码的修改，API的变化应该被记录下来
+- 检查Travis测试结果，检查代码/文档格式和单元测试。
+
+合并PR的方式有两种:
+
+- 在Github上，按照[说明](https://gitbox.apache.org/setup/)将你的Apache账户与Github账户链接，之后你就可以直接在GitHub上合并PR了。
+- 通过命令行合并pull request到https://github.com/apache/singa/pull/xxx，应执行以下指令：
+
+  ```shell
+  git clone https://github.com/apache/singa.git
+  git remote add asf https://gitbox.apache.org/repos/asf/singa.git
+  git fetch origin pull/xxx/head:prxxx
+  git checkout dev
+  git merge --no-ff prxxx
+  git push asf dev:dev
+  ```
+  不要使用rebase来合并PR，并禁用fast forward。
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/graph.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/graph.md
new file mode 100644
index 0000000..6c7e2a9
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/graph.md
@@ -0,0 +1,465 @@
+---
+id: version-4.0.0_Chinese-graph
+title: Model
+original_id: graph
+---
+
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed [...]
+
+神经网络中的前向和反向传播可以用一组操作来表示，比如卷积和池化。每个操作都需要一些输入的[tensors](./tensor)，并应用一个[operator](./autograd)来生成输出的张量。通过将每个运算符表示为一个节点，将每个张量表示为一条边，所有的运算就形成了一个计算图。有了计算图，可以通过调度运算的执行和内存的智能分配/释放来进行速度和内存优化。在SINGA中，用户只需要使用[Model](https://github.com/apache/singa/blob/master/python/singa/model.py) API定义神经网络模型，计算图则会在C++后台自动构建和优化。
+
+
+这样，一方面，用户使用[Model](./graph) API按照PyTorch那样的命令式编程风格实现网络。而与PyTorch在每次迭代中重新创建操作不同的是，SINGA在第一次迭代后就会缓冲操作，隐式地创建计算图（当该功能被启用时）。因此，另一方面，SINGA的计算图与使用声明式编程的库（如TensorFlow）创建的计算图类似，因而它可以享受在图上进行的优化。
+
+## 样例
+
+下面的代码说明了`Model`API的用法：
+
+1. 将新模型实现为Model类的子类：
+
+```Python
+class CNN(model.Model):
+
+    def __init__(self, num_classes=10, num_channels=1):
+        super(CNN, self).__init__()
+        self.conv1 = layer.Conv2d(num_channels, 20, 5, padding=0, activation="RELU")
+        self.conv2 = layer.Conv2d(20, 50, 5, padding=0, activation="RELU")
+        self.linear1 = layer.Linear(500)
+        self.linear2 = layer.Linear(num_classes)
+        self.pooling1 = layer.MaxPool2d(2, 2, padding=0)
+        self.pooling2 = layer.MaxPool2d(2, 2, padding=0)
+        self.relu = layer.ReLU()
+        self.flatten = layer.Flatten()
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.pooling1(y)
+        y = self.conv2(y)
+        y = self.pooling2(y)
+        y = self.flatten(y)
+        y = self.linear1(y)
+        y = self.relu(y)
+        y = self.linear2(y)
+        return y
+
+    def train_one_batch(self, x, y):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+        self.optimizer(loss)
+        return out, loss
+```
+
+2. 创建model、optimizer、device等的实例。编译模型：
+
+```python
+model = CNN()
+
+# initialize optimizer and attach it to the model
+sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+model.set_optimizer(sgd)
+
+# initialize device
+dev = device.create_cuda_gpu()
+
+# input and target placeholders for the model
+tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
+ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
+
+# compile the model before training
+model.compile([tx], is_train=True, use_graph=True, sequential=False)
+```
+
+3. 迭代训练：
+
+```python
+for b in range(num_train_batch):
+    # generate the next mini-batch
+    x, y = ...
+
+    # Copy the data into input tensors
+    tx.copy_from_numpy(x)
+    ty.copy_from_numpy(y)
+
+    # Training with one batch
+    out, loss = model(tx, ty)
+```
+
+这个例子的Google Colab notebook可以在[这里](https://colab.research.google.com/drive/1fbGUs1AsoX6bU5F745RwQpohP4bHTktq)找到。
+
+
+更多例子：
+
+- [MLP](https://github.com/apache/singa/blob/master/examples/mlp/model.py)
+- [CNN](https://github.com/apache/singa/blob/master/examples/cnn/model/cnn.py)
+- [ResNet](https://github.com/apache/singa/blob/master/examples/cnn/model/resnet.py)
+
+## 实现
+
+### 图的构建
+
+SINGA分三步构建计算图：
+
+1. 将操作保存在缓冲区。
+2. 分析操作的依赖性。
+3. 根据依赖关系创建节点和边。
+
+以MLP模型的dense层的矩阵乘法运算为例，该操作会在[MLP model](https://github.com/apache/singa/blob/master/examples/mlp/model.py)的前向函数中被调用：
+
+```python
+class MLP(model.Model):
+
+    def __init__(self, data_size=10, perceptron_size=100, num_classes=10):
+        super(MLP, self).__init__()
+        self.linear1 = layer.Linear(perceptron_size)
+        ...
+
+    def forward(self, inputs):
+        y = self.linear1(inputs)
+        ...
+```
+
+`线性`层由`mutmul`运算符组成，`autograd`通过SWIG调用CPP中提供的`Mult`函数来实现`matmul`运算符。
+
+```python
+# implementation of matmul()
+singa.Mult(inputs, w)
+```
+
+At the backend, the `Mult` function is implemented by calling `GEMV` a CBLAS
+function. Instead of calling `GEMV` directly, `Mult` submits `GEMV` and the
+arguments to the device as follows,
+在后端，`Mult`函数是通过调用`GEMV`一个CBLAS函数来实现的。但`Mult`没有直接调用`GEMV`，而是将`GEMV`和参数提交给设备，具体如下。
+
+```c++
+// implementation of Mult()
+C->device()->Exec(
+    [a, A, b, B, CRef](Context *ctx) mutable {
+        GEMV<DType, Lang>(a, A, B, b, &CRef, ctx);
+    },
+    read_blocks, {C->block()});
+```
+
+`Device`的`Exec`函数对函数及其参数进行缓冲。此外，它还拥有这个函数要读写的块的信息（块是指张量的内存块）。
+
+一旦`Model.forward()`被执行一次，所有的操作就会被`Device`缓冲。接下来，对所有操作的读写信息进行分析，用来建立计算图。例如，如果一个块`b`被一个操作O1写入，之后又被另一个操作O2读出，我们就会知道O2依赖于O1并且有一条从A到B的有向边，它代表了块`b`（或其张量）。之后我们就构建了一个有向无环图，如下图所示。该图会构建一次。
+
+![The computational graph of MLP](assets/GraphOfMLP.png)
+
+<br/>**Figure 1 - MLP例子的计算图**
+
+### 优化
+
+目前，基于计算图进行了以下优化：
+
+**惰性分配** 当创建张量/块时，设备不会立即为它们分配内存。相反，是在第一次访问块时，才会分配内存。
+
+**自动回收**  每个张量/块的参考计数是根据图计算出来的。在执行操作之前，参考计数是读取这个块的操作次数。在执行过程中，一旦执行了一个操作，每一个输入块的参考数就会减少1，如果一个块的参考数达到了0，就意味着这个块在剩下的操作中不会再被读取。因此，它的内存可以被安全释放。此外，SINGA还会跟踪图外的块的使用情况。如果一个块被Python代码使用（而不是被autograd操作符使用），它将不会被回收。
+
+**内存共享**  SINGA使用内存池，如[CnMem](https://github.com/NVIDIA/cnmem)来管理CUDA内存。有了自动回收和内存池，SINGA就可以在张量之间共享内存。考虑两个操作`c=a+b`和`d=2xc`。在执行第二个操作之前，根据惰性分配原则，应该分配d的内存。假设`a`在其余操作中没有使用。根据自动回收，`a`的块将在第一次操作后被释放。因此，SINGA会向CUDA流提交四个操作：加法、释放`a`、分配`b`和乘法。这样，内存池就可以将`a`释放的内存与`b`共享，而不是要求GPU为`b`做真正的malloc。
+
+其他的优化技术，如来自编译器的优化技术，如常见的子表达式消除和不同CUDA流上的并行化操作也可以应用。
+
+## 新的操作符
+
+`autograd`模块中定义的每个运算符都实现了两个功能：前向和反向，通过在后台调用运算符来实现。如果要在`autograd`中添加一个新的运算符，需要在后台添加多个运算符。
+
+以[Conv2d](https://github.com/apache/singa/blob/master/python/singa/autograd.py)运算符为例，在Python端，根据设备类型，从后台调用运算符来实现前向和反向功能：
+
+```python
+class _Conv2d(Operation):
+
+    def forward(self, x, W, b=None):
+        ......
+        if training:
+            if self.handle.bias_term:
+                self.inputs = (x, W, b) # record x, W, b
+            else:
+                self.inputs = (x, W)
+
+        if (type(self.handle) != singa.ConvHandle):
+            return singa.GpuConvForward(x, W, b, self.handle)
+        else:
+            return singa.CpuConvForward(x, W, b, self.handle)
+
+    def backward(self, dy):
+        if (type(self.handle) != singa.ConvHandle):
+            dx = singa.GpuConvBackwardx(dy, self.inputs[1], self.inputs[0],
+                                        self.handle)
+            dW = singa.GpuConvBackwardW(dy, self.inputs[0], self.inputs[1],
+                                        self.handle)
+            db = singa.GpuConvBackwardb(
+                dy, self.inputs[2],
+                self.handle) if self.handle.bias_term else None
+        else:
+            dx = singa.CpuConvBackwardx(dy, self.inputs[1], self.inputs[0],
+                                        self.handle)
+            dW = singa.CpuConvBackwardW(dy, self.inputs[0], self.inputs[1],
+                                        self.handle)
+            db = singa.CpuConvBackwardb(
+                dy, self.inputs[2],
+                self.handle) if self.handle.bias_term else None
+        if db:
+            return dx, dW, db
+        else:
+            return dx, dW
+```
+
+对于后台的每一个操作符，应按以下方式实现：
+
+- 假设操作符是`foo()`，它的真正实现应该包装在另一个函数中，例如`_foo()`。`foo()`将`_foo`和参数一起作为lambda函数传递给`Device`的`Exec`函数进行缓冲，要读和写的块也同时被传递给`Exec`。
+
+- lambda表达式中使用的所有参数都需要根据以下规则获取：
+
+  - `值捕获`: 如果参数变量是一个局部变量，或者将被立刻释放（例如，中间时序）。否则，一旦`foo()`存在，这些变量将被销毁。
+  - `引用捕获`：如果变量是记录在python端或者是一个持久变量（例如Conv2d类中的参数W和ConvHand）。
+
+  - `可变捕获`: 如果在`_foo()`中修改了由值捕获的变量，则lambda表达式应带有mutable（可变）标签。
+
+下面是一个在后台实现的操作的[例子](https://github.com/apache/singa/blob/master/src/model/operation/convolution.cc)：
+
+```c++
+Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x,
+                        const CudnnConvHandle &cch) {
+  CHECK_EQ(dy.device()->lang(), kCuda);
+
+  Tensor dx;
+  dx.ResetLike(x);
+
+  dy.device()->Exec(
+      /*
+       * dx is a local variable so it's captured by value
+       * dy is an intermediate tensor and isn't recorded on the python side
+       * W is an intermediate tensor but it's recorded on the python side
+       * chh is a variable and it's recorded on the python side
+       */
+      [dx, dy, &W, &cch](Context *ctx) mutable {
+        Block *wblock = W.block(), *dyblock = dy.block(), *dxblock = dx.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardData(
+            ctx->cudnn_handle, &alpha, cch.filter_desc, wblock->data(),
+            cch.y_desc, dyblock->data(), cch.conv_desc, cch.bp_data_alg,
+            cch.workspace.block()->mutable_data(),
+            cch.workspace_count * sizeof(float), &beta, cch.x_desc,
+            dxblock->mutable_data());
+      },
+      {dy.block(), W.block()}, {dx.block(), cch.workspace.block()});
+      /* the lambda expression reads the blocks of tensor dy and w
+       * and writes the blocks of tensor dx and chh.workspace
+       */
+
+  return dx;
+}
+```
+
+## Benchmark
+
+### 单节点
+
+- 实验设定
+  - 模型：
+    - 使用层: ResNet50 in
+      [resnet.py](https://github.com/apache/singa/blob/master/examples/cnn/autograd/resnet_cifar10.py)
+    - 使用模型: ResNet50 in
+      [resnet.py](https://github.com/apache/singa/blob/master/examples/cnn/model/resnet.py)
+  - GPU: NVIDIA RTX 2080Ti
+- 注释：
+  - `s` ：second，秒
+  - `it` ： iteration，迭代次数
+  - `Mem`：peak memory usage of single GPU，单GPU显存峰值
+  - `Throughout`：number of images processed per second，每秒处理的图像数
+  - `Time`：total time，总时间
+  - `Speed`：iterations per second。每秒迭代次数
+  - `Reduction`：the memory usage reduction rate compared with that using layer，与使用层的内存使用率相比，内存使用率降低了多少
+  - `Speedup`: speedup ratio compared with dev branch，与dev分支相比的加速率
+- 结果：
+  <table style="text-align: center">
+      <tr>
+          <th style="text-align: center">Batchsize</th>
+          <th style="text-align: center">Cases</th>
+          <th style="text-align: center">Mem(MB)</th>
+          <th style="text-align: center">Time(s)</th>
+          <th style="text-align: center">Speed(it/s)</th>
+          <th style="text-align: center">Throughput</th>
+          <th style="text-align: center">Reduction</th>
+          <th style="text-align: center">Speedup</th>
+      </tr>
+      <tr>
+          <td rowspan="4">16</td>
+          <td nowrap>layer</td>
+          <td>4975</td>
+          <td>14.1952</td>
+          <td>14.0893</td>
+          <td>225.4285</td>
+          <td>0.00%</td>
+          <td>1.0000</td>
+      </tr>
+      <tr>
+          <td nowrap>model:disable graph</td>
+          <td>4995</td>
+          <td>14.1264</td>
+          <td>14.1579</td>
+          <td>226.5261</td>
+          <td>-0.40%</td>
+          <td>1.0049</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, bfs</td>
+          <td>3283</td>
+          <td>13.7438</td>
+          <td>14.5520</td>
+          <td>232.8318</td>
+          <td>34.01%</td>
+          <td>1.0328</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, serial</td>
+          <td>3265</td>
+          <td>13.7420</td>
+          <td>14.5540</td>
+          <td>232.8635</td>
+          <td>34.37%</td>
+          <td>1.0330</td>
+      </tr>
+      <tr>
+          <td rowspan="4">32</td>
+          <td nowrap>layer</td>
+          <td>10119</td>
+          <td>13.4587</td>
+          <td>7.4302</td>
+          <td>237.7649</td>
+          <td>0.00%</td>
+          <td>1.0000</td>
+      </tr>
+      <tr>
+          <td nowrap>model:disable graph</td>
+          <td>10109</td>
+          <td>13.2952</td>
+          <td>7.5315</td>
+          <td>240.6875</td>
+          <td>0.10%</td>
+          <td>1.0123</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, bfs</td>
+          <td>6839</td>
+          <td>13.1059</td>
+          <td>7.6302</td>
+          <td>244.1648</td>
+          <td>32.41%</td>
+          <td>1.0269</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, serial</td>
+          <td>6845</td>
+          <td>13.0489</td>
+          <td>7.6635</td>
+          <td>245.2312</td>
+          <td>32.35%</td>
+          <td>1.0314</td>
+      </tr>
+  </table>
+
+### 多线程
+
+- 实验设置：
+  - API：
+    - 使用层: ResNet50 in
+      [resnet_dist.py](https://github.com/apache/singa/blob/master/examples/cnn/autograd/resnet_dist.py)
+    - 使用模型: ResNet50 in
+      [resnet.py](https://github.com/apache/singa/blob/master/examples/cnn/model/resnet.py)
+  - GPU: NVIDIA RTX 2080Ti \* 2
+  - MPI: 在同一节点上的两个MPI processes
+- 注释: 与上面相同
+- 结果：
+  <table style="text-align: center">
+      <tr>
+          <th style="text-align: center">Batchsize</th>
+          <th style="text-align: center">Cases</th>
+          <th style="text-align: center">Mem(MB)</th>
+          <th style="text-align: center">Time(s)</th>
+          <th style="text-align: center">Speed(it/s)</th>
+          <th style="text-align: center">Throughput</th>
+          <th style="text-align: center">Reduction</th>
+          <th style="text-align: center">Speedup</th>
+      </tr>
+      <tr>
+          <td rowspan="4">16</td>
+          <td nowrap>layer</td>
+          <td>5439</td>
+          <td>17.3323</td>
+          <td>11.5391</td>
+          <td>369.2522</td>
+          <td>0.00%</td>
+          <td>1.0000</td>
+      </tr>
+      <tr>
+          <td nowrap>model:disable graph</td>
+          <td>5427</td>
+          <td>17.8232</td>
+          <td>11.2213</td>
+          <td>359.0831</td>
+          <td>0.22%</td>
+          <td>0.9725</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, bfs</td>
+          <td>3389</td>
+          <td>18.2310</td>
+          <td>10.9703</td>
+          <td>351.0504</td>
+          <td>37.69%</td>
+          <td>0.9507</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, serial</td>
+          <td>3437</td>
+          <td>17.0389</td>
+          <td>11.7378</td>
+          <td>375.6103</td>
+          <td>36.81%</td>
+          <td>1.0172</td>
+      </tr>
+      <tr>
+          <td rowspan="4">32</td>
+          <td nowrap>layer</td>
+          <td>10547</td>
+          <td>14.8635</td>
+          <td>6.7279</td>
+          <td>430.5858</td>
+          <td>0.00%</td>
+          <td>1.0000</td>
+      </tr>
+      <tr>
+          <td nowrap>model:disable graph</td>
+          <td>10503</td>
+          <td>14.7746</td>
+          <td>6.7684</td>
+          <td>433.1748</td>
+          <td>0.42%</td>
+          <td>1.0060</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, bfs</td>
+          <td>6935</td>
+          <td>14.8553</td>
+          <td>6.7316</td>
+          <td>430.8231</td>
+          <td>34.25%</td>
+          <td>1.0006</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, serial</td>
+          <td>7027</td>
+          <td>14.3271</td>
+          <td>6.9798</td>
+          <td>446.7074</td>
+          <td>33.37%</td>
+          <td>1.0374</td>
+      </tr>
+  </table>
+
+### 结论
+
+- 在启用计算图的情况下进行训练，可以显著减少内存占用。
+- 目前，在速度上有一点改进。在效率方面还可以做更多的优化。
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/half-precision.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/half-precision.md
new file mode 100644
index 0000000..b5f811e
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/half-precision.md
@@ -0,0 +1,99 @@
+---
+id: version-4.0.0_Chinese-half-precision
+title: Half Precision
+original_id: half-precision
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+Half precision training 优点:
+- CPU内存使用低, 网络支持大。
+- 训练速度快。
+
+## Half data type
+
+### Half data type 定义
+在 IEEE 754 标准中明确binary16有如下格式：
+ [format](https://en.wikipedia.org/wiki/Half-precision_floating-point_format):
+Sign bit: 1 bit
+Exponent width: 5 bits
+Significand precision: 11 bits (10 explicitly stored)
+
+### Half data type 运算
+以fp32形式加载数据，快速转换成fp16。
+```python
+>>> from singa import tensor, device
+>>> dev = device.create_cuda_gpu()
+>>> x = tensor.random((2,3),dev)
+>>> x
+[[0.7703407  0.42764223 0.5872884 ]
+ [0.78362167 0.70469785 0.64975065]], float32
+>>> y = x.as_type(tensor.float16)
+>>> y
+[[0.7705 0.4277 0.5874]
+ [0.7837 0.7046 0.65  ]], float16
+```
+
+初级运算支持fp16格式。 
+```python
+>>> y+y
+[[1.541  0.8555 1.175 ]
+ [1.567  1.409  1.3   ]], float16
+```
+
+## Training in Half
+
+### Training in Half 三个步骤
+半精度训练只需要如下三个步骤:
+1. 加载数据并且转换成半精度数据
+2. 设置数据优化类型
+3. 启动训练模型
+``` python
+# cast input data to fp16
+x = load_data()
+x = x.astype(np.float16)
+tx = tensor.from_numpy(x)
+
+# load model
+model = build_model()
+# set optimizer dtype to fp16
+sgd = opt.SGD(lr=0.1, dtype=tensor.float16)
+
+# train as usual
+out, loss = model(tx, ty)
+```
+
+### 示例
+提供示例脚本`train_cnn.py`，可执行下面的命令语句开始半精度模型训练。
+```python
+python examples/cnn/train_cnn.py cnn mnist -pfloat16
+```
+
+## 实现
+
+### Half Type 依赖性
+该半精度实现方式就像一半半精度模型支持的一样，是被整合在C++后端来实现的。
+
+在GPU上跑的时候，`__half`可用在uda math API中，为了支持`__half`数学运算，需要编译Nvidia compute arch > 6.0(Pascal)
+
+
+### Nvidia Hardware Acceleration: Tensor Core
+Nvidia发布Tensor Core后进一步加速了半精度和倍数吞吐量的运算，如GEMM(CuBlas) and convolution(CuDNN)。要启用Tensor core的运算，在GEMM方面有一些要求，比如：卷积通道大小，Cuda版本和GPU版本（图灵或更高版本）等等。
+
+### Implement Operations
+半精度运算起初被整合在`tensor_math_cuda.h`中，专门提供半精度类型运算模版和实现方式，用来实现低数据量的计算。
+
+示例, GEMM 运算实现如下:
+```c++
+template <>
+void GEMM<half_float::half, lang::Cuda>(const half_float::half alpha,
+                                        const Tensor& A, const Tensor& B,
+                                        const half_float::half beta, Tensor* C,
+                                        Context* ctx) {
+  // ...
+  CUBLAS_CHECK(cublasGemmEx(handle, transb, transa, ncolB, nrowA, ncolA,
+                           alphaPtr, BPtr, Btype, ldb, APtr, Atype, lda,
+                           betaPtr, CPtr, Ctype, ldc, computeType, algo));
+  // ...
+}
+```
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/history-singa.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/history-singa.md
new file mode 100644
index 0000000..c1b37bb
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/history-singa.md
@@ -0,0 +1,38 @@
+---
+id: version-4.0.0_Chinese-history-singa
+title: History of SINGA
+original_id: history-singa
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+## 历史
+
+SINGA由新加坡国立大学DB System Group于2014年发起，与浙江大学数据库组合作。如果您在研究中使用SINGA，请引用以下两篇论文。
+
+- B.C. Ooi, K.-L. Tan, S. Wang, W. Wang, Q. Cai, G. Chen, J. Gao, Z. Luo, A. K.
+  H. Tung, Y. Wang, Z. Xie, M. Zhang, and K. Zheng.
+  [SINGA: A distributed deep learning platform](http://www.comp.nus.edu.sg/~ooibc/singaopen-mm15.pdf).
+  ACM Multimedia (Open Source Software Competition) 2015
+
+- W. Wang, G. Chen, T. T. A. Dinh, B. C. Ooi, K.-L.Tan, J. Gao, and S. Wang.
+  [SINGA: putting deep learning in the hands of multimedia users](http://www.comp.nus.edu.sg/~ooibc/singa-mm15.pdf).
+  ACM Multimedia 2015.
+
+Rafiki是SINGA的一个子模块，如果您在研究中使用Rafiki，请引用以下论文。
+
+- Wei Wang, Jinyang Gao, Meihui Zhang, Sheng Wang, Gang Chen, Teck Khim Ng, Beng
+  Chin Ooi, Jie Shao, Moaz Reyad.
+  [Rafiki: Machine Learning as an Analytics Service System](http://www.vldb.org/pvldb/vol12/p128-wang.pdf).
+  [VLDB 2019](http://vldb.org/2019/)
+  ([BibTex](https://dblp.org/rec/bib2/journals/pvldb/WangWGZCNOS18.bib)).
+
+[NetEase](http://tech.163.com/17/0602/17/CLUL016I00098GJ5.html),
+[yzBigData](http://www.yzbigdata.com/en/index.html),
+[Shentilium](https://shentilium.com/), [Foodlg](http://www.foodlg.com/) 以及
+[Medilot](https://medilot.com/technologies) 等公司在他们的工作和应用中使用了SINGA。
+
+## License
+
+SINGA以[Apache License Version 2.0](http://www.apache.org/licenses/LICENSE-2.0)
+版本发布。
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/how-to-release.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/how-to-release.md
new file mode 100644
index 0000000..cbe962b
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/how-to-release.md
@@ -0,0 +1,173 @@
+---
+id: version-4.0.0_Chinese-how-to-release
+title: How to Prepare a Release
+original_id: how-to-release
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+这是SINGA的[发布准备流程](http://www.apache.org/dev/release-publishing.html)指南。
+
+1. 选择一个发布管理者。发布管理者（RM）是发布过程的协调者，他的签名文件（.asc）将会与发布一起上传。RM 生成 KEY (RSA 4096 位)并将其上传到公钥服务器，首先需要得到其他Apache用户对他的密钥的认可（签名），才能连接到信任网，RM需要先求助其他项目管理者帮忙认证他的密钥。[如何生成密钥？](http://www.apache.org/dev/release-signing.html)
+
+2. 检查license。 [FAQ](https://www.apache.org/legal/src-headers.html#faq-docs);
+   [SINGA Issue](https://issues.apache.org/jira/projects/SINGA/issues/SINGA-447)
+
+   - 代码库不能包含与APL不兼容的第三方代码。
+   - 依赖项与APL兼容，GNU类license不兼容。
+   - 我们编写的所有源文件都必须包含Apache license头：http://www.apache.org/legal/src-headers.html.
+   链接中有一个脚本可以帮助将这个头同步到所有文件。
+   - 更新LICENSE文件。如果我们在发行包中包含了任何非APL的第三方代码，必须要在NOTICE文件的最后注明。
+
+3. 复查版本。检查代码和文档。
+
+   - 编译过程无错误。
+   - (尽可能地)包含进单元测试。
+   - Conda包运行无误。
+   - Apache网站上的在线文档是最新的。
+
+4. 准备好RELEASE_NOTES文件。包括以下项目，介绍，特性，错误（链接到JIRA或Github PR），变更，依赖列表，不兼容问题，可以按照这个[例子]((http://commons.apache.org/proper/commons-digester/commons-digester-3.0/RELEASE-NOTES.txt))来写。
+
+5. 打包候选版本。该版本应该打包成：apache-singa-VERSION.tar.gz。这个版本不应该包含任何二进制文件，包括git文件。但是CMake的编译依赖于git标签来获取版本号；要删除这个依赖，你需要手动更新CMakeLists.txt文件来设置版本号：
+
+   ```
+   # remove the following lines
+   include(GetGitRevisionDescription)
+   git_describe(VERSION --tags --dirty=-d)
+   string(REGEX REPLACE "^([0-9]+)\\..*" "\\1" VERSION_MAJOR "${VERSION}")
+   string(REGEX REPLACE "^[0-9]+\\.([0-9]+).*" "\\1" VERSION_MINOR "${VERSION}")
+   string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" VERSION_PATCH "${VERSION}")
+
+   # set the numbers manually
+   SET(PACKAGE_VERSION 3.0.0)
+   SET(VERSION 3.0.0)
+   SET(SINGA_MAJOR_VERSION 3)  # 0 -
+   SET(SINGA_MINOR_VERSION 0)  # 0 - 9
+   SET(SINGA_PATCH_VERSION 0)  # 0 - 99
+   ```
+
+   将软件包上传到[stage repo](https://dist.apache.org/repos/dist/dev/singa/)。应包括tar文件、签名、KEY和SHA256校验和文件。MD5不再使用，详细规则在[这里](https://dist.apache.org/repos/dist/dev/singa/)。阶段文件夹应该包括：
+
+   - apache-singa-VERSION.tar.gz
+   - apache-singa-VERSION.acs
+   - apache-singa-VERSION.SHA256
+
+   创建这些文件并上传到stage svn repo的命令如下：
+
+   ```sh
+   # in singa repo
+   rm -rf .git
+   rm -rf rafiki/*
+   cd ..
+   tar -czvf apache-singa-VERSION.tar.gz  singa/
+
+   mkdir stage
+   cd stage
+   svn co https://dist.apache.org/repos/dist/dev/singa/
+   cd singa
+   # copy the KEYS file from singa repo to this folder if it is not here
+   cp ../../singa/KEYS .
+   mkdir VERSION
+   # copy the tar.gz file
+   mv ../../apache-singa-VERSION.tar.gz VERSION/
+   cd VERSION
+   sha512sum apache-singa-VERSION.tar.gz > apache-singa-VERSION.tar.gz.sha512
+   gpg --armor --output apache-singa-VERSION.tar.gz.asc --detach-sig apache-singa-VERSION.tar.gz
+   cd ..
+   svn add VERSION
+   svn commit
+   ```
+
+6) 通过发送电子邮件的方式进行投票。现举例如下：
+
+```
+   To: dev@singa.apache.org
+   Subject: [VOTE] Release apache-singa-X.Y.Z (release candidate N)
+
+   Hi all,
+
+   I have created a build for Apache SINGA 3.1.0, release candidate 2.
+
+   The release note is at
+   https://github.com/apache/singa/blob/master/RELEASE_NOTES.
+
+   The artifacts to be voted on are located here:
+   https://dist.apache.org/repos/dist/dev/singa/3.1.0.rc2/apache-singa-3.1.0.rc2.tar.gz
+    
+   The hashes of the artifacts are as follows:
+   SHA512: 84545499ad36da108c6a599edd1d853f82d331bc03273b5278515554866f0c698e881f956b2eabcb6b29c07fa9fa4ff1add5a777b58db8a6a2362cf383b5c04d 
+
+   Release artifacts are signed with the followingkey:
+   https://dist.apache.org/repos/dist/dev/singa/KEYS
+
+   The signature file is:
+   https://dist.apache.org/repos/dist/dev/singa/3.1.0.rc2/apache-singa-3.1.0.rc2.tar.gz.asc
+
+   The Github tag is at:
+   https://github.com/apache/singa/releases/tag/3.1.0.rc2
+
+   The documentation website is at
+   http://singa.apache.org/docs/next/installation/
+
+   Some examples are available for testing:
+   https://github.com/apache/singa/tree/master/examples
+   
+
+
+   Please vote on releasing this package. The vote is open for at least 72 hours and passes if a majority of at least three +1 votes are cast.
+
+   [ ] +1 Release this package as Apache SINGA X.Y.Z 
+
+   [ ] 0 I don't feel strongly about it, but I'm okay with the release 
+
+   [ ] -1 Do not release this package because...
+
+   Here is my vote: +1
+
+```
+
+7) 等待至少48小时的测试回复。任何PMC、提交者或贡献者都可以测试发布的功能，以及反馈。大家在投票+1之前应该检查这些。如果投票通过，则发送如下的结果邮件，否则，从头开始重复刚刚的步骤。
+
+```
+
+To: dev@singa.apache.org Subject: [RESULT][vote] Release apache-singa-X.Y.Z
+(release candidate N)
+
+Thanks to everyone who has voted and given their comments. The tally is as
+follows.
+
+N binding +1s: <names>
+
+N non-binding +1s: <names>
+
+No 0s or -1s.
+
+I am delighted to announce that the proposal to release Apache SINGA X.Y.Z has
+passed.
+
+````
+
+8) 将软件包上传至 https://dist.apache.org/repos/dist/release/singa/，以便[distribution](http://www.apache.org/dev/release-publishing.html#distribution)。
+
+9) 更新SINGA网站的下载页面。tar.gz 文件必须从镜像下载，使用 closer.cgi 脚本；其他工件必须从 Apache 主站点下载。更多细节请看[这里](http://www.apache.org/dev/release-download-pages.html)。我们在之前的版本中得到的一些反馈。“下载页面必须只链接到正式发布的版本，所以不能包含到GitHub的链接”，“链接到KEYS, sig和Hash的链接不能使用dist.apache.org而应该使用 https://www.apache.org/dist/singa/...”“而且你只需要一个KEYS链接，而且应该描述如何使用KEYS+sig或Hash来验证下载。”
+
+10) 删除RC标签并编译conda包。
+
+11) 发布release信息：
+
+ ```
+ To: announce@apache.org, dev@singa.apache.org
+ Subject: [ANNOUNCE] Apache SINGA X.Y.Z released
+
+ We are pleased to announce that SINGA X.Y.Z is released.
+
+ SINGA is a general distributed deep learning platform
+ for training big deep learning models over large datasets.
+ The release is available at: http://singa.apache.org/downloads.html
+ The main features of this release include XXX
+ We look forward to hearing your feedback, suggestions,
+ and contributions to the project.
+
+ On behalf of the SINGA team, {SINGA Team Member Name}
+ ```
+
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/install-win.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/install-win.md
new file mode 100644
index 0000000..ee92c73
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/install-win.md
@@ -0,0 +1,360 @@
+---
+id: version-4.0.0_Chinese-install-win
+title: Build SINGA on Windows
+original_id: install-win
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+在Microsoft Windows上从源码构建SINGA的过程有四个部分：安装依赖关系、构建SINGA源码、（可选择）安装python模块和（可选择）运行单元测试。
+
+## 安装依赖项
+
+你可以创建一个文件夹来构建依赖关系。
+
+使用到的依赖项有：
+
+- 编译器和IDE：
+  - Visual Studio，社区版是免费的，可以用来构建SINGA。
+    https://www.visualstudio.com/
+- CMake
+  - 可以从 http://cmake.org/ 下载
+  - 确保 cmake 可执行文件的路径在系统路径中，或者在调用 cmake 时使用完整路径。
+- SWIG
+  - 可以从 http://swig.org/ 下载
+  - 确保swig可执行文件的路径在系统路径中，或者在调用swig时使用完整路径。请使用最新的版本，如3.0.12。
+
+- Protocol Buffers
+  - 下载一个合适的版本，如2.6.1:
+    https://github.com/google/protobuf/releases/tag/v2.6.1 。
+  - 下载 protobuf-2.6.1.zip 和 protoc-2.6.1-win32.zip。
+  - 将这两个文件解压到dependencies文件夹中，将protoc可执行文件的路径添加到系统路径中，或者在调用它时使用完整路径。
+  - 打开Visual Studio solution，它可以在vsproject文件夹中找到。
+  - 将build settings改为Release和x64。
+  - 构建libprotobuf项目。
+
+- Openblas
+  - 从 http://www.openblas.net 下载合适的源码，如0.2.20。
+  - 将源码解压到dependencies文件夹中。
+  - 如果你没有安装Perl，请下载一个perl环境，如Strawberry Perl (http://strawberryperl.com/)。
+  - 在源文件夹中运行此命令来构建Visual Studio解决方案：
+
+  ```bash
+  cmake -G "Visual Studio 15 2017 Win64"
+  ```
+
+  - 打开Visual Studio解决方案并将build settings更改为Release和x64。
+  - 构建libopenblas项目。
+
+- Google glog
+  - 下载一个合适的版本，如0.3.5:
+    https://github.com/google/glog/releases
+  - 将源码解压到dependencies文件夹中。
+  - 打开Visual Studio solution.
+  - 将build settings改为Release and x64.
+  - 构建libglog项目。
+
+## 构建SINGA源代码
+
+- 下载SINGA源代码
+- 编译protobuf文件:
+
+  - 在src/proto目录中：
+
+  ```shell
+  mkdir python_out
+  protoc.exe *.proto --python_out python_out
+  ```
+
+- 为C++和Python生成swig接口：在src/api目录中：
+
+  ```shell
+  swig -python -c++ singa.i
+  ```
+
+- 生成SINGA的Visual Studio解决方案：在SINGA源码根目录中：
+
+  ```shell
+  mkdir build
+  cd build
+  ```
+
+- 调用 cmake 并添加系统路径，类似于下面的例子:
+
+  ```shell
+  cmake -G "Visual Studio 15 2017 Win64" ^
+    -DGLOG_INCLUDE_DIR="D:/WinSinga/dependencies/glog-0.3.5/src/windows" ^
+    -DGLOG_LIBRARIES="D:/WinSinga/dependencies/glog-0.3.5/x64/Release" ^
+    -DCBLAS_INCLUDE_DIR="D:/WinSinga/dependencies/openblas-0.2.20/lapack-netlib/CBLAS/include" ^
+    -DCBLAS_LIBRARIES="D:/WinSinga/dependencies/openblas-0.2.20/lib/RELEASE" ^
+    -DProtobuf_INCLUDE_DIR="D:/WinSinga/dependencies/protobuf-2.6.1/src" ^
+    -DProtobuf_LIBRARIES="D:/WinSinga/dependencies/protobuf-2.6.1/vsprojects/x64/Release" ^
+    -DProtobuf_PROTOC_EXECUTABLE="D:/WinSinga/dependencies/protoc-2.6.1-win32/protoc.exe" ^
+    ..
+  ```
+
+- 在Visual Studio中打开生成的解决方案。
+- 将构建设置改为Release和x64。
+- 将src/api中的singa_wrap.cxx文件添加到singa_objects项目中。
+- 在 singa_objects 项目中，打开 Additional Include Directories。
+- 添加Python的include路径。
+- 添加numpy的include路径。
+- 添加protobuf的include路径。
+- 在 singa_objects 项目的预处理程序定义中， 添加 USE_GLOG。
+- 构建singa_objects项目。
+
+- 在singa项目中:
+  - 将singa_wrap.obj添加到对象库。
+  - 将目标名称改为"_singa_wrap"。
+  - 将目标扩展名为.pyd。
+  - 将配置类型改为动态库(.dll)。
+  - 进入Additional Library Directories，添加路径到python、openblas、protobuf和glog库。
+  - 在Additional Dependencies中添加libopenblas.lib、libglog.lib和libprotobuf.lib。
+
+- 构建singa项目
+
+## 安装python模块
+
+
+- 将build/python/setup.py中的`_singa_wrap.so`改为`_singa_wrap.pyd`。
+- 将`src/proto/python_out`中的文件复制到`build/python/singa/proto`中。
+
+- （可选）创建并激活一个虚拟环境：
+  ```shell
+  mkdir SingaEnv
+  virtualenv SingaEnv
+  SingaEnv\Scripts\activate
+  ```
+
+- 进入build/python文件夹，运行:
+
+  ```shell
+  python setup.py install
+  ```
+
+- 将 _singa_wrap.pyd、libglog.dll 和 libopenblas.dll 添加到路径中，或者将它们复制到 python site-packages 中的 singa package 文件夹中。
+
+
+- 通过运行如下命令，来验证SINGA是否安装成功：
+
+  ```shell
+  python -c "from singa import tensor"
+  ```
+
+构建过程的视频教程可以在这里找到：
+
+[![youtube video](https://img.youtube.com/vi/cteER7WeiGk/0.jpg)](https://www.youtube.com/watch?v=cteER7WeiGk)
+
+## 运行单元测试
+
+- 在测试文件夹中，生成Visual Studio解决方案：
+
+  ```shell
+  cmake -G "Visual Studio 15 2017 Win64"
+  ```
+
+- 在Visual Studio中打开生成的解决方案。
+
+- 更改build settings为Release和x64。
+
+- 构建glog项目。
+
+- 在test_singa项目中:
+  - 将 USE_GLOG 添加到Preprocessor Definitions中。
+  - 在 Additional Include Directories 中， 添加上面第 2 步中使用的 GLOG_INCLUDE_DIR、 CBLAS_INCLUDE_DIR 和 Protobuf_INCLUDE_DIR 的路径。同时添加build和build/include文件夹。
+  - 转到Additional Library Directories，添加openblas、protobuf和glog库的路径。同时添加 build/src/singa_objects.dir/Release。
+  - 转到 Additional Dependencies 并添加 libopenblas.lib、libglog.lib 和 libprotobuf.lib。修改两个库的名字：gtest.lib和singa_objects.lib。
+
+- 构建test_singa项目。
+
+- 将libglog.dll和libopenblas.dll添加到路径中，或者将它们复制到test/release文件夹中，使其可用。
+
+- 单元测试可以通过如下方式执行：
+
+  - 从命令行:
+
+  ```shell
+  test_singa.exe
+  ```
+
+  - 从Visual Studio:
+    - 右键点击test_singa项目，选择 "Set as StartUp Project"。
+    - 在Debug菜单中，选择'Start Without Debugging'。
+
+单元测试的视频教程可以在这里找到:
+
+[![youtube video](https://img.youtube.com/vi/393gPtzMN1k/0.jpg)](https://www.youtube.com/watch?v=393gPtzMN1k)
+
+## 构建包含cuda的GPU支持
+
+在本节中，我们将扩展前面的步骤来启用GPU。
+
+### 安装依赖项
+
+除了上面第1节的依赖关系外，我们还需要以下内容：
+
+- CUDA
+
+  从 https://developer.nvidia.com/cuda-downloads 下载一个合适的版本，如9.1。确保已经安装了Visual Studio集成模块。
+
+
+- cuDNN
+
+  从 https://developer.nvidia.com/cudnn 下载一个合适的版本，如7.1。
+
+- cnmem:
+
+  - 从 https://github.com/NVIDIA/cnmem 下载最新版本。
+  - 构建Visual Studio解决方案：
+
+  ```shell
+  cmake -G "Visual Studio 15 2017 Win64"
+  ```
+
+  - 在Visual Studio中打开生成的解决方案。
+  - 将build settings改为Release和x64。
+  - 构建cnmem项目。
+
+### 构建SINGA源代码
+
+- 调用 cmake 并添加系统路径，类似于下面的例子：
+  ```shell
+  cmake -G "Visual Studio 15 2017 Win64" ^
+    -DGLOG_INCLUDE_DIR="D:/WinSinga/dependencies/glog-0.3.5/src/windows" ^
+    -DGLOG_LIBRARIES="D:/WinSinga/dependencies/glog-0.3.5/x64/Release" ^
+    -DCBLAS_INCLUDE_DIR="D:/WinSinga/dependencies/openblas-0.2.20/lapack-netlib/CBLAS/include" ^
+    -DCBLAS_LIBRARIES="D:/WinSinga/dependencies/openblas-0.2.20/lib/RELEASE" ^
+    -DProtobuf_INCLUDE_DIR="D:/WinSinga/dependencies/protobuf-2.6.1/src" ^
+    -DProtobuf_LIBRARIES="D:\WinSinga/dependencies/protobuf-2.6.1/vsprojects/x64/Release" ^
+    -DProtobuf_PROTOC_EXECUTABLE="D:/WinSinga/dependencies/protoc-2.6.1-win32/protoc.exe" ^
+    -DCUDNN_INCLUDE_DIR=D:\WinSinga\dependencies\cudnn-9.1-windows10-x64-v7.1\cuda\include ^
+    -DCUDNN_LIBRARIES=D:\WinSinga\dependencies\cudnn-9.1-windows10-x64-v7.1\cuda\lib\x64 ^
+    -DSWIG_DIR=D:\WinSinga\dependencies\swigwin-3.0.12 ^
+    -DSWIG_EXECUTABLE=D:\WinSinga\dependencies\swigwin-3.0.12\swig.exe ^
+    -DUSE_CUDA=YES ^
+    -DCUDNN_VERSION=7 ^
+    ..
+  ```
+
+* 为C++和Python生成swig接口。在src/api目录中：
+
+  ```shell
+  swig -python -c++ singa.i
+  ```
+
+* 在Visual Studio中打开生成的解决方案
+
+* 将build settings改为Release和x64
+
+#### 构建singa_objects
+
+- 将src/api中的singa_wrap.cxx文件添加到singa_objects项目中。
+- 在 singa_objects 项目中，打开 Additional Include Directories。
+- 添加Python的include路径
+- 添加numpy include路径
+- 添加protobuf包括路径
+- 增加CUDA、cuDNN和cnmem的包含路径。
+- 在 singa_objects 项目的预处理程序定义中， 加入 USE_GLOG、 USE_CUDA 和 USE_CUDNN。删除 DISABLE_WARNINGS。
+- 建立 singa_objects 项目
+
+#### 构建singa-kernel
+
+
+- 创建一个新的Visual Studio项目，类型为 "CUDA 9.1 Runtime"。给它起个名字，比如singa-kernel。
+- 该项目自带一个名为kernel.cu的初始文件，从项目中删除这个文件。
+- 添加这个文件：src/core/tensor/math_kernel.cu。
+- 在项目设置中。
+
+  - 将平台工具集设置为 "Visual Studio 2015 (v140)"
+  - 将 "配置类型 "设置为 "静态库(.lib)"
+  - 在include目录中，添加build/include。
+
+- 建立singa-kernel项目
+
+#### 构建singa
+
+- 在singa项目中：
+
+  - 将singa_wrap.obj添加到对象库中。
+  - 将目标名称改为"_singa_wrap"。
+  - 将目标扩展名为.pyd。
+  - 将配置类型改为动态库(.dll)。
+  - 到Additional Library Directories中添加python、openblas的路径。protobuf和glog库。
+  - 同时添加singa-kernel、cnmem、cuda和cudnn的library path。
+  - 到Additional Dependencies，并添加libopenblas.lib、libglog.lib和 libprotobuf.lib。
+  - 另外还要添加：singa-kernel.lib、cnmem.lib、cudnn.lib、cuda.lib、cublas.lib。curand.lib和cudart.lib。
+
+- 构建singa项目。
+
+### Install Python module
+
+- 将 build/python/setup.py 中的 _singa_wrap.so 改为 _singa_wrap.pyd。
+
+- 将 src/proto/python_out 中的文件复制到 build/python/singa/proto 中。
+
+- （可选） 创建并激活虚拟环境:
+
+  ```shell
+  mkdir SingaEnv
+  virtualenv SingaEnv
+  SingaEnv\Scripts\activate
+  ```
+
+- 进入build/python文件夹，运行:
+
+  ```shell
+  python setup.py install
+  ```
+
+- 将 _singa_wrap.pyd, libglog.dll, libopenblas.dll, cnmem.dll, CUDA Runtime (例如 cudart64_91.dll) 和 cuDNN (例如 cudnn64_7.dll) 添加到路径中，或者将它们复制到 python site-packages 中的 singa package 文件夹中。
+
+- 通过运行如下命令来验证SINGA是否已经安装：
+
+  ```shell
+  python -c "from singa import device; dev = device.create_cuda_gpu()"
+  ```
+
+这个部分的视频教程可以在这里找到：
+
+[![youtube video](https://img.youtube.com/vi/YasKVjRtuDs/0.jpg)](https://www.youtube.com/watch?v=YasKVjRtuDs)
+
+### 运行单元测试
+
+- 在测试文件夹中，生成Visual Studio解决方案：
+
+  ```shell
+  cmake -G "Visual Studio 15 2017 Win64"
+  ```
+
+- 在Visual Studio中打开生成的解决方案，或者将项目添加到步骤5.2中创建的singa解决方案中。
+
+- 将build settings改为Release和x64。
+
+- 构建 glog 项目。
+
+- 在test_singa项目中:
+
+  - 将 USE_GLOG; USE_CUDA; USE_CUDNN 添加到Preprocessor Definitions中。
+  - 在 Additional Include Directories 中， 添加上面 5.2 中使用的 GLOG_INCLUDE_DIR、 CBLAS_INCLUDE_DIR 和 Protobuf_INCLUDE_DIR 的路径。同时添加build、build/include、CUDA和cuDNN的include文件夹。
+  - 转到Additional Library Directories，添加openblas、protobuf和glog库的路径。同时添加 build/src/singa_objects.dir/Release、singa-kernel、cnmem、CUDA 和 cuDNN 库的路径。
+  - 在Additional Dependencies中添加libopenblas.lib; libglog.lib; libprotobuf.lib; cnmem.lib; cudnn.lib; cuda.lib; cublas.lib; curand.lib; cudart.lib; singa-kernel.lib。修正两个库的名字：gtest.lib和singa_objects.lib。
+
+* 构建.
+
+* 将libglog.dll、libopenblas.dll、cnmem.dll、cudart64_91.dll和cudnn64_7.dll添加到路径中，或将它们复制到test/release文件夹中，使其可用。
+
+* 单元测试可以通过如下方式执行：
+
+  - 从命令行:
+
+    ```shell
+    test_singa.exe
+    ```
+
+  - 从 Visual Studio:
+    - 右键点击test_singa项目，选择 'Set as StartUp Project'.
+    - 从Debug菜单，选择 'Start Without Debugging'
+
+运行单元测试的视频教程可以在这里找到：
+
+[![youtube video](https://img.youtube.com/vi/YOjwtrvTPn4/0.jpg)](https://www.youtube.com/watch?v=YOjwtrvTPn4)
diff --git a/docs-site/docs/installation.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/installation.md
similarity index 53%
copy from docs-site/docs/installation.md
copy to docs-site/website/versioned_docs/version-4.0.0_Chinese/installation.md
index bc5fa57..7e7a1a8 100644
--- a/docs-site/docs/installation.md
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/installation.md
@@ -1,81 +1,68 @@
 ---
-id: installation
+id: version-4.0.0_Chinese-installation
 title: Installation
+original_id: installation
 ---
 
 <!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
 
-## Using Pip
+## 使用pip
 
-[Miniconda3](https://conda.io/miniconda.html) is recommended to use with SINGA.
-After installing miniconda, execute the one of the following commands to install
-SINGA.
+推荐使用[Miniconda3](https://conda.io/miniconda.html)来配合SINGA使用，安装miniconda后，执行以下命令之一安装SINGA。
 
-**SINGA works with python 3.6, 3.7 and 3.8.**
-
-1. CPU only
+1. 只使用CPU
    [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/17RA056Brwk0vBQTFaZ-l9EbqwADO0NA9?usp=sharing)
 
 ```bash
 pip install singa -f http://singa.apache.org/docs/next/wheel-cpu.html --trusted-host singa.apache.org
 ```
 
-You can install a specific version of SINGA via `singa==<version>`, where the
-`<version>` field should be replaced, e.g., `3.3.0`. The available SINGA
-versions are listed at the link.
+您可以通过`singa==<version>`安装特定版本的SINGA，其中`<version>`字段应被替换，例如`4.0.0`。可用的SINGA版本在链接中列出。
+
+要安装最新的开发版本，请将链接替换为
+http://singa.apache.org/docs/next/wheel-cpu-dev.html
 
-2. GPU With CUDA and cuDNN
+2. 使用CUDA和cuDNN的GPU
    [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1W30IPCqj5fG8ADAQsFqclaCLyIclVcJL?usp=sharing)
 
 ```bash
 pip install singa -f http://singa.apache.org/docs/next/wheel-gpu.html --trusted-host singa.apache.org
 ```
 
-You can also configure SINGA version and the CUDA version, like
-`singa==3.3.0+cuda10.2`. The available combinations of SINGA version and CUDA
-version are listed at the link.
+您也可以配置SINGA版本和CUDA版本，比如`singa==4.0.0+cuda10.2`，SINGA版本和CUDA版本的可用组合在链接中列出。
 
-Note: the Python version of your local Python environment will be used to find
-the corresponding wheel package. For example, if your local Python is 3.6, then
-the wheel package compiled on Python 3.6 will be selected by pip and installed.
-In fact, the wheel file's name include SINGA version, CUDA version and Python
-version. Therefore, `pip` knows which wheel file to download and install.
+要安装最新的开发版本，请将链接替换为
+http://singa.apache.org/docs/next/wheel-gpu-dev.html
 
-Refer to the comments at the top of the `setup.py` file for how to build the
-wheel packages.
+注意：你本地Python环境的Python版本将被用来寻找相应的wheel包。例如，如果你本地的Python是3.6，那么就会通过pip选择在Python 3.6上编译的wheel包并安装。事实上，wheel文件的名称包括SINGA版本、CUDA版本和Python版本。因此，`pip`知道要下载和安装哪个wheel文件。
 
-If there is no error message from
+参考setup.py文件顶部的注释，了解如何构建wheel包。
 
+如果运行以下命令没有报错：
 ```shell
 $ python -c "from singa import tensor"
 ```
 
-then SINGA is installed successfully.
+那么SINGA就安装成功了。
 
-## Using Docker
+## 使用Docker
 
-Install Docker on your local host machine following the
-[instructions](https://docs.docker.com/install/). Add your user into the
-[docker group](https://docs.docker.com/install/linux/linux-postinstall/) to run
-docker commands without `sudo`.
+按照[说明](https://docs.docker.com/install/)在你的本地主机上安装Docker。将您的用户添加到[docker组](https://docs.docker.com/install/linux/linux-postinstall/)中，以便在没有`sudo`的情况下运行docker命令。
 
-1. CPU-only.
+1. 仅使用CPU
 
 ```shell
 $ docker run -it apache/singa:X.Y.Z-cpu-ubuntu16.04 /bin/bash
 ```
 
-2. With GPU enabled. Install
-   [Nvidia-Docker](https://github.com/NVIDIA/nvidia-docker) after install
-   Docker.
+2. 要使用GPU，在安装Docker后安装
+   [Nvidia-Docker](https://github.com/NVIDIA/nvidia-docker) 
 
 ```shell
 $ nvidia-docker run -it apache/singa:X.Y.Z-cuda9.0-cudnn7.4.2-ubuntu16.04 /bin/bash
 ```
 
-3. For the complete list of SINGA Docker images (tags), visit the
-   [docker hub site](https://hub.docker.com/r/apache/singa/). For each docker
-   image, the tag is named as
+3. 关于SINGA Docker镜像（标签）的完整列表，请访问[docker hub site](https://hub.docker.com/r/apache/singa/)。对于每个docker镜像，标签的命名为：
 
 ```shell
 version-(cpu|gpu)[-devel]
@@ -89,16 +76,14 @@ version-(cpu|gpu)[-devel]
 | `devel`   | indicator for development        | if absent, SINGA Python package is installed for runtime only; if present, the building environment is also created, you can recompile SINGA from source at '/root/singa' |
 | `OS`      | indicate OS version number       | 'ubuntu16.04', 'ubuntu18.04'                                                                                                                                              |
 
-## From source
-
-You can [build and install SINGA](build.md) from the source code using native
-building tools or conda-build, on local host OS or in a Docker container.
+## 从源码编译
+您可以使用本地构建工具或conda-build在本地主机操作系统上或在Docker容器中从源代码[构建和安装SINGA](build.md)。
 
 ## FAQ
 
-- Q: Error from `from singa import tensor`
+- Q: `from singa import tensor`错误
 
-  A: Check the detailed error from
+  A: 执行下面的命令，检查详细的错误：
 
   ```shell
   python -c  "from singa import _singa_wrap"
@@ -108,12 +93,8 @@ building tools or conda-build, on local host OS or in a Docker container.
   >> import importlib
   >> importlib.import_module('_singa_wrap')
   ```
-
-  The folder of `_singa_wrap.so` is like
-  `~/miniconda3/lib/python3.7/site-packages/singa`. Normally, the error is
-  caused by the mismatch or missing of dependent libraries, e.g. cuDNN or
-  protobuf. The solution is to create a new virtual environment and install
-  SINGA in that environment, e.g.,
+  `_singa_wrap.so` 的文件夹是 `~/miniconda3/lib/python3.7/site-packages/singa`。通常情况下，这个错误是由于依赖的库不匹配或缺失造成的，例如 cuDNN 或 protobuf。解决方法是创建一个新的虚拟环境，并在该环境中安装SINGA，例如：
+  
 
   ```shell
   conda create -n singa
@@ -121,19 +102,12 @@ building tools or conda-build, on local host OS or in a Docker container.
   conda install -c nusdbsystem -c conda-forge singa-cpu
   ```
 
-- Q: When using virtual environment, every time I install SINGA, numpy would be
-  reinstalled. However, the numpy is not used when I run `import numpy`
+- Q: 使用虚拟环境时，每次安装SINGA时，都会重新安装numpy。但是，当我运行`import numpy`时，numpy没有被使用。
 
-  A: It could be caused by the `PYTHONPATH` environment variable which should be
-  set to empty when you are using virtual environment to avoid the conflicts
-  with the path of the virtual environment.
+  A: 
+  这可能是由`PYTHONPATH`环境变量引起的，在使用虚拟环境时，应将其设置为空，以避免与虚拟环境的路径冲突。
 
-- Q: When I run SINGA in Mac OS X, I got the error "Fatal Python error:
+- Q: 当我在Mac OS X中运行SINGA时，得到如下错误 "Fatal Python error:
   PyThreadState_Get: no current thread Abort trap: 6"
 
-  A: This error happens typically when you have multiple versions of Python in
-  your system, e.g, the one comes with the OS and the one installed by Homebrew.
-  The Python linked by SINGA must be the same as the Python interpreter. You can
-  check your interpreter by `which python` and check the Python linked by SINGA
-  via `otool -L <path to _singa_wrap.so>`. This problem should be resolved if
-  SINGA is installed via conda.
+  A: 这个错误通常发生在系统中有多个 Python 版本的时候，例如，操作系统自带的版本和 Homebrew 安装的版本。SINGA链接的Python必须与Python解释器相同。您可以通过`which python`来检查解释器python版本并通过`otool -L <path to _singa_wrap.so>` 检查 SINGA 链接的 Python，如果通过conda安装SINGA，这个问题应该可以解决。
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/issue-tracking.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/issue-tracking.md
new file mode 100644
index 0000000..465271f
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/issue-tracking.md
@@ -0,0 +1,11 @@
+---
+id: version-4.0.0_Chinese-issue-tracking
+title: Issue Tracking
+original_id: issue-tracking
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+SINGA使用[JIRA](https://issues.apache.org/jira/browse/singa)来管理问题，包括bug、新功能和讨论。
+
+我们现在正在转移到[Github Issues](https://github.com/apache/singa/issues)。
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/mail-lists.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/mail-lists.md
new file mode 100644
index 0000000..1a14d5f
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/mail-lists.md
@@ -0,0 +1,15 @@
+---
+id: version-4.0.0_Chinese-mail-lists
+title: Project Mailing Lists
+original_id: mail-lists
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+这些是为这个项目建立的邮件列表。每个列表都有一个订阅、退订和归档链接：
+
+| Name        | Post                                 | Subscribe                                                        | Unsubscribe                                                          | Archive                                                                             |
+| ----------- | ------------------------------------ | ---------------------------------------------------------------- | -------------------------------------------------------------------- | ----------------------------------------------------------------------------------- |
+| Development | <de...@singa.incubator.apache.org>     | [Subscribe](mailto:dev-subscribe@singa.incubator.apache.org)     | [Unsubscribe](mailto:dev-unsubscribe@singa.incubator.apache.org.)    | [mail-archives.apache.org](http://mail-archives.apache.org/mod_mbox/singa-dev/)     |
+| Commits     | <co...@singa.incubator.apache.org> | [Subscribe](mailto:commits-subscribe@singa.incubator.apache.org) | [Unsubscribe](mailto:commits-unsubscribe@singa.incubator.apache.org) | [mail-archives.apache.org](http://mail-archives.apache.org/mod_mbox/singa-commits/) |
+| Security    | <se...@singa.apache.org>          | private                                                          | private                                                              | private                                                                             |
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/onnx.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/onnx.md
new file mode 100644
index 0000000..6e9f330
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/onnx.md
@@ -0,0 +1,674 @@
+---
+id: version-4.0.0_Chinese-onnx
+title: ONNX
+original_id: onnx
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+[ONNX](https://onnx.ai/) 是机器学习模型的开放表示格式，它使AI开发人员能够在不同的库和工具中使用模型。SINGA支持加载ONNX格式模型用于训练和inference，并将使用SINGA API（如[Module](./module)）定义的模型保存为ONNX格式。
+
+SINGA在以下[版本](https://github.com/onnx/onnx/blob/master/docs/Versioning.md)中的ONNX中测试过。
+
+| ONNX version | File format version | Opset version ai.onnx | Opset version ai.onnx.ml | Opset version ai.onnx.training |
+| ------------ | ------------------- | --------------------- | ------------------------ | ------------------------------ |
+| 1.6.0        | 6                   | 11                    | 2                        | -                              |
+
+## 通常用法
+
+### 从ONNX中读取一个Model到SINGA
+
+在通过 `onnx.load` 从磁盘加载 ONNX 模型后，您需要更新模型的batch_size，因为对于大多数模型来说，它们使用一个占位符来表示其批处理量。我们在这里举一个例子，若要 `update_batch_size`，你只需要更新输入和输出的 batch_size，内部的 tensors 的形状会自动推断出来。
+
+
+然后，您可以使用 `sonnx.prepare` 来准备 SINGA 模型。该函数将 ONNX 模型图中的所有节点迭代并翻译成 SINGA 运算符，加载所有存储的权重并推断每个中间张量的形状。
+
+```python3
+import onnx
+from singa import device
+from singa import sonnx
+
+# if the input has multiple tensors? can put this function inside prepare()?
+def update_batch_size(onnx_model, batch_size):
+    model_input = onnx_model.graph.input[0]
+    model_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+    model_output = onnx_model.graph.output[0]
+    model_output.type.tensor_type.shape.dim[0].dim_value = batch_size
+    return onnx_model
+
+
+model_path = "PATH/To/ONNX/MODEL"
+onnx_model = onnx.load(model_path)
+
+# set batch size
+onnx_model = update_batch_size(onnx_model, 1)
+
+# convert onnx graph nodes into SINGA operators
+dev = device.create_cuda_gpu()
+sg_ir = sonnx.prepare(onnx_model, device=dev)
+```
+
+### Inference SINGA模型
+
+一旦创建了模型，就可以通过调用`sg_ir.run`进行inference。输入和输出必须是SINGA Tensor实例，由于SINGA模型以列表形式返回输出，如果只有一个输出，你只需要从输出中取第一个元素即可。
+
+```python3
+# can warp the following code in prepare()
+# and provide a flag training=True/False?
+
+class Infer:
+
+
+    def __init__(self, sg_ir):
+        self.sg_ir = sg_ir
+
+    def forward(self, x):
+        return sg_ir.run([x])[0]
+
+
+data = get_dataset()
+x = tensor.Tensor(device=dev, data=data)
+
+model = Infer(sg_ir)
+y = model.forward(x)
+```
+
+### 将SINGA模型保存成ONNX格式
+
+给定输入时序和输出时序，由运算符产生的模型，你可以追溯所有内部操作。因此，一个SINGA模型是由输入和输出张量定义的，要将 SINGA 模型导出为 ONNX 格式，您只需提供输入和输出张量列表。
+
+```python3
+# x is the input tensor, y is the output tensor
+sonnx.to_onnx([x], [y])
+```
+
+### 在ONNX模型上重新训练
+
+要使用 SINGA 训练（或改进）ONNX 模型，您需要设置内部的张量为可训练状态：
+
+```python3
+class Infer:
+
+    def __init__(self, sg_ir):
+        self.sg_ir = sg_ir
+        ## can wrap these codes in sonnx?
+        for idx, tens in sg_ir.tensor_map.items():
+            # allow the tensors to be updated
+            tens.requires_grad = True
+            tens.stores_grad = True
+
+    def forward(self, x):
+        return sg_ir.run([x])[0]
+
+autograd.training = False
+model = Infer(sg_ir)
+
+autograd.training = True
+# then you training the model like normal
+# give more details??
+```
+
+### 在ONNX模型上做迁移学习
+
+您也可以在ONNX模型的最后附加一些图层来进行转移学习。`last_layers` 意味着您从 [0, last_layers] 切断 ONNX 层。然后您可以通过普通的SINGA模型附加更多的层。
+
+```python3
+class Trans:
+
+    def __init__(self, sg_ir, last_layers):
+        self.sg_ir = sg_ir
+        self.last_layers = last_layers
+        self.append_linear1 = autograd.Linear(500, 128, bias=False)
+        self.append_linear2 = autograd.Linear(128, 32, bias=False)
+        self.append_linear3 = autograd.Linear(32, 10, bias=False)
+
+    def forward(self, x):
+        y = sg_ir.run([x], last_layers=self.last_layers)[0]
+        y = self.append_linear1(y)
+        y = autograd.relu(y)
+        y = self.append_linear2(y)
+        y = autograd.relu(y)
+        y = self.append_linear3(y)
+        y = autograd.relu(y)
+        return y
+
+autograd.training = False
+model = Trans(sg_ir, -1)
+
+# then you training the model like normal
+```
+
+## 一个完整示例
+
+本部分以mnist为例，介绍SINGA ONNX的使用方法。在这部分，将展示如何导出、加载、inference、再训练和迁移学习 mnist 模型的例子。您可以在[这里](https://colab.research.google.com/drive/1-YOfQqqw3HNhS8WpB8xjDQYutRdUdmCq)试用这部分内容。
+
+### 读取数据集
+
+首先，你需要导入一些必要的库，并定义一些辅助函数来下载和预处理数据集：
+
+```python
+import os
+import urllib.request
+import gzip
+import numpy as np
+import codecs
+
+from singa import device
+from singa import tensor
+from singa import opt
+from singa import autograd
+from singa import sonnx
+import onnx
+
+
+def load_dataset():
+    train_x_url = 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz'
+    train_y_url = 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz'
+    valid_x_url = 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz'
+    valid_y_url = 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'
+    train_x = read_image_file(check_exist_or_download(train_x_url)).astype(
+        np.float32)
+    train_y = read_label_file(check_exist_or_download(train_y_url)).astype(
+        np.float32)
+    valid_x = read_image_file(check_exist_or_download(valid_x_url)).astype(
+        np.float32)
+    valid_y = read_label_file(check_exist_or_download(valid_y_url)).astype(
+        np.float32)
+    return train_x, train_y, valid_x, valid_y
+
+
+def check_exist_or_download(url):
+
+    download_dir = '/tmp/'
+
+    name = url.rsplit('/', 1)[-1]
+    filename = os.path.join(download_dir, name)
+    if not os.path.isfile(filename):
+        print("Downloading %s" % url)
+        urllib.request.urlretrieve(url, filename)
+    return filename
+
+
+def read_label_file(path):
+    with gzip.open(path, 'rb') as f:
+        data = f.read()
+        assert get_int(data[:4]) == 2049
+        length = get_int(data[4:8])
+        parsed = np.frombuffer(data, dtype=np.uint8, offset=8).reshape(
+            (length))
+        return parsed
+
+
+def get_int(b):
+    return int(codecs.encode(b, 'hex'), 16)
+
+
+def read_image_file(path):
+    with gzip.open(path, 'rb') as f:
+        data = f.read()
+        assert get_int(data[:4]) == 2051
+        length = get_int(data[4:8])
+        num_rows = get_int(data[8:12])
+        num_cols = get_int(data[12:16])
+        parsed = np.frombuffer(data, dtype=np.uint8, offset=16).reshape(
+            (length, 1, num_rows, num_cols))
+        return parsed
+
+
+def to_categorical(y, num_classes):
+    y = np.array(y, dtype="int")
+    n = y.shape[0]
+    categorical = np.zeros((n, num_classes))
+    categorical[np.arange(n), y] = 1
+    categorical = categorical.astype(np.float32)
+    return categorical
+```
+
+### MNIST模型
+
+然后你可以定义一个叫做**CNN**的类来构造mnist模型，这个模型由几个卷积层、池化层、全连接层和relu层组成。你也可以定义一个函数来计算我们结果的**准确性**。最后，你可以定义一个**训练函数**和一个**测试函数**来处理训练和预测的过程。
+
+```python
+class CNN:
+    def __init__(self):
+        self.conv1 = autograd.Conv2d(1, 20, 5, padding=0)
+        self.conv2 = autograd.Conv2d(20, 50, 5, padding=0)
+        self.linear1 = autograd.Linear(4 * 4 * 50, 500, bias=False)
+        self.linear2 = autograd.Linear(500, 10, bias=False)
+        self.pooling1 = autograd.MaxPool2d(2, 2, padding=0)
+        self.pooling2 = autograd.MaxPool2d(2, 2, padding=0)
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = autograd.relu(y)
+        y = self.pooling1(y)
+        y = self.conv2(y)
+        y = autograd.relu(y)
+        y = self.pooling2(y)
+        y = autograd.flatten(y)
+        y = self.linear1(y)
+        y = autograd.relu(y)
+        y = self.linear2(y)
+        return y
+
+
+def accuracy(pred, target):
+    y = np.argmax(pred, axis=1)
+    t = np.argmax(target, axis=1)
+    a = y == t
+    return np.array(a, "int").sum() / float(len(t))
+
+
+def train(model,
+          x,
+          y,
+          epochs=1,
+          batch_size=64,
+          dev=device.get_default_device()):
+    batch_number = x.shape[0] // batch_size
+
+    for i in range(epochs):
+        for b in range(batch_number):
+            l_idx = b * batch_size
+            r_idx = (b + 1) * batch_size
+
+            x_batch = tensor.Tensor(device=dev, data=x[l_idx:r_idx])
+            target_batch = tensor.Tensor(device=dev, data=y[l_idx:r_idx])
+
+            output_batch = model.forward(x_batch)
+            # onnx_model = sonnx.to_onnx([x_batch], [y])
+            # print('The model is:\n{}'.format(onnx_model))
+
+            loss = autograd.softmax_cross_entropy(output_batch, target_batch)
+            accuracy_rate = accuracy(tensor.to_numpy(output_batch),
+                                     tensor.to_numpy(target_batch))
+
+            sgd = opt.SGD(lr=0.001)
+            for p, gp in autograd.backward(loss):
+                sgd.update(p, gp)
+            sgd.step()
+
+            if b % 1e2 == 0:
+                print("acc %6.2f loss, %6.2f" %
+                      (accuracy_rate, tensor.to_numpy(loss)[0]))
+    print("training completed")
+    return x_batch, output_batch
+
+def test(model, x, y, batch_size=64, dev=device.get_default_device()):
+    batch_number = x.shape[0] // batch_size
+
+    result = 0
+    for b in range(batch_number):
+        l_idx = b * batch_size
+        r_idx = (b + 1) * batch_size
+
+        x_batch = tensor.Tensor(device=dev, data=x[l_idx:r_idx])
+        target_batch = tensor.Tensor(device=dev, data=y[l_idx:r_idx])
+
+        output_batch = model.forward(x_batch)
+        result += accuracy(tensor.to_numpy(output_batch),
+                           tensor.to_numpy(target_batch))
+
+    print("testing acc %6.2f" % (result / batch_number))
+```
+
+### 训练mnist模型并将其导出到onnx
+
+现在，你可以通过调用 **soonx.to_onnx** 函数来训练 mnist 模型并导出其 onnx 模型。
+
+```python
+def make_onnx(x, y):
+    return sonnx.to_onnx([x], [y])
+
+# create device
+dev = device.create_cuda_gpu()
+#dev = device.get_default_device()
+# create model
+model = CNN()
+# load data
+train_x, train_y, valid_x, valid_y = load_dataset()
+# normalization
+train_x = train_x / 255
+valid_x = valid_x / 255
+train_y = to_categorical(train_y, 10)
+valid_y = to_categorical(valid_y, 10)
+# do training
+autograd.training = True
+x, y = train(model, train_x, train_y, dev=dev)
+onnx_model = make_onnx(x, y)
+# print('The model is:\n{}'.format(onnx_model))
+
+# Save the ONNX model
+model_path = os.path.join('/', 'tmp', 'mnist.onnx')
+onnx.save(onnx_model, model_path)
+print('The model is saved.')
+```
+
+### Inference
+
+导出onnx模型后，可以在'/tmp'目录下找到一个名为**mnist.onnx**的文件，这个模型可以被其他库导入。现在，如果你想把这个onnx模型再次导入到singa中，并使用验证数据集进行推理，你可以定义一个叫做**Infer**的类，Infer的前向函数将被测试函数调用，对验证数据集进行推理。此外，你应该把训练的标签设置为**False**，以固定自变量算子的梯度。
+
+在导入onnx模型时，需要先调用**onnx.load**来加载onnx模型。然后将onnx模型输入到 **soonx.prepare**中进行解析，并启动到一个singa模型(代码中的**sg_ir**)。sg_ir里面包含了一个singa图，然后就可以通过输入到它的run函数中运行一步推理。
+
+```python
+class Infer:
+    def __init__(self, sg_ir):
+        self.sg_ir = sg_ir
+        for idx, tens in sg_ir.tensor_map.items():
+            # allow the tensors to be updated
+            tens.requires_grad = True
+            tens.stores_grad= True
+            sg_ir.tensor_map[idx] = tens
+
+    def forward(self, x):
+        return sg_ir.run([x])[0] # we can run one step of inference by feeding input
+
+# load the ONNX model
+onnx_model = onnx.load(model_path)
+sg_ir = sonnx.prepare(onnx_model, device=dev) # parse and initiate to a singa model
+
+# inference
+autograd.training = False
+print('The inference result is:')
+test(Infer(sg_ir), valid_x, valid_y, dev=dev)
+```
+
+### 重训练
+
+假设导入模型后，想再次对模型进行重新训练，我们可以定义一个名为**re_train**的函数。在调用这个re_train函数之前，我们应该将训练的标签设置为**True**，以使自变量运算符更新其梯度。而在完成训练后，我们再将其设置为**False**，以调用做推理的测试函数。
+
+```python
+def re_train(sg_ir,
+             x,
+             y,
+             epochs=1,
+             batch_size=64,
+             dev=device.get_default_device()):
+    batch_number = x.shape[0] // batch_size
+
+    new_model = Infer(sg_ir)
+
+    for i in range(epochs):
+        for b in range(batch_number):
+            l_idx = b * batch_size
+            r_idx = (b + 1) * batch_size
+
+            x_batch = tensor.Tensor(device=dev, data=x[l_idx:r_idx])
+            target_batch = tensor.Tensor(device=dev, data=y[l_idx:r_idx])
+
+            output_batch = new_model.forward(x_batch)
+
+            loss = autograd.softmax_cross_entropy(output_batch, target_batch)
+            accuracy_rate = accuracy(tensor.to_numpy(output_batch),
+                                     tensor.to_numpy(target_batch))
+
+            sgd = opt.SGD(lr=0.01)
+            for p, gp in autograd.backward(loss):
+                sgd.update(p, gp)
+            sgd.step()
+
+            if b % 1e2 == 0:
+                print("acc %6.2f loss, %6.2f" %
+                      (accuracy_rate, tensor.to_numpy(loss)[0]))
+    print("re-training completed")
+    return new_model
+
+# load the ONNX model
+onnx_model = onnx.load(model_path)
+sg_ir = sonnx.prepare(onnx_model, device=dev)
+
+# re-training
+autograd.training = True
+new_model = re_train(sg_ir, train_x, train_y, dev=dev)
+autograd.training = False
+test(new_model, valid_x, valid_y, dev=dev)
+```
+
+### 迁移学习
+
+最后，如果我们想做迁移学习，我们可以定义一个名为**Trans**的函数，在onnx模型后追加一些层。为了演示，代码中只在onnx模型后追加了几个线性（全连接）和relu。可以定义一个transfer_learning函数来处理transfer-learning模型的训练过程，而训练的标签和前面一个的一样。
+
+```python
+class Trans:
+    def __init__(self, sg_ir, last_layers):
+        self.sg_ir = sg_ir
+        self.last_layers = last_layers
+        self.append_linear1 = autograd.Linear(500, 128, bias=False)
+        self.append_linear2 = autograd.Linear(128, 32, bias=False)
+        self.append_linear3 = autograd.Linear(32, 10, bias=False)
+
+    def forward(self, x):
+        y = sg_ir.run([x], last_layers=self.last_layers)[0]
+        y = self.append_linear1(y)
+        y = autograd.relu(y)
+        y = self.append_linear2(y)
+        y = autograd.relu(y)
+        y = self.append_linear3(y)
+        y = autograd.relu(y)
+        return y
+
+def transfer_learning(sg_ir,
+             x,
+             y,
+             epochs=1,
+             batch_size=64,
+             dev=device.get_default_device()):
+    batch_number = x.shape[0] // batch_size
+
+    trans_model = Trans(sg_ir, -1)
+
+    for i in range(epochs):
+        for b in range(batch_number):
+            l_idx = b * batch_size
+            r_idx = (b + 1) * batch_size
+
+            x_batch = tensor.Tensor(device=dev, data=x[l_idx:r_idx])
+            target_batch = tensor.Tensor(device=dev, data=y[l_idx:r_idx])
+            output_batch = trans_model.forward(x_batch)
+
+            loss = autograd.softmax_cross_entropy(output_batch, target_batch)
+            accuracy_rate = accuracy(tensor.to_numpy(output_batch),
+                                     tensor.to_numpy(target_batch))
+
+            sgd = opt.SGD(lr=0.07)
+            for p, gp in autograd.backward(loss):
+                sgd.update(p, gp)
+            sgd.step()
+
+            if b % 1e2 == 0:
+                print("acc %6.2f loss, %6.2f" %
+                      (accuracy_rate, tensor.to_numpy(loss)[0]))
+    print("transfer-learning completed")
+    return trans_mode
+
+# load the ONNX model
+onnx_model = onnx.load(model_path)
+sg_ir = sonnx.prepare(onnx_model, device=dev)
+
+# transfer-learning
+autograd.training = True
+new_model = transfer_learning(sg_ir, train_x, train_y, dev=dev)
+autograd.training = False
+test(new_model, valid_x, valid_y, dev=dev)
+```
+
+## ONNX模型库
+
+[ONNX 模型库](https://github.com/onnx/models)是由社区成员贡献的 ONNX 格式的预先训练的最先进模型的集合。SINGA 现在已经支持了几个 CV 和 NLP 模型。将来会支持更多模型。
+
+### 图像分类
+
+这套模型以图像作为输入，然后将图像中的主要物体分为1000个物体类别，如键盘、鼠标、铅笔和许多动物。
+
+| Model Class                                                                                         | Reference                                               | Description                                                                                                                                                                                                                               | Link                                                                                           [...]
+| --------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- [...]
+| <b>[MobileNet](https://github.com/onnx/models/tree/master/vision/classification/mobilenet)</b>      | [Sandler et al.](https://arxiv.org/abs/1801.04381)      | 最适合移动和嵌入式视觉应用的轻量级深度神经网络。 <br>Top-5 error from paper - ~10%                                                                                                               | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1HsixqJMIpKyEPhkbB8jy7NwNEFEAUWAf) |
+| <b>[ResNet18](https://github.com/onnx/models/tree/master/vision/classification/resnet)</b>          | [He et al.](https://arxiv.org/abs/1512.03385)           | 一个CNN模型（多达152层），在对图像进行分类时，使用shortcut来实现更高的准确性。 <br> Top-5 error from paper - ~3.6%                                                                                         | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1u1RYefSsVbiP4I-5wiBKHjsT9L0FxLm9) |
+| <b>[VGG16](https://github.com/onnx/models/tree/master/vision/classification/vgg)</b>                | [Simonyan et al.](https://arxiv.org/abs/1409.1556)      | 深度CNN模型（多达19层）。类似于AlexNet，但使用多个较小的内核大小的滤波器，在分类图像时提供更高的准确性。 <br>Top-5 error from paper - ~8%                                                  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/14kxgRKtbjPCKKsDJVNi3AvTev81Gp_Ds) |
+| <b>[ShuffleNet_V2](https://github.com/onnx/models/tree/master/vision/classification/shufflenet)</b> | [Simonyan et al.](https://arxiv.org/pdf/1707.01083.pdf) | 专门为移动设备设计的计算效率极高的CNN模型。这种网络架构设计考虑了速度等直接指标，而不是FLOP等间接指标。 Top-1 error from paper - ~30.6% | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19HfRu3YHP_H2z3BcZujVFRp23_J5XsuA?usp=sharing)                                                |
+
+### 目标检测
+
+目标检测模型可以检测图像中是否存在多个对象，并将图像中检测到对象的区域分割出来。
+
+| Model Class                                                                                                       | Reference                                             | Description                                                                                                                        | Link                                                                                                                                                    |
+| ----------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| <b>[Tiny YOLOv2](https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/tiny_yolov2)</b> | [Redmon et al.](https://arxiv.org/pdf/1612.08242.pdf) | 一个用于目标检测的实时CNN，可以检测20个不同的类。一个更复杂的完整YOLOv2网络的小版本。 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/11V4I6cRjIJNUv5ZGsEGwqHuoQEie6b1T) |
+
+### 面部识别
+
+人脸检测模型可以识别和/或识别给定图像中的人脸和情绪。
+
+| Model Class                                                                                               | Reference                                          | Description                                                                                                                         | Link                                                                                                                                                    |
+| --------------------------------------------------------------------------------------------------------- | -------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| <b>[ArcFace](https://github.com/onnx/models/tree/master/vision/body_analysis/arcface)</b>                 | [Deng et al.](https://arxiv.org/abs/1801.07698)    | 一种基于CNN的人脸识别模型，它可以学习人脸的判别特征，并对输入的人脸图像进行分析。 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qanaqUKGIDtifdzEzJOHjEj4kYzA9uJC) |
+| <b>[Emotion FerPlus](https://github.com/onnx/models/tree/master/vision/body_analysis/emotion_ferplus)</b> | [Barsoum et al.](https://arxiv.org/abs/1608.01041) | 基于人脸图像训练的情感识别深度CNN。                                                                        | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1XHtBQGRhe58PDi4LGYJzYueWBeWbO23r) |
+
+### 机器理解
+
+这个自然语言处理模型的子集，可以回答关于给定上下文段落的问题。
+
+| Model Class                                                                                           | Reference                                                                                                                           | Description                                                                                                       | Link                                                                                                                                     [...]
+| ----------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------- [...]
+| <b>[BERT-Squad](https://github.com/onnx/models/tree/master/text/machine_comprehension/bert-squad)</b> | [Devlin et al.](https://arxiv.org/pdf/1810.04805.pdf)                                                                               | 该模型根据给定输入段落的上下文回答问题。                                   | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1kud-lUPjS_u-TkDAzihBTw0Vqr0FjCE-)             |
+| <b>[RoBERTa](https://github.com/onnx/models/tree/master/text/machine_comprehension/roberta)</b>       | [Devlin et al.](https://arxiv.org/pdf/1907.11692.pdf)                                                                               | 一个基于大型变换器的模型，根据给定的输入文本预测情感。                                | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1F-c4LJSx3Cb2jW6tP7f8nAZDigyLH6iN?usp=sharing) |
+| <b>[GPT-2](https://github.com/onnx/models/tree/master/text/machine_comprehension/gpt-2)</b>           | [Devlin et al.](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) | 一个基于大型变换器的语言模型，给定一些文本中的单词序列，预测下一个单词。 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ZlXLSIMppPch6HgzKRillJiUcWn3PiK7?usp=sharing)                                                   [...]
+
+## 支持的操作符
+
+onnx支持下列运算:
+
+- Acos
+- Acosh
+- Add
+- And
+- Asin
+- Asinh
+- Atan
+- Atanh
+- AveragePool
+- BatchNormalization
+- Cast
+- Ceil
+- Clip
+- Concat
+- ConstantOfShape
+- Conv
+- Cos
+- Cosh
+- Div
+- Dropout
+- Elu
+- Equal
+- Erf
+- Expand
+- Flatten
+- Gather
+- Gemm
+- GlobalAveragePool
+- Greater
+- HardSigmoid
+- Identity
+- LeakyRelu
+- Less
+- Log
+- MatMul
+- Max
+- MaxPool
+- Mean
+- Min
+- Mul
+- Neg
+- NonZero
+- Not
+- OneHot
+- Or
+- Pad
+- Pow
+- PRelu
+- Reciprocal
+- ReduceMean
+- ReduceSum
+- Relu
+- Reshape
+- ScatterElements
+- Selu
+- Shape
+- Sigmoid
+- Sign
+- Sin
+- Sinh
+- Slice
+- Softmax
+- Softplus
+- Softsign
+- Split
+- Sqrt
+- Squeeze
+- Sub
+- Sum
+- Tan
+- Tanh
+- Tile
+- Transpose
+- Unsqueeze
+- Upsample
+- Where
+- Xor
+
+### 对ONNX后端的特别说明
+
+- Conv, MaxPool 以及 AveragePool
+
+  输入必须是1d`(N*C*H)`和2d`(N*C*H*W)`的形状，`dilation`必须是1。
+
+- BatchNormalization
+
+  `epsilon` 设定为1e-05，不能改变
+
+- Cast
+
+  只支持float32和int32，其他类型都会转向这两种类型。
+
+- Squeeze and Unsqueeze
+
+  如果你在`Tensor`和Scalar之间`Squeeze`或`Unsqueeze`时遇到错误，请向我们报告。
+
+- Empty tensor 
+
+  空张量在SINGA是非法的。
+
+## 实现
+
+SINGA ONNX的代码在`python/singa/soonx.py`中，主要有三个类，`SingaFrontend`、`SingaBackend`和`SingaRep`。`SingaFrontend`将SINGA模型翻译成ONNX模型；`SingaBackend`将ONNX模型翻译成`SingaRep`对象，其中存储了所有的SINGA运算符和张量（本文档中的张量指SINGA Tensor）；`SingaRep`可以像SINGA模型一样运行。
+
+### SingaFrontend
+
+`SingaFrontend`的入口函数是`singa_to_onnx_model`，它也被称为`to_onnx`，`singa_to_onnx_model`创建了ONNX模型，它还通过`singa_to_onnx_graph`创建了一个ONNX图。
+
+
+`singa_to_onnx_graph`接受模型的输出，并从输出中递归迭代SINGA模型的图，得到所有的运算符，形成一个队列。SINGA模型的输入和中间张量，即可训练的权重，同时被获取。输入存储在`onnx_model.graph.input`中；输出存储在`onnx_model.graph.output`中；可训练权重存储在`onnx_model.graph.initializer`中。
+
+然后将队列中的SINGA运算符逐一翻译成ONNX运算符。`_rename_operators` 定义了 SINGA 和 ONNX 之间的运算符名称映射。`_special_operators` 定义了翻译运算符时要使用的函数。
+
+此外，SINGA 中的某些运算符与 ONNX 的定义不同，即 ONNX 将 SINGA 运算符的某些属性视为输入，因此 `_unhandled_operators` 定义了处理特殊运算符的函数。
+
+由于SINGA中的布尔类型被视为int32，所以`_bool_operators`定义了要改变的操作符为布尔类型。
+
+### SingaBackend
+
+`SingaBackend`的入口函数是`prepare`，它检查ONNX模型的版本，然后调用`_onnx_model_to_singa_net`。
+
+`_onnx_model_to_singa_net`的目的是获取SINGA的时序和运算符。tensors在ONNX中以其名称存储在字典中，而操作符则以`namedtuple('SingaOps', ['name', 'op', 'handle', 'forward'])`的形式存储在队列中。对于每个运算符，`name`是它的ONNX节点名称；`op`是ONNX节点；`forward`是SINGA运算符的转发函数；`handle`是为一些特殊的运算符准备的，如Conv和Pooling，它们有`handle`对象。
+
+`_onnx_model_to_singa_net`的第一步是调用`_init_graph_parameter`来获取模型内的所有tensors。对于可训练的权重，可以从`onnx_model.graph.initializer`中初始化`SINGA Tensor`。请注意，权重也可能存储在图的输入或称为`Constant`的ONNX节点中，SINGA也可以处理这些。
+
+虽然所有的权重都存储在ONNX模型中，但模型的输入是未知的，只有它的形状和类型。所以SINGA支持两种方式来初始化输入，1、根据其形状和类型生成随机张量，2、允许用户分配输入。第一种方法对大多数模型都很好，但是对于一些模型，比如BERT，矩阵的指数不能随机生成，否则会产生错误。
+
+然后，`_onnx_model_to_singa_net`迭代ONNX图中的所有节点，将其翻译成SIGNA运算符。另外，`_rename_operators` 定义了 SINGA 和 ONNX 之间的运算符名称映射。`_special_operators` 定义翻译运算符时要使用的函数。`_run_node`通过输入时序来运行生成的 SINGA 模型，并存储其输出时序，供以后的运算符使用。
+
+该类最后返回一个`SingaRep`对象，并在其中存储所有SINGA时序和运算符。
+
+### SingaRep
+
+`SingaBackend`存储所有的SINGA tensors和运算符。`run`接受模型的输入，并按照运算符队列逐个运行SINGA运算符。用户可以使用`last_layers`来决定是否将模型运行到最后几层。将 `all_outputs` 设置为 `False` 表示只得到最后的输出，设置为 `True` 表示也得到所有的中间输出。
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/optimizer.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/optimizer.md
new file mode 100644
index 0000000..8bab55d
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/optimizer.md
@@ -0,0 +1,123 @@
+---
+id: version-4.0.0_Chinese-optimizer
+title: Optimizer
+original_id: optimizer
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+SINGA支持各种流行的优化器，包括动量随机梯度下降、Adam、RMSProp和AdaGrad等。对于每一种优化器，它都支持使用衰减调度器来安排不同时间段的学习率。优化器和衰减调度器包含在`singa/opt.py`中。
+
+## 创建一个优化器
+
+1. 带动量的SGD
+
+```python
+# define hyperparameter learning rate
+lr = 0.001
+# define hyperparameter momentum
+momentum = 0.9
+# define hyperparameter weight decay
+weight_decay = 0.0001
+
+from singa import opt
+sgd = opt.SGD(lr=lr, momentum=momentum, weight_decay=weight_decay)
+```
+
+2. RMSProp
+
+```python
+# define hyperparameter learning rate
+lr = 0.001
+# define hyperparameter rho
+rho = 0.9
+# define hyperparameter epsilon
+epsilon = 1e-8
+# define hyperparameter weight decay
+weight_decay = 0.0001
+
+from singa import opt
+sgd = opt.RMSProp(lr=lr, rho=rho, epsilon=epsilon, weight_decay=weight_decay)
+```
+
+3. AdaGrad
+
+```python
+# define hyperparameter learning rate
+lr = 0.001
+# define hyperparameter epsilon
+epsilon = 1e-8
+# define hyperparameter weight decay
+weight_decay = 0.0001
+
+from singa import opt
+sgd = opt.AdaGrad(lr=lr, epsilon=epsilon, weight_decay=weight_decay)
+```
+
+4. Adam
+
+```python
+# define hyperparameter learning rate
+lr = 0.001
+# define hyperparameter beta 1
+beta_1= 0.9
+# define hyperparameter beta 2
+beta_1= 0.999
+# define hyperparameter epsilon
+epsilon = 1e-8
+# define hyperparameter weight decay
+weight_decay = 0.0001
+
+from singa import opt
+sgd = opt.Adam(lr=lr, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, weight_decay=weight_decay)
+```
+
+## 创建一个衰减调度器
+
+```python
+from singa import opt
+
+# define initial learning rate
+lr_init = 0.001
+# define the rate of decay in the decay scheduler
+decay_rate = 0.95
+# define whether the learning rate schedule is a staircase shape
+staircase=True
+# define the decay step of the decay scheduler (in this example the lr is decreased at every 2 steps)
+decay_steps = 2
+
+# create the decay scheduler, the schedule of lr becomes lr_init * (decay_rate ^ (step // decay_steps) )
+lr = opt.ExponentialDecay(0.1, 2, 0.5, True)
+# Use the lr to create an optimizer
+sgd = opt.SGD(lr=lr, momentum=0.9, weight_decay=0.0001)
+```
+
+## 使用模型API中的优化器
+
+当我们创建模型时，我们需要将优化器附加到模型上：
+
+```python
+# create a CNN using the Model API
+model = CNN()
+
+# initialize optimizer and attach it to the model
+sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+model.set_optimizer(sgd)
+```
+
+然后，当我们调用模型时，它会运行利用优化器的 `train_one_batch` 方法。
+
+因此，一个迭代循环优化模型的例子是：
+
+```python
+for b in range(num_train_batch):
+    # generate the next mini-batch
+    x, y = ...
+
+    # Copy the data into input tensors
+    tx.copy_from_numpy(x)
+    ty.copy_from_numpy(y)
+
+    # Training with one batch
+    out, loss = model(tx, ty)
+```
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/security.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/security.md
new file mode 100644
index 0000000..e387ce9
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/security.md
@@ -0,0 +1,9 @@
+---
+id: version-4.0.0_Chinese-security
+title: Security
+original_id: security
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+用户可以向[SINGA安全团队邮件列表](mailto:security@singa.apache.org)报告安全漏洞。
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/software-stack.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/software-stack.md
new file mode 100644
index 0000000..cf0dc69
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/software-stack.md
@@ -0,0 +1,85 @@
+---
+id: version-4.0.0_Chinese-software-stack
+title: Software Stack
+original_id: software-stack
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+SINGA的软件栈包括两个主要层次，底层后端和Python接口层。图1展示了它们与硬件的关系。后端组件为深度学习模型提供基本的数据结构，为调度和执行操作提供硬件抽象，为分布式训练提供通信组件。Python接口封装了部分CPP数据结构，并为神经网络训练提供了额外的高级类，方便实现复杂的神经网络模型。
+
+接下来，我们以自下而上的方式介绍软件栈。
+
+![SINGA V3 software stack](assets/singav3.1-sw.png) <br/> **Figure 1 - SINGA V3
+软件栈结构**
+
+## 底层后端
+
+### Device
+
+每个`Device`实例，即一个设备，是针对一个硬件设备创建的，例如GPU或CPU。`Device`管理数据结构的内存，并调度操作的执行，例如，在CUDA流或CPU线程上。根据硬件及其编程语言，SINGA实现了以下特定的设备类。
+
+- **CudaGPU** 代表Nvidia GPU卡,执行单元是CUDA流。
+- **CppCPU** 代表一个普通的CPU,执行单位是CPU线程。
+- **OpenclGPU** 代表Nvidia和AMD的普通GPU卡,执行单位是CommandQueue,鉴于OpenCL与许多硬件设备兼容，如FPGA和ARM，OpenclGPU可以扩展到其他设备。
+
+### Tensor
+
+`Tensor`类表示一个多维数组，它存储模型变量，例如，来自卷积层的输入图像和特征图。每个`Tensor`实例（即张量）都分配在一个设备上，它管理张量的内存，并针对张量调度（计算）操作。大多数机器学习算法可以使用（密集或稀疏）张量抽象及其操作来表达。因此，SINGA将能够运行广泛的模型，包括深度学习模型和其他传统的机器学习模型。
+
+### Operator
+
+针对张量的运算符有两类，一类是线性代数运算符，如矩阵乘法，另一类是神经网络特有的运算符，如卷积和池化。线性代数运算符以`Tensor`函数的形式提供，并针对不同的硬件设备分别实现。
+
+- CppMath (tensor_math_cpp.h) 为CppCPU使用Cpp实现张量运算。
+- CudaMath (tensor_math_cuda.h) 为CudaGPU使用CUDA实现张量运算。
+- OpenclMath(tensor_math_opencl.h) 使用OpenCL实现了OpenclGPU的张量运算。
+
+神经网络的具体运算符也是单独实现的，例如：
+
+- GpuConvFoward (convolution.h) 在Nvidia GPU上通过CuDNN实现卷积的前向函数。
+- CpuConvForward (convolution.h) 在CPU上使用CPP实现卷积的前向函数。
+
+通常情况下，用户会创建一个`Device`实例，并使用它来创建多个`Tensor`实例。当用户调用Tensor函数或神经网络操作时，会调用常驻设备的相应实现。换句话说，操作符的实现对用户是透明的。
+
+Tensor和Device抽象是可扩展的，以支持使用不同编程语言的各种硬件设备。一个新的硬件设备将通过添加一个新的Device子类和相应的操作符的实现来支持。
+
+在速度和内存方面的优化是由`Device`的`Scheduler`和`MemPool`完成的。例如，`Scheduler`根据运算符的依赖性创建一个[计算图](./graph)，然后它可以优化运算符的执行顺序，以实现并行性和内存共享。
+
+### Communicator
+
+`Communicator`是为了支持[分布式训练](./dist-train)。它使用套接字、MPI和NCCL实现通信协议。通常情况下，用户只需要调用`put()`和`get()`等高级API来发送和接收tensors。针对拓扑结构、消息大小等的通信优化在内部完成。
+
+## Python 接口层
+
+所有的后端组件都通过SWIG作为Python模块暴露出来。此外，还添加了以下类来支持复杂神经网络的实现：
+
+### Opt
+
+`Opt`及其子类实现了使用参数梯度更新模型参数值的方法（如SGD）。子类[DistOpt](./dist-train)通过调用`Communicator`中的方法来同步分布式训练的worker之间的梯度。
+
+### Operator
+
+`Operator`包装了多个使用后端Tensor或神经网络运算符实现的函数。例如，前向函数和后向函数`ReLU`组成`ReLU`运算符。
+
+### Layer
+
+`Layer`和它的子类用参数来包装运算符。例如，卷积和线性运算符有权重和偏置参数。 这些参数由相应的`Layer`类维护。
+
+### Autograd
+
+
+`Autograd`通过记录运算符的前向函数的执行情况，自动按照相反的顺序调用后向函数，实现了[反向模式的自动微分](https://rufflewind.com/2016-12-30/reverse-mode-automatic-differentiation)。所有的函数都可以通过`Scheduler`进行缓冲，建立[计算图](./graph)，以提高效率和内存优化。
+
+### Model
+
+
+[Model](./graph)提供了一个简单的接口来实现新的网络模型。你只需要继承`Model`，通过创建和调用层或运算符来定义模型的前向传播。当训练数据输入到`Model`中时，`Model`会自动进行autograd并通过`Opt`自动更新参数。通过`Model` API，SINGA充分利用了命令式编程和声明式编程的优势。用户使用[Model](./graph) API按照PyTorch那样的命令式编程风格来实现一个网络，PyTorch在每次迭代中都会重新创建操作，而SINGA则不同，它在第一次迭代后就会缓冲操作，隐式地创建一个计算图（当该功能被启用时）。该图类似于使用声明式编程的库所创建的图，例如TensorFlow。因此，SINGA可以在计算图上应用内存和速度优化技术。
+
+### ONNX
+
+
+为了支持ONNX，SINGA实现了一个[sonnx](./onnx)模块，其中包括。
+
+- SingaFrontend用于将SINGA模型保存为onnx格式。
+- SingaBackend 用于将 onnx 格式的模型加载到 SINGA 中进行训练和inference。
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/source-repository.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/source-repository.md
new file mode 100644
index 0000000..60193fd
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/source-repository.md
@@ -0,0 +1,24 @@
+---
+id: version-4.0.0_Chinese-source-repository
+title: Source Repository
+original_id: source-repository
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+
+本项目使用[Git](http://git-scm.com/)来管理其源代码。关于Git的使用说明可以在 http://git-scm.com/documentation 找到。
+
+## Repository
+
+以下是在线资源库的链接：
+
+- https://gitbox.apache.org/repos/asf?p=singa.git
+
+Github镜像在：
+
+- https://github.com/apache/singa
+
+代码可以从任何一个repo中克隆，例如：
+
+    git clone https://github.com/apache/singa.git
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/team-list.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/team-list.md
new file mode 100644
index 0000000..69d20be
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/team-list.md
@@ -0,0 +1,57 @@
+---
+id: version-4.0.0_Chinese-team-list
+title: The SINGA Team
+original_id: team-list
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+一个成功的项目需要许多人扮演许多角色，有些成员编写代码或文档，而其他成员则作为测试人员，提交补丁和建议。
+
+SINGA社区的开发者主要来自新加坡国立大学、浙江大学、网易、大阪大学、yzBigData等。
+
+## PMC
+
+| Name          | Email                   | Organization                                  |
+| ------------- | ----------------------- | --------------------------------------------- |
+| Gang Chen     | cg@apache.org           | Zhejiang University                           |
+| Anh Dinh      | dinhtta@apache.org      | Singapore University of Technology and Design |
+| Ted Dunning   | tdunning@apache.org     | Apache Software Foundation                    |
+| Jinyang Gao   | jinyang@apache.org      | DAMO Academy, Alibaba Group                   |
+| Alan Gates    | gates@apache.org        | Apache Software Foundation                    |
+| Zhaojing Luo  | zhaojing@apache.org     | National University of Singapore              |
+| Thejas Nair   | thejas@apache.org       | Apache Software Foundation                    |
+| Beng Chin Ooi | ooibc@apache.org        | National University of Singapore              |
+| Moaz Reyad    | moaz@apache.org         | Université Grenoble Alpes                     |
+| Kian-Lee Tan  | tankianlee@apache.org   | National University of Singapore              |
+| Sheng Wang    | wangsh@apache.org       | DAMO Academy, Alibaba Group                   |
+| Wei Wang      | wangwei@apache.org      | National University of Singapore              |
+| Zhongle Xie   | zhongle@apache.org      | National University of Singapore              |
+| Sai Ho Yeung  | chrishkchris@apache.org | National University of Singapore              |
+| Meihui Zhang  | meihuizhang@apache.org  | Beijing Institute of Technology               |
+| Kaiping Zheng | kaiping@apache.org      | National University of Singapore              |
+
+## Committers
+
+| Name         | Email                  | Organization                                  |
+| ------------ | ---------------------- | --------------------------------------------- |
+| Xiangrui Cai | caixr@apache.org       | Nankai University                             |
+| Chonho Lee   | chonho@apache.org      | Osaka University                              |
+| Shicong Lin  | shicong@apache.org     | National University of Singapore              |
+| Rulin Xing   | rulin@apache.org       | Huazhong University of Science and Technology |
+| Wanqi Xue    | xuewanqi@apache.org    | Nanyang Technological University              |
+| Joddiy Zhang | joddiyzhang@apache.org | National University of Singapore              |
+
+## Contributors
+
+| Name               | Email                        | Organization                     |
+| ------------------ | ---------------------------- | -------------------------------- |
+| Haibo Chen         | hzchenhaibo@corp.netease.com | NetEase                          |
+| Shicheng Chen      | chengsc@comp.nus.edu.sg      | National University of Singapore |
+| Xin Ji             | vincent.j.xin@gmail.com      | Visenze, Singapore               |
+| Anthony K. H. Tung | atung@comp.nus.edu.sg        | National University of Singapore |
+| Ji Wang            | wangji@mzhtechnologies.com   | Hangzhou MZH Technologies        |
+| Yuan Wang          | wangyuan@corp.netease.com    | NetEase                          |
+| Wenfeng Wu         | dcswuw@gmail.com             | Freelancer, China                |
+| Kaiyuan Yang       | yangky@comp.nus.edu.sg       | National University of Singapore |
+| Chang Yao          | yaochang2009@gmail.com       | Hangzhou MZH Technologies        |
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/tensor.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/tensor.md
new file mode 100644
index 0000000..23f3e54
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/tensor.md
@@ -0,0 +1,245 @@
+---
+id: version-4.0.0_Chinese-tensor
+title: Tensor
+original_id: tensor
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+每个Tensor实例都是分配在特定设备实例上的多维数组。Tensor实例可以存储变量，并在不同类型的硬件设备上提供线性代数运算，而无需用户察觉。需要注意的是，除了复制函数外，用户需要确保张量操作数分配在同一个设备上。
+
+## Tensor用法
+
+### 创建Tensor
+
+```python
+>>> import numpy as np
+>>> from singa import tensor
+>>> tensor.from_numpy( np.asarray([[1, 0, 0], [0, 1, 0]], dtype=np.float32) )
+[[1. 0. 0.]
+ [0. 1. 0.]]
+```
+
+### 转换到numpy
+
+```python
+>>> a = np.asarray([[1, 0, 0], [0, 1, 0]], dtype=np.float32)
+>>> tensor.from_numpy(a)
+[[1. 0. 0.]
+ [0. 1. 0.]]
+>>> tensor.to_numpy(tensor.from_numpy(a))
+array([[1., 0., 0.],
+       [0., 1., 0.]], dtype=float32)
+```
+
+### Tensor方法
+
+```python
+>>> t = tensor.from_numpy(a)
+>>> t.transpose([1,0])
+[[1. 0.]
+ [0. 1.]
+ [0. 0.]]
+```
+
+`Tensor`变换，最多支持6维。
+
+```python
+>>> a = tensor.random((2,3,4,5,6,7))
+>>> a.shape
+(2, 3, 4, 5, 6, 7)
+>>> a.reshape((2,3,4,5,7,6)).transpose((3,2,1,0,4,5)).shape
+(5, 4, 3, 2, 7, 6)
+```
+
+### Tensor算术方法
+
+`tensor`是实时计算的：
+
+```python
+>>> t + 1
+[[2. 1. 1.]
+ [1. 2. 1.]]
+>>> t / 5
+[[0.2 0.  0. ]
+ [0.  0.2 0. ]]
+```
+
+`tensor` broadcasting运算:
+
+```python
+>>> a
+[[1. 2. 3.]
+ [4. 5. 6.]]
+>>> b
+[[1. 2. 3.]]
+>>> a + b
+[[2. 4. 6.]
+ [5. 7. 9.]]
+>>> a * b
+[[ 1.  4.  9.]
+ [ 4. 10. 18.]]
+>>> a / b
+[[1.  1.  1. ]
+ [4.  2.5 2. ]]
+>>> a/=b # inplace operation
+>>> a
+[[1.  1.  1. ]
+ [4.  2.5 2. ]]
+```
+
+`tensor` broadcasting矩阵乘法：
+
+```python
+>>> from singa import tensor
+>>> a = tensor.random((2,2,2,3))
+>>> b = tensor.random((2,3,4))
+>>> tensor.mult(a,b).shape
+(2, 2, 2, 4)
+```
+
+### Tensor函数
+
+`singa.tensor`模块中的函数在应用函数中定义的变换后返回新的`Tensor`对象。
+
+```python
+>>> tensor.log(t+1)
+[[0.6931472 0.        0.       ]
+ [0.        0.6931472 0.       ]]
+```
+
+### Tensor在不同Devices上
+
+`tensor`默认在主机(CPU)上创建；也可以通过指定设备在不同的硬件`device`上创建。`tensor`可以通过`to_device()`函数在`device`之间移动。
+
+```python
+>>> from singa import device
+>>> x = tensor.Tensor((2, 3), device.create_cuda_gpu())
+>>> x.gaussian(1,1)
+>>> x
+[[1.531889   1.0128608  0.12691343]
+ [2.1674204  3.083676   2.7421203 ]]
+>>> # move to host
+>>> x.to_device(device.get_default_device())
+```
+
+### 使用Tensor训练MLP
+
+```python
+
+"""
+  code snipet from examples/mlp/module.py
+"""
+
+label = get_label()
+data = get_data()
+
+dev = device.create_cuda_gpu_on(0)
+sgd = opt.SGD(0.05)
+
+# define tensor for input data and label
+tx = tensor.Tensor((400, 2), dev, tensor.float32)
+ty = tensor.Tensor((400,), dev, tensor.int32)
+model = MLP(data_size=2, perceptron_size=3, num_classes=2)
+
+# attached model to graph
+model.set_optimizer(sgd)
+model.compile([tx], is_train=True, use_graph=True, sequential=False)
+model.train()
+
+for i in range(1001):
+    tx.copy_from_numpy(data)
+    ty.copy_from_numpy(label)
+    out, loss = model(tx, ty, 'fp32', spars=None)
+
+    if i % 100 == 0:
+        print("training loss = ", tensor.to_numpy(loss)[0])
+```
+
+输出:
+
+```bash
+$ python3 examples/mlp/module.py
+training loss =  0.6158037
+training loss =  0.52852553
+training loss =  0.4571422
+training loss =  0.37274635
+training loss =  0.30146334
+training loss =  0.24906921
+training loss =  0.21128304
+training loss =  0.18390492
+training loss =  0.16362564
+training loss =  0.148164
+training loss =  0.13589878
+```
+
+## Tensor实现
+
+上一节介绍了`Tensor`的一般用法，下面将介绍其底层的实现。首先，将介绍Python和C++ tensors的设计。后面会讲到前端（Python）和后端（C++）如何连接，如何扩展。
+
+### Python Tensor
+
+Python类`Tensor`，定义在`python/singa/tensor.py`中，提供了高层的张量操作，用于实现深度学习操作（通过[autograd](./autograd)），以及终端用户的数据管理。
+
+它主要是通过简单地封装C++张量方法来工作，包括算术方法（如`sum`）和非算术方法（如`reshape`）。一些高级的算术运算以后会引入，并使用纯Python的张量API来实现，如`tensordot`。Python Tensor API可以利用灵活的方法轻松实现复杂的神经网络操作。
+
+### C++ Tensor
+
+C++类`Tensor`，定义在`include/singa/core/tensor.h`中，主要是管理存放数据的内存，并提供低级的API用于张量操作。同时，它还通过封装不同的后端（CUDA、BLAS、cuBLAS等）提供各种算术方法（如`matmul`）。
+
+#### 执行背景和内存块
+
+Tensor的两个重要概念或者说数据结构是执行背景`device`，和内存块`Block`。
+
+每个`Tensor`物理上存储在一个硬件设备上，并由硬件设备管理，代表执行背景（CPU、GPU），Tensor的数学计算是在设备上执行的。
+
+Tensor数据在`Block`实例中，定义在`include/singa/core/common.h`中。`Block`拥有底层数据，而tensors则在描述tensor的元数据上拥有所有权，比如`shape`、`stride`。
+
+#### Tensor数学后端
+
+为了利用不同后端硬件设备提供的高效数学库，SINGA为每个支持的后端设备提供了一套Tensor函数的实现。
+
+- 'tensor_math_cpp.h'为CppCPU设备使用Cpp（带CBLAS）实现操作。
+- 'tensor_math_cuda.h'为CudaGPU设备实现了使用Cuda(带cuBLAS)的操作。
+- 'tensor_math_opencl.h'为OpenclGPU设备实现了使用OpenCL的操作。
+
+### 将C++ APIs暴露给Python
+
+
+SWIG(http://www.swig.org/)是一个可以自动将C++ API转换为Python API的工具。SINGA使用SWIG将C++ APIs公开到Python中。SWIG会生成几个文件，包括`python/singa/singa_wrap.py`。Python模块(如`tensor`、`device`和`autograd`)导入这个模块来调用C++ API来实现Python类和函数。
+
+```python
+import tensor
+
+t = tensor.Tensor(shape=(2, 3))
+```
+
+例如，当按上面的方法创建Python `Tensor`实例时，`Tensor`类的实现会创建一个在`singa_wrap.py`中定义的Tensor类的实例，它对应于C++ `Tensor`类。为了清楚起见，`singa_wrap.py`中的`Tensor`类在`tensor.py`中被称为`CTensor`。
+
+```python
+# in tensor.py
+from . import singa_wrap as singa
+
+CTensor = singa.Tensor
+```
+
+### 创建新的Tensor函数
+
+
+有了前面的描述所奠定的基础，扩展张量函数可以用自下而上的方式轻松完成。对于数学运算，其步骤是：
+
+- 在`tensor.h`中声明新的API。
+- 使用 `tensor.cc` 中预定义的宏生成代码，参考 `GenUnaryTensorFn(Abs);` 。
+- 在`tensor_math.h`中声明template 方法/函数。
+- 至少在CPU(`tensor_math_cpp.h`)和GPU(`tensor_math_cuda.h`)中进行真正的实现。
+- 将 API 加入到 `src/api/core_tensor.i` 中，通过 SWIG 公开 API。
+- 通过调用 `singa_wrap.py` 中自动生成的函数，在 `tensor.py` 中定义 Python Tensor API。
+- 在适当的地方编写单元测试
+
+## Python API
+
+_进行中_
+
+## CPP API
+
+_进行中_
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/time-profiling.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/time-profiling.md
new file mode 100644
index 0000000..f8cf61d
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/time-profiling.md
@@ -0,0 +1,154 @@
+---
+id: version-4.0.0_Chinese-time-profiling
+title: Time Profiling
+original_id: time-profiling
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+SINGA支持对图中缓冲的每个运算符进行时间分析。为了利用时间分析功能，我们首先调用`device.SetVerbosity`方法来设置时间分析器的verbosity，然后调用`device.PrintTimeProfiling`来打印出时间分析的结果。
+
+## 设置时间分析Verbosity
+
+要使用时间分析功能，我们需要设置verbosity。有三个级别的verbosity。在默认值`verbosity == 0`的情况下，它不会进行任何时间分析。当我们设置`verbosity == 1`时，它将对前向和后向传播时间进行分析。当`verbosity == 2`时，它将对图中每个缓冲操作所花费的时间进行分析。
+
+以下是设置时间分析功能的示例代码：
+
+```python
+# create a device
+from singa import device
+dev = device.create_cuda_gpu()
+# set the verbosity
+verbosity = 2
+dev.SetVerbosity(verbosity)
+# optional: skip the first 5 iterations when profiling the time
+dev.SetSkipIteration(5)
+```
+
+那么，当我们在程序的最后完成训练后，我们就可以通过调用`device.PrintTimeProfiling`方法来打印时间分析结果。
+
+```python
+dev.PrintTimeProfiling()
+```
+
+## 不同Verbosity的输出示例
+
+我们可以运行ResNet的[示例](https://github.com/apache/singa/blob/master/examples/cnn/benchmark.py)，看看不同的verbosity设置的输出。
+
+1. `verbosity == 1`
+
+```
+Time Profiling:
+Forward Propagation Time : 0.0409127 sec
+Backward Propagation Time : 0.114813 sec
+```
+
+2. `verbosity == 2`
+
+```
+Time Profiling:
+OP_ID0. SetValue : 1.73722e-05 sec
+OP_ID1. cudnnConvForward : 0.000612724 sec
+OP_ID2. GpuBatchNormForwardTraining : 0.000559449 sec
+OP_ID3. ReLU : 0.000375004 sec
+OP_ID4. GpuPoolingForward : 0.000240041 sec
+OP_ID5. SetValue : 3.4176e-06 sec
+OP_ID6. cudnnConvForward : 0.000115619 sec
+OP_ID7. GpuBatchNormForwardTraining : 0.000150415 sec
+OP_ID8. ReLU : 9.95494e-05 sec
+OP_ID9. SetValue : 3.22432e-06 sec
+OP_ID10. cudnnConvForward : 0.000648668 sec
+OP_ID11. GpuBatchNormForwardTraining : 0.000149793 sec
+OP_ID12. ReLU : 9.92118e-05 sec
+OP_ID13. SetValue : 3.37728e-06 sec
+OP_ID14. cudnnConvForward : 0.000400953 sec
+OP_ID15. GpuBatchNormForwardTraining : 0.000572181 sec
+OP_ID16. SetValue : 3.21312e-06 sec
+OP_ID17. cudnnConvForward : 0.000398698 sec
+OP_ID18. GpuBatchNormForwardTraining : 0.00056836 sec
+OP_ID19. Add : 0.000542246 sec
+OP_ID20. ReLU : 0.000372783 sec
+OP_ID21. SetValue : 3.25312e-06 sec
+OP_ID22. cudnnConvForward : 0.000260731 sec
+OP_ID23. GpuBatchNormForwardTraining : 0.000149041 sec
+OP_ID24. ReLU : 9.9072e-05 sec
+OP_ID25. SetValue : 3.10592e-06 sec
+OP_ID26. cudnnConvForward : 0.000637481 sec
+OP_ID27. GpuBatchNormForwardTraining : 0.000152577 sec
+OP_ID28. ReLU : 9.90518e-05 sec
+OP_ID29. SetValue : 3.28224e-06 sec
+OP_ID30. cudnnConvForward : 0.000404586 sec
+OP_ID31. GpuBatchNormForwardTraining : 0.000569679 sec
+OP_ID32. Add : 0.000542291 sec
+OP_ID33. ReLU : 0.00037211 sec
+OP_ID34. SetValue : 3.13696e-06 sec
+OP_ID35. cudnnConvForward : 0.000261219 sec
+OP_ID36. GpuBatchNormForwardTraining : 0.000148281 sec
+OP_ID37. ReLU : 9.89299e-05 sec
+OP_ID38. SetValue : 3.25216e-06 sec
+OP_ID39. cudnnConvForward : 0.000633644 sec
+OP_ID40. GpuBatchNormForwardTraining : 0.000150711 sec
+OP_ID41. ReLU : 9.84902e-05 sec
+OP_ID42. SetValue : 3.18176e-06 sec
+OP_ID43. cudnnConvForward : 0.000402752 sec
+OP_ID44. GpuBatchNormForwardTraining : 0.000571523 sec
+OP_ID45. Add : 0.000542435 sec
+OP_ID46. ReLU : 0.000372539 sec
+OP_ID47. SetValue : 3.24672e-06 sec
+OP_ID48. cudnnConvForward : 0.000493054 sec
+OP_ID49. GpuBatchNormForwardTraining : 0.000293142 sec
+OP_ID50. ReLU : 0.000190047 sec
+OP_ID51. SetValue : 3.14784e-06 sec
+OP_ID52. cudnnConvForward : 0.00148837 sec
+OP_ID53. GpuBatchNormForwardTraining : 8.34794e-05 sec
+OP_ID54. ReLU : 5.23254e-05 sec
+OP_ID55. SetValue : 3.40096e-06 sec
+OP_ID56. cudnnConvForward : 0.000292971 sec
+OP_ID57. GpuBatchNormForwardTraining : 0.00029174 sec
+OP_ID58. SetValue : 3.3248e-06 sec
+OP_ID59. cudnnConvForward : 0.000590154 sec
+OP_ID60. GpuBatchNormForwardTraining : 0.000294149 sec
+OP_ID61. Add : 0.000275119 sec
+OP_ID62. ReLU : 0.000189268 sec
+OP_ID63. SetValue : 3.2704e-06 sec
+OP_ID64. cudnnConvForward : 0.000341232 sec
+OP_ID65. GpuBatchNormForwardTraining : 8.3304e-05 sec
+OP_ID66. ReLU : 5.23667e-05 sec
+OP_ID67. SetValue : 3.19936e-06 sec
+OP_ID68. cudnnConvForward : 0.000542484 sec
+OP_ID69. GpuBatchNormForwardTraining : 8.60537e-05 sec
+OP_ID70. ReLU : 5.2479e-05 sec
+OP_ID71. SetValue : 3.41824e-06 sec
+OP_ID72. cudnnConvForward : 0.000291295 sec
+OP_ID73. GpuBatchNormForwardTraining : 0.000292795 sec
+OP_ID74. Add : 0.000274438 sec
+OP_ID75. ReLU : 0.000189689 sec
+OP_ID76. SetValue : 3.21984e-06 sec
+OP_ID77. cudnnConvForward : 0.000338776 sec
+OP_ID78. GpuBatchNormForwardTraining : 8.484e-05 sec
+OP_ID79. ReLU : 5.29408e-05 sec
+OP_ID80. SetValue : 3.18208e-06 sec
+OP_ID81. cudnnConvForward : 0.000545542 sec
+OP_ID82. GpuBatchNormForwardTraining : 8.40976e-05 sec
+OP_ID83. ReLU : 5.2256e-05 sec
+OP_ID84. SetValue : 3.36256e-06 sec
+OP_ID85. cudnnConvForward : 0.000293003 sec
+OP_ID86. GpuBatchNormForwardTraining : 0.0002989 sec
+OP_ID87. Add : 0.000275041 sec
+OP_ID88. ReLU : 0.000189867 sec
+OP_ID89. SetValue : 3.1184e-06 sec
+OP_ID90. cudnnConvForward : 0.000340417 sec
+OP_ID91. GpuBatchNormForwardTraining : 8.39395e-05 sec
+OP_ID92. ReLU : 5.26544e-05 sec
+OP_ID93. SetValue : 3.2336e-06 sec
+OP_ID94. cudnnConvForward : 0.000539787 sec
+OP_ID95. GpuBatchNormForwardTraining : 8.2753e-05 sec
+OP_ID96. ReLU : 4.86758e-05 sec
+OP_ID97. SetValue : 3.24384e-06 sec
+OP_ID98. cudnnConvForward : 0.000287108 sec
+OP_ID99. GpuBatchNormForwardTraining : 0.000293127 sec
+OP_ID100. Add : 0.000269478 sec
+.
+.
+.
+```
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/wheel-cpu-dev.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/wheel-cpu-dev.md
new file mode 100644
index 0000000..343da14
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/wheel-cpu-dev.md
@@ -0,0 +1,13 @@
+---
+id: version-4.0.0_Chinese-wheel-cpu-dev
+title: CPU only (dev version)
+original_id: wheel-cpu-dev
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+## 3.0.0.dev200720
+
+- [Python 3.6](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0.dev200720-cp36-cp36m-manylinux2014_x86_64.whl)
+- [Python 3.7](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0.dev200720-cp37-cp37m-manylinux2014_x86_64.whl)
+- [Python 3.8](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0.dev200720-cp38-cp38-manylinux2014_x86_64.whl)
diff --git a/docs-site/docs/wheel-cpu.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/wheel-cpu.md
similarity index 67%
copy from docs-site/docs/wheel-cpu.md
copy to docs-site/website/versioned_docs/version-4.0.0_Chinese/wheel-cpu.md
index 295fced..4993cc7 100644
--- a/docs-site/docs/wheel-cpu.md
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/wheel-cpu.md
@@ -1,22 +1,11 @@
 ---
-id: wheel-cpu
+id: version-4.0.0_Chinese-wheel-cpu
 title: CPU only
+original_id: wheel-cpu
 ---
 
 <!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
 
-## 3.3.0
-
-- [Python 3.6](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-3.3.0-cp36-cp36m-manylinux2014_x86_64.whl)
-- [Python 3.7](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-3.3.0-cp37-cp37m-manylinux2014_x86_64.whl)
-- [Python 3.8](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-3.3.0-cp38-cp38-manylinux2014_x86_64.whl)
-
-## 3.2.0
-
-- [Python 3.6](https://www.comp.nus.edu.sg/~dbsystem/wheelhouse/singa-3.2.0-cp36-cp36m-manylinux2014_x86_64.whl)
-- [Python 3.7](https://www.comp.nus.edu.sg/~dbsystem/wheelhouse/singa-3.2.0-cp37-cp37m-manylinux2014_x86_64.whl)
-- [Python 3.8](https://www.comp.nus.edu.sg/~dbsystem/wheelhouse/singa-3.2.0-cp38-cp38-manylinux2014_x86_64.whl)
-
 ## 3.1.0
 
 - [Python 3.6](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.1.0-cp36-cp36m-manylinux2014_x86_64.whl)
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/wheel-gpu-dev.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/wheel-gpu-dev.md
new file mode 100644
index 0000000..cb78bff
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/wheel-gpu-dev.md
@@ -0,0 +1,13 @@
+---
+id: version-4.0.0_Chinese-wheel-gpu-dev
+title: CUDA enabled (dev version)
+original_id: wheel-gpu-dev
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+## 3.0.0.dev200720
+
+- [CUDA10.2, cuDNN 7.6.5, Python 3.6](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0.dev200720%2Bcuda10.2-cp36-cp36m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python 3.7](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0.dev200720%2Bcuda10.2-cp37-cp37m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python 3.8](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0.dev200720%2Bcuda10.2-cp38-cp38-manylinux2014_x86_64.whl)
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Chinese/wheel-gpu.md b/docs-site/website/versioned_docs/version-4.0.0_Chinese/wheel-gpu.md
new file mode 100644
index 0000000..efdb201
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Chinese/wheel-gpu.md
@@ -0,0 +1,22 @@
+---
+id: version-4.0.0_Chinese-wheel-gpu
+title: CUDA enabled
+original_id: wheel-gpu
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+## 3.1.0
+
+- [CUDA10.2, cuDNN 7.6.5, Python
+  3.6]https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.1.0%2Bcuda10.2-cp36-cp36m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python
+  3.7]https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.1.0%2Bcuda10.2-cp37-cp37m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python
+  3.8]https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.1.0%2Bcuda10.2-cp38-cp38-manylinux2014_x86_64.whl)
+
+## 3.0.0
+
+- [CUDA10.2, cuDNN 7.6.5, Python 3.6](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0%2Bcuda10.2-cp36-cp36m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python 3.7](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0%2Bcuda10.2-cp37-cp37m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python 3.8](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0%2Bcuda10.2-cp38-cp38-manylinux2014_x86_64.whl)
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/autograd.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/autograd.md
new file mode 100644
index 0000000..237633e
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/autograd.md
@@ -0,0 +1,282 @@
+---
+id: version-4.0.0_Viet-autograd
+title: Autograd
+original_id: autograd
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+Có hai cách thường dùng để sử dụng autograd, qua symbolic differentiation như là
+[Theano](http://deeplearning.net/software/theano/index.html) hoặc reverse
+differentiation như là
+[Pytorch](https://pytorch.org/docs/stable/notes/autograd.html). SINGA dùng cách
+Pytorch, lưu trữ computation graph rồi áp dụng backward propagation tự động sau
+khi forward propagation. Thuật toán autograd được giải thích cụ thể ở
+[đây](https://pytorch.org/docs/stable/notes/autograd.html). Chúng tôi giải thích
+các modules liên quan trong Singa và đưa ra ví dụ để minh hoạ cách sử dụng.
+
+## Các Module liên quan
+
+Autograd gồm ba classes với tên gọi `singa.tensor.Tensor`,
+`singa.autograd.Operation`, và `singa.autograd.Layer`. Trong phần tiếp theo của
+văn kiện này, chúng tôi dùng tensor, operation và layer để chỉ một chương trình
+(instance) trong class tương ứng.
+
+### Tensor
+
+Ba tính năng của Tensor được sử dụng bởi autograd,
+
+- `.creator` là một chương trình `Operation`. Chương trình này lưu trữ tác vụ
+  tạo ra Tensor instance.
+- `.requires_grad` là một biến kiểu bool. Biến được sử dụng để chỉ rằng thuật
+  toán autograd cần tính ra độ dốc (gradient) của tensor. (như owner). Ví dụ,
+  khi chạy backpropagation, thì cần phải tính ra độ dốc của tensor cho ma trận
+  trọng lượng (weight matrix) của lớp tuyến tính (linear layer) và bản đồ tính
+  năng (feature map) của convolution layer (không phải lớp cuối).
+- `.stores_grad` là một biến kiểu bool. Biến được sử dụng để chỉ rằng độ dốc của
+  owner tensor cần được lưu và tạo ra bởi hàm backward. Ví dụ, độ dốc của
+  feature maps được tính thông qua backpropagation, nhưng không được bao gồm
+  trong kết quả của hàm backward.
+
+Lập trình viên có thể thay đổi `requires_grad` và `stores_grad` của chương trình
+Tensor. Ví dụ nếu hàm sau để là True, độ dốc tương ứng sẽ được bao gồm trong kết
+quả của hàm backward. Cần lưu ý rằng nếu `stores_grad` để là True, thì
+`requires_grad` cũng phải là True, và ngược lại.
+
+### Operation
+
+Hàm chạy một hoặc một vài chương trình `Tensor` instances ở đầu vào, sau đó đầu
+ra là một hoặc một vài chương trình `Tensor` instances. Ví dụ, hàm ReLU có thể
+được sử dụng như một subclass của một hàm Operation cụ thể. Khi gọi một chương
+trình `Operation` (sau cài đặt), cần thực hiện hai bước sau:
+
+1. Ghi lại hàm operations nguồn, vd. biến `creator`của tensor đầu vào.
+2. làm tính toán bằng cách gọi hàm thành viên `.forward()`
+
+Có hai hàm thành viên cho forwarding và backwarding, vd. `.forward()` và
+`.backward()`. Đầu vào là `Tensor.data` (thuộc loại `CTensor`), và đầu ra là
+`Ctensor`. Nếu muốn thêm một hàm operation thì subclass `operation` cần chạy
+riêng `.forward()` và `.backward()`. Hàm `backward()` được tự động gọi bởi hàm
+`backward()` của autograd trong quá trình chạy backward để thực hiện độ dốc của
+đầu vào (theo mục `require_grad`).
+
+### Layer
+
+Với các hàm yêu cầu tham số (parameter), chúng tôi gói chúng lại thành một class
+mới, `Layer`. Ví dụ hàm convolution operation thì được nhóm vào trong
+convolution layer. `Layer` quản lý (hoặc lưu trữ) các tham số và sẽ gọi các hàm
+`Operation` tương ứng để thực hiện việc chuyển đổi.
+
+## Ví dụ
+
+Chúng tôi cung cấp nhiều ví dụ trong
+[mục ví dụ](https://github.com/apache/singa/tree/master/examples/autograd).
+Chúng tôi đưa ra giải thích cụ thể trong hai ví dụ tiêu biểu ở đây.
+
+### Dùng hàm Operation
+
+Code dưới đây áp dụng model MLP, chỉ dùng hàm Operation (không dùng hàm Layer).
+
+#### Thêm packages
+
+```python
+from singa.tensor import Tensor
+from singa import autograd
+from singa import opt
+```
+
+#### Tạo ma trận trọng lượng (weight matrix) và bias vector
+
+Tham số tensors được tạo bởi cả `requires_grad` và `stores_grad` ở giá trị
+`True`.
+
+```python
+w0 = Tensor(shape=(2, 3), requires_grad=True, stores_grad=True)
+w0.gaussian(0.0, 0.1)
+b0 = Tensor(shape=(1, 3), requires_grad=True, stores_grad=True)
+b0.set_value(0.0)
+
+w1 = Tensor(shape=(3, 2), requires_grad=True, stores_grad=True)
+w1.gaussian(0.0, 0.1)
+b1 = Tensor(shape=(1, 2), requires_grad=True, stores_grad=True)
+b1.set_value(0.0)
+```
+
+#### Training
+
+```python
+inputs = Tensor(data=data)  # data matrix
+target = Tensor(data=label) # label vector
+autograd.training = True    # cho training
+sgd = opt.SGD(0.05)   # optimizer
+
+for i in range(10):
+    x = autograd.matmul(inputs, w0) # matrix multiplication
+    x = autograd.add_bias(x, b0)    # add the bias vector
+    x = autograd.relu(x)            # ReLU activation operation
+
+    x = autograd.matmul(x, w1)
+    x = autograd.add_bias(x, b1)
+
+    loss = autograd.softmax_cross_entropy(x, target)
+
+    for p, g in autograd.backward(loss):
+        sgd.update(p, g)
+```
+
+### Hàm Operation + Layer
+
+[Ví dụ](https://github.com/apache/singa/blob/master/examples/autograd/mnist_cnn.py)
+sau đây áp dụng CNN model sử dụng các lớp (layers) tạo từ autograd module.
+
+#### Tạo layers
+
+```python
+conv1 = autograd.Conv2d(1, 32, 3, padding=1, bias=False)
+bn1 = autograd.BatchNorm2d(32)
+pooling1 = autograd.MaxPool2d(3, 1, padding=1)
+conv21 = autograd.Conv2d(32, 16, 3, padding=1)
+conv22 = autograd.Conv2d(32, 16, 3, padding=1)
+bn2 = autograd.BatchNorm2d(32)
+linear = autograd.Linear(32 * 28 * 28, 10)
+pooling2 = autograd.AvgPool2d(3, 1, padding=1)
+```
+
+#### Định nghĩa hàm forward
+
+Hàm trong forward pass sẽ được tự đông lưu cho backward propagation.
+
+```python
+def forward(x, t):
+    # x là input data (batch hình ảnh)
+    # t là label vector (batch số nguyên)
+    y = conv1(x)           # Conv layer
+    y = autograd.relu(y)   # ReLU operation
+    y = bn1(y)             # BN layer
+    y = pooling1(y)        # Pooling Layer
+
+    # hai convolution layers song song
+    y1 = conv21(y)
+    y2 = conv22(y)
+    y = autograd.cat((y1, y2), 1)  # cat operation
+    y = autograd.relu(y)           # ReLU operation
+    y = bn2(y)
+    y = pooling2(y)
+
+    y = autograd.flatten(y)        # flatten operation
+    y = linear(y)                  # Linear layer
+    loss = autograd.softmax_cross_entropy(y, t)  # operation
+    return loss, y
+```
+
+#### Training
+
+```python
+autograd.training = True
+for epoch in range(epochs):
+    for i in range(batch_number):
+        inputs = tensor.Tensor(device=dev, data=x_train[
+                               i * batch_sz:(1 + i) * batch_sz], stores_grad=False)
+        targets = tensor.Tensor(device=dev, data=y_train[
+                                i * batch_sz:(1 + i) * batch_sz], requires_grad=False, stores_grad=False)
+
+        loss, y = forward(inputs, targets) # forward the net
+
+        for p, gp in autograd.backward(loss):  # auto backward
+            sgd.update(p, gp)
+```
+
+### Sử dụng Model API
+
+[Ví dụ](https://github.com/apache/singa/blob/master/examples/cnn/model/cnn.py)
+sau áp dụng CNN model sử dụng [Model API](./graph).
+
+#### Định nghiã subclass của Model
+
+Model class được định nghĩa là subclass của Model. Theo đó, tất cả các hàm
+operations được sử dụng trong bước training sẽ tạo thành một computational graph
+và được phân tích. Hàm operation trong graph sẽ được lên lịch trình và thực hiện
+hiệu quả. Layers cũng có thể được bao gồm trong model class.
+
+```python
+class MLP(model.Model):  # model là subclass của Model
+
+    def __init__(self, data_size=10, perceptron_size=100, num_classes=10):
+        super(MLP, self).__init__()
+
+        # taọ operators, layers và các object khác
+        self.relu = layer.ReLU()
+        self.linear1 = layer.Linear(perceptron_size)
+        self.linear2 = layer.Linear(num_classes)
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
+
+    def forward(self, inputs):  # định nghĩa forward function
+        y = self.linear1(inputs)
+        y = self.relu(y)
+        y = self.linear2(y)
+        return y
+
+    def train_one_batch(self, x, y):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+        self.optimizer(loss)
+        return out, loss
+
+    def set_optimizer(self, optimizer):  # đính kèm optimizer
+        self.optimizer = optimizer
+```
+
+#### Training
+
+```python
+# tạo hàm model instance
+model = MLP()
+# tạo optimizer và đính vào model
+sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+model.set_optimizer(sgd)
+# input và target placeholders cho model
+tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
+ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
+# tổng hợp model trước khi training
+model.compile([tx], is_train=True, use_graph=True, sequential=False)
+
+# train model theo bước lặp (iterative)
+for b in range(num_train_batch):
+    # generate the next mini-batch
+    x, y = ...
+
+    # Copy the data into input tensors
+    tx.copy_from_numpy(x)
+    ty.copy_from_numpy(y)
+
+    # Training with one batch
+    out, loss = model(tx, ty)
+```
+
+#### Lưu model checkpoint
+
+```python
+# xác định đường dẫn để lưu checkpoint
+checkpointpath="checkpoint.zip"
+
+# lưu checkpoint
+model.save_states(fpath=checkpointpath)
+```
+
+#### Tải model checkpoint
+
+```python
+# xác định đường dẫn để lưu checkpoint
+checkpointpath="checkpoint.zip"
+
+# lưu checkpoint
+import os
+if os.path.exists(checkpointpath):
+    model.load_states(fpath=checkpointpath)
+```
+
+### Python API
+
+Xem
+[tại đây](https://singa.readthedocs.io/en/latest/autograd.html#module-singa.autograd)
+để thêm thông tin chi tiết về Python API.
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/benchmark-train.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/benchmark-train.md
new file mode 100644
index 0000000..a8826c0
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/benchmark-train.md
@@ -0,0 +1,29 @@
+---
+id: version-4.0.0_Viet-benchmark-train
+title: Benchmark cho Distributed Training
+original_id: benchmark-train
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+Tải công việc: chúng tôi sử dụng Mạng nơ-ron tích chập sâu (deep convolutional
+neural network),
+[ResNet-50](https://github.com/apache/singa/blob/master/examples/cnn/model/resnet.py)
+làm ứng dụng. ResNet-50 có 50 lớp tích chập (convolution layers) để phân loại
+hình ảnh. Nó đòi hỏi 3.8 GFLOPs để đưa vào một hình ảnh (kích thước ảnh 224x224)
+qua mạng lưới. Kích thước ảnh đầu vào là 224x224.
+
+Phần cứng: chúng tôi sử dụng máy p2.8xlarge từ AWS, mỗi máy gồm 8 Nvidia Tesla
+K80 GPUs, bộ nhớ tổng cộng 96 GB GPU, 32 vCPU, 488 GB main memory, 10 Gbps
+network bandwidth.
+
+Metric: chúng tôi tính thời gian mỗi bước cho mỗi workers để đánh giá khả năng
+mở rộng của SINGA. Kích thước của mỗi nhóm được cố định ở 32 mỗi GPU. Phương
+thức training đồng bộ (Synchronous training scheme) được áp dụng. Vì thế, kích
+thước nhóm hiệu quả là $32N$, trong đó N là số máy GPUs. Chúng tôi so sánh với
+một hệ thống mở được dùng phổ biến có sử dụng tham số server cấu trúc liên kết.
+Máy GPU đầu tiên được chọn làm server.
+
+![Thí nghiệm Benchmark](assets/benchmark.png) <br/> **Kiểm tra khả năng mở rộng.
+Bars được dùng cho thông lượng (throughput); lines dùng cho lượng kết nối
+(communication cost).**
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/build.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/build.md
new file mode 100644
index 0000000..4a23480
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/build.md
@@ -0,0 +1,523 @@
+---
+id: version-4.0.0_Viet-build
+title: Cài đặt SINGA từ Nguồn (Source)
+original_id: build
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+Các tệp nguồn có thể được tải dưới dạng
+[tar.gz file](https://dist.apache.org/repos/dist/dev/singa/), hoặc git repo
+
+```shell
+$ git clone https://github.com/apache/singa.git
+$ cd singa/
+```
+
+Nếu bạn muốn tham gia đóng góp code cho SINGA, tham khảo
+[mục contribute-code](contribute-code.md) với các bước làm và yêu cầu kĩ thuật.
+
+## Sử dụng Conda để cài SINGA
+
+Conda-build là phần mềm giúp cài đặt thư viện chương trình từ dữ liệu đám mây
+anaconda và thực hiện các tập lệnh tạo chương trình.
+
+Để cài đặt conda-build (sau khi cài conda)
+
+```shell
+conda install conda-build
+```
+
+### Tạo phiên bản CPU
+
+Để tạo phiên bản CPU cho SINGA
+
+```shell
+conda build tool/conda/singa/
+```
+
+Lệnh trên đã được kiểm tra trên Ubuntu (14.04, 16.04 và 18.04) và macOS 10.11.
+Tham khảo [trang Travis-CI](https://travis-ci.org/apache/singa) để biết thêm chi
+tiết.
+
+### Tạo phiên bản GPU
+
+Để tạo phiên bản GPU cho SINGA, máy để cài phải có Nvida GPU, và CUDA driver (>=
+384.81), phải được cài đặt CUDA toolkit (>=9) và cuDNN (>=7). Hai Docker images
+dưới đây cung cấp environment để chạy:
+
+1. apache/singa:conda-cuda9.0
+2. apache/singa:conda-cuda10.0
+
+Sau khi environment để chạy đã sẵn sàng, bạn cần phải export phiên bản CUDA
+trước, sau đó chạy lệnh conda để cài SINGA:
+
+```shell
+export CUDA=x.y (e.g. 9.0)
+conda build tool/conda/singa/
+```
+
+### Sau khi chạy chương trình
+
+Vị trí đặt tệp tin của gói chương trình được tạo (`.tar.gz`) hiển thị trên màn
+hình. Gói chương trình được tạo có thể được cài đặt trực tiếp,
+
+```shell
+conda install -c conda-forge --use-local <path to the package file>
+```
+
+hoặc tải lên dữ liệu đám mây anaconda cloud để người dùng khác có thể tải và cài
+đặt. Bạn cần phải đăng kí một tài khoản trên anaconda để có thể
+[tải lên gói chương trình](https://docs.anaconda.com/anaconda-cloud/user-guide/getting-started/).
+
+```shell
+conda install anaconda-client
+anaconda login
+anaconda upload -l main <path to the package file>
+```
+
+Sau khi tải gói chương trình lên dữ liệu đám mây, bạn có thể tìm thấy gói trên
+website của [Anaconda Cloud](https://anaconda.org/) hoặc qua lệnh
+
+```shell
+conda search -c <anaconda username> singa
+```
+
+Mỗi gói chương trình của SINGA đuợc nhận diện theo phiên bản hoặc dòng lệnh cài
+đặt. Để cài một gói chương trình SINGA cụ thể, bạn cần phải cung cấp toàn bộ
+thông tin, vd.
+
+```shell
+conda install -c <anaconda username> -c conda-forge singa=2.1.0.dev=cpu_py36
+```
+
+Để cho lệnh cài đặt không phức tạp, bạn có thể tạo các gói chương trình bổ sung
+sau dựa trên các gói chương trình cho SINGA CPU và GPU mới nhất .
+
+```console
+# for singa-cpu
+conda build tool/conda/cpu/  --python=3.6
+conda build tool/conda/cpu/  --python=3.7
+# for singa-gpu
+conda build tool/conda/gpu/  --python=3.6
+conda build tool/conda/gpu/  --python=3.7
+```
+
+Bởi vậy, khi bạn chạy
+
+```shell
+conda install -c <anaconda username> -c conda-forge singa-xpu
+```
+
+(`xpu` nghĩa là hoặc 'cpu' hoặc 'gpu'), gói SINGA tương ứng thực sự được cài đặt
+như một library phụ thuộc.
+
+## Sử dụng các phương tiện cơ bản để cài đặt SINGA trên Ubuntu
+
+Tham khảo
+[Dockerfiles](https://github.com/apache/singa/blob/master/tool/docker/devel/ubuntu/cuda9/Dockerfile#L30)
+của SINGA để xem hướng dẫn cài đặt các chương trình library phụ thuộc trên
+Ubuntu 16.04. Bạn có thể tạo một Docker container sử dụng [devel images]() và
+cài SINGA trong container. Để cài SINGA với GPU, DNNL, Python và unit tests,
+chạy lệnh theo hướng dẫn sau
+
+```shell
+mkdir build    # tại thư mục nguồn của singa
+cd build
+cmake -DENABLE_TEST=ON -DUSE_CUDA=ON -DUSE_DNNL=ON -DUSE_PYTHON3=ON ..
+make
+cd python
+pip install .
+```
+
+Chi tiết các lựa chọn CMake đuợc giải thích ở phần cuối cùng của trang này. Câu
+lệnh cuối cùng để cài gói Python. Bạn cúng có thể chạy `pip install -e .`, để
+tạo symlinks thay vì copy các tâp tin Python vào mục site-package.
+
+Nếu SINGA được compile với ENABLE_TEST=ON, bạn có thể chạy unit test bằng cách
+
+```shell
+$ ./bin/test_singa
+```
+
+Bạn sẽ thấy tất cả các trường hợp test kèm theo kết quả test. Nếu SINGA thông
+qua tất cả các test, bạn đã cài đặt SINGA thành công.
+
+## Sử dụng các phương tiện cơ bản để cài đặt SINGA trên Centos7
+
+Tạo từ nguồn sẽ khác trên Centos7 bởi tên của gói chương trình là khác nhau. Làm
+theo hướng dẫn dưới đây
+
+### Cài các chương trình phụ thuộc (dependent libraries)
+
+Gói/chương trình cơ bản
+
+```shell
+sudo yum install freetype-devel libXft-devel ncurses-devel openblas-devel blas-devel lapack devel atlas-devel kernel-headers unzip wget pkgconfig zip zlib-devel libcurl-devel cmake curl unzip dh-autoreconf git python-devel glog-devel protobuf-devel
+```
+
+Cho build-essential
+
+```shell
+sudo yum group install "Development Tools"
+```
+
+Để cài đặt swig
+
+```shell
+sudo yum install pcre-devel
+wget http://prdownloads.sourceforge.net/swig/swig-3.0.10.tar.gz
+tar xvzf swig-3.0.10.tar.gz
+cd swig-3.0.10.tar.gz
+./configure --prefix=${RUN}
+make
+make install
+```
+
+Để cài đặt gfortran
+
+```shell
+sudo yum install centos-release-scl-rh
+sudo yum --enablerepo=centos-sclo-rh-testing install devtoolset-7-gcc-gfortran
+```
+
+Để cài đặt pip và các gói chương trình khác
+
+```shell
+sudo yum install epel-release
+sudo yum install python-pip
+pip install matplotlib numpy pandas scikit-learn pydot
+```
+
+### Cài đặt
+
+Làm theo bước 1-5 của _Use native tools để cài SINGA trên Ubuntu_
+
+### Kiểm tra (testing)
+
+Bạn có thể chạy unit tests bằng cách,
+
+```shell
+$ ./bin/test_singa
+```
+
+Bạn sẽ thấy tất cả các trường hợp test kèm theo kết quả test. Nếu SINGA thông
+qua tất cả các test, bạn đã cài đặt SINGA thành công.
+
+## Compile SINGA trên Windows
+
+Hướng dẫn cài đặt trên Windows với Python vui lòng xem tại
+[mục install-win](install-win.md).
+
+## Chi tiết bổ sung về các lựa chọn biên dịch (compilation)
+
+### USE_MODULES (không còn sử dụng)
+
+Nếu protobuf và openblas không được cài đặt, bạn có thể compile SINGA cùng với
+chúng.
+
+```shell
+$ In SINGA ROOT folder
+$ mkdir build
+$ cd build
+$ cmake -DUSE_MODULES=ON ..
+$ make
+```
+
+cmake sẽ tải OpenBlas và Protobuf (2.6.1) sau đó compile cùng với SINGA.
+
+Bạn có thể sử dụng `ccmake ..` để định dạng các lựa chọn biên dịch
+(compilation). Nếu chương trình phụ thuộc (dependent libraries) nào không có
+trong đường dẫn hệ thống mặc định,bạn cần phải export các biến environment sau:
+
+```shell
+export CMAKE_INCLUDE_PATH=<path to the header file folder>
+export CMAKE_LIBRARY_PATH=<path to the lib file folder>
+```
+
+### USE_PYTHON
+
+Là lựa chọn để compile Python wrapper cho SINGA,
+
+```shell
+$ cmake -DUSE_PYTHON=ON ..
+$ make
+$ cd python
+$ pip install .
+```
+
+### USE_CUDA
+
+Chúng tôi khuyến khích cài đặt CUDA và
+[cuDNN](https://developer.nvidia.com/cudnn) để chạy SINGA trên GPUs nhằm có kết
+quả tốt nhất.
+
+SINGA đã được kiểm nghiệm chạy trên CUDA 9/10, và cuDNN 7. Nếu cuDNN được cài
+đặt vào thư mục không thuộc hệ thống, vd. /home/bob/local/cudnn/, cần chạy các
+lệnh sau để cmake và runtime có thể tìm được
+
+```shell
+$ export CMAKE_INCLUDE_PATH=/home/bob/local/cudnn/include:$CMAKE_INCLUDE_PATH
+$ export CMAKE_LIBRARY_PATH=/home/bob/local/cudnn/lib64:$CMAKE_LIBRARY_PATH
+$ export LD_LIBRARY_PATH=/home/bob/local/cudnn/lib64:$LD_LIBRARY_PATH
+```
+
+Các lựa chọn cmake cho CUDA và cuDNN cần được kích hoạt
+
+```shell
+# các Dependent libs đã được cài đặt
+$ cmake -DUSE_CUDA=ON ..
+$ make
+```
+
+### USE_DNNL
+
+Người dùng có thể kích hoạt DNNL để cải thiện hiệu quả cho chương trình CPU.
+
+Hướng dẫn cài đặt DNNL [tại đây](https://github.com/intel/mkl-dnn#installation).
+
+SINGA đã được thử nghiệm chạy trên DNNL v1.1.
+
+Để chạy SINGA với DNNL:
+
+```shell
+# các Dependent libs đã được cài đặt
+$ cmake -DUSE_DNNL=ON ..
+$ make
+```
+
+### USE_OPENCL
+
+SINGA sử dụng opencl-headers và viennacl (phiên bản 1.7.1 hoặc mới hơn) để hỗ
+trợ OpenCL, có thể được cài đặt qua
+
+```shell
+# Trên Ubuntu 16.04
+$ sudo apt-get install opencl-headers, libviennacl-dev
+# Trên Fedora
+$ sudo yum install opencl-headers, viennacl
+```
+
+Bên cạnh đó, bạn cần OpenCL Installable Client Driver (ICD) cho nền tảng mà bạn
+muốn chạy OpenCL.
+
+- Với AMD và nVidia GPUs, driver package nên cài đúng bản OpenCL ICD.
+- Với Intel CPUs và/hoặc GPUs, có thể tải driver từ
+  [Intel website.](https://software.intel.com/en-us/articles/opencl-drivers) Lưu
+  ý rằng driver này chỉ hỗ trợ các phiên bản mới của CPUs và Iris GPUs.
+- Với các bản Intel CPUs cũ hơn, bạn có thể sử dụng gói `beignet-opencl-icd`.
+
+Lưu ý rằng chạy OpenCL trên CPUs không được khuyến khích bởi tốc độ chậm. Di
+chuyển bộ nhớ theo trình tự tính theo từng giây (1000's của ms trên CPUs so với
+1's của ms trên GPUs).
+
+Có thể xem thêm thông tin về cách thiết lập environment có chạy OpenCL tại
+[đây](https://wiki.tiker.net/OpenCLHowTo).
+
+Nếu phiên bản của gói chương trình ViennaCL thấp hơn 1.7.1, bạn cần phải tạo từ
+nguồn:
+
+Clone [nguồn tại đây](https://github.com/viennacl/viennacl-dev), chọn (checkout)
+tag `release-1.7.1` để cài đặt. Bạn cần nhớ thêm đường dẫn vào phần `PATH` và
+tạo libraries vào `LD_LIBRARY_PATH`.
+
+Để cài SINGA với hỗ trợ OpenCL (đã thử trên SINGA 1.1):
+
+```shell
+$ cmake -DUSE_OPENCL=ON ..
+$ make
+```
+
+### GÓI CHƯƠNG TRÌNH (PACKAGE)
+
+Cài đặt này được sử dụng để tạo gói chương trình Debian package. Để PACKAGE=ON
+và tạo gói chương trình với lệnh như sau:
+
+```shell
+$ cmake -DPACKAGE=ON
+$ make package
+```
+
+## Câu hỏi thường gặp (Q&A)
+
+- Q: Gặp lỗi khi 'import singa'
+
+  A: Vui lòng kiểm tra chi tiết lỗi từ
+  `python -c "from singa import _singa_wrap"`. Đôi khi lỗi xảy ra bởi các
+  dependent libraries, vd. protobuf có nhiều phiên bản, nếu thiếu cudnn, phiên
+  bản numpy sẽ không tương thích. Các bước sau đưa ra giải pháp cho từng trường
+  hợp:
+
+  1. Kiểm tra cudnn và cuda. Nếu thiếu cudnn hoặc không tương thích với phiên
+     bản của wheel, bạn có thể tải phiên bản đúng của cudnn vào thư mục
+     ~/local/cudnn/ và
+
+     ```shell
+     $ echo "export LD_LIBRARY_PATH=/home/<yourname>/local/cudnn/lib64:$LD_LIBRARY_PATH" >> ~/.bashrc
+     ```
+
+  2. Nếu lỗi liên quan tới protobuf. Bạn có thể cài đặt (3.6.1) từ nguồn vào một
+     thư mục trong máy của bạn(local). chẳng hạn ~/local/; Giải nén file tar,
+     sau đó
+
+     ```shell
+     $ ./configure --prefix=/home/<yourname>local
+     $ make && make install
+     $ echo "export LD_LIBRARY_PATH=/home/<yourname>/local/lib:$LD_LIBRARY_PATH" >> ~/.bashrc
+     $ source ~/.bashrc
+     ```
+
+  3. Nếu không tìm được libs nào bao gồm python, thì taọ virtual env sử dụng
+     `pip` hoặc `conda`;
+
+  4. Nếu lỗi không do các nguyên nhân trên thì đi tới thư mục của
+     `_singa_wrap.so`,
+
+     ```shell
+     $ python
+     >> import importlib
+     >> importlib.import_module('_singa_wrap')
+     ```
+
+  kiểm tra thông báo lỗi. Ví dụ nếu phiên bản numpy không tương thích, thông báo
+  lỗi sẽ là
+
+  ```shell
+  RuntimeError: module compiled against API version 0xb but this version of numpy is 0xa
+  ```
+
+  sau đó bạn cần phải nâng cấp numpy.
+
+* Q: Lỗi khi chạy `cmake ..`, không tìm được dependent libraries.
+
+  A: Nếu bạn vẫn chưa cài đặt libraries đó, thì cài đặt chúng. Nếu bạn cài
+  libraries trong thư mục bên ngoài thư mục system, chẳng hạn như /usr/local,
+  bạn cần export các biến sau đây
+
+  ```shell
+  $ export CMAKE_INCLUDE_PATH=<path to your header file folder>
+  $ export CMAKE_LIBRARY_PATH=<path to your lib file folder>
+  ```
+
+- Q: Lỗi từ `make`, vd. linking phase
+
+  A: Nếu libraries nằm trong thư mục không phải là thư mục system mặc định trong
+  đường dẫn, bạn cần export các biến sau
+
+  ```shell
+  $ export LIBRARY_PATH=<path to your lib file folder>
+  $ export LD_LIBRARY_PATH=<path to your lib file folder>
+  ```
+
+* Q: Lỗi từ các tệp tin headers vd. 'cblas.h no such file or directory exists'
+
+  A: Bạn cần bao gồm các thư mục cblas.h vào CPLUS_INCLUDE_PATH, e.g.,
+
+  ```shell
+  $ export CPLUS_INCLUDE_PATH=/opt/OpenBLAS/include:$CPLUS_INCLUDE_PATH
+  ```
+
+* Q: Khi compile SINGA, gặp lỗi `SSE2 instruction set not enabled`
+
+  A: Bạn có thể thử lệnh sau:
+
+  ```shell
+  $ make CFLAGS='-msse2' CXXFLAGS='-msse2'
+  ```
+
+* Q:Gặp lỗi `ImportError: cannot import name enum_type_wrapper` từ
+  google.protobuf.internal khi tôi cố gắng import các tệp tin dạng .py.
+
+  A: Bạn cần cài đặt python cho protobuf, có thể cài đặt qua
+
+  ```shell
+  $ sudo apt-get install protobuf
+  ```
+
+hoặc từ nguồn
+
+```shell
+$ cd /PROTOBUF/SOURCE/FOLDER
+$ cd python
+$ python setup.py build
+$ python setup.py install
+```
+
+- Q: Khi tôi tạo OpenBLAS từ nguồn, tôi gặp yêu cầu cần phải có Fortran
+  compiler.
+
+  A: Bạn có thể compile OpenBLAS bằng cách
+
+  ```shell
+  $ make ONLY_CBLAS=1
+  ```
+
+  hoặc cài dặt sử dụng
+
+  ```shell
+  $ sudo apt-get install libopenblas-dev
+  ```
+
+- Q: Khi tôi tạo protocol buffer, thì bị thông báo `GLIBC++_3.4.20` không được
+  tìm thấy trong `/usr/lib64/libstdc++.so.6`?
+
+  A: Nghĩa là linker trong libstdc++.so.6 nhưng chương trình này thuộc về một
+  phiên bản cũ hơn của GCC đã được dùng để compile và link chương trình. Chương
+  trình phụ thuộc vào code viết trong phiên bản libstdc++ cập nhật thuộc về
+  phiên bản mới hơn của GCC, vì vậy linker phải chỉ ra cách để cài phiên bản
+  libstdc++ mới hơn được chia sẻ trong library. Cách đơn giản nhất để sửa lỗi
+  này là tìm phiên bản đúng cho libstdc++ và export nó vào LD_LIBRARY_PATH. Ví
+  dụ nếu GLIBC++\_3.4.20 có trong output của lệnh sau
+
+        $ strings /usr/local/lib64/libstdc++.so.6|grep GLIBC++
+
+  thì bạn chỉ cần tạo biến environment
+
+        $ export LD_LIBRARY_PATH=/usr/local/lib64:$LD_LIBRARY_PATH
+
+- Q: Khi tạo glog, nhận thông báo "src/logging_unittest.cc:83:20: error:
+  ‘gflags’ is not a namespace-name"
+
+  A: Có thể do bạn đã cài gflags với một namespace khác như là "google". vì thế
+  glog không thể tìm thấy 'gflags' namespace. Do cài glog thì không cần phải có
+  gflags. Nên bạn cần sửa tệp tin configure.ac thành ignore gflags.
+
+        1. cd to glog src directory
+        2. change line 125 of configure.ac  to "AC_CHECK_LIB(gflags, main, ac_cv_have_libgflags=0, ac_cv_have_libgflags=0)"
+        3. autoreconf
+
+  Sau đó bạn có thể cài lại glog.
+
+- Q: Khi sử dụng virtual environment, bất cứ khi nào tôi chạy pip install, numpy
+  sẽ tự cài lại numpy. Tuy nhiên, numpy này không được sử dụng khi tôi
+  `import numpy`
+
+  A: Lỗi có thể gây ra bởi `PYTHONPATH` vốn nên được để trống (empty) khi bạn sử
+  dụng virtual environment nhằm tránh conflict với đường dẫn của virtual
+  environment.
+
+- Q: Khi compile PySINGA từ nguồn, có lỗi compilation do thiếu
+  <numpy/objectarray.h>
+
+  A: Vui lòng cài đặt numpy và export đường dẫn của tệp tin numpy header như sau
+
+        $ export CPLUS_INCLUDE_PATH=`python -c "import numpy; print numpy.get_include()"`:$CPLUS_INCLUDE_PATH
+
+- Q: Khi chạy SINGA trên Mac OS X, tôi gặp lỗi "Fatal Python error:
+  PyThreadState_Get: no current thread Abort trap: 6"
+
+  A: Lỗi này thường xảy ra khi bạn có nhiều phiên bản Python trong hệ thống, và
+  bạn cài SINGA qua pip (vấn đề này có thể được giải quyết nếu cài đặt bằng
+  conda), vd. một bên qua OS và một bên cài đặt qua Homebrew. Python dùng trong
+  PySINGA phải là Python interpreter. Bạn có thể kiểm tra trình thông dịch
+  (interpreter) của mình bằng `which python` và kiểm tra bản Python dùng trong
+  PySINGA qua `otool -L <path to _singa_wrap.so>`. Để sửa lỗi này, bạn compile
+  SINGA với đúng phiên bản mà SINGA cần. Cụ thể, nếu bạn tạo PySINGA từ nguồn,
+  bạn cần cụ thể đường dẫn khi gọi
+  [cmake](http://stackoverflow.com/questions/15291500/i-have-2-versions-of-python-installed-but-cmake-is-using-older-version-how-do)
+
+        $ cmake -DPYTHON_LIBRARY=`python-config --prefix`/lib/libpython2.7.dylib -DPYTHON_INCLUDE_DIR=`python-config --prefix`/include/python2.7/ ..
+
+  Nếu cài đặt PySINGA từ gói binary packages, vd. debian hay wheel, thì bạn cần
+  thay đổi trình thông dịch của python (python interpreter), vd., reset \$PATH
+  để đường dẫn dúng của Python ở đằng trước.
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/contribute-code.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/contribute-code.md
new file mode 100644
index 0000000..9040734
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/contribute-code.md
@@ -0,0 +1,128 @@
+---
+id: version-4.0.0_Viet-contribute-code
+title: Tham gia viết code
+original_id: contribute-code
+---
+
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed [...]
+
+## Định dạng mã code
+
+Nền tảng code của SINGA tuân theo định dạng Google cho cả code
+[CPP](http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml) và
+[Python](http://google.github.io/styleguide/pyguide.html).
+
+Một cách đơn giản để thực hiện định dạng lập trình Google là sử dụng linting và
+các công cụ định dạng trong Visual Studio Code editor:
+
+- [C/C++ extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode.cpptools)
+- [Python extension](https://marketplace.visualstudio.com/items?itemName=ms-python.python)
+- [cpplint extension](https://marketplace.visualstudio.com/items?itemName=mine.cpplint)
+- [Clang-Format](https://marketplace.visualstudio.com/items?itemName=xaver.clang-format)
+
+Sau khi cài extensions, chỉnh sửa tệp tin `settings.json`.
+
+```json
+{
+  "[cpp]": {
+    "editor.defaultFormatter": "xaver.clang-format"
+  },
+  "cpplint.cpplintPath": "path/to/cpplint",
+
+  "editor.formatOnSave": true,
+  "python.formatting.provider": "yapf",
+  "python.linting.enabled": true,
+  "python.linting.lintOnSave": true,
+  "clang-format.language.cpp.style": "google",
+  "python.formatting.yapfArgs": ["--style", "{based_on_style: google}"]
+}
+```
+
+Dựa vào nền tảng bạn đang sử dụng. tệp tin user settings được đặt tại đây:
+
+1. Windows %APPDATA%\Code\User\settings.json
+2. macOS "\$HOME/Library/Application Support/Code/User/settings.json"
+3. Linux "\$HOME/.config/Code/User/settings.json"
+
+Thông số cấu hình cụ thể có trong các tệp tin config file tuơng ứng. Những công
+cụ này sẽ tự động tìm kiếm các tập tin cấu hình configuration files trong root
+của dự án, vd. `.pylintrc`.
+
+#### Cài đặt công cụ
+
+Tốt nhất là tất cả người tham gia viết mã code sử dụng cùng một phiên bản công
+cụ định dạng mã code (clang-format 9.0.0 và yapf 0.29.0), để tất cả định dạng mã
+code sẽ giống nhau dù thuộc về các PRs khác nhau, nhằm tránh tạo conflict trong
+github pull request.
+
+Trước tiên, cài đặt LLVM 9.0 cung cấp clang-format phiên bản 9.0.0. Trang tải
+LLVM là:
+
+- [LLVM](http://releases.llvm.org/download.html#9.0.0)
+
+  - Trên Ubuntu
+
+    ```sh
+    sudo apt-get install clang-format-9
+    ```
+
+  - Trên Windows. Tải gói pre-built và cài đặt
+
+Sau đó, cài cpplint, pylint và yapf
+
+- Ubuntu or OSX:
+
+  ```
+  $ sudo pip install cpplint
+  $ which cpplint
+  /path/to/cpplint
+
+  $ pip install yapf==0.29.0
+  $ pip install pylint
+  ```
+
+- Windows: Cài Anaconda cho gói quản lý package management.
+
+  ```
+  $ pip install cpplint
+  $ where cpplint
+  C:/path/to/cpplint.exe
+
+  $ pip install yapf==0.29.0
+  $ pip install pylint
+  ```
+
+#### Sử dụng
+
+- Sau khi kích hoạt, linting sẽ tự động được áp dụng khi bạn chỉnh sửa các tập
+  tin mã code nguồn (source code file). Lỗi và cảnh báo sẽ hiển thị trên thanh
+  Visual Studio Code `PROBLEMS`.
+- Định dạng mã code có thể thực hiện bằng cách sử dụng Command
+  Palette(`Shift+Ctrl+P` cho Windows hay `Shift+Command+P` cho OSX) và gõ
+  `Format Document`.
+
+#### Gửi
+
+Bạn cần phải chữa lỗi định dạng nếu có trước khi gửi đi pull requests.
+
+## Tạo Environment
+
+Chúng tôi khuyến khích dùng Visual Studio Code để viết code. Có thể cài các
+Extensions như Python, C/C++, Code Spell Checker, autoDocstring, vim, Remote
+Development. Tham khảo cấu hình (vd., `settings.json`) của các extensions
+[tại đây](https://gist.github.com/nudles/3d23cfb6ffb30ca7636c45fe60278c55).
+
+Nếu bạn cập nhật mã code CPP, bạn cần recompile SINGA [từ nguồn](./build.md).
+Nên sử dụng các công cụ cài đặt cơ bản trong `*-devel` Docker images hay
+`conda build`.
+
+Nếu bạn chỉ cập nhật mã code Python, bạn cần cài đặt SINGAS một lần, sau đó copy
+các tập tin Python cập nhật để thay thế chúng trong thư mục cài đặt Python,
+
+```shell
+cp python/singa/xx.py  <path to conda>/lib/python3.7/site-packages/singa/
+```
+
+## Trình Tự
+
+Vui lòng tham khảo mục [git workflow](./git-workflow.md).
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/contribute-docs.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/contribute-docs.md
new file mode 100644
index 0000000..7ac8763
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/contribute-docs.md
@@ -0,0 +1,106 @@
+---
+id: version-4.0.0_Viet-contribute-docs
+title: Tham gia chỉnh sửa Hướng Dẫn Sử Dụng
+original_id: contribute-docs
+---
+
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed [...]
+
+Hướng Dẫn Sử Dụng có hai dạng, dạng tập tin markdown và dạng sử dụng API
+reference. Tài liệu này giới thiệu vài công cụ và chỉ dẫn trong việc chuẩn bị
+các tập tin nguồn markdown và chú thích API.
+
+Tập tin markdown sẽ được sử dụng trong việc tạo trang HTML qua
+[Docusaurus](https://docusaurus.io/); Chú thích API (từ nguồn mã code) sẽ được
+sử dụng để tạo các trang tham khảo API sử dụng Sphinx (cho Python) và Doxygen
+(cho CPP).
+
+## Tập Tin Markdown
+
+Làm theo [định dạng Văn bản Google](https://developers.google.com/style). Ví dụ,
+
+1. Bỏ 'vui lòng' (please) khỏi bất cứ hướng dẫn sử dụng nào. 'Please click...'
+   thành 'Click ...'.
+2. Làm theo
+   [qui tắc viết hoa tiêu chuẩn](https://owl.purdue.edu/owl/general_writing/mechanics/help_with_capitals.html).
+3. Sử dụng 'bạn' thay cho 'chúng tôi' trong hướng dẫn.
+4. Sử dụng thì hiện tại và tránh sử dụng từ 'sẽ'
+5. Nên dùng dạng chủ động thay vì bị động
+
+Thêm vào đó, để cho nội dung hướng dẫn sủ dụng thống nhất,
+
+1. Viết câu ngắn, chẳng hạn độ dài <=80
+2. Sử dụng đường dẫn liên quan, mặc định chúng ta đang ở thư mục root của repo,
+   vd., `doc-site/docs` để chỉ `singa-doc/docs-site/docs`
+3. Nhấn mạnh câu lệnh, đường dẫn, class, function và biến sử dụng backticks,
+   vd., `Tensor`, `singa-doc/docs-site/docs`.
+4. Để nêu bật các điều khoản/khái niệm, sử dụng _graph_ hoặc **graph**
+
+[Cộng cụ prettier](https://prettier.io/) được sử dụng bởi dự án này sẽ tự làm
+định dạng code dựa trên
+[cấu hình](https://github.com/apache/singa-doc/blob/master/docs-site/.prettierrc)
+khi thực hiện `git commit`. Ví dụ, nó sẽ gói chữ trong các tập tin markdown
+thành nhiều nhất 80 kí tự (trừ các dòng chú thích).
+
+Khi giới thiệu một khái niệm (concept) (vd., class `Tensor`), đưa ra khái quát
+chung (mục đích và mối quan hệ với các khái niệm khác), APIs và ví dụ. Google
+colab có thể được sử dụng để mô phỏng điều này.
+
+Tham khảo [trang](https://github.com/apache/singa-doc/tree/master/docs-site) để
+biết thêm chi tiết về cách chỉnh sửa các tập tin markdown và xây dựng website.
+
+## Tham Khảo API
+
+### CPP API
+
+Thực hiện theo
+[Mẫu chú thích của Google CPP](https://google.github.io/styleguide/cppguide.html#Comments).
+
+Để tạo văn bản, chạy "doxygen" từ thư mục doc (khuyến khích Doxygen >= 1.8)
+
+### Python API
+
+Thực hiện theo
+[Mẫu Google Python DocString](http://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings).
+
+## Visual Studio Code (vscode)
+
+Nếu bạn sử dụng vscode để viết code, các plugins sau sẽ giúp ích.
+
+### Docstring Snippet
+
+[autoDocstring](https://marketplace.visualstudio.com/items?itemName=njpwerner.autodocstring)
+tạo docstring của functions, classes, v.v. Lựa chọn định dạng DocString to
+`google`.
+
+### Kiểm Tra Lỗi Chính Tả
+
+[Code Spell Checker](https://marketplace.visualstudio.com/items?itemName=streetsidesoftware.code-spell-checker)
+có thể được cơ cấu để kiểm tra chú thích trong mã code, hoặc các tập tin .md và
+.rst.
+
+Để kiểm tra lỗi chính tả cho các dòng chú thích trong code Python, thêm vào các
+snippet sau qua `File - Preferences - User Snippets - python.json`
+
+    "cspell check" : {
+    "prefix": "cspell",
+    "body": [
+        "# Chỉ dẫn kiểm tra lỗi chính tả cho các dòng chú thích trong code python và c/cpp",
+        "# cSpell:includeRegExp #.* ",
+        "# cSpell:includeRegExp (\"\"\"|''')[^\1]*\1",
+        "# cSpell: CStyleComment",
+    ],
+    "description": "# chỉ kiểm tra lỗi chính tả cho chú thích trong python"
+    }
+
+Để kiểm tra lỗi chính tả cho các dòng chú thích trong code Cpp, thêm vào các
+snippet sau qua `File - Preferences - User Snippets - cpp.json`
+
+    "cspell check" : {
+    "prefix": "cspell",
+    "body": [
+        "// Chỉ dẫn kiểm tra lỗi chính tả cho các dòng chú thích trong code cpp",
+        "// cSpell:includeRegExp CStyleComment",
+    ],
+    "description": "# chỉ kiểm tra lỗi chính tả cho chú thích trong cpp"
+    }
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/device.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/device.md
new file mode 100644
index 0000000..8e2d1e2
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/device.md
@@ -0,0 +1,33 @@
+---
+id: version-4.0.0_Viet-device
+title: Device
+original_id: device
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+Device dùng ở đây nghĩa là thiết bị phần cứng với bộ nhớ và các bộ phận máy
+tính. Tất cả [Tensor operations](./tensor) được sắp xếp bởi các thiết bị
+resident device khi chạy. Bộ nhớ của Tensor luôn luôn được quản lý bởi memory
+manager của thiết bị đó. Bởi vậy việc tận dụng tối đa bộ nhớ và thực hiện được
+tiến hành tại Device class.
+
+## Các thiết bị cụ thể
+
+Hiện tại, SINGA được chạy trên ba Device,
+
+1.  CudaGPU cho cạc Nvidia GPU card chạy code Cuda
+2.  CppCPU cho CPU chạy Cpp code
+3.  OpenclGPU cho cạc GPU chạy OpenCL code
+
+## Ví Dụ Sử Dụng
+
+Code dưới đây là ví dụ về việc tạo device:
+
+```python
+from singa import device
+cuda = device.create_cuda_gpu_on(0)  # sử dụng cạc GPU với ID 0
+host = device.get_default_device()  # tạo host mặc định cho device (CppCPU)
+ary1 = device.create_cuda_gpus(2)  # tạo 2 devices, bắt đầu từ ID 0
+ary2 = device.create_cuda_gpus([0,2])  # tạo 2 devices với ID 0 và 2
+```
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/dist-train.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/dist-train.md
new file mode 100644
index 0000000..94862a1
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/dist-train.md
@@ -0,0 +1,447 @@
+---
+id: version-4.0.0_Viet-dist-train
+title: Distributed Training
+original_id: dist-train
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+SINGA hỗ trợ data parallel training trên nhiều GPUs (trên một node hoặc nhiều
+node khác nhau). Sơ đồ sau mô phỏng data parallel training:
+
+![MPI.png](assets/MPI.png)
+
+Trong distributed training, mỗi chỉ lệnh (gọi là worker) chạy một training
+script trên một máy GPU. Mỗi chỉ lệnh (process) có một communication rank riêng.
+Dữ liệu để training được phân cho các worker và model thì được sao chép cho mỗi
+worker. Ở mỗi vòng, worker đọc một mini-batch dữ liệu (vd., 256 hình ảnh) từ
+phần được chia và chạy thuật toán BackPropagation để tính ra độ dốc (gradient)
+của weight, được lấy trung bình qua all-reduce (cung cấp bởi
+[NCCL](https://developer.nvidia.com/nccl)) để cập nhật weight theo thuật toán
+stochastic gradient descent (SGD).
+
+Hàm all-reduce operation bởi NCCL có thể được sử dụng để giảm và đồng bộ hoá độ
+dốc từ các máy GPU các nhau. Xem thử training với 4 GPUs như dưới đây. Sau khi
+độ dốc (gradients) từ 4 GPUs được tính, all-reduce sẽ trả lại tổng độ dốc
+(gradient) cho các GPU và đưa tới mỗi GPU. Sau đó có thể dễ dàng tính ra độ dốc
+trung bình.
+
+![AllReduce.png](assets/AllReduce.png)
+
+## Sử Dụng
+
+SINGA áp dụng một module gọi là `DistOpt` (là dạng con của `Opt`) cho
+distributed training. Nó gói lại normal SGD optimizer và gọi `Communicator` để
+động bộ hoá độ dốc. Ví dụ sau mô phỏng cách sử dụng `DistOpt` để training một
+CNN model với dữ liệu MNIST. Nguồn code có thể tìm
+[tại đây](https://github.com/apache/singa/blob/master/examples/cnn/), và
+[Colab notebook]().
+
+### Code Ví Dụ
+
+1. Định nghĩa neural network model:
+
+```python
+class CNN(model.Model):
+
+    def __init__(self, num_classes=10, num_channels=1):
+        super(CNN, self).__init__()
+        self.conv1 = layer.Conv2d(num_channels, 20, 5, padding=0, activation="RELU")
+        self.conv2 = layer.Conv2d(20, 50, 5, padding=0, activation="RELU")
+        self.linear1 = layer.Linear(500)
+        self.linear2 = layer.Linear(num_classes)
+        self.pooling1 = layer.MaxPool2d(2, 2, padding=0)
+        self.pooling2 = layer.MaxPool2d(2, 2, padding=0)
+        self.relu = layer.ReLU()
+        self.flatten = layer.Flatten()
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.pooling1(y)
+        y = self.conv2(y)
+        y = self.pooling2(y)
+        y = self.flatten(y)
+        y = self.linear1(y)
+        y = self.relu(y)
+        y = self.linear2(y)
+        return y
+
+    def train_one_batch(self, x, y, dist_option='fp32', spars=0):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+
+        # cho phép nhiều lựa chọn dùng trong distributed training
+        # Tham khảo mục "Optimizations về Distributed Training"
+        if dist_option == 'fp32':
+            self.optimizer(loss)
+        elif dist_option == 'fp16':
+            self.optimizer.backward_and_update_half(loss)
+        elif dist_option == 'partialUpdate':
+            self.optimizer.backward_and_partial_update(loss)
+        elif dist_option == 'sparseTopK':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=True,
+                                                      spars=spars)
+        elif dist_option == 'sparseThreshold':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=False,
+                                                      spars=spars)
+        return out, loss
+
+# tạo model
+model = CNN()
+```
+
+2. Tạo `DistOpt` instance và đính nó vào model đã tạo:
+
+```python
+sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+sgd = opt.DistOpt(sgd)
+model.set_optimizer(sgd)
+dev = device.create_cuda_gpu_on(sgd.local_rank)
+```
+
+Đây là giải thích cho các biến sử dụng trong code:
+
+(i) `dev`
+
+dev dùng để chỉ `Device` instance, nơi tải dữ liệu và chạy CNN model.
+
+(ii)`local_rank`
+
+Local rank chỉ số GPU mà chỉ lệnh (process) hiện tại đang sử dụng trên cùng một
+node. Ví dụ, nếu bạn đang sử dụng một node có 2 GPUs, `local_rank=0` nghĩa là
+chỉ lệnh này đang sử dụng máy GPU đầu tiên, trong khi `local_rank=1` nghĩa là
+đang sử dụng máy GPU thứ hai. Sử dụng MPI hay đa xử lý, bạn có thể chạy cùng một
+tập lệnh training chỉ khác giá trị của `local_rank`.
+
+(iii)`global_rank`
+
+Rank trong global biểu thị global rank cho tất cả các chỉ lệnh (process) trong
+các nodes mà bạn đang sử dụng. Lấy ví dụ trường hợp bạn có 3 nodes và mỗi một
+node có hai GPUs, `global_rank=0` nghĩa là chỉ lệnh đang sử dụng máy GPU đầu
+tiên ở node đầu tiên, `global_rank=2` nghĩa là chỉ lệnh đang sử dụng máy GPU đầu
+tiên ở node thứ 2, và `global_rank=4` nghĩa là chỉ lệnh đang sử dụng máy GPU đầu
+tiên ở node thứ 3.
+
+3. Tải và phân chia dữ liệu để training/validation
+
+```python
+def data_partition(dataset_x, dataset_y, global_rank, world_size):
+    data_per_rank = dataset_x.shape[0] // world_size
+    idx_start = global_rank * data_per_rank
+    idx_end = (global_rank + 1) * data_per_rank
+    return dataset_x[idx_start:idx_end], dataset_y[idx_start:idx_end]
+
+train_x, train_y, test_x, test_y = load_dataset()
+train_x, train_y = data_partition(train_x, train_y,
+                                  sgd.global_rank, sgd.world_size)
+test_x, test_y = data_partition(test_x, test_y,
+                                sgd.global_rank, sgd.world_size)
+```
+
+Một phần của bộ dữ liệu (dataset) được trả lại cho `dev`.
+
+Tại đây, `world_size` thể hiện tổng số chỉ lệnh trong tất cả các node mà bạn
+đang sử dụng cho distributed training.
+
+4. Khởi tạo và đồng bộ các tham số của model cho tất cả workers:
+
+```python
+# Đồng bộ tham số ban đầu
+tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
+ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
+model.compile([tx], is_train=True, use_graph=graph, sequential=True)
+...
+# Sử dụng cùng một random seed cho các ranks khác nhau
+seed = 0
+dev.SetRandSeed(seed)
+np.random.seed(seed)
+```
+
+5. Chạy BackPropagation và distributed SGD
+
+```python
+for epoch in range(max_epoch):
+    for b in range(num_train_batch):
+        x = train_x[idx[b * batch_size: (b + 1) * batch_size]]
+        y = train_y[idx[b * batch_size: (b + 1) * batch_size]]
+        tx.copy_from_numpy(x)
+        ty.copy_from_numpy(y)
+        # Train the model
+        out, loss = model(tx, ty)
+```
+
+### Hướng Dẫn Thực Hiện
+
+Có hai cách để bắt đầu quá trình training: MPI hoặc Python đa xử lý.
+
+#### Python Đa Xử Lý
+
+Chạy trên một node với nhiều GPUs, trong đó mỗi GPU là một worker.
+
+1. Đặt tất cả các training codes trong cùng một hàm (function)
+
+```python
+def train_mnist_cnn(nccl_id=None, local_rank=None, world_size=None):
+    ...
+```
+
+2. Tạo `mnist_multiprocess.py`
+
+```python
+if __name__ == '__main__':
+    # Generate a NCCL ID to be used for collective communication
+    nccl_id = singa.NcclIdHolder()
+
+    # Define the number of GPUs to be used in the training process
+    world_size = int(sys.argv[1])
+
+    # Define and launch the multi-processing
+	import multiprocessing
+    process = []
+    for local_rank in range(0, world_size):
+        process.append(multiprocessing.Process(target=train_mnist_cnn,
+                       args=(nccl_id, local_rank, world_size)))
+
+    for p in process:
+        p.start()
+```
+
+Dưới đây là giải thích cho các biến tạo ở trên:
+
+(i) `nccl_id`
+
+Lưu ý rằng chúng ta cần phải tạo một NCCL ID ở đây để sử dụng cho collective
+communication, sau đó gửi nó tới tất cả các chỉ lệnh. NCCL ID giống như là vé
+vào cửa, khi chỉ có chỉ lệnh với ID này có thể tham gia vào quá trình
+all-reduce. (Về sua nếu dùng MPI, thì việc sử dụng NCCL ID là không cần thiết,
+bởi vì ID được gửi đi bởi MPI trong code của chúng tôi một cách tự động)
+
+(ii) `world_size`
+
+world_size là số lượng máy GPUs bạn muốn sử dụng cho training.
+
+(iii) `local_rank`
+
+local_rank xác định local rank của distributed training và máy gpu được sử dụng
+trong chỉ lệnh. Trong code bên trên, for loop được sử dụng để chạy hàm train
+function, và local_rank chạy vòng từ 0 tới world_size. Trong trường hợp này, chỉ
+lệnh khác nhau có thể sử dụng máy GPUs khác nhau để training.
+
+Tham số để tạo `DistOpt` instance cần được cập nhật như sau:
+
+```python
+sgd = opt.DistOpt(sgd, nccl_id=nccl_id, local_rank=local_rank, world_size=world_size)
+```
+
+3. Chạy `mnist_multiprocess.py`
+
+```sh
+python mnist_multiprocess.py 2
+```
+
+Kết qủa hiển thị tốc độ so với training trên một máy GPU.
+
+```
+Starting Epoch 0:
+Training loss = 408.909790, training accuracy = 0.880475
+Evaluation accuracy = 0.956430
+Starting Epoch 1:
+Training loss = 102.396790, training accuracy = 0.967415
+Evaluation accuracy = 0.977564
+Starting Epoch 2:
+Training loss = 69.217010, training accuracy = 0.977915
+Evaluation accuracy = 0.981370
+Starting Epoch 3:
+Training loss = 54.248390, training accuracy = 0.982823
+Evaluation accuracy = 0.984075
+Starting Epoch 4:
+Training loss = 45.213406, training accuracy = 0.985560
+Evaluation accuracy = 0.985276
+Starting Epoch 5:
+Training loss = 38.868435, training accuracy = 0.987764
+Evaluation accuracy = 0.986278
+Starting Epoch 6:
+Training loss = 34.078186, training accuracy = 0.989149
+Evaluation accuracy = 0.987881
+Starting Epoch 7:
+Training loss = 30.138697, training accuracy = 0.990451
+Evaluation accuracy = 0.988181
+Starting Epoch 8:
+Training loss = 26.854443, training accuracy = 0.991520
+Evaluation accuracy = 0.988682
+Starting Epoch 9:
+Training loss = 24.039650, training accuracy = 0.992405
+Evaluation accuracy = 0.989083
+```
+
+#### MPI
+
+Có thể dùng cho cả một node và nhiều node miễn là có nhiều máy GPUs.
+
+1. Tạo `mnist_dist.py`
+
+```python
+if __name__ == '__main__':
+    train_mnist_cnn()
+```
+
+2. Tạo một hostfile cho MPI, vd. hostfile dưới đây sử dụng 2 chỉ lệnh (vd., 2
+   GPUs) trên một node
+
+```txt
+localhost:2
+```
+
+3. Khởi động quá trình training qua `mpiexec`
+
+```sh
+mpiexec --hostfile host_file python mnist_dist.py
+```
+
+Kết qủa có thể hiển thị tốc độ so với training trên một máy GPU.
+
+```
+Starting Epoch 0:
+Training loss = 383.969543, training accuracy = 0.886402
+Evaluation accuracy = 0.954327
+Starting Epoch 1:
+Training loss = 97.531479, training accuracy = 0.969451
+Evaluation accuracy = 0.977163
+Starting Epoch 2:
+Training loss = 67.166870, training accuracy = 0.978516
+Evaluation accuracy = 0.980769
+Starting Epoch 3:
+Training loss = 53.369656, training accuracy = 0.983040
+Evaluation accuracy = 0.983974
+Starting Epoch 4:
+Training loss = 45.100403, training accuracy = 0.985777
+Evaluation accuracy = 0.986078
+Starting Epoch 5:
+Training loss = 39.330826, training accuracy = 0.987447
+Evaluation accuracy = 0.987179
+Starting Epoch 6:
+Training loss = 34.655270, training accuracy = 0.988799
+Evaluation accuracy = 0.987780
+Starting Epoch 7:
+Training loss = 30.749735, training accuracy = 0.989984
+Evaluation accuracy = 0.988281
+Starting Epoch 8:
+Training loss = 27.422146, training accuracy = 0.991319
+Evaluation accuracy = 0.988582
+Starting Epoch 9:
+Training loss = 24.548153, training accuracy = 0.992171
+Evaluation accuracy = 0.988682
+```
+
+## Tối Ưu Hoá Distributed Training
+
+SINGA cung cấp chiến lược đa tối ưu hoá cho distributed training để giảm
+communication cost. Tham khảo API của `DistOpt` cho cấu hình của mỗi cách.
+
+Khi sử dụng `model.Model` để tạo một model, cần phải đặt các lựa chọn cho
+distributed training trong phương pháp `train_one_batch`. Tham khảo code ví dụ
+trên đầu trang. Bạn có thể chỉ cần copy code cho các lựa chọn và sử dụng nó cho
+các model khác. Với các lựa chọn xác định, ta có thể đặt tham số`dist_option` và
+`spars` khi bắt đầu training với `model(tx, ty, dist_option, spars)`
+
+### Không Tối Ưu Hoá
+
+```python
+out, loss = model(tx, ty)
+```
+
+`loss` là output tensor từ hàm loss function, vd., cross-entropy cho
+classification tasks.
+
+### Half-precision Gradients
+
+```python
+out, loss = model(tx, ty, dist_option = 'fp16')
+```
+
+Chuyển đổi gía trị độ dốc sang hiển thị dạng 16-bit (vd., half-precision) trước
+khi gọi hàm all-reduce.
+
+### Đồng Bộ Cục Bộ (Partial Synchronization)
+
+```python
+out, loss = model(tx, ty, dist_option = 'partialUpdate')
+```
+
+Ở mỗi vòng lặp (iteration), mỗi rank thực hiện việc cập nhật sgd. Sau đó chỉ một
+nhóm tham số là được tính trung bình để đồng bộ hoá. Điều này giúp tiết kiệm
+communication cost. Độ lớn của nhóm này được xác định khi tạo hàm `DistOpt`
+instance.
+
+### Phân Bổ Độ Dốc (Gradient Sparsification)
+
+Kế hoạch phân bổ để chọn ra một nhóm nhỏ độ dốc nhằm thực hiện all-reduce. Có
+hai cách:
+
+- Chọn K phần tử lớn nhất. spars là một phần (0 - 1) của tổng số phần tử được
+  chọn.
+
+```python
+out, loss = model(tx, ty, dist_option = 'sparseTopK', spars = spars)
+```
+
+- Tất cả độ dốc có giá trị tuyệt đối lớn hơn ngưỡng spars đặt trước được lựa
+  chọn.
+
+```python
+out, loss = model(tx, ty, dist_option = 'sparseThreshold', spars = spars)
+```
+
+Các hyper-parameter được cấu tạo khi tạo hàm `DistOpt` instance.
+
+## Thực Hiện
+
+Mục này chủ yếu dành cho các lập trình viên (developer) muốn biết lập trình
+trong distribute module được thực hiện như thế nào.
+
+### Giao Diện C cho Bộ Chuyển Mạch (communicator) NCCL
+
+Trước tiên, communication layer được lập trình bằng ngôn ngữ C
+[communicator.cc](https://github.com/apache/singa/blob/master/src/io/communicator.cc).
+Nó áp dụng NCCL library cho collective communication.
+
+Có hai hàm tạo nên communicator, một cho MPI và một cho đa phương thức
+(multiprocess).
+
+(i) Hàm tạo sử dụng MPI
+
+Hàm tạo bước đầu sẽ sử dụng global rank và world size, sau đó tính toán ra local
+rank. Tiếp theo, rank 0 sẽ tạo ra NCCL ID và phát nó lên mỗi rank. Sau đó, nó
+gọi hàm setup để khởi tạo NCCL communicator, cuda streams, và buffers.
+
+(ii) Hàm tạo sử dụng Python đa phương thức
+
+Hàm tạo bước đầu sẽ sử dụng rank, world size, và NCCL ID từ input argument. Sau
+đó, nó gọi hàm setup function để khởi tạo NCCL communicator, cuda streams, và
+buffers.
+
+Sau khi khởi động, nó thực hiện chức năng all-reduce để đồng bộ hoá các tham số
+model và độ dốc. Ví dụ, synch sử dụng một input tensor và tiến hành all-reduce
+qua đoạn chương trình NCCL. Sau khi gọi synch, cần gọi hàm wait để đợi hàm
+all-reduce operation kết thúc.
+
+### Giao Diện Python của DistOpt
+
+Sau đó, giao diện python sẽ tạo ra một
+[DistOpt](https://github.com/apache/singa/blob/master/python/singa/opt.py) class
+để gói một
+[optimizer](https://github.com/apache/singa/blob/master/python/singa/opt.py)
+object để thực hiện distributed training dựa trên MPI hoặc đa xử lý. Trong khi
+khởi động, nó tạo ra một NCCL communicator object (từ giao diện C đề cập ở mục
+nhỏ phía trên). Sau đó, communicator object này được sử dụng trong mỗi hàm
+all-reduce trong DistOpt.
+
+Trong MPI hoặc đa xử lý, mỗi chỉ lệnh có một rank, cho biết thông tin máy GPU
+nào qui trình này đang sử dụng. Dữ liệu training được chia nhỏ để mỗi chỉ lệnh
+có thể đánh giá sub-gradient dựa trên dữ liệu đã chia trước đó. Sau khi
+sub-gradient được tạo ra ở mỗi chỉ lệnh, độ dốc stochastic gradient tổng hợp sẽ
+được tạo ra bằng cách all-reduce các sub-gradients đánh giá bởi tất cả các chỉ
+lệnh.
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/download.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/download.md
new file mode 100644
index 0000000..17a3f70
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/download.md
@@ -0,0 +1,206 @@
+---
+id: version-4.0.0_Viet-downloads
+title: Tải SINGA
+original_id: downloads
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+## Kiểm Chứng
+
+Để kiểm chứng tập tin tar.gz đã tải, tải
+[KEYS](https://www.apache.org/dist/singa/KEYS) và tập tin ASC sau đó thực hiện
+các lệnh sau
+
+```shell
+% gpg --import KEYS
+% gpg --verify downloaded_file.asc downloaded_file
+```
+
+Bạn có thể kiểm tra giá trị của SHA512 hoặc MD5 để xem liệu việc tải về đã hoàn
+thành chưa.
+
+## V3.1.0 (30 tháng 10 năm 2020):
+
+- [Apache SINGA 3.1.0](http://www.apache.org/dyn/closer.cgi/singa/3.1.0/apache-singa-3.1.0.tar.gz)
+  [\[SHA512\]](https://www.apache.org/dist/singa/3.1.0/apache-singa-3.1.0.tar.gz.sha512)
+  [\[ASC\]](https://www.apache.org/dist/singa/3.1.0/apache-singa-3.1.0.tar.gz.asc)
+- [Release Notes 3.1.0](http://singa.apache.org/docs/releases/RELEASE_NOTES_3.1.0)
+- Thay đổi chung:
+  - Cập nhật Tensor core:
+    - Hỗ trợ tensor transformation (reshape, transpose) cho tensors có tới 6
+      chiều (dimensions).
+    - Áp dụn traverse_unary_transform ở Cuda backend, tương tự như CPP backend
+      one.
+  - Thêm hàm tensor operators vào autograd module.
+  - Cải tạo lại sonnx để
+    - Hỗ trợ việc tạo hàm operators từ cả layer và autograd.
+    - Viết lại SingaRep để SINGA representation mạnh và nhanh hơn.
+    - Thêm SONNXModel áp dụng từ Model để API và các tính năng đồng bộ với nhau.
+  * Thay thế Travis CI với trình tự Github. Thêm quản lý chất lượng và độ bao
+    phủ.
+  * Thêm Tập lệnh compiling và packaging nhằm tạo gói wheel packages cho
+    distribution.
+  * Fix bugs
+    - Hoàn thiện Tập lệnh training cho ví dụ về IMDB LSTM model.
+    - Ổn định lại hàm Tensor operation Mult khi sử dụng Broadcasting.
+    - Hàm Gaussian trong Tensor giờ có thể chạy trên Tensor với kích thước lẻ.
+    - Cập nhật hàm hỗ trợ chạy thử gradients() trong autograd để tìm tham số
+      gradient qua tham số python object id khi chạy thử.
+
+## V3.0.0 (18 April 2020):
+
+- [Apache SINGA 3.0.0](https://archive.apache.org/dist/singa/3.0.0/apache-singa-3.0.0.tar.gz)
+  [\[SHA512\]](https://archive.apache.org/dist/singa/3.0.0/apache-singa-3.0.0.tar.gz.sha512)
+  [\[ASC\]](https://archive.apache.org/dist/singa/3.0.0/apache-singa-3.0.0.tar.gz.asc)
+- [Ghi Chú Phát Hành 3.0.0](http://singa.apache.org/docs/releases/RELEASE_NOTES_3.0.0)
+- Các tính năng mới và thay đổi chính,
+  - Nâng cấp ONNX. Thử nghiệm nhiều ONNX models trên SINGA.
+  - Thực hiện Distributed training với MPI và tối ưu hoá NCCL Communication qua
+    phân bổ và nén độ dốc, và truyền tải phân khúc.
+  - Xây dựng và tối ưu hoá tốc độ và bộ nhớ sử dụng graph của Computational
+    graph.
+  - Lập trang Tài Liệu sử dụng mới (singa.apache.org) và website tham khảo API
+    (apache-singa.rtfd.io).
+  - CI cho việc kiểm tra chất lượng mã code.
+  - Thay thế MKLDNN bằng DNNL
+  - Cập nhật APIs cho tensor để hỗ trợ hàm broadcasting.
+  - Tạo autograd operators mới để hỗ trợ các ONNX models.
+
+## Incubating v2.0.0 (20 tháng 4 năm 2019):
+
+- [Apache SINGA 2.0.0 (incubating)](https://archive.apache.org/dist/incubator/singa/2.0.0/apache-singa-incubating-2.0.0.tar.gz)
+  [\[SHA512\]](https://archive.apache.org/dist/incubator/singa/2.0.0/apache-singa-incubating-2.0.0.tar.gz.sha512)
+  [\[ASC\]](https://archive.apache.org/dist/incubator/singa/2.0.0/apache-singa-incubating-2.0.0.tar.gz.asc)
+- [Ghi Chú Phát Hành 2.0.0 (incubating)](http://singa.apache.org/docs/releases/RELEASE_NOTES_2.0.0.html)
+- Các tính năng mới và thay đổi chính,
+  - Nâng cấp autograd (cho Convolution networks và recurrent networks)
+  - Hỗ trợ ONNX
+  - Cải thiện hàm CPP operations qua Intel MKL DNN lib
+  - Thực hiện tensor broadcasting
+  - Chuyển Docker images dưới tên sử dụng trong Apache
+  - Cập nhật các phiên bản dependent lib trong conda-build config
+
+## Incubating v1.2.0 (6 June 2018):
+
+- [Apache SINGA 1.2.0 (incubating)](https://archive.apache.org/dist/incubator/singa/1.2.0/apache-singa-incubating-1.2.0.tar.gz)
+  [\[SHA512\]](https://archive.apache.org/dist/incubator/singa/1.2.0/apache-singa-incubating-1.2.0.tar.gz.sha512)
+  [\[ASC\]](https://archive.apache.org/dist/incubator/singa/1.2.0/apache-singa-incubating-1.2.0.tar.gz.asc)
+- [Release Notes 1.2.0 (incubating)](http://singa.apache.org/docs/releases/RELEASE_NOTES_1.2.0.html)
+- Các tính năng mới và thay đổi chính,
+  - Thực hiện autograd (đang hỗ trợ MLP model)
+  - Nâng cấp PySinga để hỗ trợ Python 3
+  - Cải thiện Tensor class với mục stride
+  - Nâng cấp cuDNN từ V5 sang V7
+  - Thêm VGG, Inception V4, ResNet, và DenseNet cho ImageNet classification
+  - Tạo alias cho gói conda packages
+  - Hoàn thiện Tài liệu sử dụng bằng tiếng Trung
+  - Thêm hướng dẫn chạy Singa trên Windows
+  - Cập nhật compilation, CI
+  - Sửa lỗi nếu có
+
+## Incubating v1.1.0 (12 February 2017):
+
+- [Apache SINGA 1.1.0 (incubating)](https://archive.apache.org/dist/incubator/singa/1.1.0/apache-singa-incubating-1.1.0.tar.gz)
+  [\[MD5\]](https://archive.apache.org/dist/incubator/singa/1.1.0/apache-singa-incubating-1.1.0.tar.gz.md5)
+  [\[ASC\]](https://archive.apache.org/dist/incubator/singa/1.1.0/apache-singa-incubating-1.1.0.tar.gz.asc)
+- [Release Notes 1.1.0 (incubating)](http://singa.apache.org/docs/releases/RELEASE_NOTES_1.1.0.html)
+- Các tính năng mới và thay đổi chính,
+  - Tạo Docker images (phiên bản CPU và GPU)
+  - Tạo Amazon AMI cho SINGA (phiên bản CPU)
+  - Tích hợp với Jenkins để tự động tạo gói Wheel và Debian (cho cài đặt), và
+    cập nhật website.
+  - Nâng cấp FeedFowardNet, vd., nhiều mode cho inputs và verbose để sửa lỗi
+  - Thêm Concat và Slice layers
+  - Mở rộng CrossEntropyLoss nhằm chấp nhật instance với nhiều labels
+  - Thêm image_tool.py với phương thức image augmentation
+  - Hỗ trợ tải và lưu model qua Snapshot API
+  - Compile SINGA source trên Windows
+  - Compile những dependent libraries bắt buộc cùng với SINGA code
+  - Kích hoạt Java binding (cơ bản) cho SINGA
+  - Thêm phiên bản ID trong kiểm soát tập tin
+  - Thêm gói sử dụng Rafiki cung cấp RESTFul APIs
+  - Thêm ví dụ pretrained từ Caffe, bao gồm GoogleNet
+
+## Incubating v1.0.0 (8 September 2016):
+
+- [Apache SINGA 1.0.0 (incubating)](https://archive.apache.org/dist/incubator/singa/1.0.0/apache-singa-incubating-1.0.0.tar.gz)
+  [\[MD5\]](https://archive.apache.org/dist/incubator/singa/1.0.0/apache-singa-incubating-1.0.0.tar.gz.md5)
+  [\[ASC\]](https://archive.apache.org/dist/incubator/singa/1.0.0/apache-singa-incubating-1.0.0.tar.gz.asc)
+- [Release Notes 1.0.0 (incubating)](http://singa.apache.org/docs/releases/RELEASE_NOTES_1.0.0.html)
+- Các tính năng mới và thay đổi chính,
+  - Tạo Tensor nhằm hỗ trợ nhiều model khác nhau.
+  - Tạo Device để chạy trên các thiết bị phần cứng khác nhau, bao gồm CPU,
+    (Nvidia/AMD) GPU và FPGA (sẽ thử nghiệm ở các phiên bản mới).
+  - Thay thế GNU autotool với cmake khi compilation.
+  - Hỗ trợ Mac OS
+  - Cải thiện Python binding, bao gồm cài đặt và lập trình.
+  - Tạo thêm nhiều deep learning models, bao gồm VGG và ResNet
+  - Thêm IO classes để đọc/viết tập tin và mã hoá/giải mã dữ liệu
+  - Các thành phần network communication mới trực tiếp từ Socket.
+  - Cudnn V5 với Dropout và RNN layers.
+  - Thay thế công cụ xây dựng website từ maven sang Sphinx
+  - Tích hợp Travis-CI
+
+## Incubating v0.3.0 (20 April 2016):
+
+- [Apache SINGA 0.3.0 (incubating)](https://archive.apache.org/dist/incubator/singa/0.3.0/apache-singa-incubating-0.3.0.tar.gz)
+  [\[MD5\]](https://archive.apache.org/dist/incubator/singa/0.3.0/apache-singa-incubating-0.3.0.tar.gz.md5)
+  [\[ASC\]](https://archive.apache.org/dist/incubator/singa/0.3.0/apache-singa-incubating-0.3.0.tar.gz.asc)
+- [Release Notes 0.3.0 (incubating)](http://singa.apache.org/docs/releases/RELEASE_NOTES_0.3.0.html)
+- Các tính năng mới và thay đổi chính,
+  - Training trên nhóm máy GPU: cho phép training các deep learning models trên
+    một nhóm máy GPU
+  - Cải thiện Python wrapper khiến cho job configuration trở nên dễ dàng, bao
+    gồm neural net và thuật toán SGD.
+  - Thêm cập nhật SGD updaters mới, bao gồm Adam, AdaDelta và AdaMax.
+  - Cài đặt cần ít dependent libraries hơn cho mỗi node training.
+  - Đa dạng training với CPU và GPU.
+  - Hỗ trợ cuDNN V4.
+  - Tìm nạp trước dữ liệu.
+  - Sửa lỗi nếu có.
+
+## Incubating v0.2.0 (14 January 2016):
+
+- [Apache SINGA 0.2.0 (incubating)](https://archive.apache.org/dist/incubator/singa/0.2.0/apache-singa-incubating-0.2.0.tar.gz)
+  [\[MD5\]](https://archive.apache.org/dist/incubator/singa/0.2.0/apache-singa-incubating-0.2.0.tar.gz.md5)
+  [\[ASC\]](https://archive.apache.org/dist/incubator/singa/0.2.0/apache-singa-incubating-0.2.0.tar.gz.asc)
+- [Release Notes 0.2.0 (incubating)](http://singa.apache.org/docs/releases/RELEASE_NOTES_0.2.0.html)
+- Các tính năng mới và thay đổi chính,
+  - Training trên GPU cho phép training các models phức tạp trên một node với
+    nhiều card GPU.
+  - Chia nhỏ Hybrid neural net hỗ trợ dữ liệu và model song song cùng lúc.
+  - Cải thiện Python wrapper khiến cho job configuration trở nên dễ dàng, bao
+    gồm neural net và thuật toán SGD.
+  - Áp dụng RNN model và thuật toán BPTT để hỗ trợ các ứng dụng dựa trên RNN
+    models, e.g., GRU.
+  - Tích hợp các phần mêm đám mây bao gồm Mesos, Docker và HDFS.
+  - Cung cấp hình ảnh cấu trúc neural net và thông tin layer, hỗ trợ việc sửa
+    lỗi.
+  - Hàm Linear algebra và các hàm ngẫu nhiên không dùng Blobs và chỉ điểm dữ
+    liệu thô.
+  - Tạo layers mới, bao gồm SoftmaxLayer, ArgSortLayer, DummyLayer, RNN layers
+    và cuDNN layers.
+  - Cập nhật Layer class để chứa nhiều data/grad Blobs.
+  - Trích xuất các features và thử nghiệm hiệu quả cho dữ liệu mới bằng cách tải
+    các tham số model đã được train từ trước.
+  - Thêm Store class cho hàm IO operations.
+
+## Incubating v0.1.0 (8 October 2015):
+
+- [Apache SINGA 0.1.0 (incubating)](https://archive.apache.org/dist/incubator/singa/apache-singa-incubating-0.1.0.tar.gz)
+  [\[MD5\]](https://archive.apache.org/dist/incubator/singa/apache-singa-incubating-0.1.0.tar.gz.md5)
+  [\[ASC\]](https://archive.apache.org/dist/incubator/singa/apache-singa-incubating-0.1.0.tar.gz.asc)
+- [Amazon EC2 image](https://console.aws.amazon.com/ec2/v2/home?region=ap-southeast-1#LaunchInstanceWizard:ami=ami-b41001e6)
+- [Release Notes 0.1.0 (incubating)](http://singa.apache.org/docs/releases/RELEASE_NOTES_0.1.0.html)
+- Các thay đổi chính gồm có,
+  - Cài đặt sử dụng tiện ích GNU build
+  - Tập lệnh cho job management với zookeeper
+  - Lập trình model dựa trên NeuralNet và trích xuất Layer.
+  - Kết cấu hệ thống dựa trên Worker, Server và Stub.
+  - Training models từ ba model khác nhau, là feed-forward models, energy models
+    và RNN models.
+  - Đồng bộ và không đồng bộ và không đồng bộ distributed training frameworks sử
+    dụng CPU
+  - Điểm kiểm tra (Checkpoint) và khôi phục
+  - Kiểm tra đơn vị sử dụng gtest
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/examples.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/examples.md
new file mode 100644
index 0000000..ab7146a
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/examples.md
@@ -0,0 +1,69 @@
+---
+id: version-4.0.0_Viet-examples
+title: Ví Dụ
+original_id: examples
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+Phần này đưa ra một vài ví dụ về việc thực hiện deep learning sử dụng SINGA. Mã
+nguồn (source code) được cung cấp trong SINGA repo trên
+[Github](https://github.com/apache/singa/tree/master/examples). Có thể tham khảo
+các ví dụ sử dụng SINGA Python APIs trên CPU hoặc một GPU trên
+[Google Colab](https://colab.research.google.com/). Bạn có thể trực tiếp chạy
+thử trên Google Cloud mà không cần tạo local environment. Đường dẫn tới mỗi ví
+dụ được cung cấp dưới đây.
+
+## Image Classification
+
+| Model       | Dataset                           | Đường dẫn                                                                                               |
+| ----------- | --------------------------------- | ------------------------------------------------------------------------------------------------------- |
+| CNN cơ bản  | MNIST, CIFAR10, CIFAR100          | [Colab](https://colab.research.google.com/drive/1fbGUs1AsoX6bU5F745RwQpohP4bHTktq)                      |
+| AlexNet     | ImageNet                          | [Cpp]()                                                                                                 |
+| VGG         | ImageNet                          | [Cpp](), [Python](), [Colab](https://colab.research.google.com/drive/14kxgRKtbjPCKKsDJVNi3AvTev81Gp_Ds) |
+| XceptionNet | MNIST, CIFAR10, CIFAR100          | [Python]()                                                                                              |
+| ResNet      | MNIST, CIFAR10, CIFAR100, CIFAR10 | [Python](), [Colab](https://colab.research.google.com/drive/1u1RYefSsVbiP4I-5wiBKHjsT9L0FxLm9)          |
+| MobileNet   | ImageNet                          | [Colab](https://colab.research.google.com/drive/1HsixqJMIpKyEPhkbB8jy7NwNEFEAUWAf)                      |
+
+## Object Detection
+
+| Model       | Dataset    | Đường dẫn                                                                          |
+| ----------- | ---------- | ---------------------------------------------------------------------------------- |
+| Tiny YOLOv2 | Pascal VOC | [Colab](https://colab.research.google.com/drive/11V4I6cRjIJNUv5ZGsEGwqHuoQEie6b1T) |
+
+## Nhận diện Khuôn mặt và Cảm xúc
+
+| Model           | Dataset                                                                                                                                                | Đường dẫn                                                                          |
+| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------- |
+| ArcFace         | Refined MS-Celeb-1M                                                                                                                                    | [Colab](https://colab.research.google.com/drive/1qanaqUKGIDtifdzEzJOHjEj4kYzA9uJC) |
+| Emotion FerPlus | [Facial Expression Recognition Challenge](https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge/data) | [Colab](https://colab.research.google.com/drive/1XHtBQGRhe58PDi4LGYJzYueWBeWbO23r) |
+
+## Image Generation
+
+| Model | Dataset | Đường dẫn                                                                          |
+| ----- | ------- | ---------------------------------------------------------------------------------- |
+| GAN   | MNIST   | [Colab](https://colab.research.google.com/drive/1f86MNDW47DJqHoIqWD1tOxcyx2MWys8L) |
+| LSGAN | MNIST   | [Colab](https://colab.research.google.com/drive/1C6jNRf28vnFOI9JVM4lpkJPqxsnhxdol) |
+
+## Machine Comprehension
+
+| Model      | Dataset                                                                   | Đường dẫn                                                                          |
+| ---------- | ------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- |
+| Bert-Squad | [SQuAD v1.1](https://rajpurkar.github.io/SQuAD-explorer/explore/1.1/dev/) | [Colab](https://colab.research.google.com/drive/1kud-lUPjS_u-TkDAzihBTw0Vqr0FjCE-) |
+
+## Text Classification
+
+| Model       | Dataset | Đường dẫn  |
+| ----------- | ------- | ---------- |
+| Simple LSTM | IMDB    | [python]() |
+
+## Text Ranking
+
+| Model  | Dataset     | Đường dẫn  |
+| ------ | ----------- | ---------- |
+| BiLSTM | InsuranceQA | [python]() |
+
+## Misc.
+
+- Restricted Boltzmann Machine sử dụng dữ liệu MNIST, [nguồn](),
+  [Colab](https://colab.research.google.com/drive/19996noGu9JyHHkVmp4edBGu7PJSRQKsd).
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/git-workflow.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/git-workflow.md
new file mode 100644
index 0000000..76a546a
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/git-workflow.md
@@ -0,0 +1,130 @@
+---
+id: version-4.0.0_Viet-git-workflow
+title: Quy Trình Sử Dụng Git
+original_id: git-workflow
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+## Dành cho Lập Trình Viên
+
+1. Fork [SINGA Github repository](https://github.com/apache/singa) về tài khoản
+   Github của bạn.
+
+2. Clone **repo** (viết tắt của repository) từ tài khoản Github của bạn
+
+   ```shell
+   git clone https://github.com/<Github account>/singa.git
+   git remote add upstream https://github.com/apache/singa.git
+   ```
+
+3. Tạo branch mới (vd., `feature-foo` hoặc `fixbug-foo`), chỉnh sửa và commit
+   code của bạn ở đây .
+
+   ```shell
+   git checkout dev
+   git checkout -b feature-foo
+   # write your code
+   git add <created/updated files>
+   git commit
+   ```
+
+   Nội dung lệnh commit nên bao gồm:
+
+   - Tiêu đề (Title) mô tả.
+   - Mô tả chi tiết. Nếu lệnh commit là sửa lỗi (bugs), tốt nhất là nên bao gồm
+     việc mô tả ngắn gọn lại vấn đề. Nếu thêm tính năng mới, có thể bao gồm động
+     cơ thúc đẩy/mục đích của tính năng mới này.
+
+   Nếu branch của bạn có nhiều commits nhỏ, bạn cần súc tích lại các commits
+   bằng cách
+
+   ```shell
+   git rebase -i <commit id>
+   ```
+
+   Bạn có thể
+   [squash và reword](https://help.github.com/en/articles/about-git-rebase) các
+   commits.
+
+4. Khi bạn đang xử lý các mã code, branch `dev` của SINGA có thể đang được cập
+   nhật bởi người khác; Trong trường hợp này, bạn cần pull dev mới nhất
+
+   ```shell
+   git checkout dev
+   git pull upstream dev:dev
+   ```
+
+5. [Rebase](https://git-scm.com/book/en/v2/Git-Branching-Rebasing) `feature-foo`
+   vào branch `dev` và push commits vào tài khoản Github của bạn ( branch mới).
+   Lệnh rebase nhằm giúp cho lịch sử commit của bạn rõ ràng. Các lệnh git dưới
+   đây nên được thực hiện sau khi bạn commit các việc làm của mình:
+
+   ```shell
+   git checkout feature-foo
+   git rebase dev
+   git push origin feature-foo:feature-foo
+   ```
+
+   Lệnh rebase thực hiện các
+   [bước sau](https://git-scm.com/book/en/v2/Git-Branching-Rebasing): "Lệnh này
+   thực hiện bắt đầu từ hình thái ban đầu của hai branches (mà bạn đang sử dụng
+   hoặc bạn đang rebase vào), xác định sự khác nhau ở mỗi commit của branch bạn
+   đang sử dụng, lưu các điểm khác nhau vào tập tin tạm thời, điều chỉnh branch
+   hiện tại để có cùng commit với branch mà bạn đang rebase vào, rồi cuối cùng
+   áp dụng từng thay đổi một theo thứ tự." Bởi vậy, sau khi thực hiện, bạn sẽ
+   vẫn ở feature branch, nhưng commit IDs/hashes của bạn được thay đổi do các
+   điểm khác nhau đã được commit trong quá trình rebase; và branch của bạn giờ
+   đây chứa bản code cập nhật nhất từ branch dev và branch của bạn.
+
+6. Tạo một pull request (PR) vào branch dev của apache/singa trên website
+   Github. Nếu bạn muốn thông báo cho các thành viên khác đang làm việc trên
+   cùng một tập tin, bạn có thể tìm tập tin đó trên Github và nhấn vào "Blame"
+   để xem chú thích từng dòng một ai đã thay đổi code lần cuối cùng. Sau đó, bạn
+   có thể thêm @username trong mục mô tả PR để nhắc họ. Hãy nói rõ rằng những
+   đóng góp này là công sức của bạn và rằng bạn cấp bản quyền công sức này cho
+   dự án theo dạng bản quyền dự án mở. Những commits khác (vd., sửa lỗi) vào
+   branch mới này sẽ được tự động cập nhật vào pull request của bạn bởi Github.
+
+7. Đợi thành viên xét duyệt PR. Trong quá trình này, dev branch của SINGA có thể
+   được những người khác cập nhật, do vậy bạn cần phải
+   [merge the latest dev](https://docs.fast.ai/dev/git.html#how-to-keep-your-feature-branch-up-to-date)
+   để xử lý các conflicts. Một số người
+   [rebase PR vào branch dev mới nhất](https://github.com/edx/edx-platform/wiki/How-to-Rebase-a-Pull-Request)
+   thay vì merging. Tuy nhiên, nếu các thành viên khác fetch PR này để thêm các
+   tính năng mới rồi gửi PR, việc rebase sẽ gây ra **duplicate commits** (với
+   hash khác) ở PR mới. Xem
+   [Nguyên tắc vàng để Rebasing](https://www.atlassian.com/git/tutorials/merging-vs-rebasing)
+   để biết thêm chi tiết khi nào cần tránh rebase. Một giải pháp đơn giản để cập
+   nhật PR (nhằm xử lý conflicts hay lỗi commit) là checkout một branch mới từ
+   branch dev cập nhật nhất của Apache SINGAS repo; copy và paste các mã code
+   được cập nhật/thêm vào; commit và gửi một PR mới.
+
+## Dành cho commiters
+
+Commiters có thể merge pull requests (PRs) vào dev branch của repo upstream.
+Trước khi merge mỗi PR, committer nên
+
+- kiểm tra thông điệp commit (nội dung và định dạng)
+- kiểm tra những thay đổi so với code hiện tại. Thay đổi về API nên được ghi
+  lại.
+- kiểm tra kết quả Travis testing cho định dạng mã code/tài liệu và unit tests
+
+Có hai cách để merge một pull request:
+
+- Trên Github. Làm theo [hướng dẫn](https://gitbox.apache.org/setup/) để kết nối
+  tài khoản Apache với tài khoản Github của bạn. Sau đó bạn có thể trực tiếp
+  merge PRs trên GitHub.
+- Để merge pull request https://github.com/apache/singa/pull/xxx qua command
+  line, thực hiện theo hướng dẫn sau:
+
+  ```shell
+  git clone https://github.com/apache/singa.git
+  git remote add asf https://gitbox.apache.org/repos/asf/singa.git
+  git fetch origin pull/xxx/head:prxxx
+  git checkout dev
+  git merge --no-ff prxxx
+  git push asf dev:dev
+  ```
+
+  Không sử dụng rebase để merge PR; và vô hiệu hoá fast forward.
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/graph.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/graph.md
new file mode 100644
index 0000000..ff5d20f
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/graph.md
@@ -0,0 +1,525 @@
+---
+id: version-4.0.0_Viet-graph
+title: Model
+original_id: graph
+---
+
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed [...]
+
+Forward và backward propagation trong mạng thần kinh nhân tạo (neural network)
+có thể sử dụng một tập hợp các hàm như convolution và pooling. Mỗi hàm nhận một
+vài input [tensors](./tensor) và áp dụng một [operator](./autograd) để tạo
+output tensors. Bằng việc thể hiện mỗi operator là một node và mỗi tensor là một
+edge, tất cả dạng hàm tạo thành một computational graph. Với computational
+graph, tối ưu hoá tốc độ và bộ nhớ có thể được tiến hành bởi việc đưa vào thực
+hiện việc phân bổ/giải phóng bộ nhớ và thao tác một cách hợp lý. Trong SINGA,
+người dùng chỉ cần xác định neural network model sử dụng API của hàm
+[Model](https://github.com/apache/singa/blob/master/python/singa/model.py).
+Graph được xây dựng và tối ưu hoá ở C++ phía sau một cách tự động.
+
+Theo đó, một mặt người dùng thực hiện network sử dụng API của hàm
+[Model](./graph) tuân theo phong cách lập trình bắt buộc như PyTorch. Có điều
+khác với PyTorch phải tái tạo lại các thao tác ở mỗi vòng lặp, SINGA buffer các
+thao tác để tạo computational graph một cách đầy đủ (khi tính năng này được kích
+hoạt) sau vòng lặp đầu tiên. Do đó, mặt khác, SINGA có computational graph giống
+như được tạo bởi các libraries sử dụng lập trình khai báo (declarative
+programming), như TensorFlow. Nên nó được tối ưu hoá qua graph.
+
+## Ví Dụ
+
+Mã code sau mô phỏng việc sử dụng API của hàm `Model`.
+
+1. Áp dụng model mới như một tập con của Model class.
+
+```Python
+class CNN(model.Model):
+
+    def __init__(self, num_classes=10, num_channels=1):
+        super(CNN, self).__init__()
+        self.conv1 = layer.Conv2d(num_channels, 20, 5, padding=0, activation="RELU")
+        self.conv2 = layer.Conv2d(20, 50, 5, padding=0, activation="RELU")
+        self.linear1 = layer.Linear(500)
+        self.linear2 = layer.Linear(num_classes)
+        self.pooling1 = layer.MaxPool2d(2, 2, padding=0)
+        self.pooling2 = layer.MaxPool2d(2, 2, padding=0)
+        self.relu = layer.ReLU()
+        self.flatten = layer.Flatten()
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.pooling1(y)
+        y = self.conv2(y)
+        y = self.pooling2(y)
+        y = self.flatten(y)
+        y = self.linear1(y)
+        y = self.relu(y)
+        y = self.linear2(y)
+        return y
+
+    def train_one_batch(self, x, y):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+        self.optimizer(loss)
+        return out, loss
+```
+
+2. Tạo một instance cho model, optimizer, device, v.v. Compile model đó
+
+```python
+model = CNN()
+
+# khởi tạo optimizer và đính nó vào model
+sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+model.set_optimizer(sgd)
+
+# khởi tạo device
+dev = device.create_cuda_gpu()
+
+# input và target placeholders cho model
+tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
+ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
+
+# compile model trước khi training
+model.compile([tx], is_train=True, use_graph=True, sequential=False)
+```
+
+3. Train model theo vòng lặp
+
+```python
+for b in range(num_train_batch):
+    # tạo mini-batch tiếp theo
+    x, y = ...
+
+    # Copy dữ liệu vào input tensors
+    tx.copy_from_numpy(x)
+    ty.copy_from_numpy(y)
+
+    # Training với một batch
+    out, loss = model(tx, ty)
+```
+
+Ví dụ này có trên Google Colab notebook
+[tại đây](https://colab.research.google.com/drive/1fbGUs1AsoX6bU5F745RwQpohP4bHTktq).
+
+Các ví dụ khác:
+
+- [MLP](https://github.com/apache/singa/blob/master/examples/mlp/model.py)
+- [CNN](https://github.com/apache/singa/blob/master/examples/cnn/model/cnn.py)
+- [ResNet](https://github.com/apache/singa/blob/master/examples/cnn/model/resnet.py)
+
+## Thực Hiện
+
+### Xây Dựng Graph
+
+SINGA tạo computational graph qua 3 bước:
+
+1. Buffer các thao tác
+2. Phân tích hoạt động các thư viện sử dụng trong dự án (dependencies)
+3. Tạo nodes và edges dựa trên dependencies
+
+Sử dụng phép nhân ma trận từ dense layer của
+[MLP model](https://github.com/apache/singa/blob/master/examples/mlp/model.py)
+làm ví dụ. Quá trình này gọi là hàm `forward` function của class MLP
+
+```python
+class MLP(model.Model):
+
+    def __init__(self, data_size=10, perceptron_size=100, num_classes=10):
+        super(MLP, self).__init__()
+        self.linear1 = layer.Linear(perceptron_size)
+        ...
+
+    def forward(self, inputs):
+        y = self.linear1(inputs)
+        ...
+```
+
+Layer `Linear` tạo thành từ phép tính `mutmul`. `autograd` áp dụng phép `matmul`
+bằng cách gọi hàm `Mult` được lấy từ CPP qua SWIG.
+
+```python
+# áp dụng matmul()
+singa.Mult(inputs, w)
+```
+
+Từ phía sau, hàm `Mult` function được áp dụng bằng cách gọi `GEMV`, là một hàm
+CBLAS. thay vì gọi hàm `GEMV` trực tiếp, `Mult` gửi đi `GEMV` và đối số
+(argument) tới thiết bị (device) như sau,
+
+```c++
+// Áp dụng Mult()
+C->device()->Exec(
+    [a, A, b, B, CRef](Context *ctx) mutable {
+        GEMV<DType, Lang>(a, A, B, b, &CRef, ctx);
+    },
+    read_blocks, {C->block()});
+```
+
+Hàm `Exec` function của `Device` buffer hàm này và các đối số của nó. Thêm vào
+đó, nó cũng có thông tin về các block (một block là một đoạn bộ nhớ cho một
+tensor) để đọc và viết bởi hàm này.
+
+Sau khi `Model.forward()` được thực hiện xong một lần, tất cả quá trình được
+buffer bởi `Device`. Tiếp theo, thông tin đọc/viết của tất cả quá trình sẽ được
+phân tích để tạo computational graph. Ví dụ, nếu một block `b` được viết bởi quá
+trình 01 và sau đó được đọc bởi quá trình 02 khác, chúng ta sẽ biết 02 là dựa
+vào 01 và có edge trực tiếp từ A sang B, thể hiện qua block `b` (hoặc tensor của
+nó). Sau đó một graph không tuần hoàn sẽ được tạo ra như dưới đây. Graph chỉ
+được tạo ra một lần.
+
+![Computational graph của MLP](assets/GraphOfMLP.png)
+
+<br/>**Sơ đồ 1 - Ví dụ Computational graph của MLP.**
+
+### Tối Ưu Hoá
+
+Hiện nay, các tối ưu hoá sau được thực hiện dựa trên computational graph.
+
+**Phân bổ thụ động (Lazy allocation)** Khi tensor/blocks được tạo ra, các thiết
+bị (devices) không phân bổ bộ nhớ cho chúng ngay lập tức. Thay vào đó, khi block
+được tiếp cận lần đầu tiên, bộ nhớ sẽ được phân bổ.
+
+**Tự động tái sử dụng (Automatic recycling)** Đếm số của mỗi tensor/block được
+tính dựa trên graph. Trước khi thực hiện quá trình nào, đếm số là số lượng hàm
+đọc block này. Trong quá trình thực hiện, khi một hàm nào được tiến hành, đếm số
+của mỗi block đầu vào bị trừ đi 1. Nếu đếm số của một block bằng 0, thì block
+này sẽ không được đọc lại nữa trong toàn bộ quá trình còn lại. Bởi vậy, bộ nhớ
+của nó được giải phóng một cách an toàn. Thêm vào đó, SINGA theo dõi việc sử
+dụng block bên ngoài graph. Nếu block được sử dụng bởi mã code Python (không
+phải các hàm autograd), nó sẽ không được tái sử dụng.
+
+**Chia sẻ bộ nhớ** SINGA sử dụng memory pool, như là
+[CnMem](https://github.com/NVIDIA/cnmem) để quản lý bộ nhớ CUDA. Với _Automatic
+recycling_ và memory pool, SINGA có thể chia sẻ bộ nhớ giữa các tensor. Xem xét
+hai hàm `c = a + b` và `d=2xc`. Trước khi thực hiện hàm thứ hai, theo như _Lazy
+allocation_ thì bộ nhớ của d nên được sử dụng. Cũng như `a` không được sử dụng ở
+toàn bộ quá trình còn lại. Theo Tự động sử dụng (Automatic recycling), block của
+`a` sẽ được giải phóng sau hàm đầu tiên. Vì thế, SINGA sẽ đề xuất bốn hàm tới
+CUDA stream: addition, free `a`, malloc `b`, và multiplication. Memory pool sau
+đó có thể chia sẻ bộ nhớ được `a` với `b` giải phóng thay vì yêu cầu GPU thực
+hiện real malloc cho `b`.
+
+Các kĩ thuật tối ưu hoá khác, ví dụ từ compliers, như common sub-expression
+elimination và parallelizing operations trên CUDA streams khác nhau cũng có thể
+được áp dụng.
+
+## Toán Tử (Operator) mới
+
+Mỗi toán tử được định nghĩa trong `autograd` module áp dụng hai hàm: forward và
+backward, được thực hiện bằng cách gọi toán tử (operator) từ backend. Để thêm
+một toán tử mới vào hàm `autograd`, bạn cần thêm nhiều toán tử ở backend.
+
+Lấy toán tử
+[Conv2d](https://github.com/apache/singa/blob/master/python/singa/autograd.py)
+làm ví dụ, từ phía Python, hàm forward và backward được thực hiện bằng cách gọi
+các toán tử từ backend dựa trên loại device.
+
+```python
+class _Conv2d(Operation):
+
+    def forward(self, x, W, b=None):
+        ......
+        if training:
+            if self.handle.bias_term:
+                self.inputs = (x, W, b) # ghi chép x, W, b
+            else:
+                self.inputs = (x, W)
+
+        if (type(self.handle) != singa.ConvHandle):
+            return singa.GpuConvForward(x, W, b, self.handle)
+        else:
+            return singa.CpuConvForward(x, W, b, self.handle)
+
+    def backward(self, dy):
+        if (type(self.handle) != singa.ConvHandle):
+            dx = singa.GpuConvBackwardx(dy, self.inputs[1], self.inputs[0],
+                                        self.handle)
+            dW = singa.GpuConvBackwardW(dy, self.inputs[0], self.inputs[1],
+                                        self.handle)
+            db = singa.GpuConvBackwardb(
+                dy, self.inputs[2],
+                self.handle) if self.handle.bias_term else None
+        else:
+            dx = singa.CpuConvBackwardx(dy, self.inputs[1], self.inputs[0],
+                                        self.handle)
+            dW = singa.CpuConvBackwardW(dy, self.inputs[0], self.inputs[1],
+                                        self.handle)
+            db = singa.CpuConvBackwardb(
+                dy, self.inputs[2],
+                self.handle) if self.handle.bias_term else None
+        if db:
+            return dx, dW, db
+        else:
+            return dx, dW
+```
+
+Mỗi toán tử ở backend nên được thực hiện theo cách sau:
+
+- Giả dụ toán từ là `foo()`; khi được thực hiện nên được gói vào trong một hàm
+  khác, như `_foo()`. `foo()` chuyển `_foo` cùng với các đối số như một hàm
+  lambda tới hàm `Device`'s `Exec` để buffer. Block để đọc và viết cũng được
+  chuyển cho `Exec`.
+
+- Tất cả đối số được sử dụng trong hàm lambda expression cần phải được thu thập
+  dựa trên các nguyên tắc sau.
+
+  - `thu thập bằng giá trị`: Nếu biến đối số (argument variable) là biến local
+    hoặc sẽ được giải phóng ngay (như intermediate tensors). Hoặc, những biến số
+    này sẽ bị loại bỏ khi `foo()` tồn tại.
+  - `thu thập theo tham khảo`：Nếu biến số được ghi lại từ phía python hoặc một
+    biến bất biến (như tham số W và ConvHand trong Conv2d class).
+
+  - `mutable`: Biểu thức lambda expression nên có biến thẻ (mutable tag) nếu một
+    biến được thu thập theo giá trị bị thay đổi trong hàm `_foo()`
+
+Đây là một
+[ví dụ](https://github.com/apache/singa/blob/master/src/model/operation/convolution.cc)
+về operator được áp dụng ở backend.
+
+```c++
+Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x,
+                        const CudnnConvHandle &cch) {
+  CHECK_EQ(dy.device()->lang(), kCuda);
+
+  Tensor dx;
+  dx.ResetLike(x);
+
+  dy.device()->Exec(
+      /*
+       * dx is a local variable so it's captured by value
+       * dy is an intermediate tensor and isn't recorded on the python side
+       * W is an intermediate tensor but it's recorded on the python side
+       * chh is a variable and it's recorded on the python side
+       */
+      [dx, dy, &W, &cch](Context *ctx) mutable {
+        Block *wblock = W.block(), *dyblock = dy.block(), *dxblock = dx.block();
+        float alpha = 1.f, beta = 0.f;
+        cudnnConvolutionBackwardData(
+            ctx->cudnn_handle, &alpha, cch.filter_desc, wblock->data(),
+            cch.y_desc, dyblock->data(), cch.conv_desc, cch.bp_data_alg,
+            cch.workspace.block()->mutable_data(),
+            cch.workspace_count * sizeof(float), &beta, cch.x_desc,
+            dxblock->mutable_data());
+      },
+      {dy.block(), W.block()}, {dx.block(), cch.workspace.block()});
+      /* the lambda expression reads the blocks of tensor dy and w
+       * and writes the blocks of tensor dx and chh.workspace
+       */
+
+  return dx;
+}
+```
+
+## Điểm Chuẩn (Benchmark)
+
+### Trên một node
+
+- Thiết lập thí nghiệm
+  - Model
+    - Sử dụng layer: ResNet50 trong
+      [resnet.py](https://github.com/apache/singa/blob/master/examples/cnn/autograd/resnet_cifar10.py)
+    - Sử dụng model: ResNet50 trong
+      [resnet.py](https://github.com/apache/singa/blob/master/examples/cnn/model/resnet.py)
+  - GPU: NVIDIA RTX 2080Ti
+- Kí hiệu
+  - `s` ：giây (second)
+  - `it` ： vòng lặp (iteration)
+  - `Mem`：sử dụng bộ nhớ tối đa trong một GPU
+  - `Throughout`：số lượng hình ảnh được xử lý mỗi giây
+  - `Time`：tổng thời gian
+  - `Speed`：vòng lặp mỗi giây
+  - `Reduction`：tốc độ giảm bộ nhớ sử dụng so với sử dụng layer
+  - `Speedup`: tốc độ tăng tốc so với dev branch
+- Kết quả
+  <table style="text-align: center">
+      <tr>
+          <th style="text-align: center">Batchsize</th>
+          <th style="text-align: center">Cases</th>
+          <th style="text-align: center">Mem(MB)</th>
+          <th style="text-align: center">Time(s)</th>
+          <th style="text-align: center">Speed(it/s)</th>
+          <th style="text-align: center">Throughput</th>
+          <th style="text-align: center">Reduction</th>
+          <th style="text-align: center">Speedup</th>
+      </tr>
+      <tr>
+          <td rowspan="4">16</td>
+          <td nowrap>layer</td>
+          <td>4975</td>
+          <td>14.1952</td>
+          <td>14.0893</td>
+          <td>225.4285</td>
+          <td>0.00%</td>
+          <td>1.0000</td>
+      </tr>
+      <tr>
+          <td nowrap>model:disable graph</td>
+          <td>4995</td>
+          <td>14.1264</td>
+          <td>14.1579</td>
+          <td>226.5261</td>
+          <td>-0.40%</td>
+          <td>1.0049</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, bfs</td>
+          <td>3283</td>
+          <td>13.7438</td>
+          <td>14.5520</td>
+          <td>232.8318</td>
+          <td>34.01%</td>
+          <td>1.0328</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, serial</td>
+          <td>3265</td>
+          <td>13.7420</td>
+          <td>14.5540</td>
+          <td>232.8635</td>
+          <td>34.37%</td>
+          <td>1.0330</td>
+      </tr>
+      <tr>
+          <td rowspan="4">32</td>
+          <td nowrap>layer</td>
+          <td>10119</td>
+          <td>13.4587</td>
+          <td>7.4302</td>
+          <td>237.7649</td>
+          <td>0.00%</td>
+          <td>1.0000</td>
+      </tr>
+      <tr>
+          <td nowrap>model:disable graph</td>
+          <td>10109</td>
+          <td>13.2952</td>
+          <td>7.5315</td>
+          <td>240.6875</td>
+          <td>0.10%</td>
+          <td>1.0123</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, bfs</td>
+          <td>6839</td>
+          <td>13.1059</td>
+          <td>7.6302</td>
+          <td>244.1648</td>
+          <td>32.41%</td>
+          <td>1.0269</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, serial</td>
+          <td>6845</td>
+          <td>13.0489</td>
+          <td>7.6635</td>
+          <td>245.2312</td>
+          <td>32.35%</td>
+          <td>1.0314</td>
+      </tr>
+  </table>
+
+### Đa quá trình (Multi processes)
+
+- Thiết lập thí nghiệm
+  - API
+    - Sử dụng Layer: ResNet50 trong
+      [resnet_dist.py](https://github.com/apache/singa/blob/master/examples/cnn/autograd/resnet_dist.py)
+    - Sử dụng Model: ResNet50 trong
+      [resnet.py](https://github.com/apache/singa/blob/master/examples/cnn/model/resnet.py)
+  - GPU: NVIDIA RTX 2080Ti \* 2
+  - MPI: hai quá trình MPI trên một node
+- Kí hiệu: như trên
+- kết quả
+  <table style="text-align: center">
+      <tr>
+          <th style="text-align: center">Batchsize</th>
+          <th style="text-align: center">Cases</th>
+          <th style="text-align: center">Mem(MB)</th>
+          <th style="text-align: center">Time(s)</th>
+          <th style="text-align: center">Speed(it/s)</th>
+          <th style="text-align: center">Throughput</th>
+          <th style="text-align: center">Reduction</th>
+          <th style="text-align: center">Speedup</th>
+      </tr>
+      <tr>
+          <td rowspan="4">16</td>
+          <td nowrap>layer</td>
+          <td>5439</td>
+          <td>17.3323</td>
+          <td>11.5391</td>
+          <td>369.2522</td>
+          <td>0.00%</td>
+          <td>1.0000</td>
+      </tr>
+      <tr>
+          <td nowrap>model:disable graph</td>
+          <td>5427</td>
+          <td>17.8232</td>
+          <td>11.2213</td>
+          <td>359.0831</td>
+          <td>0.22%</td>
+          <td>0.9725</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, bfs</td>
+          <td>3389</td>
+          <td>18.2310</td>
+          <td>10.9703</td>
+          <td>351.0504</td>
+          <td>37.69%</td>
+          <td>0.9507</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, serial</td>
+          <td>3437</td>
+          <td>17.0389</td>
+          <td>11.7378</td>
+          <td>375.6103</td>
+          <td>36.81%</td>
+          <td>1.0172</td>
+      </tr>
+      <tr>
+          <td rowspan="4">32</td>
+          <td nowrap>layer</td>
+          <td>10547</td>
+          <td>14.8635</td>
+          <td>6.7279</td>
+          <td>430.5858</td>
+          <td>0.00%</td>
+          <td>1.0000</td>
+      </tr>
+      <tr>
+          <td nowrap>model:disable graph</td>
+          <td>10503</td>
+          <td>14.7746</td>
+          <td>6.7684</td>
+          <td>433.1748</td>
+          <td>0.42%</td>
+          <td>1.0060</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, bfs</td>
+          <td>6935</td>
+          <td>14.8553</td>
+          <td>6.7316</td>
+          <td>430.8231</td>
+          <td>34.25%</td>
+          <td>1.0006</td>
+      </tr>
+      <tr>
+          <td nowrap>model:enable graph, serial</td>
+          <td>7027</td>
+          <td>14.3271</td>
+          <td>6.9798</td>
+          <td>446.7074</td>
+          <td>33.37%</td>
+          <td>1.0374</td>
+      </tr>
+  </table>
+
+### Kết Luận
+
+- Training với computational graph giúp giảm đáng kể khối bộ nhớ.
+- Hiện tại, tốc độ có cải thiện một chút. Nhiều tối ưu hoá có thể được thực hiện
+  giúp tăng hiệu quả.
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/history-singa.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/history-singa.md
new file mode 100644
index 0000000..e4b9143
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/history-singa.md
@@ -0,0 +1,43 @@
+---
+id: version-4.0.0_Viet-history-singa
+title: Lịch Sử Phát Triển SINGA
+original_id: history-singa
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+## Lịch Sử Phát Triển
+
+SINGA được phát triển bởi DB System Group tại ĐH Quốc Gia Singapore (NUS) vào
+năm 2014, hợp tác với nhóm nghiên cứu cơ sở dữ liệu của ĐH Chiết Giang. Vui lòng
+cite các báo cáo khoa học dưới đây nếu bạn muốn sử dụng SINGA trong nghiên cứu
+của mình:
+
+- B.C. Ooi, K.-L. Tan, S. Wang, W. Wang, Q. Cai, G. Chen, J. Gao, Z. Luo, A. K.
+  H. Tung, Y. Wang, Z. Xie, M. Zhang, and K. Zheng.
+  [SINGA: A distributed deep learning platform](http://www.comp.nus.edu.sg/~ooibc/singaopen-mm15.pdf).
+  ACM Multimedia (Open Source Software Competition) 2015
+
+- W. Wang, G. Chen, T. T. A. Dinh, B. C. Ooi, K.-L.Tan, J. Gao, and S. Wang.
+  [SINGA: putting deep learning in the hands of multimedia users](http://www.comp.nus.edu.sg/~ooibc/singa-mm15.pdf).
+  ACM Multimedia 2015.
+
+Rafiki là một mô-đun của SINGA. Vui lòng cite các báo cáo khoa học dưới đây nếu
+bạn muốn sử dụng Rafiki trong nghiên cứu của mình:
+
+- Wei Wang, Jinyang Gao, Meihui Zhang, Sheng Wang, Gang Chen, Teck Khim Ng, Beng
+  Chin Ooi, Jie Shao, Moaz Reyad.
+  [Rafiki: Machine Learning as an Analytics Service System](http://www.vldb.org/pvldb/vol12/p128-wang.pdf).
+  [VLDB 2019](http://vldb.org/2019/)
+  ([BibTex](https://dblp.org/rec/bib2/journals/pvldb/WangWGZCNOS18.bib)).
+
+Các công ty như [NetEase](http://tech.163.com/17/0602/17/CLUL016I00098GJ5.html),
+[yzBigData](http://www.yzbigdata.com/en/index.html),
+[Shentilium](https://shentilium.com/), [Foodlg](http://www.foodlg.com/) and
+[Medilot](https://medilot.com/technologies) đang sử dụng SINGA cho các ứng dụng
+của họ.
+
+## Bản Quyền
+
+SINGA được phát hành bởi
+[Bản Quyền Apache phiên bản 2.0](http://www.apache.org/licenses/LICENSE-2.0)
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/how-to-release.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/how-to-release.md
new file mode 100644
index 0000000..f0f6dae
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/how-to-release.md
@@ -0,0 +1,207 @@
+---
+id: version-4.0.0_Viet-how-to-release
+title: Chuẩn bị trước khi phát hành
+original_id: how-to-release
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+Đây là hướng dẫn chuẩn bị cho việc phát hành
+[quá trình chuẩn bị trước khi phát hành](http://www.apache.org/dev/release-publishing.html)
+SINGA.
+
+1. Lựa chọn người quản lý cho việc phát hành. Người quản lý chịu trách nhiệm
+   điều phối quá trình phát hành. Chữ ký của người quản lý (.asc) sẽ được tải
+   lên cùng với bản phát hành. Nggười quản lý tạo KEY (RSA 4096-bit) và tải nó
+   lên public key server. Để được tin cậy kết nối trên web, người quản lý cần
+   các người dùng Apache khác chứng thực (signed) Key của mình. Anh ta trước
+   tiên cần yêu cầu mentor giúp chứng thực key.
+   [Cách tạo Key](http://www.apache.org/dev/release-signing.html)?
+
+2. Kiểm tra bản quyền.
+   [FAQ](https://www.apache.org/legal/src-headers.html#faq-docs);
+   [Các bản SINGA đã phát hành](https://issues.apache.org/jira/projects/SINGA/issues/SINGA-447)
+
+   - Nền tảng code không bao gồm code của bên thứ 3 mà không tương thích với
+     APL;
+   - Các chương trình dependencies phải tương thích với APL. Các licenses giống
+     với GNU là không tương thích;
+   - Các tệp tin nguồn viết bởi chúng tôi PHẢI bao gồm license header của
+     Apache: http://www.apache.org/legal/src-headers.html. Chúng tôi cung cấp
+     script để chạy header trên tất cả các tệp tin.
+   - Cập nhật tệp tin LICENSE. Nếu code có chứa mã code của một bên thứ 3 trong
+     bản phát hành mà không phải APL, phải nêu rõ ở phần cuối của tập tin THÔNG
+     BÁO.
+
+3. Nâng cấp phiên bản. Kiểm tra mã code và Tài liệu hướng dẫn
+
+   - Quá trình cài đặt không bị lỗi nào.
+   - Bao gồm tests cho những mục nhỏ (nhiều nhất có thể)
+   - Gói chương trình Conda chạy không bị lỗi.
+   - Tài liệu hướng dẫn trực tuyến trên trang web Apache là mới nhất.
+
+4. Chuẩn bị tệp tin RELEASE_NOTES (Lưu ý phát hành). Bao gồm các mục, Giới
+   thiệu, Tính năng nổi bật, Lỗi Bugs, (đường dẫn tới JIRA hoặc Github PR),
+   Những thay đổi, Danh sách thư viện Dependency, Các vấn đề không tương thích.
+   Làm theo
+   [ví dụ](http://commons.apache.org/proper/commons-digester/commons-digester-3.0/RELEASE-NOTES.txt).
+
+5. Gói các phiên bản phát hành. Bản phát hành cần được gói gọn thành:
+   apache-singa-VERSION.tar.gz. Trong bản phát hành không nên chứa bất kì tệp
+   tin dạng binary nào, bao gồm cả các tệp tin git. Tuy nhiên, các tệp CMake
+   compilation dựa vào git tag để tạo số phiên bản; để bỏ qua dependency này,
+   bạn cần cập nhật tệp tin CMakeLists.txt theo cách thủ công để tạo số phiên
+   bản.
+
+   ```
+   # xoá các dòng sau
+   include(GetGitRevisionDescription)
+   git_describe(VERSION --tags --dirty=-d)
+   string(REGEX REPLACE "^([0-9]+)\\..*" "\\1" VERSION_MAJOR "${VERSION}")
+   string(REGEX REPLACE "^[0-9]+\\.([0-9]+).*" "\\1" VERSION_MINOR "${VERSION}")
+   string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" VERSION_PATCH "${VERSION}")
+
+   # thay đổi số
+   SET(PACKAGE_VERSION 3.0.0)
+   SET(VERSION 3.0.0)
+   SET(SINGA_MAJOR_VERSION 3)  # 0 -
+   SET(SINGA_MINOR_VERSION 0)  # 0 - 9
+   SET(SINGA_PATCH_VERSION 0)  # 0 - 99
+   ```
+
+   Tải gói chương trình lên
+   [stage repo](https://dist.apache.org/repos/dist/dev/singa/). Cần bao gồm các
+   tệp tin tar, signature, KEY và tệp tin SHA256 checksum. Không sử dụng MD5.
+   Xem chính sách tại
+   [đây](http://www.apache.org/dev/release-distribution#sigs-and-sums). Thư mục
+   stage cần bao gồm:
+
+   - apache-singa-VERSION.tar.gz
+   - apache-singa-VERSION.acs
+   - apache-singa-VERSION.SHA256
+
+   Các lệnh để tạo tệp tin và tải chúng lên stage svn repo:
+
+   ```sh
+   # trong singa repo
+   rm -rf .git
+   rm -rf rafiki/*
+   cd ..
+   tar -czvf apache-singa-VERSION.tar.gz  singa/
+
+   mkdir stage
+   cd stage
+   svn co https://dist.apache.org/repos/dist/dev/singa/
+   cd singa
+   # copy tệp tin KEYS từ singa repo sang thư mục này nếu không có
+   cp ../../singa/KEYS .
+   mkdir VERSION
+   # copy tệp tin tar.gz
+   mv ../../apache-singa-VERSION.tar.gz VERSION/
+   cd VERSION
+   sha512sum apache-singa-VERSION.tar.gz > apache-singa-VERSION.tar.gz.sha512
+   gpg --armor --output apache-singa-VERSION.tar.gz.asc --detach-sig apache-singa-VERSION.tar.gz
+   cd ..
+   svn add VERSION
+   svn commit
+   ```
+
+6) Kêu gọi vote bằng cách gửi email. Xem ví dụ dưới đây.
+
+   ```
+   To: dev@singa.apache.org
+   Subject: [VOTE] Release apache-singa-X.Y.Z (release candidate N)
+
+   Hi all,
+
+   I have created a build for Apache SINGA 3.1.0, release candidate 2.
+
+   The release note is at
+   https://github.com/apache/singa/blob/master/RELEASE_NOTES.
+
+   The artifacts to be voted on are located here:
+   https://dist.apache.org/repos/dist/dev/singa/3.1.0.rc2/apache-singa-3.1.0.rc2.tar.gz
+    
+   The hashes of the artifacts are as follows:
+   SHA512: 84545499ad36da108c6a599edd1d853f82d331bc03273b5278515554866f0c698e881f956b2eabcb6b29c07fa9fa4ff1add5a777b58db8a6a2362cf383b5c04d 
+
+   Release artifacts are signed with the followingkey:
+   https://dist.apache.org/repos/dist/dev/singa/KEYS
+
+   The signature file is:
+   https://dist.apache.org/repos/dist/dev/singa/3.1.0.rc2/apache-singa-3.1.0.rc2.tar.gz.asc
+
+   The Github tag is at:
+   https://github.com/apache/singa/releases/tag/3.1.0.rc2
+
+   The documentation website is at
+   http://singa.apache.org/docs/next/installation/
+
+   Some examples are available for testing:
+   https://github.com/apache/singa/tree/master/examples
+   ```
+
+Please vote on releasing this package. The vote is open for at least 72 hours
+and passes if a majority of at least three +1 votes are cast.
+
+[ ] +1 Release this package as Apache SINGA X.Y.Z [ ] 0 I don't feel strongly
+about it, but I'm okay with the release [ ] -1 Do not release this package
+because...
+
+Here is my vote: +1
+
+```
+
+7) Sau đó đợi ít nhất 48 giờ để nhận phản hồi. Bất kì PMC, committer hay contributor
+đều có thể kiểm tra các tính năng trước khi phát hành, và đưa ra nhận xét. Mọi người nên kiểm tra trước khi
+đưa ra vote +1. Nếu vote được thông qua, vui lòng gửi email kết quả. Nếu không thì cần lặp lại trình tự từ đầu.
+
+```
+
+To: dev@singa.apache.org Subject: [RESULT][vote] Release apache-singa-X.Y.Z
+(release candidate N)
+
+Thanks to everyone who has voted and given their comments. The tally is as
+follows.
+
+N binding +1s: <names>
+
+N non-binding +1s: <names>
+
+No 0s or -1s.
+
+I am delighted to announce that the proposal to release Apache SINGA X.Y.Z has
+passed.
+
+````
+
+8) Tải gói chương trình để
+[phân bổ](http://www.apache.org/dev/release-publishing.html#distribution)
+tới https://dist.apache.org/repos/dist/release/singa/.
+
+9) Cập nhật trang Tải (Download) trên website SINGA. Tệp tin tar.gz PHẢI được tải từ mirror, sử dụng closer.cgi script; các tạo tác khác PHẢI được tải từ trang chủ Apache. Xem chi tiết tại
+[đây](http://www.apache.org/dev/release-download-pages.html). Một vài nhận xét chúng tôi nhận được trong các đợt phát hành trước: "Trang Tải chỉ nên được dẫn tới các bản phát hành chính thức, vì vậy phải bao gồm đường dẫn tới GitHub.", "Đường dẫn tới KEYS, sigs và
+hashes không nên sử dụng dist.apache.org; mà nên dùng
+https://www.apache.org/dist/singa/...;", "Và bạn chỉ cần một đường dẫn tới KEYS,
+và cần có hướng dẫn cách sử dụng KEYS + sig hay hash để chứng thực hoàn tất việc tải."
+
+10) Xoá tag RC và tập hợp gói conda packages.
+
+11) Xuất bản thông tin phát hành.
+
+ ```
+ To: announce@apache.org, dev@singa.apache.org
+ Subject: [ANNOUNCE] Apache SINGA X.Y.Z released
+
+ We are pleased to announce that SINGA X.Y.Z is released.
+
+ SINGA is a general distributed deep learning platform
+ for training big deep learning models over large datasets.
+ The release is available at: http://singa.apache.org/downloads.html
+ The main features of this release include XXX
+ We look forward to hearing your feedback, suggestions,
+ and contributions to the project.
+
+ On behalf of the SINGA team, {SINGA Team Member Name}
+ ```
+````
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/install-win.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/install-win.md
new file mode 100644
index 0000000..22057dc
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/install-win.md
@@ -0,0 +1,396 @@
+---
+id: version-4.0.0_Viet-install-win
+title: Cách cài SINGA trên Windows
+original_id: install-win
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+Quá trình cài đặt SINGA từ nguồn sử dụng Microsoft Windows bao gồm bốn bước: cài
+đặt thư viện dependencies, cài SINGA từ nguồn, (không bắt buộc) cài đặt python
+module và (không bắt buộc) chạy thử unit tests.
+
+## Cài đặt thư viện dependencies
+
+Bạn có thể tạo một thư mục để cài đặt thư viện dependencies.
+
+Các thư viện dependencies bao gồm:
+
+- Compiler và IDE
+  - Visual Studio. Công cụ biên tập mã này miễn phí và có thể được dùng trong
+    việc cài đặt SINGA. https://www.visualstudio.com/
+- CMake
+  - Có thể tải về qua http://cmake.org/
+  - Đảm bảo đường dẫn khả thi của cmake nằm trong đường dẫn chương trình system
+    path, hoặc sử dụng đường dẫn đầy đủ khi gọi hàm cmake.
+- SWIG
+
+  - Có thể tải từ http://swig.org/
+  - Đảm bảo đường dẫn khả thi của swig nằm trong đường dẫn chương trình system
+    path, hoặc sử dụng đường dẫn đầy đủ khi gọi hàm swig. Sử dụng các phiên bản
+    cập nhật như 3.0.12.
+
+- Protocol Buffers
+  - Tải các phiên bản phù hợp như 2.6.1:
+    https://github.com/google/protobuf/releases/tag/v2.6.1 .
+  - Tải cả hai tệp protobuf-2.6.1.zip và protoc-2.6.1-win32.zip .
+  - Giải nén cả hai tệp trên trong thư mục thư viện dependencies. Thêm đường dẫn
+    khả thi cho protoc vào system path, hoặc sử dụng đường dẫn đầy đủ khi gọi
+    hàm này.
+  - Mở Visual Studio solution có thể tìm trong thư mục vsproject.
+  - Thay đổi cài đặt thiết lập Settings tới Release and x64.
+  - Cài đặt libprotobuf project.
+- Openblas
+
+  - Tải phiên bản nguồn phù hợp như 0.2.20 từ http://www.openblas.net
+  - Giải nén nguồn trong thư mục thư viện dependencies.
+  - Nếu bạn không có chương trình Perl, tải perl environment như Strawberry Perl
+    (http://strawberryperl.com/)
+  - Cài đặt Visual Studio solution bằng lệnh sau từ thư mục nguồn:
+
+  ```bash
+  cmake -G "Visual Studio 15 2017 Win64"
+  ```
+
+  - Mở Visual Studio solution và thay đổi cấu hình cài đặt cho Release and x64.
+  - Cài libopenblas project
+
+- Google glog
+  - Tải phiên bản phù hợp như 0.3.5 từ https://github.com/google/glog/releases
+  - Giải nén nguồn trong thư mục thư viện dependencies.
+  - Mở Visual Studio solution.
+  - Thay đổi cài đặt thiết lập Settings tới Release and x64.
+  - Cài đặt libglog project
+
+## Cài SINGA từ nguồn
+
+- Tải code nguồn của SINGA
+- Cấu tạo các tệp protobuf:
+
+  - Tới thư mục src/proto
+
+  ```shell
+  mkdir python_out
+  protoc.exe *.proto --python_out python_out
+  ```
+
+- Tạo swig interfaces cho C++ và Python: Tới mục src/api
+
+  ```shell
+  swig -python -c++ singa.i
+  ```
+
+- Tạo Visual Studio solution cho SINGA: Đi tới thư mục nguồn SINGA
+
+  ```shell
+  mkdir build
+  cd build
+  ```
+
+- Gọi hàm cmake và thêm đường dẫn vào trong system của bạn, tương tự như ví dụ
+  sau:
+
+  ```shell
+  cmake -G "Visual Studio 15 2017 Win64" ^
+    -DGLOG_INCLUDE_DIR="D:/WinSinga/dependencies/glog-0.3.5/src/windows" ^
+    -DGLOG_LIBRARIES="D:/WinSinga/dependencies/glog-0.3.5/x64/Release" ^
+    -DCBLAS_INCLUDE_DIR="D:/WinSinga/dependencies/openblas-0.2.20/lapack-netlib/CBLAS/include" ^
+    -DCBLAS_LIBRARIES="D:/WinSinga/dependencies/openblas-0.2.20/lib/RELEASE" ^
+    -DProtobuf_INCLUDE_DIR="D:/WinSinga/dependencies/protobuf-2.6.1/src" ^
+    -DProtobuf_LIBRARIES="D:/WinSinga/dependencies/protobuf-2.6.1/vsprojects/x64/Release" ^
+    -DProtobuf_PROTOC_EXECUTABLE="D:/WinSinga/dependencies/protoc-2.6.1-win32/protoc.exe" ^
+    ..
+  ```
+
+- Mở generated solution trong Visual Studio
+- Thay đổi cài đặt thiết lập Settings tới Release and x64
+- Thêm tệp tin singa_wrap.cxx từ src/api tới singa_objects project
+- Trong singa_objects project, mở Additional Include Directories.
+- Thêm Python bao gồm đường dẫn
+- Thêm numpy bao gồm đường dẫn
+- Thêm protobuf bao gồm đường dẫn
+- Trong định nghĩa preprocessor của singa_objects project, thêm USE_GLOG
+- Sử dụng singa_objects project
+
+- Trong singa project:
+
+  - thêm singa_wrap.obj vào Thư viện Object
+  - đổi tên mục target thành \_singa_wrap
+  - đổi định dạng tệp target thành .pyd
+  - đổi định dạng cấu hình sang Dynamic Library (.dll)
+  - đi tới Additional Library Directories và thêm đường dẫn vào các thư viện
+    python, openblas, protobuf và glog
+  - đi tới các thư viện Dependencies bổ sung để thêm libopenblas.lib,
+    libglog.lib và libprotobuf.lib
+
+- tạo singa project
+
+## Cài đặt Python module
+
+- Đổi `_singa_wrap.so` thành `_singa_wrap.pyd` trong build/python/setup.py
+- Copy các tệp tin trong `src/proto/python_out` sang `build/python/singa/proto`
+
+- Không bắt buộc, tạo và kích hoạt virtual environment:
+
+  ```shell
+  mkdir SingaEnv
+  virtualenv SingaEnv
+  SingaEnv\Scripts\activate
+  ```
+
+- tới thư mục build/python và chạy:
+
+  ```shell
+  python setup.py install
+  ```
+
+- Sử dụng \_singa_wrap.pyd, libglog.dll và libopenblas.dll bằng cách thêm chúng
+  vào đường dẫn hoặc copy chúng vào thư mục gói chương trình singa trong gói
+  python site-packages
+
+- Xác nhận SINGA đã được cài đặt bằng cách chạy:
+
+  ```shell
+  python -c "from singa import tensor"
+  ```
+
+Tham khảo video quá trình cài đặt tại đây:
+
+[![youtube video](https://img.youtube.com/vi/cteER7WeiGk/0.jpg)](https://www.youtube.com/watch?v=cteER7WeiGk)
+
+## Chạy Unit Tests
+
+- Trong thư mục test, tạo Visual Studio solution:
+
+  ```shell
+  cmake -G "Visual Studio 15 2017 Win64"
+  ```
+
+- Mở generated solution trong Visual Studio.
+
+- Thay đổi cài đặt thiết lập Settings tới Release and x64.
+
+- Tạo glog project.
+
+- Trong mục test_singa project:
+
+  - Thêm USE_GLOG vào Định nghĩa Preprocessor.
+  - Trong Additional Include Directories, thêm đường dẫn của GLOG_INCLUDE_DIR,
+    CBLAS_INCLUDE_DIR và Protobuf_INCLUDE_DIR đã được dùng ở bước 2 bên trên.
+    Đồng thời tạo và tạo/bao gồm các thư mục.
+  - Đi tới Additional Library Directories và thêm đường dẫn vào thư viện
+    openblas, protobuf và glog. Thêm build/src/singa_objects.dir/Release.
+  - Tới Thư viện Dependencies bổ sung và thêm libopenblas.lib, libglog.lib và
+    libprotobuf.lib. Sửa tên của hai thư viện: gtest.lib và singa_objects.lib.
+
+- Cài test_singa project.
+
+- Sử dụng libglog.dll và libopenblas.dll bằng cách thêm chúng vào đường dẫn hoặc
+  copy chúng vào thư mục test/release.
+
+- Tiến hành unit tests bằng cách
+
+  - Từ dòng lệnh:
+
+  ```shell
+  test_singa.exe
+  ```
+
+  - Từ Visual Studio:
+    - ấn chuột phải tại test_singa project và chọn 'Set as StartUp Project'.
+    - Từ mục Debug menu, chọn 'Start Without Debugging'
+
+Tham khảo video hướng dẫn chạy unit tests tại đây:
+
+[![youtube video](https://img.youtube.com/vi/393gPtzMN1k/0.jpg)](https://www.youtube.com/watch?v=393gPtzMN1k)
+
+## Cài đặt hỗ trợ GPU với CUDA
+
+Trong mục này, chúng tôi sẽ mở rộng các bước phía trên để sử dụng GPU.
+
+### Cài đặt thư viện Dependencies
+
+Ngoài các thư viện dependencies ở mục 1 phía trên, chúng ta cần:
+
+- CUDA
+
+  Tải phiên bản phù hợp như 9.1 từ https://developer.nvidia.com/cuda-downloads .
+  Đảm bảo bạn cài đặt Visual Studio integration module.
+
+- cuDNN
+
+  Tải phiên bản phù hợp như 7.1 từ https://developer.nvidia.com/cudnn
+
+- cnmem:
+
+  - Tải phiên bản mới nhất tại https://github.com/NVIDIA/cnmem
+  - Tạo Visual Studio solution:
+
+  ```shell
+  cmake -G "Visual Studio 15 2017 Win64"
+  ```
+
+  - Mở generated solution trong Visual Studio.
+  - Thay đổi cài đặt thiết lập Settings tới Release and x64.
+  - Tạo cnmem project.
+
+### Cài SINGA từ nguồn
+
+- Gọi hàm cmake và thêm đường dẫn vào trong system của bạn, tương tự như ví dụ
+  sau:
+  ```shell
+  cmake -G "Visual Studio 15 2017 Win64" ^
+    -DGLOG_INCLUDE_DIR="D:/WinSinga/dependencies/glog-0.3.5/src/windows" ^
+    -DGLOG_LIBRARIES="D:/WinSinga/dependencies/glog-0.3.5/x64/Release" ^
+    -DCBLAS_INCLUDE_DIR="D:/WinSinga/dependencies/openblas-0.2.20/lapack-netlib/CBLAS/include" ^
+    -DCBLAS_LIBRARIES="D:/WinSinga/dependencies/openblas-0.2.20/lib/RELEASE" ^
+    -DProtobuf_INCLUDE_DIR="D:/WinSinga/dependencies/protobuf-2.6.1/src" ^
+    -DProtobuf_LIBRARIES="D:\WinSinga/dependencies/protobuf-2.6.1/vsprojects/x64/Release" ^
+    -DProtobuf_PROTOC_EXECUTABLE="D:/WinSinga/dependencies/protoc-2.6.1-win32/protoc.exe" ^
+    -DCUDNN_INCLUDE_DIR=D:\WinSinga\dependencies\cudnn-9.1-windows10-x64-v7.1\cuda\include ^
+    -DCUDNN_LIBRARIES=D:\WinSinga\dependencies\cudnn-9.1-windows10-x64-v7.1\cuda\lib\x64 ^
+    -DSWIG_DIR=D:\WinSinga\dependencies\swigwin-3.0.12 ^
+    -DSWIG_EXECUTABLE=D:\WinSinga\dependencies\swigwin-3.0.12\swig.exe ^
+    -DUSE_CUDA=YES ^
+    -DCUDNN_VERSION=7 ^
+    ..
+  ```
+
+* Tạo swig interfaces cho C++ và Python: Tới mục src/api
+
+  ```shell
+  swig -python -c++ singa.i
+  ```
+
+* Mở generated solution trong Visual Studio
+
+* Thay đổi cài đặt thiết lập Settings tới Release and x64.
+
+#### Tạo singa_objects
+
+- Thêm tệp tin singa_wrap.cxx từ src/api tới singa_objects project
+- Trong singa_objects project, mở Additional Include Directories.
+- Thêm Python bao gồm đường dẫn
+- Thêm numpy bao gồm đường dẫn
+- Thêm protobuf bao gồm đường dẫn
+- Bổ sung bao gồm đường dẫn cho CUDA, cuDNN và cnmem
+- Trong định nghĩa preprocessor của singa_objects project, thêm USE_GLOG,
+  USE_CUDA và USE_CUDNN. Xoá DISABLE_WARNINGS.
+- Tạo singa_objects project
+
+#### Tạo singa-kernel
+
+- Tạo một Visual Studio project mới dưới dạng "CUDA 9.1 Runtime". Đặt tên dạng
+  như singa-kernel.
+- project này chứa sẵn một tệp tin là kernel.cu. Xoá tệp tin này khỏi project.
+- Thêm tệp tin này: src/core/tensor/math_kernel.cu
+- Trong mục cài đặt project:
+
+  - Đặt Platform Toolset sang dạng "Visual Studio 2015 (v140)"
+  - Đổi Configuration Type sang " Static Library (.lib)"
+  - Trong mục Include Directories, thêm vào build/include.
+
+- Tạo singa-kernel project
+
+#### Cài đặt singa
+
+- Trong singa project:
+
+  - thêm singa_wrap.obj vào Object Libraries
+  - đổi tên target thành \_singa_wrap
+  - đổi định dạng target sang .pyd
+  - đổi định dạng cấu hình sang Dynamic Library (.dll)
+  - đi tới Additional Library Directories và thêm đường dẫn vào các thư viện
+    python, openblas, protobuf và glog
+  - thêm các đường dẫn thư viện cho singa-kernel, cnmem, cuda và cudnn.
+  - đi tới các thư viện Dependencies bổ sung để thêm libopenblas.lib,
+    libglog.lib và libprotobuf.lib
+  - Đồng thời thêm: singa-kernel.lib, cnmem.lib, cudnn.lib, cuda.lib ,
+    cublas.lib, curand.lib và cudart.lib.
+
+- tạo singa project
+
+### Cài đặt Python module
+
+- Đổi tên \_singa_wrap.so sang \_singa_wrap.pyd trong mục build/python/setup.py
+- Copy các tệp tin trong src/proto/python_out sang build/python/singa/proto
+
+- Không bắt buộc, tạo và kích hoạt virtual environment:
+
+  ```shell
+  mkdir SingaEnv
+  virtualenv SingaEnv
+  SingaEnv\Scripts\activate
+  ```
+
+- tới thư mục build/python và chạy:
+
+  ```shell
+  python setup.py install
+  ```
+
+- Sử dụng \_singa_wrap.pyd, libglog.dll, libopenblas.dll, cnmem.dll, CUDA
+  Runtime (e.g. cudart64_91.dll) và cuDNN (e.g. cudnn64_7.dll) bằng cách thêm
+  chúng vào đường dẫn hoặc copy chúng vào thư mục gói chương trình singa trong
+  gói python site-packages
+
+- Xác nhận SINGA đã được cài đặt bằng cách chạy:
+
+  ```shell
+  python -c "from singa import device; dev = device.create_cuda_gpu()"
+  ```
+
+Tham khảo video hướng dẫn cho mục này tại đây:
+
+[![youtube video](https://img.youtube.com/vi/YasKVjRtuDs/0.jpg)](https://www.youtube.com/watch?v=YasKVjRtuDs)
+
+### Run Unit Tests
+
+- Trong thư mục tests, tạo Visual Studio solution:
+
+  ```shell
+  cmake -G "Visual Studio 15 2017 Win64"
+  ```
+
+- Mở solution được tạo trong Visual Studio, hoặc thêm project vào singa solution
+  đã được tạo ở bước 5.2
+
+- Thay đổi cài đặt thiết lập Settings tới Release and x64.
+
+- Tạo glog project.
+
+- Trong test_singa project:
+
+  - Thêm USE_GLOG; USE_CUDA; USE_CUDNN vào Định Nghĩa Preprocessor.
+  - Trong Thư viện Dependencies bổ sung, thêm đường dẫn của GLOG_INCLUDE_DIR,
+    CBLAS_INCLUDE_DIR và Protobuf_INCLUDE_DIR đã được sử dụng tại bước 5.2 ở
+    trên. Và thêm build, build/include, CUDA và cuDNN bao gồm thư mục.
+  - Tới Thư viện Dependencies bổ sung và thêm đường dẫn của thư viện openblas,
+    protobuf và glog. Và thêm đường dẫn thư viện của
+    build/src/singa_objects.dir/Release, singa-kernel, cnmem, CUDA và cuDNN.
+  - Tới Thư viện Dependencies bổ sung và thêm libopenblas.lib; libglog.lib;
+    libprotobuf.lib; cnmem.lib; cudnn.lib; cuda.lib; cublas.lib; curand.lib;
+    cudart.lib; singa-kernel.lib. Sửa tên của hai thư viện: gtest.lib và
+    singa_objects.lib.
+
+* Tạo test_singa project.
+
+* Sử dụng libglog.dll, libopenblas.dll, cnmem.dll, cudart64_91.dll và
+  cudnn64_7.dll bằng cách thêm chúng vào đường dẫn hoặc copy chúng vào thư mục
+  test/release.
+
+- Tiến hành unit tests bằng cách:
+
+  - Từ dòng lệnh:
+
+    ```shell
+    test_singa.exe
+    ```
+
+  - Từ Visual Studio:
+    - ấn chuột phải tại test_singa project và chọn 'Set as StartUp Project'.
+    - Từ mục Debug menu, chọn 'Start Without Debugging'
+
+Tham khảo video hướng dẫn chạy unit tests tại đây:
+
+[![youtube video](https://img.youtube.com/vi/YOjwtrvTPn4/0.jpg)](https://www.youtube.com/watch?v=YOjwtrvTPn4)
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/installation.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/installation.md
new file mode 100644
index 0000000..8d216e1
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/installation.md
@@ -0,0 +1,142 @@
+---
+id: version-4.0.0_Viet-installation
+title: Cài đặt
+original_id: installation
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+## Sử dụng Pip
+
+[Miniconda3](https://conda.io/miniconda.html) được khuyến khích dùng với SINGA.
+Sau khi cài đặt miniconda, thực hiện các lệnh sau để cài đặt SINGA.
+
+1. Cho CPU
+   [![Mở trên Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/17RA056Brwk0vBQTFaZ-l9EbqwADO0NA9?usp=sharing)
+
+```bash
+pip install singa -f http://singa.apache.org/docs/next/wheel-cpu.html --trusted-host singa.apache.org
+```
+
+Bạn có thể cài đặt một phiên bản SINGA cụ thể sử dụng `singa==<version>`, thay
+thông tin `<version>`, v.d, `4.0.0`. Xem danh sách các phiên bản SINGA đang hoạt
+động ở đường dẫn.
+
+Để cài đặt phiên bản phát triển mới nhất, thay đường dẫn bằng
+http://singa.apache.org/docs/next/wheel-cpu-dev.html
+
+2. GPU với CUDA và cuDNN
+   [![Mở trên Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1W30IPCqj5fG8ADAQsFqclaCLyIclVcJL?usp=sharing)
+
+```bash
+pip install singa -f http://singa.apache.org/docs/next/wheel-gpu.html --trusted-host singa.apache.org
+```
+
+Bạn có thể thiết lập phiên bản SINGA và CUDA, như `singa==4.0.0+cuda10.2`. Danh
+sách tổ hợp phiên bản SINGA với CUDA được cung cấp trong đường dẫn.
+
+Để cài đặt phiên bản phát triển mới nhất, thay đường dẫn bằng
+http://singa.apache.org/docs/next/wheel-gpu-dev.html
+
+Lưu ý: phiên bản Python của Python environment trong máy của bạn sẽ được sử dụng
+để tìm gói wheel tương ứng. Ví dụ, nếu bạn sử dụng Python 3.6, thì gói wheel kết
+cấu trong Python 3.6 sẽ được pip chọn để cài đặt. Thực tế, tên của tệp tin wheel
+bao gồm phiên bản SINGA, phiên bản CUDA và Python. Vì thế, `pip` biết tệp tin
+wheel nào để tải và cài đặt.
+
+Tham khảo chú thích ở phần đầu của tệp tin `setup.py` về cách tạo các gói wheel
+packages.
+
+Nếu không có lỗi khi chạy
+
+```shell
+$ python -c "from singa import tensor"
+```
+
+thì bạn đã cài đặt SINGA thành công.
+
+## Sử dụng Docker
+
+Cài đặt Docker vào máy chủ local theo
+[hướng dẫn](https://docs.docker.com/install/). Thêm người dùng vào
+[nhóm docker](https://docs.docker.com/install/linux/linux-postinstall/) để chạy
+câu lệnh docker mà không cần dùng `sudo`.
+
+1. Cho CPU.
+
+```shell
+$ docker run -it apache/singa:X.Y.Z-cpu-ubuntu16.04 /bin/bash
+```
+
+2. Với GPU. Cài đặt [Nvidia-Docker](https://github.com/NVIDIA/nvidia-docker) sau
+   khi cài Docker.
+
+```shell
+$ nvidia-docker run -it apache/singa:X.Y.Z-cuda9.0-cudnn7.4.2-ubuntu16.04 /bin/bash
+```
+
+3. Xem danh sách toàn bộ SINGA Docker images (tags), tại
+   [trang web docker hub](https://hub.docker.com/r/apache/singa/). Với mỗi
+   docker image, tag được đặt tên theo
+
+```shell
+version-(cpu|gpu)[-devel]
+```
+
+| Tag         | Mô tả                                | Ví dụ giá trị                                                                                                                                                      |
+| ----------- | ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `phiên bản` | phiên bản SINGA                      | '2.0.0-rc0', '2.0.0', '1.2.0'                                                                                                                                      |
+| `cpu`       | image không thể sử dụng cho GPUs     | 'cpu'                                                                                                                                                              |
+| `gpu`       | image có thể sử dụng cho Nvidia GPUs | 'gpu', or 'cudax.x-cudnnx.x' e.g., 'cuda10.0-cudnn7.3'                                                                                                             |
+| `devel`     | chỉ số phát triển                    | nếu không có, gói SINGA Python package chỉ được cài đặt cho runtime; nếu có, environment cũng được tạo ra, bạn có thể kết cấu lại SINGA từ nguồn tại '/root/singa' |
+| `OS`        | cho biết phiên bản OS                | 'ubuntu16.04', 'ubuntu18.04'                                                                                                                                       |
+
+## Từ nguồn
+
+Bạn có thể [tạo và cài đặt SINGA](build.md) từ mã code nguồn sử dụng các công cụ
+tạo chương trình hoặc conda-build, trên hệ điều hành máy chủ cục bộ (local host
+os) hay trong Docker container.
+
+## Câu Hỏi Thường Gặp
+
+- Q: Lỗi khi chạy `from singa import tensor`
+
+  A: Kiểm tra chi tiết lỗi từ
+
+  ```shell
+  python -c  "from singa import _singa_wrap"
+  # tới thưu mục chứa _singa_wrap.so
+  ldd path to _singa_wrap.so
+  python
+  >> import importlib
+  >> importlib.import_module('_singa_wrap')
+  ```
+
+  Thư mục chứa `_singa_wrap.so` thường ở
+  `~/miniconda3/lib/python3.7/site-packages/singa`. Thông thường, lỗi này được
+  gây ra bởi sự không tương thích hoặc thiếu các thư viện dependent libraries,
+  v.d cuDNN hay protobuf. Cách giải quyết là tạo một virtual environment mới và
+  cài đặt SINGA trong environment đó, v.d,
+
+  ```shell
+  conda create -n singa
+  conda activate singa
+  conda install -c nusdbsystem -c conda-forge singa-cpu
+  ```
+
+- Q: Khi sử dụng virtual environment, mỗi khi tôi cài SINGA, numpy cũng tự động
+  bị cài lại. Tuy nhiên, numpy không được sử dụng khi chạy `import numpy`
+
+  A: Lỗi này có thể do biến `PYTHONPATH` environment lẽ ra phải để trống trong
+  khi bạn sử dụng virtual environment để tránh mâu thuẫn với đường dẫn của
+  virtual environment.
+
+- Q: Khi chạy SINGA trên Mac OS X, tôi gặp lỗi "Fatal Python error:
+  PyThreadState_Get: no current thread Abort trap: 6"
+
+  A: Lỗi này thường xảy ra khi bạn có nhiều phiên bản Python trong hệ thống,
+  v.d, bản của OS và bản được cài bởi Homebrew. Bản Python dùng cho SINGA phải
+  giống với bản Python interpreter. Bạn có thể kiểm tra interpreter của mình
+  bằng `which python` và kiểm tra bản Python dùng cho SINGA qua
+  `otool -L <path to _singa_wrap.so>`. Vấn đề này được giải quyết nếu SINGA được
+  cài qua conda.
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/issue-tracking.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/issue-tracking.md
new file mode 100644
index 0000000..558490b
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/issue-tracking.md
@@ -0,0 +1,13 @@
+---
+id: version-4.0.0_Viet-issue-tracking
+title: Theo Dõi Vấn Đề
+original_id: issue-tracking
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+SINGA sử dụng [JIRA](https://issues.apache.org/jira/browse/singa) để quản lý các
+vấn đề bao gồm lỗi (bugs), các tính năng mới và thảo luận.
+
+Chúng tôi đang trong quá trình chuyển sang mục
+[Github Issues](https://github.com/apache/singa/issues).
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/mail-lists.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/mail-lists.md
new file mode 100644
index 0000000..8648d6c
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/mail-lists.md
@@ -0,0 +1,17 @@
+---
+id: version-4.0.0_Viet-mail-lists
+title: Danh sách liên hệ của Dự Án
+original_id: mail-lists
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+Có một danh sách liên hệ được thiết lập cho dự án này. Mỗi danh sách bao gồm
+đường dẫn để đăng kí(subscribe), huỷ đăng kí (unsubscribe), và đường dẫn tới kho
+lưu trữ.
+
+| Tên        | Loại                                 | Đăng kí                                                        | Huỷ Đăng kí                                                          | Lưu Trữ                                                                             |
+| ---------- | ------------------------------------ | -------------------------------------------------------------- | -------------------------------------------------------------------- | ----------------------------------------------------------------------------------- |
+| Phát triển | <de...@singa.incubator.apache.org>     | [Đăng kí](mailto:dev-subscribe@singa.incubator.apache.org)     | [Huỷ đăng kí](mailto:dev-unsubscribe@singa.incubator.apache.org.)    | [mail-archives.apache.org](http://mail-archives.apache.org/mod_mbox/singa-dev/)     |
+| Đóng góp   | <co...@singa.incubator.apache.org> | [Đăng kí](mailto:commits-subscribe@singa.incubator.apache.org) | [Huỷ đăng kí](mailto:commits-unsubscribe@singa.incubator.apache.org) | [mail-archives.apache.org](http://mail-archives.apache.org/mod_mbox/singa-commits/) |
+| Bảo mật    | <se...@singa.apache.org>          | private                                                        | private                                                              | private                                                                             |
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/onnx.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/onnx.md
new file mode 100644
index 0000000..2fba93b
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/onnx.md
@@ -0,0 +1,410 @@
+---
+id: version-4.0.0_Viet-onnx
+title: ONNX
+original_id: onnx
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+[ONNX](https://onnx.ai/) là một định dạng đại diện mở dùng trong các model của
+machine learning, cho phép nhà phát triển AI sử dụng các models trên các
+libraries và công cụ khác nhau. SINGA hỗ trợ tải các models dạng ONNX trong
+training và inference, và lưu các models ở dạng ONNX với SINGA APIs (e.g.,
+[Module](./module)).
+
+SINGA đã được thử nghiệm với
+[phiên bản sau](https://github.com/onnx/onnx/blob/master/docs/Versioning.md) của
+ONNX.
+
+| Phiên bản ONNX | Phiên bản định dạng tệp tin | Opset phiên bản ai.onnx | Opset phiên bản ai.onnx.ml | Opset phiên bản ai.onnx.training |
+| -------------- | --------------------------- | ----------------------- | -------------------------- | -------------------------------- |
+| 1.6.0          | 6                           | 11                      | 2                          | -                                |
+
+## Sử dụng chung
+
+### Tải một ONNX Model trong SINGA
+
+Sau khi tải một ONNX model từ disk qua `onnx.load`, bạn chỉ cần cập nhật
+batch-size của input sử dụng `tensor.PlaceHolder` sau SINGA v3.0, shape của
+internal tensors sẽ tự động được tạo ra.
+
+Sau đó, bạn định nghĩa một class thừa hưởng từ `sonnx.SONNXModel` và thực hiện
+hai phương pháp `forward` cho quá trình forward và `train_one_batch` cho quá
+trình training. Sau khi gọi hàm `model.compile`, hàm SONNX sẽ lặp lại và dịch
+tất cả các nodes trong phạm vi graph của ONNX model sang các hàm SINGA, tải tất
+cả weights đã lưu trữ và tạo ra shape của từng tensor trung gian.
+
+```python3
+import onnx
+from singa import device
+from singa import sonnx
+
+class MyModel(sonnx.SONNXModel):
+
+    def __init__(self, onnx_model):
+        super(MyModel, self).__init__(onnx_model)
+
+    def forward(self, *x):
+        y = super(MyModel, self).forward(*x)
+        # Since SINGA model returns the output as a list,
+        # if there is only one output,
+        # you just need to take the first element.
+        return y[0]
+
+    def train_one_batch(self, x, y):
+        pass
+
+model_path = "PATH/To/ONNX/MODEL"
+onnx_model = onnx.load(model_path)
+
+# convert onnx model into SINGA model
+dev = device.create_cuda_gpu()
+x = tensor.PlaceHolder(INPUT.shape, device=dev)
+model = MyModel(onnx_model)
+model.compile([x], is_train=False, use_graph=True, sequential=True)
+```
+
+### Inference với SINGA model
+
+Sau khi tạo models, bạn có thể tiến hành inference bằng cách gọi hàm
+`model.forward`. Đầu vào và đầu ra phải ở dạng phiên bản của SINGA `Tensor`.
+
+```python3
+x = tensor.Tensor(device=dev, data=INPUT)
+y = model.forward(x)
+```
+
+### Lưu model của SINGA dưới dạng ONNX
+
+Với hàm tensors đầu vào và đầu ra được tạo ra bởi các hàm của model, bạn có thể
+truy nguyên đến tất cả các hàm nội bộ. Bởi vậy, một model SINGA được xác định
+bởi tensors đầu vào và đầu ra. Để biến một model SINGA sang dạng ONNX, bạn chỉ
+cần cung cấp danh sách tensor đầu vào và đầu ra.
+
+```python3
+# x is the input tensor, y is the output tensor
+sonnx.to_onnx([x], [y])
+```
+
+### Training lại với model ONNX
+
+Để train (hay luyện) một model ONNX sử dụng SINGA, bạn cần thực hiện
+`train_one_batch` từ `sonnx.SONNXModel` và đánh dấu `is_train=True` khi gọi hàm
+`model.compile`.
+
+```python3
+from singa import opt
+from singa import autograd
+
+class MyModel(sonnx.SONNXModel):
+
+    def __init__(self, onnx_model):
+        super(MyModel, self).__init__(onnx_model)
+
+    def forward(self, *x):
+        y = super(MyModel, self).forward(*x)
+        return y[0]
+
+    def train_one_batch(self, x, y, dist_option, spars):
+        out = self.forward(x)
+        loss = autograd.softmax_cross_entropy(out, y)
+        if dist_option == 'fp32':
+            self.optimizer.backward_and_update(loss)
+        elif dist_option == 'fp16':
+            self.optimizer.backward_and_update_half(loss)
+        elif dist_option == 'partialUpdate':
+            self.optimizer.backward_and_partial_update(loss)
+        elif dist_option == 'sparseTopK':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=True,
+                                                      spars=spars)
+        elif dist_option == 'sparseThreshold':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=False,
+                                                      spars=spars)
+        return out, loss
+
+    def set_optimizer(self, optimizer):
+        self.optimizer = optimizer
+
+sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+model.set_optimizer(sgd)
+model.compile([tx], is_train=True, use_graph=graph, sequential=True)
+```
+
+### Transfer-learning một model ONNX
+
+Bạn cũng có thể thêm một vài layers vào phần cuối của ONNX model để làm
+transfer-learning. Hàm `last_layers` chấp nhận một số nguyên âm để chỉ layer bị
+cắt ra. Ví dụ, `-1` nghĩa là bị cắt ra sau kết quả cuối cùng (không xoá bớt
+layer nào) `-2` nghĩa là bị cắt ra sau hai layer cuối cùng.
+
+```python3
+from singa import opt
+from singa import autograd
+
+class MyModel(sonnx.SONNXModel):
+
+    def __init__(self, onnx_model):
+        super(MyModel, self).__init__(onnx_model)
+        self.linear = layer.Linear(1000, 3)
+
+    def forward(self, *x):
+        # cut off after the last third layer
+        # and append a linear layer
+        y = super(MyModel, self).forward(*x, last_layers=-3)[0]
+        y = self.linear(y)
+        return y
+
+    def train_one_batch(self, x, y, dist_option, spars):
+        out = self.forward(x)
+        loss = autograd.softmax_cross_entropy(out, y)
+        if dist_option == 'fp32':
+            self.optimizer.backward_and_update(loss)
+        elif dist_option == 'fp16':
+            self.optimizer.backward_and_update_half(loss)
+        elif dist_option == 'partialUpdate':
+            self.optimizer.backward_and_partial_update(loss)
+        elif dist_option == 'sparseTopK':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=True,
+                                                      spars=spars)
+        elif dist_option == 'sparseThreshold':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=False,
+                                                      spars=spars)
+        return out, loss
+
+    def set_optimizer(self, optimizer):
+        self.optimizer = optimizer
+
+sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+model.set_optimizer(sgd)
+model.compile([tx], is_train=True, use_graph=graph, sequential=True)
+```
+
+## ONNX model zoo
+
+[ONNX Model Zoo](https://github.com/onnx/models) là tổ hợp các models ở dạng
+ONNX, đã được train có kết quả tốt nhất, đóng góp bởi cộng đồng thành viên.
+SINGA giờ đây đã hỗ trợ một số models CV và NLP. Chúng tôi dự định sẽ sớm hỗ trợ
+thêm các models khác.
+
+### Phân loại hình ảnh (Image Classification)
+
+Tổ hợp models này có đầu vào là hình ảnh, sau đó phân loại các đối tượng chính
+trong hình ảnh thành 1000 mục đối tượng như bàn phím, chuột, bút chì, và các
+động vật.
+
+| Model Class                                                                                         | Tham khảo                                               | Mô tả                                                                                                                                                                                                                                              | Đường dẫn                                                                             [...]
+| --------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------- [...]
+| <b>[MobileNet](https://github.com/onnx/models/tree/master/vision/classification/mobilenet)</b>      | [Sandler et al.](https://arxiv.org/abs/1801.04381)      | deep neural network nhỏ, nhẹ phù hợp nhất cho điện thoại và ứng dụng hình ảnh đính kèm. <br>Top-5 error từ báo cáo - ~10%                                                                                                                          | [![Mở trên Colab](https://colab.research.google.com/assets/colab-badge.svg)](https:// [...]
+| <b>[ResNet18](https://github.com/onnx/models/tree/master/vision/classification/resnet)</b>          | [He et al.](https://arxiv.org/abs/1512.03385)           | Mô hình CNN (lên tới 152 layers). Sử dụng liên kết ngắn gọn để đạt độ chính xác cao hơn khi phân loại hình ảnh. <br> Top-5 error từ báo cáo - ~3.6%                                                                                                | [![Mở trên Colab](https://colab.research.google.com/assets/colab-badge.svg)](https:// [...]
+| <b>[VGG16](https://github.com/onnx/models/tree/master/vision/classification/vgg)</b>                | [Simonyan et al.](https://arxiv.org/abs/1409.1556)      | Mô hình CNN chuyên sâu (lên tới 19 layers). Tương tự như AlexNet nhưng sử dụng nhiều loại filters cỡ kernel nhỏ hơn mang lại độ chính xác cao hơn khi phân loại hình ảnh. <br>Top-5 từ báo cáo - ~8%                                               | [![Mở trên Colab](https://colab.research.google.com/assets/colab-badge.svg)](https:// [...]
+| <b>[ShuffleNet_V2](https://github.com/onnx/models/tree/master/vision/classification/shufflenet)</b> | [Simonyan et al.](https://arxiv.org/pdf/1707.01083.pdf) | Mô hình CNN cực kỳ hiệu quả trong sử dụng tài nguyên, được thiết kế đặc biệt cho các thiết bị di động. Mạng lưới thiết kế hệ mô hình sử dụng số liệu trực tiếp như tốc độ, thay vì các số liệu gián tiếp như FLOP. Top-1 error từ báo cáo - ~30.6% | [![Mở trên Colab](https://colab.research.google.com/drive/19HfRu3YHP_H2z3BcZujVFRp23_ [...]
+
+Chúng tôi cung cấp ví dụ re-training sử dụng VGG và ResNet, vui lòng xem tại
+`examples/onnx/training`.
+
+### Nhận Diện Đối Tượng (Object Detection)
+
+Các models Object detection nhận diện sự hiện diện của các đối tượng trong một
+hình ảnh và phân đoạn ra các khu vực của bức ảnh mà đối tượng được nhận diện.
+
+| Model Class                                                                                                       | Tham khảo                                             | Mô tả                                                                                                                                       | Đường dẫn                                                                                                                                               |
+| ----------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| <b>[Tiny YOLOv2](https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/tiny_yolov2)</b> | [Redmon et al.](https://arxiv.org/pdf/1612.08242.pdf) | Mô hình CNN thời gian thực cho Nhận diện đối tượng có thể nhận diện 20 loại đối tượng khác nhau. Phiên bản nhỏ của mô hình phức tạp Yolov2. | [![Mở trên Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/11V4I6cRjIJNUv5ZGsEGwqHuoQEie6b1T) |
+
+### Phân tích Khuôn Mặt (Face Analysis)
+
+Các mô hình Nhận Diện Khuôn Mặt xác định và/hoặc nhận diện khuôn mặt người và
+các trạng thái cảm xúc trong bức ảnh.
+
+| Model Class                                                                                               | Tham khảo                                          | Mô tả                                                                                                                                              | Đường dẫn                                                                                                                                               |
+| --------------------------------------------------------------------------------------------------------- | -------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| <b>[ArcFace](https://github.com/onnx/models/tree/master/vision/body_analysis/arcface)</b>                 | [Deng et al.](https://arxiv.org/abs/1801.07698)    | Mô hình dựa trên CNN để nhận diện khuôn mặt, học từ các đặc tính khác nhau trên khuôn mặt và tạo ra các embeddings cho hình ảnh khuôn mặt đầu vào. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qanaqUKGIDtifdzEzJOHjEj4kYzA9uJC) |
+| <b>[Emotion FerPlus](https://github.com/onnx/models/tree/master/vision/body_analysis/emotion_ferplus)</b> | [Barsoum et al.](https://arxiv.org/abs/1608.01041) | Mô hình CNN chuyên sâu nhận diện cảm xúc được train trên các hình ảnh khuôn mặt.                                                                   | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1XHtBQGRhe58PDi4LGYJzYueWBeWbO23r) |
+
+### Máy Hiểu (Machine Comprehension)
+
+Một dạng của mô hình xử lý ngôn ngữ tự nhiên giúp trả lời câu hỏi trên một đoạn
+ngôn ngữ cung cấp.
+
+| Model Class                                                                                           | Tham khảo                                                                                                                           | Mô tả                                                                                             | Đường dẫn                                                                                                                                                [...]
+| ----------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- [...]
+| <b>[BERT-Squad](https://github.com/onnx/models/tree/master/text/machine_comprehension/bert-squad)</b> | [Devlin et al.](https://arxiv.org/pdf/1810.04805.pdf)                                                                               | Mô hình này trả lời câu hỏi dựa trên ngữ cảnh của đoạn văn đầu vào.                               | [![Mở trên Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1kud-lUPjS_u-TkDAzihBTw0Vqr0FjCE-)  [...]
+| <b>[RoBERTa](https://github.com/onnx/models/tree/master/text/machine_comprehension/roberta)</b>       | [Devlin et al.](https://arxiv.org/pdf/1907.11692.pdf)                                                                               | Mô hình transformer-based kích thước lớn, dự đoán ngữ nghĩa dựa trên đoạn văn đầu vào.            | [![Mở trên Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1F-c4LJSx3Cb2jW6tP7f8nAZDigyLH6iN?u [...]
+| <b>[GPT-2](https://github.com/onnx/models/tree/master/text/machine_comprehension/gpt-2)</b>           | [Devlin et al.](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) | Mô hình ngôn ngữ transformer-based kích thước lớn, đưa ra một đoạn chữ, rồi dự đoán từ tiếp theo. | [![Mở trên Colab](https://colab.research.google.com/drive/1ZlXLSIMppPch6HgzKRillJiUcWn3PiK7?usp=sharing)                                                 [...]
+
+## Các toán tử (Operators) được hỗ trợ
+
+Chúng tôi hỗ trợ các toán tử sau:
+
+- Acos
+- Acosh
+- Add
+- And
+- Asin
+- Asinh
+- Atan
+- Atanh
+- AveragePool
+- BatchNormalization
+- Cast
+- Ceil
+- Clip
+- Concat
+- ConstantOfShape
+- Conv
+- Cos
+- Cosh
+- Div
+- Dropout
+- Elu
+- Equal
+- Erf
+- Expand
+- Flatten
+- Gather
+- Gemm
+- GlobalAveragePool
+- Greater
+- HardSigmoid
+- Identity
+- LeakyRelu
+- Less
+- Log
+- MatMul
+- Max
+- MaxPool
+- Mean
+- Min
+- Mul
+- Neg
+- NonZero
+- Not
+- OneHot
+- Or
+- Pad
+- Pow
+- PRelu
+- Reciprocal
+- ReduceMean
+- ReduceSum
+- Relu
+- Reshape
+- ScatterElements
+- Selu
+- Shape
+- Sigmoid
+- Sign
+- Sin
+- Sinh
+- Slice
+- Softmax
+- Softplus
+- Softsign
+- Split
+- Sqrt
+- Squeeze
+- Sub
+- Sum
+- Tan
+- Tanh
+- Tile
+- Transpose
+- Unsqueeze
+- Upsample
+- Where
+- Xor
+
+### Các lưu ý đặc biệt cho ONNX backend
+
+- Conv, MaxPool và AveragePool
+
+  Đầu vào phải có shape 1d`(N*C*H)` và 2d(`N*C*H*W`) trong khi `dilation` phải
+  là 1.
+
+- BatchNormalization
+
+  `epsilon` là 1e-05 và không được đổi.
+
+- Cast
+
+  Chỉ hỗ trợ float32 và int32, các dạng khác phải được cast thành hai dạng này.
+
+- Squeeze và Unsqueeze
+
+  Nếu gặp lỗi khi dùng `Squeeze` hay `Unsqueeze` giữa `Tensor` và Scalar, vui
+  lòng báo cho chúng tôi.
+
+- Empty tensor Empty tensor không được chấp nhận trong SINGA.
+
+## Thực hiện
+
+Mã code của SINGA ONNX được đặt trong `python/singa/soonx.py`. Có bốn loại
+chính, `SingaFrontend`, `SingaBackend`, `SingaRep` và `SONNXModel`.
+`SingaFrontend` qui đổi mô hình SINGA model sang mô hình ONNX model;
+`SingaBackend` biến mô hình ONNX model sang đối tượng `SingaRep` giúp lưu trữ
+tất cả các toán tử SINGA operators và tensors(tensor trong văn bản này nghĩa là
+SINGA `Tensor`); `SingaRep` có thẻ chạy giống như mô hình SINGA model.
+`SONNXModel` tạo ra từ `model.Model` xác định thống nhất API cho SINGA.
+
+### SingaFrontend
+
+Hàm function đầu vào của `SingaFrontend` là `singa_to_onnx_model` cũng được gọi
+là `to_onnx`. `singa_to_onnx_model` tạo ra mô hình ONNX model, và nó cũng tạo ra
+một ONNX graph bằng việc sử dụng `singa_to_onnx_graph`.
+
+`singa_to_onnx_graph` chấp nhận đầu ra của mô hình, và lặp lại đệ quy graph của
+mô hình SINGA model từ đầu ra để gom tất cả toán tử tạo thành một hàng. Tensors
+đầu vào và trực tiếp, v.d, weights để train, của mô hình SINGA model được chọn
+cùng một lúc. Đầu vào được lưu trong `onnx_model.graph.input`; đầu ra được lưu
+trong `onnx_model.graph.output`; và weights để train được lưu trong
+`onnx_model.graph.initializer`.
+
+Sau đó toán tử SINGA operator trong hàng được đổi sang từng toán tử ONNX
+operators. `_rename_operators` xác định tên toán tử giữa SINGA và ONNX.
+`_special_operators` xác định function sử dụng để biến đổi toán tử.
+
+Thêm vào đó, một vài toán tử trong SINGA có các định nghĩa khác với ONNX, chẳng
+hạn như, ONNX coi một vài thuộc tính của toán tử SINGA operators là đầu vào, vì
+thế `_unhandled_operators` xác định function nào dùng để xử lý toán tử đặc biệt.
+
+Do dạng bool được coi là dạng int32 trong SINGA, `_bool_operators` địng nghĩa
+toán tử có thể chuyển sang dạng bool.
+
+### SingaBackend
+
+Function đầu vào của `SingaBackend` là `prepare` kiểm tra phiên bản nào của mô
+hình ONNX model rồi gọi `_onnx_model_to_singa_ops`.
+
+Chức năng của `_onnx_model_to_singa_ops` là để lấy SINGA tensors và operators.
+Các tensors được lưu trong một thư viện, theo tên trong ONNX, và operators được
+lưu trong hàng ở dạng `namedtuple('SingaOps', ['node', 'operator'])`. Với mỗi
+toán tử operator, `node` là một ví dụ từ OnnxNode được dùng để lưu các thông tin
+cơ bản của ONNX node; `operator` là forward function cho toán tử SINGA;
+
+Bước đầu tiên của `_onnx_model_to_singa_ops` có bốn bước, đầu tiên là gọi
+`_parse_graph_params` để lấy tất các các tensors lưu trong `params`. Sau đó gọi
+hàm `_parse_graph_inputs_outputs` để lấy tất cả thông tin đầu vào đầu ra lưu
+trong `inputs` và `outputs`. Cuối cùng nó lặp lại tất cả các nodes trong ONNX
+graph và đẩy sang `_onnx_node_to_singa_op` như SINGA operators hoặc layers và
+lưu chúng thành `outputs`. Một vài weights được lưu trong ONNX node gọi là
+`Constant`, SONNX có thể xử lý chúng bằng `_onnx_constant_to_np` để lưu trong
+`params`.
+
+Cuối cùng class này trả lại một đối tượng `SingaRep` và lưu trên `params`,
+`inputs`, `outputs`, `layers`.
+
+### SingaRep
+
+`SingaBackend` lưu tất cả SINGA tensors và operators. `run` chấp nhận đầu vào
+của mô hình và chạy từng toán tử SINGA operators một, theo hàng của toán tử.
+Người dùng có thể sử dụng `last_layers` để xoá mô hình model sau vài layers cuối
+cùng.
+
+### SONNXModel
+
+`SONNXModel` được tạo từ `sonnx.SONNXModel` và thực hiện phương pháp `forward`
+để cung cấp một API đồng bộ với các mô hình SINGA.
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/optimizer.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/optimizer.md
new file mode 100644
index 0000000..4116d4a
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/optimizer.md
@@ -0,0 +1,128 @@
+---
+id: version-4.0.0_Viet-optimizer
+title: Optimizer
+original_id: optimizer
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+SINGA hỗ trợ đa dạng các thuật toán tối ưu (optimizers) phổ biến bao gồm
+stochastic gradient descent với momentum, Adam, RMSProp, và AdaGrad, etc. Với
+mỗi thuật toán tối ưu, SINGA hỗ trợ để sử dụng decay schedular để lên kế hoạch
+learning rate áp dụng trong các epochs khác nhau. Các mỗi thuật toán tối ưu và
+decay schedulers có trong `singa/opt.py`.
+
+## Tạo thuật toán tối ưu
+
+1. SGD với momentum
+
+```python
+# xác định hyperparameter learning rate
+lr = 0.001
+# xác định hyperparameter momentum
+momentum = 0.9
+# xác định hyperparameter weight decay
+weight_decay = 0.0001
+
+from singa import opt
+sgd = opt.SGD(lr=lr, momentum=momentum, weight_decay=weight_decay)
+```
+
+2. RMSProp
+
+```python
+# xác định hyperparameter learning rate
+lr = 0.001
+# xác định hyperparameter rho
+rho = 0.9
+# xác định hyperparameter epsilon
+epsilon = 1e-8
+# xác định hyperparameter weight decay
+weight_decay = 0.0001
+
+from singa import opt
+sgd = opt.RMSProp(lr=lr, rho=rho, epsilon=epsilon, weight_decay=weight_decay)
+```
+
+3. AdaGrad
+
+```python
+# xác định hyperparameter learning rate
+lr = 0.001
+# xác định hyperparameter epsilon
+epsilon = 1e-8
+# xác định hyperparameter weight decay
+weight_decay = 0.0001
+
+from singa import opt
+sgd = opt.AdaGrad(lr=lr, epsilon=epsilon, weight_decay=weight_decay)
+```
+
+4. Adam
+
+```python
+# xác định hyperparameter learning rate
+lr = 0.001
+# xác định hyperparameter beta 1
+beta_1= 0.9
+# xác định hyperparameter beta 2
+beta_1= 0.999
+# xác định hyperparameter epsilon
+epsilon = 1e-8
+# xác định hyperparameter weight decay
+weight_decay = 0.0001
+
+from singa import opt
+sgd = opt.Adam(lr=lr, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, weight_decay=weight_decay)
+```
+
+## Tạo Decay Scheduler
+
+```python
+from singa import opt
+
+# xác định learning rate ban đầu
+lr_init = 0.001
+# xác định rate của decay trong decay scheduler
+decay_rate = 0.95
+# xác định learning rate schedule có ở dạng staircase shape
+staircase=True
+# xác định bước decay của decay scheduler (trong ví dụ này lr giảm sau mỗi 2 bước)
+decay_steps = 2
+
+# tạo decay scheduler, schedule của lr trở thành lr_init * (decay_rate ^ (step // decay_steps) )
+lr = opt.ExponentialDecay(0.1, 2, 0.5, True)
+# sử dụng lr để tạo một thuật toán tối ưu
+sgd = opt.SGD(lr=lr, momentum=0.9, weight_decay=0.0001)
+```
+
+## Sử dụng thuật toán tối ưu trong Model API
+
+Khi tạo mô hình model, cần đính kèm thuật toán tối ưu vào model.
+
+```python
+# tạo CNN sử dụng Model API
+model = CNN()
+
+# khởi tạo thuật toán tối ưu và đính vào model
+sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+model.set_optimizer(sgd)
+```
+
+Sau đó, khi gọi hàm model, chạy phương pháp `train_one_batch` để sử dụng thuật
+toán tối ưu.
+
+Do vậy, một ví dụ cho lặp lại loop để tối ưu hoá model là:
+
+```python
+for b in range(num_train_batch):
+    # tạo mini-batch tiếp theo
+    x, y = ...
+
+    # Copy dữ liệu vào tensors đầu vào
+    tx.copy_from_numpy(x)
+    ty.copy_from_numpy(y)
+
+    # Train với một batch
+    out, loss = model(tx, ty)
+```
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/security.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/security.md
new file mode 100644
index 0000000..50a184e
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/security.md
@@ -0,0 +1,10 @@
+---
+id: version-4.0.0_Viet-security
+title: Bảo Mật
+original_id: security
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+Người dùng có thể báo cáo vấn đề bảo mật tới
+[Mail List của Nhóm SINGA Security](mailto:security@singa.apache.org)
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/software-stack.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/software-stack.md
new file mode 100644
index 0000000..a098a23
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/software-stack.md
@@ -0,0 +1,146 @@
+---
+id: version-4.0.0_Viet-software-stack
+title: Software Stack
+original_id: software-stack
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+Cấu trúc phần mềm của SINGA bao gồm hai cấp độ chính, cấp thấp backend classes
+và cấp giao diện Python. Hình 1 mô tả chúng cùng với phần cứng. Cấu tạo backend
+cung cấp cấu trúc dữ liệu cơ bản cho các mô hình deep learning, khái quát phần
+cứng để kế hoạch và thực hiện các phép tính, trong khi thành phần communication
+dùng cho distributed training. Giao diện Python tập hợp cấu trúc dữ liệu CPP và
+cung cấp các classes cấp cao bổ sung cho việc train neural network, giúp tiện
+lợi hơn khi thực hiện các mô hình neural network phức tạp.
+
+Sau đây, chúng tôi giới thiệu cấu trúc phần mềm từ dưới lên.
+
+![Cấu trúc phần mềm SINGA V3](assets/singav3.1-sw.png) <br/> **Hình 1 - Cấu trúc
+phần mềm SINGA V3.**
+
+## Backend cấp thấp
+
+### Device
+
+Mỗi trường hợp `Device`, hay thiết bị, được tạo ra từ một thiết bị phần cứng,
+v.d GPU hay CPU. `Device` quản lý bộ nhớ của cấu trúc dữ liệu, và lên kế hoạch
+hoạt động cho việc thực hiện, v.d, trên CUDA streams hay CPU threads. Dựa trên
+phần cứng và ngôn ngữ lập trình của nó, SINGA sử dụng các loại device cụ thể
+sau:
+
+- **CudaGPU** tượng trưng cho cạc Nvidia GPU. Đơn vị sử dụng là CUDA streams.
+- **CppCPU** là một CPU thông thường. Đơn vị sử dụng là CPU threads.
+- **OpenclGPU** là cạc GPU thông thường cho cả Nvidia và AMD. Đơn vị sử dụng là
+  CommandQueues. Do OpenCL tương thích với rất nhiều thiết bị phần cứng, như
+  FPGA và ARM, OpenclGPU có thể phù hợp với các thiết bị phần cứng khác.
+
+### Tensor
+
+Hàm `Tensor` class là một array đa chiều, lưu trữ biến models, như hình ảnh đầu
+vào và bản đồ đặc tính của convolution layer. Mỗi hàm `Tensor` (v.d một tensor)
+được đặt trong một thiết bị, giúp quản lý bộ nhớ của tensor và lên kế hoạch
+(phép tính) việc thực hiện với mỗi tensor. Hầu hết phép toán trong machine
+learning có thể thể hiện (dày hoặc mỏng) dựa trên nghĩa và việc sử dụng tensor.
+Bởi vậy SINGA có thể chạy đa dạng nhiều mô hình, bao gồm deep learning và các mô
+hình machine learning truyền thống khác.
+
+### Hàm Operator
+
+Có hai dạng hàm operators cho tensors, đại số tuyến tính (linear algebra) như
+nhân ma trận (matrix multiplication), và các hàm riêng của neural network như
+convolution và pooling. Các hàm đại số tuyến tính được dùng như `Tensor`
+functions và được áp dụng riêng rẽ với các thiết bị phần cứng khác nhau.
+
+- CppMath (tensor_math_cpp.h) thực hiện hoạt động tensor sử dụng Cpp cho CppCPU
+- CudaMath (tensor_math_cuda.h) thực hiện hoạt động tensor sử dụng CUDA cho
+  CudaGPU
+- OpenclMath (tensor_math_opencl.h) thực hiện hoạt động tensor sử dụng OpenCL
+  cho OpenclGPU
+
+Các toán tử neural network cụ thể cũng được sử dụng riêng rẽ như:
+
+- GpuConvFoward (convolution.h) thực hiện hàm forward function của convolution
+  qua CuDNN trên Nvidia GPU.
+- CpuConvForward (convolution.h) thực hiện hàm forward function của convolution
+  qua CPP trên CPU.
+
+Thông thường, người dùng tạo một hàm `Device` và sử dụng nó để tạo ra các hàm
+`Tensor`. Khi gọi chức năng Tensor hoặc dùng trong neural network, việc thực
+hiện tương ứng cho thiết bị sử dụng sẽ được gọi. Nói cách khác, việc áp dụng các
+toán tử là rõ ràng với người dùng.
+
+Việc dùng Tensor và Device có thể được áp dụng rộng hơn cho đa dạng thiết bị
+phần cứng  
+sử dụng ngôn ngữ lập trình. Mỗi thiết bị phần cứng mới sẽ được hỗ trợ bằng cách
+thêm một Device subclass mới và việc áp dụng tương ứng với các toán tử
+operators.
+
+Tối ưu hoá cho tốc độ và bộ nhớ được thực hiện bởi `Scheduler` và `MemPool` của
+`Device`. Ví dụ, `Scheduler` tạo ra một [computational graph](./graph) dựa theo
+thư viện chương trình của toán tử operators. Sau đó nó có thể tối ưu lệnh thực
+hiện của toán tử trong bộ nhớ chia sẻ và song song.
+
+### Communicator
+
+`Communicator` là để hỗ trợ [distributed training](./dist-train). Nó áp dụng
+communication protocols sử dụng sockets, MPI và NCCL. Thường người dùng chỉ cần
+gọi APIs cấp cao như `put()` và `get()` để gửi và nhận tensors. Tối ưu hoá
+Communication cho cấu trúc liên kết, kích cỡ tin nhắn, v.v được thực hiện nội
+bộ.
+
+## Giao diện Python
+
+Tất cả thành phần backend được thể hiện dạng Python modules thông qua SWIG. Thêm
+vào đó, các classes sau được thêm vào để hỗ trợ việc áp dụng cho các networks
+phức tạp.
+
+### Opt
+
+`Opt` và các lớp con áp dụng phương pháp (như SGD) để cập nhật các giá trị tham
+số model sử dụng tham số gradients. Một lớp con [DistOpt](./dist-train) đồng bộ
+gradients qua các workers trong distributed training bằng cách gọi phương pháp
+từ `Communicator`.
+
+### Hàm Operator
+
+Hàm `Operator` gói nhiều functions khác nhau sử dụng toán tử Tensor hoặc neural
+network từ backend. Ví dụ, hàm forward function và backward function `ReLU` tạo
+ra toán tử `ReLU` operator.
+
+### Layer
+
+`Layer` và các lớp con gói các toán tử operators bằng tham số. Ví dụ,
+convolution và linear operators có tham số weight và bias parameters. Tham số
+được duy trì bởi các lớp `Layer` tương ứng.
+
+### Autograd
+
+[Autograd](./autograd) sử dụng
+[reverse-mode automatic differentiation](https://rufflewind.com/2016-12-30/reverse-mode-automatic-differentiation)
+bằng cách ghi nhớ hoạt động của hàm forward functions của các toán tử rồi tự
+động gọi hàm backward functions ở chiều ngược lại. Tất cả các hàm functions có
+thể được hỗ trợ bởi `Scheduler` để tạo ra một [computational graph](./graph)
+nhằm tối ưu hoá hiệu quả và bộ nhớ.
+
+### Model
+
+[Model](./graph) cung cấp giao diện cơ bản để thực hiện các mô hình models mới.
+Bạn chỉ cần dùng `Model` và định nghĩa việc thực hiện forward propagation của
+model bằng cách tạo và gọi các layers của toán tử. `Model` sẽ thực hiện autograd
+và tự động cập nhật tham số thông qua `Opt` trong khi dữ liệu để training được
+bơm vào đó. Với `Model` API, SINGA có nhiều lợi thế trong cả lập trình mệnh lệnh
+và lập trình khai báo. Người dùng sử dụng một hệ thống sử dụng [Model](./graph)
+API theo dạng lập trình mệnh lệnh như PyTorch. Khác với PyTorch tạo lại phép
+thực thi operations ở mỗi vòng lặp, SINGA hỗ trợ phép thực thi qua cách tạo một
+computational graph hàm súc (khi tính năng này được sử dụng) sau vòng lặp đầu
+tiên. Graph tương tự như đã được tạo bởi các thư viện sử dụng lập trình khai
+báo, như TensorFlow. Vì thế, SINGA có thể áp dụng các kĩ thuật tối ưu hoá bộ nhớ
+và tốc độ qua computational graph.
+
+### ONNX
+
+Để hỗ trợ ONNX, SINGA áp dụng một [sonnx](./onnx) module, bao gồm:
+
+- SingaFrontend để lưu SINGA model ở định dạng onnx.
+- SingaBackend để tải model định dạng onnx sang SINGA cho training và inference.
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/source-repository.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/source-repository.md
new file mode 100644
index 0000000..dd5385b
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/source-repository.md
@@ -0,0 +1,24 @@
+---
+id: version-4.0.0_Viet-source-repository
+title: Source Repository
+original_id: source-repository
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+Dự án này sử dụng [Git](http://git-scm.com/) để quản lý nguồn mã code (source
+code). Hướng dẫn sử dụng Git xem tại http://git-scm.com/documentation .
+
+## Repository
+
+Các đường dẫn sau tới repository của nguồn code online.
+
+- https://gitbox.apache.org/repos/asf?p=singa.git
+
+Bản Github mirror tại
+
+- https://github.com/apache/singa
+
+Mã code có thể cloned từ cả hai repo, vd.,
+
+    git clone https://github.com/apache/singa.git
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/team-list.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/team-list.md
new file mode 100644
index 0000000..34f62eb
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/team-list.md
@@ -0,0 +1,60 @@
+---
+id: version-4.0.0_Viet-team-list
+title: Danh sách nhân sự SINGA
+original_id: team-list
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+Một dự án thành công là công sức của rất nhiều người tham gia. Một số thành viên
+viết code hoặc tài liệu, trong khi những người khác góp công sức quí báu vào thử
+nghiệm, đề xuất sửa chữa và gợi ý.
+
+Cộng đồng SINGA bao gồm các lập trình viên đa phần đến từ NUS, ĐH Chiết Giang,
+NetEase, ĐH Osaka, yzBigData, v.v
+
+## PMC
+
+| Tên           | Email                   | Tổ Chức                                       |
+| ------------- | ----------------------- | --------------------------------------------- |
+| Gang Chen     | cg@apache.org           | Zhejiang University                           |
+| Anh Dinh      | dinhtta@apache.org      | Singapore University of Technology and Design |
+| Ted Dunning   | tdunning@apache.org     | Apache Software Foundation                    |
+| Jinyang Gao   | jinyang@apache.org      | DAMO Academy, Alibaba Group                   |
+| Alan Gates    | gates@apache.org        | Apache Software Foundation                    |
+| Zhaojing Luo  | zhaojing@apache.org     | National University of Singapore              |
+| Thejas Nair   | thejas@apache.org       | Apache Software Foundation                    |
+| Beng Chin Ooi | ooibc@apache.org        | National University of Singapore              |
+| Moaz Reyad    | moaz@apache.org         | Université Grenoble Alpes                     |
+| Kian-Lee Tan  | tankianlee@apache.org   | National University of Singapore              |
+| Sheng Wang    | wangsh@apache.org       | DAMO Academy, Alibaba Group                   |
+| Wei Wang      | wangwei@apache.org      | National University of Singapore              |
+| Zhongle Xie   | zhongle@apache.org      | National University of Singapore              |
+| Sai Ho Yeung  | chrishkchris@apache.org | National University of Singapore              |
+| Meihui Zhang  | meihuizhang@apache.org  | Beijing Institute of Technology               |
+| Kaiping Zheng | kaiping@apache.org      | National University of Singapore              |
+
+## Committers
+
+| Tên          | Email                  | Tổ Chức                                       |
+| ------------ | ---------------------- | --------------------------------------------- |
+| Xiangrui Cai | caixr@apache.org       | Nankai University                             |
+| Chonho Lee   | chonho@apache.org      | Osaka University                              |
+| Shicong Lin  | shicong@apache.org     | National University of Singapore              |
+| Rulin Xing   | rulin@apache.org       | Huazhong University of Science and Technology |
+| Wanqi Xue    | xuewanqi@apache.org    | Nanyang Technological University              |
+| Joddiy Zhang | joddiyzhang@apache.org | National University of Singapore              |
+
+## Contributors
+
+| Tên                | Email                        | Tổ Chức                          |
+| ------------------ | ---------------------------- | -------------------------------- |
+| Haibo Chen         | hzchenhaibo@corp.netease.com | NetEase                          |
+| Shicheng Chen      | chengsc@comp.nus.edu.sg      | National University of Singapore |
+| Xin Ji             | vincent.j.xin@gmail.com      | Visenze, Singapore               |
+| Anthony K. H. Tung | atung@comp.nus.edu.sg        | National University of Singapore |
+| Ji Wang            | wangji@mzhtechnologies.com   | Hangzhou MZH Technologies        |
+| Yuan Wang          | wangyuan@corp.netease.com    | NetEase                          |
+| Wenfeng Wu         | dcswuw@gmail.com             | Freelancer, China                |
+| Kaiyuan Yang       | yangky@comp.nus.edu.sg       | National University of Singapore |
+| Chang Yao          | yaochang2009@gmail.com       | Hangzhou MZH Technologies        |
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/tensor.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/tensor.md
new file mode 100644
index 0000000..c30b3aa
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/tensor.md
@@ -0,0 +1,283 @@
+---
+id: version-4.0.0_Viet-tensor
+title: Tensor
+original_id: tensor
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+Mỗi thực thể Tensor instance là một array đa chiều được đặt trong một thực thể
+Device. Thực thể Tensor lưu các biến và cung cấp phép tính đại số tuyến tính cho
+các loại thiết bị phần cứng khác nhau mà không cần người dùng để ý. Lưu ý rằng
+người dùng cần đảm bảo các toán hạng tensor được đặt ở cùng một thiết bị ngoại
+trừ hàm copy.
+
+## Cách sử dụng Tensor
+
+### Tạo Tensor
+
+```python
+>>> import numpy as np
+>>> from singa import tensor
+>>> tensor.from_numpy( np.asarray([[1, 0, 0], [0, 1, 0]], dtype=np.float32) )
+[[1. 0. 0.]
+ [0. 1. 0.]]
+```
+
+### Chuyển sang numpy
+
+```python
+>>> a = np.asarray([[1, 0, 0], [0, 1, 0]], dtype=np.float32)
+>>> tensor.from_numpy(a)
+[[1. 0. 0.]
+ [0. 1. 0.]]
+>>> tensor.to_numpy(tensor.from_numpy(a))
+array([[1., 0., 0.],
+       [0., 1., 0.]], dtype=float32)
+```
+
+### Phương pháp Tensor
+
+```python
+>>> t = tensor.from_numpy(a)
+>>> t.transpose([1,0])
+[[1. 0.]
+ [0. 1.]
+ [0. 0.]]
+```
+
+biến đổi `tensor` tới 6 chữ số
+
+```python
+>>> a = tensor.random((2,3,4,5,6,7))
+>>> a.shape
+(2, 3, 4, 5, 6, 7)
+>>> a.reshape((2,3,4,5,7,6)).transpose((3,2,1,0,4,5)).shape
+(5, 4, 3, 2, 7, 6)
+```
+
+### Phương pháp số học Tensor
+
+`tensor` được đánh giá trong thời gian thực.
+
+```python
+>>> t + 1
+[[2. 1. 1.]
+ [1. 2. 1.]]
+>>> t / 5
+[[0.2 0.  0. ]
+ [0.  0.2 0. ]]
+```
+
+`tensor` tạo số học:
+
+```python
+>>> a
+[[1. 2. 3.]
+ [4. 5. 6.]]
+>>> b
+[[1. 2. 3.]]
+>>> a + b
+[[2. 4. 6.]
+ [5. 7. 9.]]
+>>> a * b
+[[ 1.  4.  9.]
+ [ 4. 10. 18.]]
+>>> a / b
+[[1.  1.  1. ]
+ [4.  2.5 2. ]]
+>>> a/=b # inplace operation
+>>> a
+[[1.  1.  1. ]
+ [4.  2.5 2. ]]
+```
+
+`tensor` broadcasting on matrix multiplication (GEMM)
+
+```python
+>>> from singa import tensor
+>>> a = tensor.random((2,2,2,3))
+>>> b = tensor.random((2,3,4))
+>>> tensor.mult(a,b).shape
+(2, 2, 2, 4)
+```
+
+### Hàm lập trình Tensor Functions
+
+Hàm Functions trong `singa.tensor` tạo ra đối tượng `tensor` mới sau khi áp dụng
+phép tính trong hàm function.
+
+```python
+>>> tensor.log(t+1)
+[[0.6931472 0.        0.       ]
+ [0.        0.6931472 0.       ]]
+```
+
+### Tensor ở các thiết bị khác nhau
+
+`tensor` được tạo ra trên máy chủ (CPU) từ ban đầu; nó cũng được tạo ra trên các
+thiết bị phần cứng khác nhau bằng cách cụ thể hoá `device`. Một `tensor` có thể
+chuyển giữa `device`s qua hàm `to_device()` function.
+
+```python
+>>> from singa import device
+>>> x = tensor.Tensor((2, 3), device.create_cuda_gpu())
+>>> x.gaussian(1,1)
+>>> x
+[[1.531889   1.0128608  0.12691343]
+ [2.1674204  3.083676   2.7421203 ]]
+>>> # move to host
+>>> x.to_device(device.get_default_device())
+```
+
+### Dùng Tensor để train MLP
+
+```python
+
+"""
+  Đoạn mã trích từ examples/mlp/module.py
+"""
+
+label = get_label()
+data = get_data()
+
+dev = device.create_cuda_gpu_on(0)
+sgd = opt.SGD(0.05)
+
+# định nghĩa tensor cho dữ liệu và nhãn đầu vào
+tx = tensor.Tensor((400, 2), dev, tensor.float32)
+ty = tensor.Tensor((400,), dev, tensor.int32)
+model = MLP(data_size=2, perceptron_size=3, num_classes=2)
+
+# đính model vào graph
+model.set_optimizer(sgd)
+model.compile([tx], is_train=True, use_graph=True, sequential=False)
+model.train()
+
+for i in range(1001):
+    tx.copy_from_numpy(data)
+    ty.copy_from_numpy(label)
+    out, loss = model(tx, ty, 'fp32', spars=None)
+
+    if i % 100 == 0:
+        print("training loss = ", tensor.to_numpy(loss)[0])
+```
+
+Đầu ra:
+
+```bash
+$ python3 examples/mlp/module.py
+training loss =  0.6158037
+training loss =  0.52852553
+training loss =  0.4571422
+training loss =  0.37274635
+training loss =  0.30146334
+training loss =  0.24906921
+training loss =  0.21128304
+training loss =  0.18390492
+training loss =  0.16362564
+training loss =  0.148164
+training loss =  0.13589878
+```
+
+## Áp dụng Tensor
+
+Mục trước chỉ ra cách sử dụng chung của `Tensor`, việc áp dụng cụ thể được đưa
+ra sau đây. Đầu tiên, sẽ giới thiệu việc thiết lập tensors Python và C++. Phần
+sau sẽ nói về cách frontend (Python) và backend (C++) kết nối với nhau và cách
+để mở rộng chúng.
+
+### Python Tensor
+
+`Tensor` của lớp Python, được định nghĩa trong `python/singa/tensor.py`, cung
+cấp cách dùng tensor ở tầng cao, để thực hiện việc vận hành deep learning (qua
+[autograd](./autograd)), cũng như là quản lý dữ liệu bởi người dùng cuối.
+
+Hoạt động cơ bản của nó là gói xung quanh các phương pháp C++ tensor, cả phương
+pháp số học (như `sum`) và không số học (như `reshape`). Một vài phép số học cao
+cấp về sau được giới thiệu và áp dụng sử dụng thuần Python tensor API, như
+`tensordot`. Python Tensor APIs có thể sử dụng để thực hiện dễ dàng các phép
+tính neural network phức tạp với các phương pháp methods linh hoạt có sẵn.
+
+### C++ Tensor
+
+`Tensor` lớp C++, được định nghĩa trong `include/singa/core/tensor.h`, về cơ bản
+quản lý bộ nhớ nắm giữ dữ liệu, và cung cấp APIs tầm thấp cho các hàm thực hiện
+tensor. Đồng thời nó cũng cung cấp các phương pháp số học đa dạng (như `matmul`)
+bằng cách gói các chương trình backends khác nhau (CUDA, BLAS, cuBLAS, v.v.).
+
+#### Văn bản thực hiện và Khoá Bộ nhớ
+
+Hai khái niệm hay cấu trúc dữ liệu quan trọng của `Tensor` là việc áp dụng
+`device`, và khoá bộ nhớ `Block`.
+
+Mỗi `Tensor` được lưu theo nghiã đen và quản lý bởi một thiết bị phần cứng, thể
+hiện theo nghĩa thực hành (CPU, GPU). Tính toán Tensor được thực hiện trên thiết
+bị.
+
+Dữ liệu Tensor trong hàm `Block`, được định nghĩa trong
+`include/singa/core/common.h`. `Block` chứa dữ liệu cơ sở, trong khi tensors
+chịu trách nhiệm về lý lịch dữ liệu metadata mô tả tensor, như `shape`,
+`strides`.
+
+#### Tensor Math Backends
+
+Để tận dụng các thư viện chương trình toán hiệu quả cung cấp bởi backend từ các
+thiết bị phần cứng khác nhau, SINGA cung cấp một bộ Tensor functions cho mỗi
+backend được hỗ trợ.
+
+- 'tensor_math_cpp.h' áp dụng vận hành sử dụng Cpp (với CBLAS) cho thiết bị
+  CppCPU.
+- 'tensor_math_cuda.h' áp dụng vận hành sử dụng Cuda (với cuBLAS) cho thiết bị
+  CudaGPU.
+- 'tensor_math_opencl.h' áp dụng vận hành sử dụng OpenCL cho thiết bị OpenclGPU.
+
+### Trình bày C++ APIs qua Python
+
+SWIG(http://www.swig.org/) là công cụ có thể tự động qui đổi C++ APIs sang
+Python APIs. SINGA sử dụng SWIG để trình bày C++ APIs sang Python. Một vài tệp
+tin được tạo bởi SWIG, bao gồm `python/singa/singa_wrap.py`. Các Python mô-đun
+(như, `tensor`, `device` và `autograd`) nhập mô-đun để gọi C++ APIs để áp dụng
+hàm và lớp Python.
+
+```python
+import tensor
+
+t = tensor.Tensor(shape=(2, 3))
+```
+
+Ví dụ, khi một Python `Tensor` instance được tạo ra ở trên, việc áp dụng
+`Tensor` class tạo ra một instance của `Tensor` class định nghĩa trong
+`singa_wrap.py`, tương ứng với C++ `Tensor` class. Rõ ràng hơn, `Tensor` class
+trong `singa_wrap.py` để chỉ `CTensor` trong `tensor.py`.
+
+```python
+# trong tensor.py
+from . import singa_wrap as singa
+
+CTensor = singa.Tensor
+```
+
+### Tạo Hàm Tensor Functions mới
+
+Với nền tảng được mô tả phía trên, mở rộng hàm tensor functions có thể dễ dàng
+thực hiện từ dưới lên, Với các phép toán, các bước làm như sau:
+
+- Khai báo API mới cho `tensor.h`
+- Tạo mã code sử dụng tiền tố xác định trước trong `tensor.cc`, lấy
+  `GenUnaryTensorFn(Abs);` làm ví dụ.
+- Khai báo theo mẫu method/function trong `tensor_math.h`
+- Thực hiện áp dụng ít nhất cho CPU (`tensor_math_cpp.h`) và
+  GPU(`tensor_math_cuda.h`)
+- Trình API qua SWIG bằng cách thêm nó vào `src/api/core_tensor.i`
+- Định nghĩa Python Tensor API trong `tensor.py` bằng cách tự động gọi hàm
+  function được tạo trong `singa_wrap.py`
+- Viết unit tests khi phù hợp
+
+## Python API
+
+_đang cập nhật_
+
+## CPP API
+
+_đang cập nhật_
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/time-profiling.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/time-profiling.md
new file mode 100644
index 0000000..bcc7912
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/time-profiling.md
@@ -0,0 +1,164 @@
+---
+id: version-4.0.0_Viet-time-profiling
+title: Time Profiling
+original_id: time-profiling
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+SINGA hỗ trợ hồ sơ thời gian (time profilier) của mỗi toán tử được lưu tạm thời
+trong graph. Để tận dụng chức năng hồ sơ thời gian, chúng tôi trước tiên gọi
+method `device.SetVerbosity` để tạo độ dài cho hồ sơ thời gian, sau đó gọi hàm
+`device.PrintTimeProfiling` để in ra kết quả của hồ sơ thời gian.
+
+## Thiết lập Độ dài cho Hồ sơ thời gian
+
+Để sử dụng chức năng hồ sơ thời gian, bạn cần tạo độ dài cho nó. Có ba mức độ.
+Giá trị ban đầu đặt sẵn `verbosity == 0`, là không áp dụng hồ sơ thời gian. Khi
+bạn để `verbosity == 1`, nó sẽ lên hồ sơ thời gian chạy forward và backward. Khi
+`verbosity == 2`, nó sẽ lên hồ sơ thời gian cho mỗi buffered operation trong
+graph.
+
+Sau đây là mã code ví dụ để thiết lập chức năng hồ sơ thời gian:
+
+```python
+# tạo thiết bị
+from singa import device
+dev = device.create_cuda_gpu()
+# đặt độ dài
+verbosity = 2
+dev.SetVerbosity(verbosity)
+# không bắt buộc: bỏ qua 5 vòng lặp đầu tiên khi lên hồ sơ thời gian
+dev.SetSkipIteration(5)
+```
+
+Tiếp theo, sau khi kết thúc training ở cuối mỗi chương trình, chúng ta có thể in
+kết quả hồ sơ thời gian bằng cách gọi method `device.PrintTimeProfiling`:
+
+```python
+dev.PrintTimeProfiling()
+```
+
+## Ví dụ đầu ra cho các độ dài khác nhau
+
+Có thể chạy
+[ví dụ](https://github.com/apache/singa/blob/master/examples/cnn/benchmark.py)
+ResNet để xem kết quả với cách đặt độ dài khác nhau:
+
+1. `verbosity == 1`
+
+```
+Time Profiling:
+Forward Propagation Time : 0.0409127 sec
+Backward Propagation Time : 0.114813 sec
+```
+
+2. `verbosity == 2`
+
+```
+Time Profiling:
+OP_ID0. SetValue : 1.73722e-05 sec
+OP_ID1. cudnnConvForward : 0.000612724 sec
+OP_ID2. GpuBatchNormForwardTraining : 0.000559449 sec
+OP_ID3. ReLU : 0.000375004 sec
+OP_ID4. GpuPoolingForward : 0.000240041 sec
+OP_ID5. SetValue : 3.4176e-06 sec
+OP_ID6. cudnnConvForward : 0.000115619 sec
+OP_ID7. GpuBatchNormForwardTraining : 0.000150415 sec
+OP_ID8. ReLU : 9.95494e-05 sec
+OP_ID9. SetValue : 3.22432e-06 sec
+OP_ID10. cudnnConvForward : 0.000648668 sec
+OP_ID11. GpuBatchNormForwardTraining : 0.000149793 sec
+OP_ID12. ReLU : 9.92118e-05 sec
+OP_ID13. SetValue : 3.37728e-06 sec
+OP_ID14. cudnnConvForward : 0.000400953 sec
+OP_ID15. GpuBatchNormForwardTraining : 0.000572181 sec
+OP_ID16. SetValue : 3.21312e-06 sec
+OP_ID17. cudnnConvForward : 0.000398698 sec
+OP_ID18. GpuBatchNormForwardTraining : 0.00056836 sec
+OP_ID19. Add : 0.000542246 sec
+OP_ID20. ReLU : 0.000372783 sec
+OP_ID21. SetValue : 3.25312e-06 sec
+OP_ID22. cudnnConvForward : 0.000260731 sec
+OP_ID23. GpuBatchNormForwardTraining : 0.000149041 sec
+OP_ID24. ReLU : 9.9072e-05 sec
+OP_ID25. SetValue : 3.10592e-06 sec
+OP_ID26. cudnnConvForward : 0.000637481 sec
+OP_ID27. GpuBatchNormForwardTraining : 0.000152577 sec
+OP_ID28. ReLU : 9.90518e-05 sec
+OP_ID29. SetValue : 3.28224e-06 sec
+OP_ID30. cudnnConvForward : 0.000404586 sec
+OP_ID31. GpuBatchNormForwardTraining : 0.000569679 sec
+OP_ID32. Add : 0.000542291 sec
+OP_ID33. ReLU : 0.00037211 sec
+OP_ID34. SetValue : 3.13696e-06 sec
+OP_ID35. cudnnConvForward : 0.000261219 sec
+OP_ID36. GpuBatchNormForwardTraining : 0.000148281 sec
+OP_ID37. ReLU : 9.89299e-05 sec
+OP_ID38. SetValue : 3.25216e-06 sec
+OP_ID39. cudnnConvForward : 0.000633644 sec
+OP_ID40. GpuBatchNormForwardTraining : 0.000150711 sec
+OP_ID41. ReLU : 9.84902e-05 sec
+OP_ID42. SetValue : 3.18176e-06 sec
+OP_ID43. cudnnConvForward : 0.000402752 sec
+OP_ID44. GpuBatchNormForwardTraining : 0.000571523 sec
+OP_ID45. Add : 0.000542435 sec
+OP_ID46. ReLU : 0.000372539 sec
+OP_ID47. SetValue : 3.24672e-06 sec
+OP_ID48. cudnnConvForward : 0.000493054 sec
+OP_ID49. GpuBatchNormForwardTraining : 0.000293142 sec
+OP_ID50. ReLU : 0.000190047 sec
+OP_ID51. SetValue : 3.14784e-06 sec
+OP_ID52. cudnnConvForward : 0.00148837 sec
+OP_ID53. GpuBatchNormForwardTraining : 8.34794e-05 sec
+OP_ID54. ReLU : 5.23254e-05 sec
+OP_ID55. SetValue : 3.40096e-06 sec
+OP_ID56. cudnnConvForward : 0.000292971 sec
+OP_ID57. GpuBatchNormForwardTraining : 0.00029174 sec
+OP_ID58. SetValue : 3.3248e-06 sec
+OP_ID59. cudnnConvForward : 0.000590154 sec
+OP_ID60. GpuBatchNormForwardTraining : 0.000294149 sec
+OP_ID61. Add : 0.000275119 sec
+OP_ID62. ReLU : 0.000189268 sec
+OP_ID63. SetValue : 3.2704e-06 sec
+OP_ID64. cudnnConvForward : 0.000341232 sec
+OP_ID65. GpuBatchNormForwardTraining : 8.3304e-05 sec
+OP_ID66. ReLU : 5.23667e-05 sec
+OP_ID67. SetValue : 3.19936e-06 sec
+OP_ID68. cudnnConvForward : 0.000542484 sec
+OP_ID69. GpuBatchNormForwardTraining : 8.60537e-05 sec
+OP_ID70. ReLU : 5.2479e-05 sec
+OP_ID71. SetValue : 3.41824e-06 sec
+OP_ID72. cudnnConvForward : 0.000291295 sec
+OP_ID73. GpuBatchNormForwardTraining : 0.000292795 sec
+OP_ID74. Add : 0.000274438 sec
+OP_ID75. ReLU : 0.000189689 sec
+OP_ID76. SetValue : 3.21984e-06 sec
+OP_ID77. cudnnConvForward : 0.000338776 sec
+OP_ID78. GpuBatchNormForwardTraining : 8.484e-05 sec
+OP_ID79. ReLU : 5.29408e-05 sec
+OP_ID80. SetValue : 3.18208e-06 sec
+OP_ID81. cudnnConvForward : 0.000545542 sec
+OP_ID82. GpuBatchNormForwardTraining : 8.40976e-05 sec
+OP_ID83. ReLU : 5.2256e-05 sec
+OP_ID84. SetValue : 3.36256e-06 sec
+OP_ID85. cudnnConvForward : 0.000293003 sec
+OP_ID86. GpuBatchNormForwardTraining : 0.0002989 sec
+OP_ID87. Add : 0.000275041 sec
+OP_ID88. ReLU : 0.000189867 sec
+OP_ID89. SetValue : 3.1184e-06 sec
+OP_ID90. cudnnConvForward : 0.000340417 sec
+OP_ID91. GpuBatchNormForwardTraining : 8.39395e-05 sec
+OP_ID92. ReLU : 5.26544e-05 sec
+OP_ID93. SetValue : 3.2336e-06 sec
+OP_ID94. cudnnConvForward : 0.000539787 sec
+OP_ID95. GpuBatchNormForwardTraining : 8.2753e-05 sec
+OP_ID96. ReLU : 4.86758e-05 sec
+OP_ID97. SetValue : 3.24384e-06 sec
+OP_ID98. cudnnConvForward : 0.000287108 sec
+OP_ID99. GpuBatchNormForwardTraining : 0.000293127 sec
+OP_ID100. Add : 0.000269478 sec
+.
+.
+.
+```
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/wheel-cpu-dev.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/wheel-cpu-dev.md
new file mode 100644
index 0000000..5357e64
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/wheel-cpu-dev.md
@@ -0,0 +1,13 @@
+---
+id: version-4.0.0_Viet-wheel-cpu-dev
+title: Dùng cho CPU (phiên bản dev)
+original_id: wheel-cpu-dev
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+## 3.0.0.dev200720
+
+- [Python 3.6](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0.dev200720-cp36-cp36m-manylinux2014_x86_64.whl)
+- [Python 3.7](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0.dev200720-cp37-cp37m-manylinux2014_x86_64.whl)
+- [Python 3.8](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0.dev200720-cp38-cp38-manylinux2014_x86_64.whl)
diff --git a/docs-site/docs/wheel-cpu.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/wheel-cpu.md
similarity index 66%
copy from docs-site/docs/wheel-cpu.md
copy to docs-site/website/versioned_docs/version-4.0.0_Viet/wheel-cpu.md
index 295fced..56c80e0 100644
--- a/docs-site/docs/wheel-cpu.md
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/wheel-cpu.md
@@ -1,22 +1,11 @@
 ---
-id: wheel-cpu
-title: CPU only
+id: version-4.0.0_Viet-wheel-cpu
+title: Dùng cho CPU
+original_id: wheel-cpu
 ---
 
 <!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
 
-## 3.3.0
-
-- [Python 3.6](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-3.3.0-cp36-cp36m-manylinux2014_x86_64.whl)
-- [Python 3.7](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-3.3.0-cp37-cp37m-manylinux2014_x86_64.whl)
-- [Python 3.8](https://singa-wheel.s3.ap-southeast-1.amazonaws.com/singa-3.3.0-cp38-cp38-manylinux2014_x86_64.whl)
-
-## 3.2.0
-
-- [Python 3.6](https://www.comp.nus.edu.sg/~dbsystem/wheelhouse/singa-3.2.0-cp36-cp36m-manylinux2014_x86_64.whl)
-- [Python 3.7](https://www.comp.nus.edu.sg/~dbsystem/wheelhouse/singa-3.2.0-cp37-cp37m-manylinux2014_x86_64.whl)
-- [Python 3.8](https://www.comp.nus.edu.sg/~dbsystem/wheelhouse/singa-3.2.0-cp38-cp38-manylinux2014_x86_64.whl)
-
 ## 3.1.0
 
 - [Python 3.6](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.1.0-cp36-cp36m-manylinux2014_x86_64.whl)
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/wheel-gpu-dev.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/wheel-gpu-dev.md
new file mode 100644
index 0000000..e751946
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/wheel-gpu-dev.md
@@ -0,0 +1,13 @@
+---
+id: version-4.0.0_Viet-wheel-gpu-dev
+title: Sử Dụng CUDA (phiên bản dev)
+original_id: wheel-gpu-dev
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+## 3.0.0.dev200720
+
+- [CUDA10.2, cuDNN 7.6.5, Python 3.6](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0.dev200720%2Bcuda10.2-cp36-cp36m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python 3.7](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0.dev200720%2Bcuda10.2-cp37-cp37m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python 3.8](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0.dev200720%2Bcuda10.2-cp38-cp38-manylinux2014_x86_64.whl)
diff --git a/docs-site/website/versioned_docs/version-4.0.0_Viet/wheel-gpu.md b/docs-site/website/versioned_docs/version-4.0.0_Viet/wheel-gpu.md
new file mode 100644
index 0000000..6d109c4
--- /dev/null
+++ b/docs-site/website/versioned_docs/version-4.0.0_Viet/wheel-gpu.md
@@ -0,0 +1,22 @@
+---
+id: version-4.0.0_Viet-wheel-gpu
+title: Sử Dụng CUDA
+original_id: wheel-gpu
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agree [...]
+
+## 3.1.0
+
+- [CUDA10.2, cuDNN 7.6.5, Python
+  3.6]https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.1.0%2Bcuda10.2-cp36-cp36m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python
+  3.7]https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.1.0%2Bcuda10.2-cp37-cp37m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python
+  3.8]https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.1.0%2Bcuda10.2-cp38-cp38-manylinux2014_x86_64.whl)
+
+## 3.0.0
+
+- [CUDA10.2, cuDNN 7.6.5, Python 3.6](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0%2Bcuda10.2-cp36-cp36m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python 3.7](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0%2Bcuda10.2-cp37-cp37m-manylinux2014_x86_64.whl)
+- [CUDA10.2, cuDNN 7.6.5, Python 3.8](https://singa-wheel.s3-ap-southeast-1.amazonaws.com/singa-3.0.0%2Bcuda10.2-cp38-cp38-manylinux2014_x86_64.whl)
diff --git a/docs-site/website/versioned_sidebars/version-4.0.0_Chinese-sidebars.json b/docs-site/website/versioned_sidebars/version-4.0.0_Chinese-sidebars.json
new file mode 100644
index 0000000..b6d4ce6
--- /dev/null
+++ b/docs-site/website/versioned_sidebars/version-4.0.0_Chinese-sidebars.json
@@ -0,0 +1,44 @@
+{
+  "version-4.0.0_Chinese-docs": {
+    "Getting Started": [
+      "version-4.0.0_Chinese-installation",
+      "version-4.0.0_Chinese-software-stack",
+      "version-4.0.0_Chinese-examples"
+    ],
+    "Guides": [
+      "version-4.0.0_Chinese-device",
+      "version-4.0.0_Chinese-tensor",
+      "version-4.0.0_Chinese-autograd",
+      "version-4.0.0_Chinese-optimizer",
+      "version-4.0.0_Chinese-graph",
+      "version-4.0.0_Chinese-onnx",
+      "version-4.0.0_Chinese-dist-train",
+      "version-4.0.0_Chinese-time-profiling",
+      "version-4.0.0_Chinese-half-precision"
+    ],
+    "Development": [
+      "version-4.0.0_Chinese-downloads",
+      "version-4.0.0_Chinese-build",
+      "version-4.0.0_Chinese-contribute-code",
+      "version-4.0.0_Chinese-contribute-docs",
+      "version-4.0.0_Chinese-how-to-release",
+      "version-4.0.0_Chinese-git-workflow"
+    ]
+  },
+  "version-4.0.0_Chinese-community": {
+    "Community": [
+      "version-4.0.0_Chinese-source-repository",
+      "version-4.0.0_Chinese-mail-lists",
+      "version-4.0.0_Chinese-issue-tracking",
+      "version-4.0.0_Chinese-security",
+      "version-4.0.0_Chinese-team-list",
+      "version-4.0.0_Chinese-history-singa"
+    ],
+    "Wheel Package": [
+      "version-4.0.0_Chinese-wheel-cpu",
+      "version-4.0.0_Chinese-wheel-cpu-dev",
+      "version-4.0.0_Chinese-wheel-gpu",
+      "version-4.0.0_Chinese-wheel-gpu-dev"
+    ]
+  }
+}
diff --git a/docs-site/website/versioned_sidebars/version-4.0.0_Viet-sidebars.json b/docs-site/website/versioned_sidebars/version-4.0.0_Viet-sidebars.json
new file mode 100644
index 0000000..cdd2926
--- /dev/null
+++ b/docs-site/website/versioned_sidebars/version-4.0.0_Viet-sidebars.json
@@ -0,0 +1,43 @@
+{
+  "version-4.0.0_Viet-docs": {
+    "Getting Started": [
+      "version-4.0.0_Viet-installation",
+      "version-4.0.0_Viet-software-stack",
+      "version-4.0.0_Viet-examples"
+    ],
+    "Guides": [
+      "version-4.0.0_Viet-device",
+      "version-4.0.0_Viet-tensor",
+      "version-4.0.0_Viet-autograd",
+      "version-4.0.0_Viet-optimizer",
+      "version-4.0.0_Viet-graph",
+      "version-4.0.0_Viet-onnx",
+      "version-4.0.0_Viet-dist-train",
+      "version-4.0.0_Viet-time-profiling"
+    ],
+    "Development": [
+      "version-4.0.0_Viet-downloads",
+      "version-4.0.0_Viet-build",
+      "version-4.0.0_Viet-contribute-code",
+      "version-4.0.0_Viet-contribute-docs",
+      "version-4.0.0_Viet-how-to-release",
+      "version-4.0.0_Viet-git-workflow"
+    ]
+  },
+  "version-4.0.0_Viet-community": {
+    "Community": [
+      "version-4.0.0_Viet-source-repository",
+      "version-4.0.0_Viet-mail-lists",
+      "version-4.0.0_Viet-issue-tracking",
+      "version-4.0.0_Viet-security",
+      "version-4.0.0_Viet-team-list",
+      "version-4.0.0_Viet-history-singa"
+    ],
+    "Wheel Package": [
+      "version-4.0.0_Viet-wheel-cpu",
+      "version-4.0.0_Viet-wheel-cpu-dev",
+      "version-4.0.0_Viet-wheel-gpu",
+      "version-4.0.0_Viet-wheel-gpu-dev"
+    ]
+  }
+}
diff --git a/docs-site/website/versions.json b/docs-site/website/versions.json
index b5c8827..6ca5be0 100644
--- a/docs-site/website/versions.json
+++ b/docs-site/website/versions.json
@@ -1,4 +1,5 @@
 [
+  "4.0.0",
   "3.3.0",
   "3.2.0",
   "3.2.0.rc1",
@@ -6,6 +7,6 @@
   "3.0.0",
   "3.0.0.rc1",
   "2.0.0",
-  "3.3.0_Chinese",
-  "3.3.0_Viet"
+  "4.0.0_Chinese",
+  "4.0.0_Viet"
 ]
diff --git a/docs-site/website/versions_otherlang.json b/docs-site/website/versions_otherlang.json
index de84c5c..462f5b3 100644
--- a/docs-site/website/versions_otherlang.json
+++ b/docs-site/website/versions_otherlang.json
@@ -1,4 +1,4 @@
 [
-  "3.3.0_Chinese",
-  "3.3.0_Viet"
+  "4.0.0_Chinese",
+  "4.0.0_Viet"
 ]