You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ju...@apache.org on 2023/12/18 21:01:00 UTC
(tvm) 01/03: [Unity] Bump fpA_intB_gemm (#16244)
This is an automated email from the ASF dual-hosted git repository.
junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git
commit e98fdea65460512b97ccc87be1f43e6e37486814
Author: Wuwei Lin <wu...@apache.org>
AuthorDate: Thu Dec 14 11:38:00 2023 -0800
[Unity] Bump fpA_intB_gemm (#16244)
Updated preprocessing and submodule the support 3D weight for MoE.
* update
* update
* update
---
3rdparty/cutlass_fpA_intB_gemm | 2 +-
src/runtime/contrib/cutlass/weight_preprocess.cc | 15 +++++++++------
tests/scripts/task_config_build_gpu.sh | 1 +
3 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/3rdparty/cutlass_fpA_intB_gemm b/3rdparty/cutlass_fpA_intB_gemm
index ed951b046f..74ee6cb468 160000
--- a/3rdparty/cutlass_fpA_intB_gemm
+++ b/3rdparty/cutlass_fpA_intB_gemm
@@ -1 +1 @@
-Subproject commit ed951b046f89ddfd990af8d2482e3350bda2fec6
+Subproject commit 74ee6cb46816267515c08eb78755d2b9b8db0bb4
diff --git a/src/runtime/contrib/cutlass/weight_preprocess.cc b/src/runtime/contrib/cutlass/weight_preprocess.cc
index ef80627cc7..4b378fa4a7 100644
--- a/src/runtime/contrib/cutlass/weight_preprocess.cc
+++ b/src/runtime/contrib/cutlass/weight_preprocess.cc
@@ -37,18 +37,21 @@ namespace runtime {
// The preprocessing functions are defined in C++, so we need to copy the input weight to CPU.
TVM_REGISTER_GLOBAL("cutlass.ft_preprocess_weight")
.set_body_typed([](NDArray packed_weight, int sm, bool is_int4) {
- int rows = packed_weight->shape[0];
- int cols = packed_weight->shape[1];
- std::vector<int8_t> input_cpu(rows * cols);
- std::vector<int8_t> output_cpu(rows * cols);
+ bool is_2d = packed_weight->ndim == 2;
+ int num_experts = is_2d ? 1 : packed_weight->shape[0];
+ int rows = packed_weight->shape[is_2d ? 0 : 1];
+ int cols = packed_weight->shape[is_2d ? 1 : 2];
+
+ std::vector<int8_t> input_cpu(num_experts * rows * cols);
+ std::vector<int8_t> output_cpu(num_experts * rows * cols);
packed_weight.CopyToBytes(input_cpu.data(), input_cpu.size());
// multiply cols by 2 since the "col" params in preprocess_weights refers to the column of
// the unpacked weight.
if (is_int4) {
cols *= 2;
}
- fastertransformer::preprocess_weights(output_cpu.data(), input_cpu.data(), rows, cols,
- is_int4, sm);
+ fastertransformer::preprocess_weights(output_cpu.data(), input_cpu.data(), num_experts, rows,
+ cols, is_int4, sm);
auto out = NDArray::Empty(packed_weight.Shape(), packed_weight->dtype, packed_weight->device);
out.CopyFromBytes(output_cpu.data(), output_cpu.size());
return out;
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 37ab0a87f1..e68e646ce1 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -54,3 +54,4 @@ echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
echo set\(USE_CUTLASS ON\) >> config.cmake
echo set\(USE_CMSISNN ON\) >> config.cmake
echo set\(USE_MSC ON\) >> config.cmake
+echo set\(CMAKE_CUDA_ARCHITECTURES 75\) >> config.cmake