You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ju...@apache.org on 2023/12/18 21:01:00 UTC
(tvm) 01/03: [Unity] Bump fpA_intB_gemm (#16244)

This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit e98fdea65460512b97ccc87be1f43e6e37486814
Author: Wuwei Lin <wu...@apache.org>
AuthorDate: Thu Dec 14 11:38:00 2023 -0800

    [Unity] Bump fpA_intB_gemm (#16244)
    
    Updated preprocessing and submodule the support 3D weight for MoE.
    
    * update
    
    * update
    
    * update
---
 3rdparty/cutlass_fpA_intB_gemm                   |  2 +-
 src/runtime/contrib/cutlass/weight_preprocess.cc | 15 +++++++++------
 tests/scripts/task_config_build_gpu.sh           |  1 +
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/3rdparty/cutlass_fpA_intB_gemm b/3rdparty/cutlass_fpA_intB_gemm
index ed951b046f..74ee6cb468 160000
--- a/3rdparty/cutlass_fpA_intB_gemm
+++ b/3rdparty/cutlass_fpA_intB_gemm
@@ -1 +1 @@
-Subproject commit ed951b046f89ddfd990af8d2482e3350bda2fec6
+Subproject commit 74ee6cb46816267515c08eb78755d2b9b8db0bb4
diff --git a/src/runtime/contrib/cutlass/weight_preprocess.cc b/src/runtime/contrib/cutlass/weight_preprocess.cc
index ef80627cc7..4b378fa4a7 100644
--- a/src/runtime/contrib/cutlass/weight_preprocess.cc
+++ b/src/runtime/contrib/cutlass/weight_preprocess.cc
@@ -37,18 +37,21 @@ namespace runtime {
 // The preprocessing functions are defined in C++, so we need to copy the input weight to CPU.
 TVM_REGISTER_GLOBAL("cutlass.ft_preprocess_weight")
     .set_body_typed([](NDArray packed_weight, int sm, bool is_int4) {
-      int rows = packed_weight->shape[0];
-      int cols = packed_weight->shape[1];
-      std::vector<int8_t> input_cpu(rows * cols);
-      std::vector<int8_t> output_cpu(rows * cols);
+      bool is_2d = packed_weight->ndim == 2;
+      int num_experts = is_2d ? 1 : packed_weight->shape[0];
+      int rows = packed_weight->shape[is_2d ? 0 : 1];
+      int cols = packed_weight->shape[is_2d ? 1 : 2];
+
+      std::vector<int8_t> input_cpu(num_experts * rows * cols);
+      std::vector<int8_t> output_cpu(num_experts * rows * cols);
       packed_weight.CopyToBytes(input_cpu.data(), input_cpu.size());
       // multiply cols by 2 since the "col" params in preprocess_weights refers to the column of
       // the unpacked weight.
       if (is_int4) {
         cols *= 2;
       }
-      fastertransformer::preprocess_weights(output_cpu.data(), input_cpu.data(), rows, cols,
-                                            is_int4, sm);
+      fastertransformer::preprocess_weights(output_cpu.data(), input_cpu.data(), num_experts, rows,
+                                            cols, is_int4, sm);
       auto out = NDArray::Empty(packed_weight.Shape(), packed_weight->dtype, packed_weight->device);
       out.CopyFromBytes(output_cpu.data(), output_cpu.size());
       return out;
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 37ab0a87f1..e68e646ce1 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -54,3 +54,4 @@ echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
 echo set\(USE_CUTLASS ON\) >> config.cmake
 echo set\(USE_CMSISNN ON\) >> config.cmake
 echo set\(USE_MSC ON\) >> config.cmake
+echo set\(CMAKE_CUDA_ARCHITECTURES 75\) >> config.cmake