You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by GitBox <gi...@apache.org> on 2022/02/25 22:41:01 UTC
[GitHub] [tvm] comaniac commented on a change in pull request #10388: [BYOC][TENSOORT] Add support for FP16 on TensorRT BYOC flow

comaniac commented on a change in pull request #10388:
URL: https://github.com/apache/tvm/pull/10388#discussion_r815195307



##########
File path: src/runtime/contrib/tensorrt/tensorrt_builder.cc
##########
@@ -150,19 +165,30 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
   // Get outputs.
   node_output_map_[nid] = {};
   for (auto out : params.outputs) {
+    VLOG(1) << "Before forcing output tensor type: " << static_cast<int>(out->getType())
+            << std::endl;
+
+    out->setType(InferTensorType(&params));
+
     node_output_map_[nid].push_back(TensorRTOpInput(out));
+    VLOG(1) << "After forcing output tensor type: " << static_cast<int>(out->getType())
+            << std::endl;
   }
 }
 
 TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
   // Process graph to create INetworkDefinition.
-// Build engine.
-#if TRT_VERSION_GE(6, 0, 1)
+  // Build engine.
+
   config_ = builder_->createBuilderConfig();
   config_->setMaxWorkspaceSize(max_workspace_size_);
-  if (use_fp16_) {
-    config_->setFlag(nvinfer1::BuilderFlag::kFP16);
-  }
+
+  // According to documentation this is required for single FP precision. Always on doesnt seem to
+  // prevent pure FP32 execution

Review comment:
       nit: Better to provide the document link.

##########
File path: src/runtime/contrib/tensorrt/tensorrt_builder.cc
##########
@@ -150,19 +165,30 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
   // Get outputs.
   node_output_map_[nid] = {};
   for (auto out : params.outputs) {
+    VLOG(1) << "Before forcing output tensor type: " << static_cast<int>(out->getType())
+            << std::endl;

Review comment:
       No need a newline for log.

##########
File path: src/runtime/contrib/tensorrt/tensorrt_ops.cc
##########
@@ -202,9 +202,14 @@ class ElementWiseBinaryOpConverter : public TensorRTOpConverter {
     auto it = op_map.find(params->op_name);
     ICHECK(it != op_map.end()) << "Unsupported elementwise type " << params->op_name;
     // Broadcast
+
     auto input0 = params->inputs.at(0).tensor;
+    VLOG(1) << " Input type 0: " << static_cast<int>(input0->getType()) << std::endl;

Review comment:
       ditto

##########
File path: tests/python/contrib/test_tensorrt.py
##########
@@ -113,15 +131,17 @@ def run_and_verify_func(config, target="cuda", run_module=True):
                         mode, mod=mod, device=dev, target=target
                     ).evaluate()
             else:
+                mod = relay.transform.InferType()(mod)
                 with tvm.transform.PassContext(opt_level=3):
                     func = relay.create_executor(
                         mode, mod=mod, device=dev, target=target
                     ).evaluate()
             if run_module:
                 result_dict[result_key] = func(**input_dict, **params)
 
-    if run_module:
-        assert_result_dict_holds(result_dict)
+            print(result_dict)

Review comment:
       remove

##########
File path: tests/python/contrib/test_tensorrt.py
##########
@@ -88,23 +99,30 @@ def run_and_verify_func(config, target="cuda", run_module=True):
     run_module: bool
 
         If True, the built module will be run after being compiled.
+
+    data_type: str
+        Check between single and double floating precision
     """
     f, input_shapes, is_param = config
-    params = {x: np.random.uniform(-1, 1, input_shapes[x]).astype(np.float32) for x in is_param}
+    params = {
+        x: np.random.uniform(-1, 1, input_shapes[x]).astype(dtype=data_type) for x in is_param
+    }
     input_dict = {
-        k: np.random.uniform(-1, 1, v).astype(np.float32)
+        k: np.random.uniform(-1, 1, v).astype(dtype=data_type)
         for k, v in input_shapes.items()
         if k not in is_param
     }
     dev = tvm.device(target)
 
     result_dict = dict()
-    for mode in ["graph", "vm"]:
-        for use_trt in [False, True]:
+    for mode in ["vm", "graph"]:
+        for use_trt in [True, False]:
             mod = tvm.IRModule()
             mod["main"] = f
             result_key = mode + ("_trt" if use_trt else "")
             if use_trt:
+                mod = relay.transform.InferType()(mod)
+                print(mod)

Review comment:
       remove

##########
File path: src/runtime/contrib/tensorrt/tensorrt_builder.cc
##########
@@ -204,19 +227,30 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
 
 nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr,
                                                         DLDeviceType src_device) {
+  VLOG(1) << "Device type for DLTensorAsWeight: " << dptr->device.device_type;
+  VLOG(1) << "DLType for DLTensorAsWeight: " << dptr->dtype;
+  VLOG(1) << "DLShape for DLTensorAsWeight: " << dptr->shape << std::endl;
+
   ICHECK_EQ(dptr->device.device_type, src_device);
-  ICHECK(static_cast<int>(dptr->dtype.code) == kDLFloat ||
-         static_cast<int>(dptr->dtype.code) == kDLInt);
-  const auto trt_dtype = static_cast<int>(dptr->dtype.code) == kDLFloat
-                             ? nvinfer1::DataType::kFLOAT
-                             : nvinfer1::DataType::kINT32;
+
+  // (@mikepapadim) here there is a bug with dptr pointer. dptr dtypes has bits of 16 and type of
+  // float16, but code is Float
+  auto trt_dtype = (static_cast<int>(dptr->dtype.bits) == 16) ? nvinfer1::DataType::kHALF
+                                                              : nvinfer1::DataType::kFLOAT;
+
+  VLOG(1) << "DLTensorAsWeight:";
+  VLOG(1) << "dptr->dtype   : " << dptr->dtype;
+  VLOG(1) << "dptr->dtype.code   : " << static_cast<int>(dptr->dtype.code);
+  VLOG(1) << "dptr->dtype.bits   : " << static_cast<int>(dptr->dtype.bits);
+  VLOG(1) << "TRT dtype for DLTensor   : " << static_cast<int>(trt_dtype) << std::endl;

Review comment:
       ditto

##########
File path: src/runtime/contrib/tensorrt/tensorrt_builder.cc
##########
@@ -204,19 +227,30 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
 
 nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr,
                                                         DLDeviceType src_device) {
+  VLOG(1) << "Device type for DLTensorAsWeight: " << dptr->device.device_type;
+  VLOG(1) << "DLType for DLTensorAsWeight: " << dptr->dtype;
+  VLOG(1) << "DLShape for DLTensorAsWeight: " << dptr->shape << std::endl;

Review comment:
       ditto

##########
File path: src/runtime/contrib/tensorrt/tensorrt_ops.cc
##########
@@ -20,6 +20,8 @@
  * \file runtime/contrib/tensorrt/tensorrt_ops.cc
  * \brief Converters from Relay ops into TensorRT layers. Converters should
  * inherit from TensorRTOpConverter and implement the Convert() method.
+ * Minimal version 7, 0, 0
+ *  TRT_VERSION_GE(5, 1, 5)

Review comment:
       Please update the TRT document accordingly.
   (https://tvm.apache.org/docs/how_to/deploy/tensorrt.html)

##########
File path: src/runtime/contrib/tensorrt/tensorrt_builder.cc
##########
@@ -150,19 +165,30 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
   // Get outputs.
   node_output_map_[nid] = {};
   for (auto out : params.outputs) {
+    VLOG(1) << "Before forcing output tensor type: " << static_cast<int>(out->getType())
+            << std::endl;
+
+    out->setType(InferTensorType(&params));
+
     node_output_map_[nid].push_back(TensorRTOpInput(out));
+    VLOG(1) << "After forcing output tensor type: " << static_cast<int>(out->getType())
+            << std::endl;

Review comment:
       ditto

##########
File path: src/runtime/contrib/tensorrt/tensorrt_runtime.cc
##########
@@ -407,13 +406,14 @@ class TensorRTRuntime : public JSONRuntimeBase {
     // Using this key will only allow a single model per TVM_TENSORRT_CACHE_DIR directory. We could
     // instead use a hash of graph_json and all weights to allow many models in the same directory,
     // but the cost of computing the hash is high.
-    return symbol_name_ + (dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false) ? "_fp16" : "_fp32");
+    return symbol_name_ + (dmlc::GetEnv("TVM_TENSORRT_USE_FP16", true) ? "_fp16" : "_fp32");
   }
 
   /*! \brief Retreive a GPU buffer for input or output or allocate if needed. */
   NDArray GetOrAllocateDeviceBuffer(int entry_id, int binding_index) {
     std::vector<int64_t> shape(data_entry_[entry_id]->shape,
                                data_entry_[entry_id]->shape + data_entry_[entry_id]->ndim);
+    VLOG(1) << " Device buffers: \n Entry id:  " << entry_id;

Review comment:
       Newline seems not necessary.

##########
File path: src/runtime/contrib/tensorrt/tensorrt_ops.cc
##########
@@ -202,9 +202,14 @@ class ElementWiseBinaryOpConverter : public TensorRTOpConverter {
     auto it = op_map.find(params->op_name);
     ICHECK(it != op_map.end()) << "Unsupported elementwise type " << params->op_name;
     // Broadcast
+
     auto input0 = params->inputs.at(0).tensor;
+    VLOG(1) << " Input type 0: " << static_cast<int>(input0->getType()) << std::endl;
+
     auto input0_dims = TrtDimsToVector(input0->getDimensions());
     auto input1 = params->inputs.at(1).tensor;
+    VLOG(1) << " Input type at 1: " << static_cast<int>(input1->getType()) << std::endl;

Review comment:
       ditto

##########
File path: src/runtime/contrib/tensorrt/tensorrt_runtime.cc
##########
@@ -226,6 +225,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
       int binding_index = engine->getBindingIndex(name.c_str());
       ICHECK_NE(binding_index, -1);
       if (data_entry_[eid]->device.device_type != kDLCUDA) {
+        VLOG(1) << " Get output buffers from the device : ";

Review comment:
       Didn't see output buffers being printed?

##########
File path: tests/python/contrib/test_tensorrt.py
##########
@@ -169,50 +189,54 @@ def compile_and_run(mod, params, i_data, mode="vm", use_trt=True):
                 mod, params, i_data, mode=mode, use_trt=use_trt
             )
 
+    print(result_dict)

Review comment:
       remove

##########
File path: tests/python/contrib/test_tensorrt.py
##########
@@ -861,17 +911,28 @@ def get_graph(x_shape, pad_width):
     )
 
 
+def test_add(run_module):
+    def get_graph(x_shape):
+        x = relay.var("x", shape=(x_shape), dtype="float16")
+        y = relay.var("y", shape=(x_shape), dtype="float16")
+        out = relay.add(x, y)
+        f = relay.Function([x, y], out)
+        return f, {"x": x_shape, "y": x_shape}, []
+
+    run_and_verify_func(get_graph((1, 1000)), run_module=run_module, data_type="float16")
+
+
 def test_softmax(run_module):
     def get_graph(x_shape, axis):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
+        x = relay.var("x", shape=(x_shape), dtype="float16")
         out = relay.nn.softmax(x, axis=axis)
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
-    run_and_verify_func(get_graph((1, 1000), axis=1), run_module=run_module)
-    run_and_verify_func(get_graph((1, 1000), axis=-1), run_module=run_module)
-    run_and_verify_func(get_graph((1, 3, 4), axis=-2), run_module=run_module)
-    run_and_verify_func(get_graph((1, 3, 4), axis=1), run_module=run_module)
+    run_and_verify_func(get_graph((1, 1000), axis=1), run_module=run_module, data_type="float16")
+    # run_and_verify_func(get_graph((1, 1000), axis=-1), run_module=run_module)
+    # run_and_verify_func(get_graph((1, 3, 4), axis=-2), run_module=run_module)
+    # run_and_verify_func(get_graph((1, 3, 4), axis=1), run_module=run_module)

Review comment:
       uncomment

##########
File path: src/runtime/contrib/tensorrt/tensorrt_ops.cc
##########
@@ -796,24 +794,16 @@ class UnaryOpConverter : public TensorRTOpConverter {
     // The following ops are supported by TRT but don't exist in relay yet:
     // recip, tan, sinh, cosh, asin, acos, asinh, acosh, atanh
     static const std::unordered_map<std::string, nvinfer1::UnaryOperation> op_map = {
-      {"exp", nvinfer1::UnaryOperation::kEXP},
-      {"log", nvinfer1::UnaryOperation::kLOG},
-      {"sqrt", nvinfer1::UnaryOperation::kSQRT},
-      {"abs", nvinfer1::UnaryOperation::kABS},
-      {"negative", nvinfer1::UnaryOperation::kNEG},
-#if TRT_VERSION_GE(5, 1, 5)
-      {"sin", nvinfer1::UnaryOperation::kSIN},
-      {"cos", nvinfer1::UnaryOperation::kCOS},
-      {"atan", nvinfer1::UnaryOperation::kATAN},
-      {"ceil", nvinfer1::UnaryOperation::kCEIL},
-      {"floor", nvinfer1::UnaryOperation::kFLOOR},
-#endif
-#if TRT_VERSION_GE(7, 0, 0)
-      {"erf", nvinfer1::UnaryOperation::kERF},
-#endif
+        {"exp", nvinfer1::UnaryOperation::kEXP},      {"log", nvinfer1::UnaryOperation::kLOG},
+        {"sqrt", nvinfer1::UnaryOperation::kSQRT},    {"abs", nvinfer1::UnaryOperation::kABS},
+        {"negative", nvinfer1::UnaryOperation::kNEG}, {"sin", nvinfer1::UnaryOperation::kSIN},
+        {"cos", nvinfer1::UnaryOperation::kCOS},      {"atan", nvinfer1::UnaryOperation::kATAN},
+        {"ceil", nvinfer1::UnaryOperation::kCEIL},    {"floor", nvinfer1::UnaryOperation::kFLOOR},
+        {"erf", nvinfer1::UnaryOperation::kERF},
     };
     auto it = op_map.find(params->op_name);
-    ICHECK(it != op_map.end()) << "Unsupported unary type " << params->op_name;
+    ICHECK(it != op_map.end()) << "Unsupported unary type " << params->op_name << std::endl
+                               << "Check TRT version ";

Review comment:
       ```suggestion
       ICHECK(it != op_map.end()) << "Unsupported unary type " << params->op_name
                                  << ". Please check TRT version >= 7.0.0";
   ```

##########
File path: tests/python/contrib/test_tensorrt.py
##########
@@ -421,12 +449,14 @@ def get_graph(
                             dilation=dilation,
                         ),
                         run_module=run_module,
+                        data_type="float16",
                     )
     run_and_verify_func(
         get_graph((1, 3, 16, 16), (3, 8, 7, 7), 3, [2, 2, 3, 3], [2, 2], [1, 1], 24),
         run_module=run_module,
     )
-    run_and_verify_func(get_graph((1, 3, 16, 16), (1, 3, 1, 1), channels=1), run_module=run_module)
+    # run_and_verify_func(
+    #     get_graph((1, 3, 16, 16), (1, 3, 1, 1), channels=1), run_module=run_module)

Review comment:
       Uncomment?

##########
File path: src/runtime/contrib/tensorrt/tensorrt_builder.cc
##########
@@ -150,19 +165,30 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
   // Get outputs.
   node_output_map_[nid] = {};
   for (auto out : params.outputs) {
+    VLOG(1) << "Before forcing output tensor type: " << static_cast<int>(out->getType())
+            << std::endl;
+
+    out->setType(InferTensorType(&params));
+
     node_output_map_[nid].push_back(TensorRTOpInput(out));
+    VLOG(1) << "After forcing output tensor type: " << static_cast<int>(out->getType())
+            << std::endl;
   }
 }
 
 TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
   // Process graph to create INetworkDefinition.
-// Build engine.
-#if TRT_VERSION_GE(6, 0, 1)
+  // Build engine.
+
   config_ = builder_->createBuilderConfig();
   config_->setMaxWorkspaceSize(max_workspace_size_);
-  if (use_fp16_) {
-    config_->setFlag(nvinfer1::BuilderFlag::kFP16);
-  }
+
+  // According to documentation this is required for single FP precision. Always on doesnt seem to
+  // prevent pure FP32 execution
+  config_->setFlag(nvinfer1::BuilderFlag::kFP16);
+
+  // Pass it explicitly
+  // config_->setFlag(nvinfer1::BuilderFlag::kDEBUG);

Review comment:
       Remove?

##########
File path: tests/python/contrib/test_tensorrt.py
##########
@@ -169,50 +189,54 @@ def compile_and_run(mod, params, i_data, mode="vm", use_trt=True):
                 mod, params, i_data, mode=mode, use_trt=use_trt
             )
 
+    print(result_dict)
+
     if run_module:
         assert_result_dict_holds(result_dict)
 
 
 def test_tensorrt_simple(run_module):
-    dtype = "float32"
-    xshape = (1, 3, 2, 2)
-    yshape = (1, 3, 1, 1)
-    zshape = (1, 1, 1, 1)
-    x = relay.var("x", shape=(xshape), dtype=dtype)
-    y = relay.var("y", shape=(yshape), dtype=dtype)
-    z = relay.var("z", shape=(zshape), dtype=dtype)
-    w = z * (x + y)
-    out = relay.nn.relu(w)
-    f = relay.Function([x, y, z], out)
-
-    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
-    y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
-    z_data = np.random.uniform(-1, 1, zshape).astype(dtype)
+    for dtype in SUPPORTED_TYPES:
+        xshape = (1, 3, 2, 2)
+        yshape = (1, 3, 1, 1)
+        zshape = (1, 1, 1, 1)
+        x = relay.var("x", shape=(xshape), dtype=dtype)
+        y = relay.var("y", shape=(yshape), dtype=dtype)
+        z = relay.var("z", shape=(zshape), dtype=dtype)
+        w = z * (x + y)
+        out = relay.nn.relu(w)
+        f = relay.Function([x, y, z], out)
+        x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
+        y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
+        z_data = np.random.uniform(-1, 1, zshape).astype(dtype)
 
-    result_dict = dict()
-    for mode in ["vm", "graph"]:
-        for use_trt in [True, False]:
-            mod = tvm.IRModule()
-            mod["main"] = f
-            result_key = mode + ("_trt" if use_trt else "")
-            if use_trt:
-                mod, config = tensorrt.partition_for_tensorrt(mod)
-                with tvm.transform.PassContext(
-                    opt_level=3, config={"relay.ext.tensorrt.options": config}
-                ):
-                    func = relay.create_executor(
-                        mode, mod=mod, device=tvm.cuda(0), target="cuda"
-                    ).evaluate()
-            else:
-                with tvm.transform.PassContext(opt_level=3):
-                    func = relay.create_executor(
-                        mode, mod=mod, device=tvm.cuda(0), target="cuda"
-                    ).evaluate()
-            if run_module:
-                result_dict[result_key] = func(x_data, y_data, z_data)
+        result_dict = dict()
+        for mode in ["vm", "graph"]:
+            for use_trt in [False, True]:
+                mod = tvm.IRModule()
+                mod["main"] = f
+                result_key = mode + ("_trt" if use_trt else "")
+                if use_trt:
+                    mod = relay.transform.InferType()(mod)
+                    mod, config = tensorrt.partition_for_tensorrt(mod)
+                    with tvm.transform.PassContext(
+                        opt_level=3, config={"relay.ext.tensorrt.options": config}
+                    ):
+                        func = relay.create_executor(
+                            mode, mod=mod, device=tvm.cuda(0), target="cuda"
+                        ).evaluate()
+                else:
+                    mod = relay.transform.InferType()(mod)
+                    with tvm.transform.PassContext(opt_level=3):
+                        func = relay.create_executor(
+                            mode, mod=mod, device=tvm.cuda(0), target="cuda"
+                        ).evaluate()
+                if run_module:
+                    result_dict[result_key] = func(x_data, y_data, z_data)
 
-    if run_module:
-        assert_result_dict_holds(result_dict)
+        print(result_dict)

Review comment:
       remove

##########
File path: tests/python/contrib/test_tensorrt.py
##########
@@ -471,7 +502,8 @@ def get_graph(
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
-    run_and_verify_func(get_graph(), run_module=run_module)
+    # for tp in ["float32", "float16", "int8", "uint8"]:

Review comment:
       remove or?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org