You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by le...@apache.org on 2022/09/14 09:48:09 UTC
[tvm] branch main updated: [OpenCLML] More ops and network coverage (#12762)
This is an automated email from the ASF dual-hosted git repository.
leandron pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 2aa0d1fbfc [OpenCLML] More ops and network coverage (#12762)
2aa0d1fbfc is described below
commit 2aa0d1fbfcf4a31e343cc6852fdc4abd660c850a
Author: Siva <qu...@quicinc.com>
AuthorDate: Wed Sep 14 15:18:03 2022 +0530
[OpenCLML] More ops and network coverage (#12762)
Added operators pooling (avg, max), binary operators (add, subtract, multiply, min, max) and concat.
Clip operator with min=0 and max=6 is remapped to relu6 to take advantage of CLML acceleration
without sub graphing this to fallback path.
Added new test cases for above listed operators and also end-to-end network test cases for Resnet50
& InceptionV3.
CLML support FP16 arithmetic mode which gives significant performance boost over FP32. This PR
enhances FP16 usage based on Operator datatype in relay graph.
Co-authored-by: Krishna Raju quic_kvegiraj@quicinc.com
Co-authored-by: Shwetank Singh quic_shwesing@quicinc.com
---
python/tvm/relay/op/contrib/clml.py | 35 ++-
src/relay/backend/contrib/clml/codegen.cc | 37 +++
src/runtime/contrib/clml/clml_runtime.cc | 315 +++++++++++++++++++----
tests/python/contrib/test_clml/infrastructure.py | 28 +-
tests/python/contrib/test_clml/test_network.py | 139 +++++++---
tests/python/contrib/test_clml/test_ops.py | 83 +++++-
6 files changed, 529 insertions(+), 108 deletions(-)
diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
index cacd10de28..d253544d45 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -23,7 +23,7 @@ from tvm._ffi import register_func
from tvm.relay import transform
from tvm.relay.build_module import bind_params_by_name
-from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item
+from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item, is_tuple
from .register import register_pattern_table
from ..strategy.generic import is_depthwise_conv2d
@@ -135,6 +135,7 @@ def clml_pattern_table():
"""Create a convolution pattern."""
pattern = is_op("nn.conv2d")(wildcard(), is_constant())
pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
+ pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
pattern = pattern.optional(
lambda x: is_op("nn.batch_norm")(
x, is_constant(), is_constant(), is_constant(), is_constant()
@@ -142,6 +143,7 @@ def clml_pattern_table():
)
pattern = pattern.optional(is_tuple_get_item)
pattern = pattern.optional(is_op("nn.relu"))
+ pattern = pattern.optional(is_op("clip"))
return pattern
def batch_norm_pattern():
@@ -152,10 +154,24 @@ def clml_pattern_table():
pattern = is_tuple_get_item(pattern)
return pattern
+ def concat_pattern():
+ """Create a concat pattern.
+
+ Returns
+ -------
+ pattern : dataflow_pattern.AltPattern
+ Denotes the concat pattern.
+ """
+ pattern = is_tuple(None)
+ pattern = is_op("concatenate")(pattern)
+
+ return pattern
+
def dense_pattern():
"""Create a dense pattern."""
pattern = is_op("nn.dense")(wildcard(), is_constant())
pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
+ pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
return pattern
def pad_pattern():
@@ -172,6 +188,13 @@ def clml_pattern_table():
call = call.args[0]
if isinstance(call, tvm.relay.expr.TupleGetItem):
call = call.tuple_value
+ elif call.op.name == "clip":
+ if call.attrs["a_min"] != 0.0 or call.attrs["a_max"] != 6.0:
+ return False
+ call = call.args[0]
+ if isinstance(call, tvm.relay.expr.TupleGetItem):
+ call = call.tuple_value
+
while call.op.name != "nn.conv2d":
call = call.args[0]
attrs, args = call.attrs, call.args
@@ -194,6 +217,7 @@ def clml_pattern_table():
("clml.conv2d", conv_pattern(), check_conv),
("clml.dense", dense_pattern()),
("clml.pad", pad_pattern()),
+ ("clml.concat", concat_pattern()),
("clml.batch_norm", batch_norm_pattern()),
]
@@ -207,11 +231,18 @@ def _register_external_op_helper(op_name, supported=True):
_register_external_op_helper("clip")
-_register_external_op_helper("relu")
+_register_external_op_helper("nn.relu")
_register_external_op_helper("nn.global_avg_pool2d")
_register_external_op_helper("nn.global_max_pool2d")
+_register_external_op_helper("nn.avg_pool2d")
+_register_external_op_helper("nn.max_pool2d")
_register_external_op_helper("nn.softmax")
_register_external_op_helper("reshape")
+_register_external_op_helper("add")
+_register_external_op_helper("subtract")
+_register_external_op_helper("multiply")
+_register_external_op_helper("minimum")
+_register_external_op_helper("maximum")
class OpAttrContext(object):
diff --git a/src/relay/backend/contrib/clml/codegen.cc b/src/relay/backend/contrib/clml/codegen.cc
index fa082a423d..b89f05e178 100644
--- a/src/relay/backend/contrib/clml/codegen.cc
+++ b/src/relay/backend/contrib/clml/codegen.cc
@@ -91,6 +91,8 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer {
json_node = CreateDenseJSONNode(cn);
} else if (name == "clml.pad") {
json_node = CreatePadJSONNode(cn);
+ } else if (name == "clml.concat") {
+ json_node = CreateConcatJSONNode(cn);
} else {
LOG(FATAL) << "Unrecognized CLML pattern: " << name;
}
@@ -148,6 +150,15 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer {
} else {
current_call = current_call->args[0].as<CallNode>();
}
+ } else if (backend::IsOp(current_call, "clip")) {
+ nodes.activation = current_call;
+ nodes.act_type = "relu6";
+ if (current_call->args[0].as<TupleGetItemNode>()) {
+ auto tuple_item = current_call->args[0].as<TupleGetItemNode>();
+ current_call = tuple_item->tuple.as<CallNode>();
+ } else {
+ current_call = current_call->args[0].as<CallNode>();
+ }
}
if (backend::IsOp(current_call, "nn.batch_norm")) {
nodes.bn = current_call;
@@ -279,6 +290,32 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer {
return json_node;
}
+ /*!
+ * \brief Create a JSON representation of a Concat operator.
+ *
+ * \param cn The call to be represented.
+ * \return A JSON representation of a specific operator.
+ */
+ std::shared_ptr<JSONGraphNode> CreateConcatJSONNode(const CallNode* cn) {
+ const auto* fn = cn->op.as<FunctionNode>();
+ ICHECK(fn);
+ const auto* concat = fn->body.as<CallNode>();
+
+ ICHECK(backend::IsOp(concat, "concatenate"));
+ const auto* concat_op = concat->op.as<OpNode>();
+ ICHECK(concat_op);
+ const std::string name = concat_op->name;
+
+ std::vector<JSONGraphNodeEntry> inputs;
+ for (auto arg : cn->args) {
+ inputs.push_back(VisitExpr(arg)[0]);
+ }
+
+ auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 1);
+ SetCallNodeAttribute(json_node, concat);
+ return json_node;
+ }
+
/*!
* \brief Create a JSON representation of a Dense operator.
*
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index da41442ef9..cdc3b9a7b5 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -335,13 +335,15 @@ class CLMLRuntime : public JSONRuntimeBase {
size_t nid;
for (nid = 0; nid < nodes_.size(); ++nid) {
const auto& node = nodes_[nid];
+ DLDataType tvm_dtype = node.GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
if (node.GetOpType() == "input") {
- auto clml_input = MakeCLMLTensorFromJSONNode(node);
+ auto clml_input = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
this->layer_.storage_map.insert({nid, std::make_pair(clml_input, node)});
this->layer_.inputs.push_back(clml_input);
// Input copy placeholder Tensor
this->layer_.in_placeholder.push_back(
- MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM));
+ MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype));
} else if (node.GetOpType() == "kernel") {
auto op_name = node.GetOpName();
if ("nn.conv2d" == op_name) {
@@ -364,6 +366,11 @@ class CLMLRuntime : public JSONRuntimeBase {
auto out = CreateBatchNormLayer(&layer_, node);
this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
this->layer_.func_outs.push_back(out);
+ } else if ("nn.max_pool2d" == op_name || "nn.avg_pool2d" == op_name ||
+ "nn.l2_pool2d" == op_name) {
+ auto out = CreatePoolingLayer(&layer_, node);
+ this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+ this->layer_.func_outs.push_back(out);
} else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" == op_name) {
auto out = CreateGlobalPoolingLayer(&layer_, node);
this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
@@ -372,6 +379,10 @@ class CLMLRuntime : public JSONRuntimeBase {
auto out = CreateReshapeLayer(&layer_, node);
this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
this->layer_.func_outs.push_back(out);
+ } else if ("concatenate" == op_name) {
+ auto out = CreateConcatLayer(&layer_, node);
+ this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+ this->layer_.func_outs.push_back(out);
} else if ("nn.dense" == op_name) {
auto out = CreateDenseLayer(&layer_, node);
this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
@@ -388,6 +399,11 @@ class CLMLRuntime : public JSONRuntimeBase {
auto out = CreateClipLayer(&layer_, node);
this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
this->layer_.func_outs.push_back(out);
+ } else if ("add" == op_name || "subtract" == op_name || "multiply" == op_name ||
+ "minimum" == op_name || "maximum" == op_name) {
+ auto out = CreateBinaryLayer(&layer_, node);
+ this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+ this->layer_.func_outs.push_back(out);
} else {
LOG(FATAL) << "Unsupported op: " << op_name;
}
@@ -396,10 +412,14 @@ class CLMLRuntime : public JSONRuntimeBase {
LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType();
}
}
- if (nid > 0) {
- this->layer_.outputs.push_back(this->layer_.storage_map[nid - 1].first);
+
+ for (size_t i = 0; i < outputs_.size(); ++i) {
+ nid = outputs_[i].id_;
+ DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ this->layer_.outputs.push_back(this->layer_.storage_map[nid].first);
this->layer_.out_placeholder.push_back(
- MakeCLMLTensorFromJSONNode(nodes_[nid - 1], CL_TENSOR_LAYOUT_NCHW_QCOM));
+ MakeCLMLTensorFromJSONNode(nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype));
}
// ALlocate device memories and initialize the params if any
cl_int result = 0;
@@ -558,6 +578,20 @@ class CLMLRuntime : public JSONRuntimeBase {
}
}
+ cl_arithmetic_mode_qcom MakeCLArithMode(const cl_channel_type& data_type,
+ const cl_channel_type& acc_type = CL_FLOAT) {
+ if (data_type == CL_FLOAT && acc_type == CL_FLOAT) {
+ return CL_ARITHMETIC_MODE_FP32_QCOM;
+ } else if (data_type == CL_HALF_FLOAT && acc_type == CL_FLOAT) {
+ return CL_ARITHMETIC_MODE_FP16_ACC32_QCOM;
+ } else if (data_type == CL_HALF_FLOAT && acc_type == CL_HALF_FLOAT) {
+ return CL_ARITHMETIC_MODE_FP16_QCOM;
+ } else {
+ LOG(FATAL) << "Datatype " << data_type << " unsupported by CLML runtime";
+ return CL_ARITHMETIC_MODE_FP32_QCOM;
+ }
+ }
+
std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(
const JSONGraphNode& tensor_rep, void* data, std::vector<size_t> c_shape,
cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_uint dtype = CL_FLOAT) {
@@ -634,6 +668,9 @@ class CLMLRuntime : public JSONRuntimeBase {
std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
std::vector<cl_uint> clml_padding = GetVectorValues(padding);
+ DLDataType tvm_dtype = node.GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
if (!node.HasAttr("padding")) {
clml_padding.resize(4);
std::fill(clml_padding.begin(), clml_padding.end(), 0);
@@ -668,7 +705,7 @@ class CLMLRuntime : public JSONRuntimeBase {
has_act = true;
}
cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
- CL_ARITHMETIC_MODE_FP32_QCOM};
+ cl_arithmetic_mode};
// Collect inputs and outputs, handling nn.conv2d.
std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
@@ -680,15 +717,15 @@ class CLMLRuntime : public JSONRuntimeBase {
has_bias = (num_inputs == 3) || (num_inputs == 7);
has_bn = (num_inputs == 6) || (num_inputs == 7);
// Input
- auto input = MakeCLMLTensorFromJSONEntry(inputs[0]);
-
+ auto input =
+ MakeCLMLTensorFromJSONEntry(inputs[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
// Weight
- auto weight = MakeCLMLTensorFromJSONEntry(inputs[1]);
-
+ auto weight =
+ MakeCLMLTensorFromJSONEntry(inputs[1], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
// Bias
auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
if (has_bias) {
- bias = MakeCLMLTensorFromJSONEntry(inputs[2]);
+ bias = MakeCLMLTensorFromJSONEntry(inputs[2], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
} else {
cl_ml_tensor_desc_qcom desc = {};
desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
@@ -698,7 +735,7 @@ class CLMLRuntime : public JSONRuntimeBase {
bias->tensor = layer_.unusedTensor;
}
// Output
- auto output = MakeCLMLTensorFromJSONNode(node);
+ auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
cl_ml_op_convolution_desc_qcom conv_desc{mode,
groups,
4,
@@ -707,7 +744,7 @@ class CLMLRuntime : public JSONRuntimeBase {
{clml_strides[0], clml_strides[1]},
{clml_dilation[0], clml_dilation[1]},
0,
- CL_ARITHMETIC_MODE_FP32_QCOM};
+ cl_arithmetic_mode};
cl_ml_op_qcom op = NULL;
if (!has_bn) {
@@ -734,13 +771,16 @@ class CLMLRuntime : public JSONRuntimeBase {
auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
- bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index], bn_shape);
- bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1], bn_shape);
- bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2], bn_shape);
- bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3], bn_shape);
-
- cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM,
- CL_ARITHMETIC_MODE_FP32_QCOM};
+ bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index], bn_shape,
+ CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+ bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1], bn_shape,
+ CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+ bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2], bn_shape,
+ CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+ bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3], bn_shape,
+ CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+
+ cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
if (!has_act) {
result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormForwardQCOM(
workspace->context, 0, &conv_desc, &bn_desc, input->tensor, weight->tensor,
@@ -772,11 +812,15 @@ class CLMLRuntime : public JSONRuntimeBase {
cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU) {
cl_int result = 0;
cl_ml_op_qcom op = NULL;
- auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
- auto output = MakeCLMLTensorFromJSONNode(node);
+ DLDataType tvm_dtype = node.GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+ cl_dtype);
+ auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
- CL_ARITHMETIC_MODE_FP32_QCOM};
+ cl_arithmetic_mode};
cl_ml_tensor_desc_qcom desc = {};
desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
@@ -805,7 +849,11 @@ class CLMLRuntime : public JSONRuntimeBase {
const JSONGraphNode& node) {
cl_int result = 0;
cl_ml_op_qcom op = NULL;
- auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+ DLDataType tvm_dtype = node.GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+ cl_dtype);
int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
auto bn_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]);
std::vector<size_t> bn_shape = {1, 1, 1, 1};
@@ -814,15 +862,18 @@ class CLMLRuntime : public JSONRuntimeBase {
auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
- bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], bn_shape);
- bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], bn_shape);
- bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3], bn_shape);
- bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4], bn_shape);
+ bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], bn_shape,
+ CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+ bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], bn_shape,
+ CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+ bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3], bn_shape,
+ CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+ bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4], bn_shape,
+ CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
- auto output = MakeCLMLTensorFromJSONNode(node);
+ auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
- cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM,
- CL_ARITHMETIC_MODE_FP32_QCOM};
+ cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
result = h_ClmlIntf->clCreateMLOpBatchNormForwardQCOM(
workspace->context, 0, &bn_desc, input->tensor, bn_mean->tensor, bn_var->tensor,
@@ -834,6 +885,61 @@ class CLMLRuntime : public JSONRuntimeBase {
return output;
}
+ /*!
+ * \brief Create a creating pooling layer.
+ *
+ * \note Currently global_max_pool2d and global_avg_pool2d are supported.
+ *
+ * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
+ * \param node The JSON representation of the operator.
+ */
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreatePoolingLayer(CachedLayer* layer,
+ const JSONGraphNode& node) {
+ cl_int result = 0;
+ cl_ml_op_qcom op = NULL;
+ DLDataType tvm_dtype = node.GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+ cl_dtype);
+ auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+ auto in_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
+
+ std::vector<std::string> windows = node.GetAttr<std::vector<std::string>>("pool_size");
+ std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
+ std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
+ std::vector<cl_uint> clml_window = GetVectorValues(windows);
+ std::vector<cl_uint> clml_stride = GetVectorValues(strides);
+ std::vector<cl_uint> clml_padding = GetVectorValues(padding);
+
+ cl_ml_op_pooling_desc_qcom pool_desc = {
+ node.GetOpName() == "nn.max_pool2d" ? CL_POOLING_MODE_MAX_QCOM
+ : CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
+ 4, // reserved
+ {clml_padding[0], clml_padding[1]},
+ {clml_padding[2], clml_padding[3]},
+ {clml_stride[0], clml_stride[1]},
+ {clml_window[0], clml_window[1]},
+ CL_PROPAGATE_NAN_QCOM,
+ cl_arithmetic_mode,
+ };
+
+ cl_ml_tensor_desc_qcom desc = {};
+ cl_ml_tensor_qcom unusedTensor = NULL;
+ desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+ result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &unusedTensor);
+ ICHECK(unusedTensor && result == CL_SUCCESS) << ":" << result;
+
+ result =
+ h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(workspace->context, 0, &pool_desc, input->tensor,
+ unusedTensor, output->tensor, &op, tuning_cache);
+ ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result;
+
+ layer_.func_ins.push_back(input);
+ layer->function.push_back(op);
+ return output;
+ }
+
/*!
* \brief Create a global pooling layer.
*
@@ -846,8 +952,12 @@ class CLMLRuntime : public JSONRuntimeBase {
CachedLayer* layer, const JSONGraphNode& node) {
cl_int result = 0;
cl_ml_op_qcom op = NULL;
- auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
- auto output = MakeCLMLTensorFromJSONNode(node);
+ DLDataType tvm_dtype = node.GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+ cl_dtype);
+ auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto in_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
cl_ml_op_pooling_desc_qcom pool_desc = {
node.GetOpName() == "nn.global_max_pool2d" ? CL_POOLING_MODE_MAX_QCOM
@@ -858,7 +968,7 @@ class CLMLRuntime : public JSONRuntimeBase {
{1, 1},
{in_dims.w, in_dims.h},
CL_PROPAGATE_NAN_QCOM,
- CL_ARITHMETIC_MODE_FP32_QCOM,
+ cl_arithmetic_mode,
};
cl_ml_tensor_desc_qcom desc = {};
@@ -887,14 +997,17 @@ class CLMLRuntime : public JSONRuntimeBase {
const JSONGraphNode& node) {
cl_int result = 0;
cl_ml_op_qcom op = NULL;
- auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+ DLDataType tvm_dtype = node.GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+ cl_dtype);
auto out_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
- auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, CL_FLOAT, nullptr,
+ auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype, nullptr,
{out_dims.n, out_dims.c, 1, 1});
cl_ml_op_softmax_desc_qcom softmax_desc = {CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM,
- CL_SOFTMAX_MODE_INSTANCE_QCOM,
- CL_ARITHMETIC_MODE_FP32_QCOM};
+ CL_SOFTMAX_MODE_INSTANCE_QCOM, cl_arithmetic_mode};
result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(workspace->context, 0, &softmax_desc,
input->tensor, output->tensor, &op, tuning_cache);
@@ -915,8 +1028,12 @@ class CLMLRuntime : public JSONRuntimeBase {
const JSONGraphNode& node) {
cl_int result = 0;
cl_ml_op_qcom op = NULL;
- auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
- auto output = MakeCLMLTensorFromJSONNode(node);
+ DLDataType tvm_dtype = node.GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+ cl_dtype);
+ auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
std::string pad_mode = node.GetAttr<std::vector<std::string>>("pad_mode")[0];
std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("pad_width");
@@ -936,7 +1053,7 @@ class CLMLRuntime : public JSONRuntimeBase {
clml_pad_mode,
{0, 0},
{clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3], 0, 0, 0, 0},
- CL_ARITHMETIC_MODE_FP32_QCOM};
+ cl_arithmetic_mode};
result = h_ClmlIntf->clCreateMLOpPadQCOM(workspace->context, 0, &pad_desc, input->tensor,
output->tensor, &op, tuning_cache);
@@ -957,8 +1074,11 @@ class CLMLRuntime : public JSONRuntimeBase {
const JSONGraphNode& node) {
cl_int result = 0;
cl_ml_op_qcom op = NULL;
- auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
- auto output = MakeCLMLTensorFromJSONNode(node);
+ DLDataType tvm_dtype = node.GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+ cl_dtype);
+ auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
result = h_ClmlIntf->clCreateMLOpReshapeQCOM(workspace->context, 0, input->tensor,
output->tensor, &op, tuning_cache);
@@ -969,6 +1089,42 @@ class CLMLRuntime : public JSONRuntimeBase {
return output;
}
+ /*!
+ * \brief Create a concat layer.
+ *
+ *
+ * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
+ * \param node The JSON representation of the operator.
+ */
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateConcatLayer(CachedLayer* layer,
+ const JSONGraphNode& node) {
+ cl_int result = 0;
+ cl_ml_op_qcom op = NULL;
+ std::vector<JSONGraphNodeEntry> input_ = node.GetInputs();
+ DLDataType tvm_dtype = node.GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+ int inputSize = input_.size();
+ int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
+ auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+ cl_ml_tensor_qcom* concatInputs = new cl_ml_tensor_qcom[inputSize];
+ for (int i = 0; i < inputSize; i++) {
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[i], {},
+ CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+ concatInputs[i] = input->tensor;
+ }
+ cl_ml_op_concat_desc_qcom concatDesc = {1, (cl_uint)inputSize, cl_arithmetic_mode};
+
+ result = h_ClmlIntf->clCreateMLOpConcatQCOM(workspace->context, 0, &concatDesc, concatInputs,
+ output->tensor, &op, tuning_cache);
+ ICHECK(op && result == CL_SUCCESS) << "Concat Error:" << result;
+
+ layer->function.push_back(op);
+
+ delete[] concatInputs;
+ return output;
+ }
+
/*!
* \brief Create a dense layer.
*
@@ -980,21 +1136,27 @@ class CLMLRuntime : public JSONRuntimeBase {
const JSONGraphNode& node) {
cl_int result = 0;
cl_ml_op_qcom op = NULL;
- auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+ DLDataType tvm_dtype = node.GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+ auto inp_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {1, inp_dims.c, 1, 1},
+ CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto wt_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]);
bool has_bias = node.GetInputs().size() == 3 ? true : false;
-
- auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, wt_dims.n, wt_dims.c});
+ auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, wt_dims.n, wt_dims.c},
+ CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
if (has_bias) {
auto bias_dims = get_tensor_dims(nodes_[node.GetInputs()[2].id_]);
- bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {1, bias_dims.c, 1, 1});
+ bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {1, bias_dims.c, 1, 1},
+ CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
}
cl_ml_op_fully_connected_desc_qcom fc_desc = {1, CL_FC_WEIGHT_TRANSFORM_TRANSPOSE_QCOM,
- CL_ARITHMETIC_MODE_FP32_QCOM};
+ cl_arithmetic_mode};
+ auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
- auto output = MakeCLMLTensorFromJSONNode(node);
if (has_bias) {
result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(
workspace->context, 0, &fc_desc, input->tensor, weight->tensor, bias->tensor,
@@ -1021,15 +1183,17 @@ class CLMLRuntime : public JSONRuntimeBase {
const JSONGraphNode& node) {
cl_int result = 0;
cl_ml_op_qcom op = NULL;
- auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
- auto output = MakeCLMLTensorFromJSONNode(node);
+ DLDataType tvm_dtype = node.GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+ cl_dtype);
+ auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
cl_float a_max = std::stof(node.GetAttr<std::vector<std::string>>("a_max")[0]);
cl_float a_min = std::stof(node.GetAttr<std::vector<std::string>>("a_min")[0]);
- cl_ml_op_clip_desc_qcom clip_desc = {CL_CLIP_BY_VALUE_QCOM,
- {{a_max}, CL_FLOAT},
- {{a_min}, CL_FLOAT},
- CL_ARITHMETIC_MODE_FP32_QCOM};
+ cl_ml_op_clip_desc_qcom clip_desc = {
+ CL_CLIP_BY_VALUE_QCOM, {{a_max}, CL_FLOAT}, {{a_min}, CL_FLOAT}, cl_arithmetic_mode};
result = h_ClmlIntf->clCreateMLOpClipQCOM(workspace->context, 0, &clip_desc, input->tensor,
output->tensor, &op, tuning_cache);
@@ -1040,6 +1204,47 @@ class CLMLRuntime : public JSONRuntimeBase {
return output;
}
+ /*!
+ * \brief Create a Binary layer.
+ *
+ * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
+ * \param node The JSON representation of the operator.
+ */
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateBinaryLayer(CachedLayer* layer,
+ const JSONGraphNode& node) {
+ cl_int result = 0;
+ cl_ml_op_qcom op = NULL;
+ DLDataType tvm_dtype = node.GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+ auto input_a = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {},
+ CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+ auto input_b = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {},
+ CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+ auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+ std::string op_name = node.GetOpName();
+ cl_binary_op_qcom binary_op = CL_TENSOR_OP_ADD_QCOM;
+ if (op_name == "subtract")
+ binary_op = CL_TENSOR_OP_SUB_QCOM;
+ else if (op_name == "multiply")
+ binary_op = CL_TENSOR_OP_MUL_QCOM;
+ else if (op_name == "minimum")
+ binary_op = CL_TENSOR_OP_MIN_QCOM;
+ else if (op_name == "maximum")
+ binary_op = CL_TENSOR_OP_MAX_QCOM;
+ cl_ml_op_binary_desc_qcom add_desc = {
+ binary_op, {{1.0}, CL_FLOAT}, {{1.0}, CL_FLOAT}, {{0.0}, CL_FLOAT}, cl_arithmetic_mode};
+
+ result = h_ClmlIntf->clCreateMLOpBinaryQCOM(workspace->context, 0, &add_desc, input_a->tensor,
+ input_b->tensor, output->tensor, &op, tuning_cache);
+ ICHECK(op && result == CL_SUCCESS) << op_name << " Node Error:" << result;
+
+ layer_.func_ins.push_back(input_a);
+ layer_.func_ins.push_back(input_b);
+ layer->function.push_back(op);
+ return output;
+ }
+
/*!
* \brief The network layers represented by acl functions.
* \note Currently only supports a single layer.
diff --git a/tests/python/contrib/test_clml/infrastructure.py b/tests/python/contrib/test_clml/infrastructure.py
index 0cf76079e8..08b11525ec 100644
--- a/tests/python/contrib/test_clml/infrastructure.py
+++ b/tests/python/contrib/test_clml/infrastructure.py
@@ -29,6 +29,7 @@ from tvm import rpc
from tvm.contrib import graph_executor
from tvm.relay.op.contrib import clml
from tvm.contrib import utils
+from tvm import autotvm
from tvm.autotvm.measure import request_remote
from tvm.relay.expr_functor import ExprMutator, Call
@@ -144,35 +145,28 @@ def skip_codegen_test():
return True
-def build_module(mod, target, target_host, params=None, enable_clml=True):
+def build_module(mod, target, target_host, params=None, enable_clml=True, tune_log=""):
"""Build module with option to build for CLML."""
if isinstance(mod, tvm.relay.expr.Call):
mod = tvm.IRModule.from_expr(mod)
- with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
- if enable_clml:
- mod = clml.partition_for_clml(mod, params)
- relay.backend.te_compiler.get().clear()
- # print("Build Mod:", mod)
- return relay.build(mod, target=target, target_host=target_host, params=params)
+ with autotvm.apply_history_best(tune_log):
+ with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+ if enable_clml:
+ mod = clml.partition_for_clml(mod, params)
+ relay.backend.te_compiler.get().clear()
+ return relay.build(mod, target=target, target_host=target_host, params=params)
def build_and_run(
- mod,
- inputs,
- outputs,
- params,
- device,
- enable_clml=True,
- no_runs=1,
- config=None,
+ mod, inputs, outputs, params, device, enable_clml=True, no_runs=1, config=None, tune_log=""
):
"""Build and run the relay module."""
if config is None:
config = {}
try:
- libm = build_module(mod, device.target, device.target_host, params, enable_clml)
+ libm = build_module(mod, device.target, device.target_host, params, enable_clml, tune_log)
clml_modules = extract_clml_modules(libm)
for mod in clml_modules:
@@ -198,7 +192,7 @@ def build_and_run(
for _ in range(no_runs):
gen_module.run()
out.append([gen_module.get_output(i) for i in range(outputs)])
- time_f = gen_module.module.time_evaluator("run", device.device.cl(0), number=50)
+ time_f = gen_module.module.time_evaluator("run", device.device.cl(0), number=1)
cost = time_f().mean
print("%g secs/iteration\n" % cost)
return out
diff --git a/tests/python/contrib/test_clml/test_network.py b/tests/python/contrib/test_clml/test_network.py
index 405f5782ff..95f3a45baf 100644
--- a/tests/python/contrib/test_clml/test_network.py
+++ b/tests/python/contrib/test_clml/test_network.py
@@ -25,20 +25,13 @@ import tvm
from test_clml.infrastructure import skip_runtime_test, build_and_run, Device
-def _build_and_run_network(mod, params, inputs, data, device, atol, rtol):
+def _build_and_run_network(mod, params, inputs, data, device, atol, rtol, tvm_log=""):
"""Helper function to build and run a network."""
outputs = []
for clml in [True, False]:
outputs.append(
- build_and_run(
- mod,
- data,
- 1,
- params,
- device,
- enable_clml=clml,
- )[0]
+ build_and_run(mod, data, 1, params, device, enable_clml=clml, tune_log=tvm_log)[0][0]
)
return outputs
@@ -55,11 +48,7 @@ def _get_keras_model(keras_model, inputs_dict, data):
def get_bottom_top_model(model, layer_name):
layer = model.get_layer(layer_name)
bottom_input = model.layers[0].input
- bottom_output = bottom_input
- for layer in model.layers:
- bottom_output = layer(bottom_output)
- if layer.name == layer_name:
- break
+ bottom_output = layer.output
bottom_model = Model(bottom_input, bottom_output)
return bottom_model
@@ -81,6 +70,9 @@ def test_mobilenet():
def get_model():
from tensorflow.keras.applications import MobileNet
+ import tensorflow as tf
+
+ tf.keras.backend.clear_session()
mobilenet = MobileNet(
include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000
@@ -106,32 +98,113 @@ def test_mobilenet():
)
# test
- print("OpenCL:", outputs[0][0].asnumpy().shape)
- print("CLML:", outputs[1][0].asnumpy().shape)
+ print("OpenCL:", outputs[0].asnumpy().shape)
+ print("CLML:", outputs[1].asnumpy().shape)
- opencl_sort = np.argsort(outputs[1][0].asnumpy()).flatten()
- clml_sort = np.argsort(outputs[0][0].asnumpy()).flatten()
+ opencl_sort = np.argsort(outputs[1].asnumpy()).flatten()
+ clml_sort = np.argsort(outputs[0].asnumpy()).flatten()
tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, atol=1e-5)
-"""
- tvm.testing.assert_allclose(
- ref_outputs, outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
- print("OpenCL to Keras looks good")
- tvm.testing.assert_allclose(
- outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
- print("OpenCL to CLML looks good")
- exit(0)
+def test_inception_v3():
+ Device.load("test_config.json")
+
+ if skip_runtime_test():
+ return
+
+ device = Device()
+ dtype = "float16"
+
+ def get_model():
+ from tensorflow.keras.applications import InceptionV3
+ import tensorflow as tf
+
+ tf.keras.backend.clear_session()
+
+ inceptionV3 = InceptionV3(
+ include_top=True, weights=None, input_shape=(299, 299, 3), classes=1000
+ )
+ inputs = {inceptionV3.input_names[0]: ((1, 3, 299, 299), "float16")}
+
+ data = {}
+ np.random.seed(0)
+ for name, (shape, dtype) in inputs.items():
+ if dtype == "uint8":
+ low, high = 0, 1
+ else:
+ low, high = -2, 1
+ data[name] = np.random.uniform(low, high, shape).astype(dtype)
+
+ mod, params, ref_outputs = _get_keras_model(inceptionV3, inputs, data)
+ return mod, params, inputs, data, ref_outputs
+
+ mod, params, inputs, input_data, ref_outputs = get_model()
+ outputs = _build_and_run_network(
+ mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5
+ )
+
+ opencl_sort = np.argsort(outputs[1].asnumpy()).flatten()
+ clml_sort = np.argsort(outputs[0].asnumpy()).flatten()
+
+ tvm.testing.assert_allclose(opencl_sort[:5], clml_sort[:5], rtol=1e-5, atol=1e-5)
+
+
+def test_resnet50v2():
+ Device.load("test_config.json")
+
+ if skip_runtime_test():
+ return
+
+ device = Device()
+ dtype = "float16"
+
+ def get_model():
+ from tensorflow.keras.applications import ResNet50V2
+ import tensorflow as tf
+
+ tf.keras.backend.clear_session()
- tvm.testing.assert_allclose(
- ref_outputs.transpose(0, 3, 1, 2), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
- print("OpenCL to Keras looks good")
- tvm.testing.assert_allclose(
- outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
- print("OpenCL to CLML looks good")
-"""
+ model = ResNet50V2(include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000)
+ inputs_dict = {model.input_names[0]: ((1, 3, 224, 224), "float32")}
+
+ data = {}
+ np.random.seed(0)
+
+ for name, (shape, dtype) in inputs_dict.items():
+ if dtype == "uint8":
+ low, high = 0, 1
+ else:
+ low, high = -1, 1
+ data[name] = np.random.uniform(low, high, shape).astype(dtype)
+
+ """Convert Keras graph to relay."""
+ inputs = {}
+ for name, (shape, _) in inputs_dict.items():
+ inputs[model.input_names[0]] = shape
+
+ ref_outputs = model.predict(data["input_1"].transpose(0, 2, 3, 1))
+
+ mod, params = relay.frontend.from_keras(model, inputs, layout="NCHW")
+
+ return mod, params, inputs, data, ref_outputs
+
+ mod, params, inputs, input_data, ref_outputs = get_model()
+ outputs = _build_and_run_network(
+ mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5
+ )
+
+ # test
+ print("OpenCL:", outputs[0].asnumpy().shape)
+ print("CLML:", outputs[1].asnumpy().shape)
+
+ opencl_sort = np.argsort(outputs[1].asnumpy()).flatten()
+ clml_sort = np.argsort(outputs[0].asnumpy()).flatten()
+
+ tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, atol=1e-5)
if __name__ == "__main__":
test_mobilenet()
+ test_resnet50v2()
+ test_inception_v3()
diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py
index 13f49d1527..d14a5ec6e9 100644
--- a/tests/python/contrib/test_clml/test_ops.py
+++ b/tests/python/contrib/test_clml/test_ops.py
@@ -211,6 +211,87 @@ def test_batchnorm():
)
+def test_concat():
+ Device.load("test_config.json")
+
+ if skip_runtime_test():
+ return
+
+ device = Device()
+ dtype = "float16"
+ in_shape_1 = (1, 16, 16, 16)
+ in_shape_2 = (1, 16, 16, 16)
+ a = relay.var("input_1", shape=in_shape_1, dtype=dtype)
+ b = relay.var("input_2", shape=in_shape_2, dtype=dtype)
+ low, high = -1, 1
+ inputs = {
+ "input_1": tvm.nd.array(np.random.uniform(-1, 1, in_shape_1).astype(dtype)),
+ "input_2": tvm.nd.array(np.random.uniform(-1, 1, in_shape_2).astype(dtype)),
+ }
+
+ params = {}
+ func = relay.concatenate((a, b), axis=1)
+ mod = IRModule.from_expr(func)
+
+ opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0]
+ clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0]
+
+ tvm.testing.assert_allclose(
+ clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
+ )
+
+
+def test_avgpool():
+ Device.load("test_config.json")
+
+ if skip_runtime_test():
+ return
+
+ device = Device()
+ dtype = "float16"
+ trials = [
+ # input size pool_size stride paading
+ [(1, 64, 147, 147), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
+ [(1, 192, 71, 71), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
+ [(1, 288, 35, 35), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
+ [(1, 768, 17, 17), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
+ [(1, 2048, 17, 17), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
+ [(1, 192, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
+ [(1, 256, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
+ [(1, 288, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
+ [(1, 768, 17, 17), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
+ [(1, 1280, 8, 8), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
+ ]
+ params = {}
+ for (
+ input_shape,
+ pool_size,
+ stride,
+ padding,
+ pooling_type,
+ ) in trials:
+ a = relay.var("input_1", shape=input_shape, dtype=dtype)
+ input_arr = tvm.nd.array(np.random.uniform(-1, 1, input_shape).astype(dtype))
+ inputs = {
+ "input_1": input_arr,
+ }
+
+ if pooling_type == "max":
+ func = relay.nn.max_pool2d(a, pool_size=pool_size, strides=stride, padding=padding)
+ else:
+ func = relay.nn.avg_pool2d(a, pool_size=pool_size, strides=stride, padding=padding)
+ mod = IRModule.from_expr(func)
+
+ opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0]
+ clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0]
+
+ tvm.testing.assert_allclose(
+ clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
+ )
+
+
if __name__ == "__main__":
test_conv2d()
- test_batchnorm()
+ # test_batchnorm()
+ test_avgpool()
+ test_concat()