You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tvm.apache.org by GitBox <gi...@apache.org> on 2022/06/24 11:31:58 UTC

[GitHub] [tvm] elvin-n opened a new pull request, #11878: Amalyshe/adreno static markup

elvin-n opened a new pull request, #11878:
URL: https://github.com/apache/tvm/pull/11878

   This PR is a split part of origin PR11357, should be merged after PR11874, PR11875, PR11876
   @csullivan @mbs-octoml
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] elvin-n commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

elvin-n commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r931917655


##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,558 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_residual_block():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    filter_shape3 = (32, 32, 2, 2)
+    bias_shape1 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
+
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv1, B1)
+    D = relay.op.nn.relu(D)
+
+    conv2 = relay.nn.conv2d(
+        D,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+    D = relay.op.add(conv2, D)
+    D = D * relay.const(0.15, "float16")
+    D = relay.op.nn.relu(D)
+
+    conv3 = relay.nn.conv2d(
+        D,
+        W3,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.nn.relu(conv3)
+
+    mod = relay.Function([A, W1, B1, W2, W3], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    filter_data3 = np.zeros(filter_shape3).astype(dtype)
+    initializer("weight", filter_data3)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "weight3": tvm.nd.array(filter_data3),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture-weight",
+        "global",
+        "global.texture",
+        "global.texture-weight",
+        "",

Review Comment:
   BTW, please don't be confused by the name of the memory scopes. It is historical naming. Now the layout of texture is defined by some algorithm defined in annotate_texture_storage.cc `Scope()` functions. and it is rather refer to 
   
   ```
   texture -> 123|4|5
   texture-weight -> 1|234|5
   texture-nchw  ->12|34|5
   ```
   
   these scopes are applied to any type of the tensor - data/weights/bias. dividing by buckets are defined only based by values in shape



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] elvin-n commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

elvin-n commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r931983992


##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,558 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_residual_block():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    filter_shape3 = (32, 32, 2, 2)
+    bias_shape1 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
+
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv1, B1)
+    D = relay.op.nn.relu(D)
+
+    conv2 = relay.nn.conv2d(
+        D,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+    D = relay.op.add(conv2, D)
+    D = D * relay.const(0.15, "float16")
+    D = relay.op.nn.relu(D)
+
+    conv3 = relay.nn.conv2d(
+        D,
+        W3,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.nn.relu(conv3)
+
+    mod = relay.Function([A, W1, B1, W2, W3], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    filter_data3 = np.zeros(filter_shape3).astype(dtype)
+    initializer("weight", filter_data3)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "weight3": tvm.nd.array(filter_data3),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture-weight",
+        "global",
+        "global.texture",
+        "global.texture-weight",
+        "",
+        "",
+    ]
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+
+
+@tvm.testing.requires_opencl
+def test_plan_device_issue1():

Review Comment:
   Done



##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,558 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_residual_block():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    filter_shape3 = (32, 32, 2, 2)
+    bias_shape1 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
+
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv1, B1)
+    D = relay.op.nn.relu(D)
+
+    conv2 = relay.nn.conv2d(
+        D,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+    D = relay.op.add(conv2, D)
+    D = D * relay.const(0.15, "float16")
+    D = relay.op.nn.relu(D)
+
+    conv3 = relay.nn.conv2d(
+        D,
+        W3,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.nn.relu(conv3)
+
+    mod = relay.Function([A, W1, B1, W2, W3], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    filter_data3 = np.zeros(filter_shape3).astype(dtype)
+    initializer("weight", filter_data3)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "weight3": tvm.nd.array(filter_data3),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture-weight",
+        "global",
+        "global.texture",
+        "global.texture-weight",
+        "",
+        "",
+    ]
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+
+
+@tvm.testing.requires_opencl
+def test_plan_device_issue1():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    conv2 = relay.nn.conv2d(
+        conv1,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+
+    mod = relay.Function([A, W1, W2], conv2)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "weight2": tvm.nd.array(filter_data2),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture-weight",
+        "",
+        "",
+    ]
+
+    static_memory_scope = []
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+
+
+@tvm.testing.requires_opencl
+def test_branch_textures():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (96, 32, 2, 2)
+    filter_shape2 = (32, 96, 2, 2)
+    filter_shape3 = (5, 96, 2, 2)
+    bias_shape1 = (1, 96, 1, 1)
+    bias_shape2 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
+    B2 = relay.var("bias2", shape=bias_shape2, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=96,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv1, B1)
+    D = relay.op.nn.relu(D)
+
+    conv2 = relay.nn.conv2d(
+        D,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    conv2 = relay.op.add(conv2, B2)
+    conv2 = relay.op.nn.relu(conv2)
+
+    conv3 = relay.nn.conv2d(
+        D,
+        W3,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=5,
+        kernel_size=(2, 2),
+    )
+
+    t = relay.Tuple([conv2, conv3])
+    c = relay.op.concatenate(t, axis=1)
+
+    mod = relay.Function([A, W1, B1, W2, B2, W3], c)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    bias_data2 = np.zeros(bias_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    initializer("bias", bias_data2)
+    filter_data3 = np.zeros(filter_shape3).astype(dtype)
+    initializer("weight", filter_data3)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "bias2": tvm.nd.array(bias_data2),
+        "weight3": tvm.nd.array(filter_data3),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ]
+
+    static_memory_scope = []
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+
+
+@tvm.testing.requires_opencl
+def test_branch1_texture_params():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape0 = (32, 32, 1, 1)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    filter_shape3 = (32, 32, 2, 2)
+    bias_shape1 = (1, 32, 1, 1)
+    # bias_shape2 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W0 = relay.var("weight0", shape=filter_shape0, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
+
+    conv0 = relay.nn.conv2d(
+        A,
+        W0,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+
+    pool = relay.nn.avg_pool2d(conv0, pool_size=(2, 2), strides=(2, 2))
+    conv1 = relay.nn.conv2d(
+        pool,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 1, 1],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    conv1 = relay.op.add(conv1, B1)
+    conv1 = relay.op.nn.relu(conv1)
+
+    conv2 = relay.nn.conv2d(
+        pool,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+
+    conv3 = relay.nn.conv2d(
+        pool,
+        W3,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 1, 1, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    conv3 = relay.op.nn.relu(conv3)
+    res = relay.op.add(conv1, conv2)
+    res = relay.op.add(res, conv3)
+
+    mod = relay.Function([A, W0, W1, B1, W2, W3], res)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data0 = np.zeros(filter_shape0).astype(dtype)
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    filter_data3 = np.zeros(filter_shape3).astype(dtype)
+    initializer("weight", filter_data3)
+    params1 = {
+        "weight0": tvm.nd.array(filter_data0),
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "weight3": tvm.nd.array(filter_data3),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture-weight",
+        "global.texture",
+        "",
+        "",
+    ]
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+
+
+#                      conv2d <- to get textures

Review Comment:
   Done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] csullivan commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

csullivan commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r935780618


##########
src/relay/transforms/annotate_texture_storage.cc:
##########
@@ -175,8 +173,26 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
     }
   }
 
+  /**
+   * Defines the name of the memory scope which can fit the tensor of required shape
+   *
+   * The scope stands for "global" if tensor does not satisfy current flattening rules for textures
+   * (texture currently has to be 5d tensors with value eq 4 in the last dimension)
+   *
+   * The packing layout inside the texture scope (the part after the dash) is defined
+   * during the shape itself. Hardware can have limitations on the texture spatial dimensions
+   * we must not exceed these sizes. In addition to the fitting of h/w limitation we want to
+   * get balanced packing where final spatial sizes of textures will not be too different
+   * @param shape shape to be analyzed
+   * @param vd VirtualDevice for the tensors determined of memory scope
+   * @return string representing memory scope either "global" or "global.texture-layout"
+   */
   std::string Scope(Array<PrimExpr> shape, const VirtualDevice& vd) {
-    if (vd != VirtualDevice::FullyUnconstrained()) {
+    // currently we support only textures been made from 5d tensors

Review Comment:
   TODO(@csullivan, @elvin-n): Support more layouts with Buffer.axis_separators lowering. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] elvin-n commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

elvin-n commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r931983714


##########
src/relay/transforms/annotate_texture_storage.cc:
##########
@@ -0,0 +1,509 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file annotate_texture_storage.cc
+ * \brief Collection of target specific relay passes which
+ * storage scope related information.
+ *
+ *  - CollectStorageInfo returns a mapping from relay expr
+ *    to a list of output storage scopes for each output.
+ *    These scopes are used during memory planning as well
+ *    as downstream when doing codegen and in the graph runtime when doing runtime dataspace
+ *    allocations.
+ *
+ *  - AnnotateMemoryScope calls *target.CollectStorageInfo for all target been represented
+ *    in the graph and rewrites graph modifying or inserting of VirtualDevice with required
+ *    memory_scop collected from the CollectStorageInfo
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/tir/expr.h>
+
+#include <memory>
+#include <unordered_map>
+
+#include "../transforms/device_aware_visitors.h"
+
+namespace tvm {
+namespace relay {
+namespace {
+
+/**
+ * @brief Analyzes the graph and returns mapping of expressions vs desired memory scope
+ */
+class StorageInfo : private transform::DeviceAwareExprVisitor {
+ public:
+  StorageInfo() : transform::DeviceAwareExprVisitor(Optional<IRModule>()) {}
+
+  static Map<Expr, Array<String>> GetStorageMap(const Expr& expr) {
+    StorageInfo storage_info;
+    storage_info.VisitExpr(expr);
+    storage_info.LegalizeProducerStorage();
+    Map<Expr, Array<String>> storage_map;
+    for (auto& kv : storage_info.storage_scope_) {
+      std::vector<String> storage_scopes;
+      std::copy(kv.second.begin(), kv.second.end(), std::back_inserter(storage_scopes));
+      storage_map.Set(GetRef<Expr>(kv.first), Array<String>{storage_scopes});
+    }
+
+    // Filling the input arguments by "global" scope to handle PlanDevice algo which propagates
+    // virtual devices from outputs to inputs. At the same time outputs must be unconstrained
+    // to avoid useless device_copy
+    for (const auto& cs : storage_info.consumer_storage_scopes_) {
+      // we have record in consumers that mean that potentially consumer
+      // dealt with textures anyhow, it's safe to mark this expr as global scope
+      // even without verification of the consumer's outputs scope
+      if (storage_info.CanConsumeTextures(cs.second) &&
+          storage_map.find(GetRef<Expr>(cs.first)) == storage_map.end()) {
+        storage_map.Set(GetRef<Expr>(cs.first), Array<String>{"global"});
+      }
+    }
+
+    // initial algo assumes mapping of outputs of the expr that is not enough, need to update
+    // VirtualDevice for function variables to get proper codegen. Adding vars to storage_map
+    for (const auto& a : storage_info.args_to_vars_) {
+      if (storage_map.count(a.first)) {
+        for (const auto& v : a.second) {
+          storage_map.Set(v, storage_map[a.first]);
+        }
+      }
+    }
+    return storage_map;
+  }
+
+ private:
+  void Visit(const Expr& expr) {
+    // Pre-order traversal to enable upward propagation
+    // of consumer storage scopes to producers when desirable.
+    if (const auto* fn = expr.as<FunctionNode>()) {
+      this->VisitExpr(fn->body);
+      for (const auto& param : fn->params) {
+        this->VisitExpr(param);
+      }
+    } else {
+      this->VisitExpr(expr);
+    }
+  }
+
+  void VisitExpr_(const VarNode* vn) final { ApplyConsumerScopeToInputs(vn); }
+
+  void VisitExpr_(const ConstantNode* cn) final { ApplyConsumerScopeToInputs(cn); }
+
+  void DeviceAwareVisitExpr_(const CallNode* call) final {
+    // Check the contents of this primitive function
+    if (const auto* fn = call->op.as<FunctionNode>()) {
+      if (fn->HasNonzeroAttr(attr::kPrimitive)) {
+        primitive_supports_texture_ = false;
+        Visit(call->op);
+        if (primitive_supports_texture_) {
+          if (call->checked_type().as<TensorTypeNode>()) {
+            std::string scope = "global.texture";
+            if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
+              if (ttype->shape.size() == 5) {
+                scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(call)));
+              }
+            }
+            storage_scope_[call].push_back(scope);
+          } else {
+            const auto* tuple_type = call->type_as<TupleTypeNode>();
+            ICHECK(tuple_type);
+            // TODO(csullivan): Add support for mixed output storage scope.
+            // In current adreno storage planner all outputs of a
+            // primitive function are assumed to be of the same storage
+            // type. This should be easy to extend in the future.
+            for (size_t i = 0; i < tuple_type->fields.size(); i++) {
+              storage_scope_[call].push_back("global.texture");
+            }
+          }
+          for (size_t i = 0; i < fn->params.size(); i++) {
+            args_to_vars_[call->args[i]].push_back(fn->params[i]);
+          }
+        }
+        // Add consumer storage scope information for call arguments
+        for (auto& arg : call->args) {
+          if (storage_scope_.count(call)) {
+            ICHECK(!HasMixedStorageOutputs(call))
+                << "Mixed output storage scopes are not currently supported";
+            consumer_storage_scopes_[arg.operator->()].push_back("global.texture");
+          } else {
+            consumer_storage_scopes_[arg.operator->()].push_back("global");
+          }
+        }
+      }
+    }
+
+    primitive_supports_texture_ = SupportsTextureStorage(call);
+
+    for (auto& arg : call->args) {
+      Visit(arg);
+    }
+    // We have all callees filled into storage_scope_ if they support textures
+    // We need to verify if this call expects texture and if it does not, remove from
+    // storage_scope_ since initially storage_scope_ is filled only based on knowledge
+    // that function able to work with textures, but not necessary that this texture is
+    // expected by function callee
+    for (auto& arg : call->args) {
+      if (consumer_storage_scopes_.count(arg.operator->()) &&
+          GetConsumerScope(consumer_storage_scopes_[arg.operator->()]) != "global.texture") {
+        storage_scope_.erase(arg.operator->());
+        if (const auto* cn = arg.as<CallNode>()) {
+          if (const auto* fn = cn->op.as<FunctionNode>()) {
+            storage_scope_.erase(fn->body.operator->());
+          }
+        }
+      }
+    }
+  }
+
+  std::string Scope(Array<PrimExpr> shape, const VirtualDevice& vd) {
+    if (vd != VirtualDevice::FullyUnconstrained()) {
+      std::map<int, std::string> diffs;
+      int limit =
+          vd->target->GetAttr<Integer>("texture_spatial_limit").value_or(Integer(16384))->value;
+      int a0 = shape[0].as<IntImmNode>()->value;
+      int a1 = shape[1].as<IntImmNode>()->value;
+      int a2 = shape[2].as<IntImmNode>()->value;
+      int a3 = shape[3].as<IntImmNode>()->value;
+
+      int d3l = a0 * a1 * a2;
+      int d3r = a3;
+      int diff3 = d3l > d3r ? d3l - d3r : d3r - d3l;
+      if (d3l < limit && d3r < limit) diffs[diff3] = "";
+
+      int d2l = a0 * a1;
+      int d2r = a2 * a3;
+      int diff2 = d2l > d2r ? d2l - d2r : d2r - d2l;
+      if (d2l < limit && d2r < limit) diffs[diff2] = "nhwc";
+
+      int d1l = a0;
+      int d1r = a1 * a2 * a3;
+      int diff1 = d1l > d1r ? d1l - d1r : d1r - d1l;
+      if (d1l < limit && d1r < limit) diffs[diff1] = "weight";
+      if (!diffs.empty()) {
+        std::string scope = "global.texture";
+        if (!diffs.begin()->second.empty()) {
+          scope += ("-" + diffs.begin()->second);
+        }
+        return scope;
+      }
+    }
+    return "global";
+  }
+
+  void ApplyConsumerScopeToInputs(const ExprNode* expr) {
+    std::string scope;
+    auto consumer_scopes_it = consumer_storage_scopes_.find(expr);
+    if (consumer_scopes_it != consumer_storage_scopes_.end()) {
+      std::string consumer_scope = GetConsumerScope(consumer_scopes_it->second);
+      ICHECK(!storage_scope_.count(expr))
+          << "Already propagated consumer scopes to input: " << GetRef<Expr>(expr);
+
+      bool expr_is_rgba_vectorizable = false;
+      if (const auto* ttype = expr->checked_type().as<TensorTypeNode>()) {
+        if (ttype->shape.size() == 5) {
+          scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(expr)));
+          if (scope != "global") {
+            auto inner_dim = ttype->shape.back().as<IntImmNode>();
+            if (inner_dim && inner_dim->value == 4) {
+              expr_is_rgba_vectorizable = true;
+            }
+          }
+        }
+      }
+
+      // Only propagate texture scope from consumers to input expr if
+      // the input shape of the input expr is rgba vectorizable.
+      if (consumer_scope.find("global.texture") != std::string::npos) {
+        if (expr_is_rgba_vectorizable) {
+          storage_scope_[expr].push_back(scope);
+        }
+      } else {
+        storage_scope_[expr].push_back(consumer_scope);
+      }
+    }
+  }
+
+  void LegalizeProducerStorage() {
+    for (auto& kv : consumer_storage_scopes_) {
+      const ExprNode* producer = kv.first;
+      std::string legal_scope = GetConsumerScope(kv.second);
+      if (storage_scope_.count(producer)) {
+        ICHECK(!HasMixedStorageOutputs(producer))
+            << "Mixed output storage scopes are not currently supported";
+        if (storage_scope_[producer][0].find(legal_scope) == std::string::npos) {
+          for (size_t i = 0; i < storage_scope_[producer].size(); i++) {
+            // Only support uniform storage scope across all outputs for now
+            storage_scope_[producer][i] = legal_scope;
+          }
+        }
+      }
+    }
+  }
+
+  std::string GetConsumerScope(const std::vector<std::string>& consumer_scopes) const {
+    if (!consumer_scopes.size()) {
+      return "global";
+    }
+    std::string texture_tag = "global.texture";
+    for (auto& consumer_scope : consumer_scopes) {
+      if (consumer_scope.find(texture_tag) == std::string::npos) {
+        return "global";
+      }
+    }
+    return texture_tag;
+  }
+
+  bool CanConsumeTextures(const std::vector<std::string>& consumer_scopes) const {
+    std::string texture_tag = "global.texture";
+    for (auto& consumer_scope : consumer_scopes) {
+      if (consumer_scope.find(texture_tag) == 0) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool HasMixedStorageOutputs(const ExprNode* expr) {
+    if (storage_scope_.count(expr)) {
+      std::string ref_scope = storage_scope_[expr][0];
+      for (std::string& scope : storage_scope_[expr]) {
+        if (scope != ref_scope) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  bool SupportsTextureStorage(const CallNode* call) const {
+    bool supports_texture_storage = false;
+    if (auto attrs = call->attrs.as<Conv2DAttrs>()) {
+      if (attrs->data_layout == "NCHW4c" && attrs->kernel_layout == "OIHW4o") {
+        supports_texture_storage = true;
+      } else if (attrs->data_layout == "NHWC4c" &&
+                 (attrs->kernel_layout == "HWOI4o" || attrs->kernel_layout == "HWIO4o" ||
+                  attrs->kernel_layout == "OIHW4o")) {
+        supports_texture_storage = true;
+      }
+    } else if (auto attrs = call->attrs.as<Conv2DWinogradAttrs>()) {
+      if ((attrs->data_layout == "NCHW4c" || attrs->data_layout == "NHWC4c") &&
+          (attrs->kernel_layout == "OIHW4o" || attrs->kernel_layout == "HWIO4o")) {
+        supports_texture_storage = true;
+      }
+    } else if (auto attrs = call->attrs.as<GlobalPool2DAttrs>()) {
+      if (attrs->layout == "NCHW4c") {
+        supports_texture_storage = true;
+      }
+    } else if (auto attrs = call->attrs.as<MaxPool2DAttrs>()) {
+      if (attrs->layout == "NCHW4c") {
+        supports_texture_storage = true;
+      }
+    } else if (auto attrs = call->attrs.as<AvgPool2DAttrs>()) {
+      if (attrs->layout == "NCHW4c") {
+        supports_texture_storage = true;
+      }
+    }
+
+    return supports_texture_storage;
+  }
+
+  /*! \brief Temporary state for marking whether a visited function
+   *         primitive supports texture storage scope */
+  bool primitive_supports_texture_ = false;
+  /*! \brief expr storage scope mapping for each output  */
+  std::unordered_map<const ExprNode*, std::vector<std::string>> storage_scope_;
+  /*! \brief output storage scopes used by consumers of expr key  */
+  std::unordered_map<const ExprNode*, std::vector<std::string>> consumer_storage_scopes_;
+  /*! \brief mapping of arguments to call to function variables*/
+  std::unordered_map<Expr, std::vector<Var>, ObjectPtrHash, ObjectPtrEqual> args_to_vars_;
+};
+
+}  // namespace
+
+/**
+ * @brief rewrite of virtual devices, memory_scope part for expressions defined
+ * by the StorageInfo analysis pass
+ *
+ * Currently this workflow supports analysis and rewriting of VirtualDevice for
+ * Constants and function Variables
+ */
+class VDRewriter : public transform::DeviceAwareExprMutator {

Review Comment:
   Done



##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,558 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_residual_block():

Review Comment:
   Done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] elvin-n commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

elvin-n commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r931982942


##########
src/relay/transforms/annotate_texture_storage.cc:
##########
@@ -0,0 +1,509 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file annotate_texture_storage.cc
+ * \brief Collection of target specific relay passes which
+ * storage scope related information.
+ *
+ *  - CollectStorageInfo returns a mapping from relay expr
+ *    to a list of output storage scopes for each output.
+ *    These scopes are used during memory planning as well
+ *    as downstream when doing codegen and in the graph runtime when doing runtime dataspace
+ *    allocations.
+ *
+ *  - AnnotateMemoryScope calls *target.CollectStorageInfo for all target been represented
+ *    in the graph and rewrites graph modifying or inserting of VirtualDevice with required
+ *    memory_scop collected from the CollectStorageInfo
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/tir/expr.h>
+
+#include <memory>
+#include <unordered_map>
+
+#include "../transforms/device_aware_visitors.h"
+
+namespace tvm {
+namespace relay {
+namespace {
+
+/**
+ * @brief Analyzes the graph and returns mapping of expressions vs desired memory scope
+ */
+class StorageInfo : private transform::DeviceAwareExprVisitor {
+ public:
+  StorageInfo() : transform::DeviceAwareExprVisitor(Optional<IRModule>()) {}
+
+  static Map<Expr, Array<String>> GetStorageMap(const Expr& expr) {
+    StorageInfo storage_info;
+    storage_info.VisitExpr(expr);
+    storage_info.LegalizeProducerStorage();
+    Map<Expr, Array<String>> storage_map;
+    for (auto& kv : storage_info.storage_scope_) {
+      std::vector<String> storage_scopes;
+      std::copy(kv.second.begin(), kv.second.end(), std::back_inserter(storage_scopes));
+      storage_map.Set(GetRef<Expr>(kv.first), Array<String>{storage_scopes});
+    }
+
+    // Filling the input arguments by "global" scope to handle PlanDevice algo which propagates
+    // virtual devices from outputs to inputs. At the same time outputs must be unconstrained
+    // to avoid useless device_copy
+    for (const auto& cs : storage_info.consumer_storage_scopes_) {
+      // we have record in consumers that mean that potentially consumer
+      // dealt with textures anyhow, it's safe to mark this expr as global scope
+      // even without verification of the consumer's outputs scope
+      if (storage_info.CanConsumeTextures(cs.second) &&
+          storage_map.find(GetRef<Expr>(cs.first)) == storage_map.end()) {
+        storage_map.Set(GetRef<Expr>(cs.first), Array<String>{"global"});
+      }
+    }
+
+    // initial algo assumes mapping of outputs of the expr that is not enough, need to update
+    // VirtualDevice for function variables to get proper codegen. Adding vars to storage_map
+    for (const auto& a : storage_info.args_to_vars_) {
+      if (storage_map.count(a.first)) {
+        for (const auto& v : a.second) {
+          storage_map.Set(v, storage_map[a.first]);
+        }
+      }
+    }
+    return storage_map;
+  }
+
+ private:
+  void Visit(const Expr& expr) {
+    // Pre-order traversal to enable upward propagation
+    // of consumer storage scopes to producers when desirable.
+    if (const auto* fn = expr.as<FunctionNode>()) {
+      this->VisitExpr(fn->body);
+      for (const auto& param : fn->params) {
+        this->VisitExpr(param);
+      }
+    } else {
+      this->VisitExpr(expr);
+    }
+  }
+
+  void VisitExpr_(const VarNode* vn) final { ApplyConsumerScopeToInputs(vn); }
+
+  void VisitExpr_(const ConstantNode* cn) final { ApplyConsumerScopeToInputs(cn); }
+
+  void DeviceAwareVisitExpr_(const CallNode* call) final {
+    // Check the contents of this primitive function
+    if (const auto* fn = call->op.as<FunctionNode>()) {
+      if (fn->HasNonzeroAttr(attr::kPrimitive)) {
+        primitive_supports_texture_ = false;
+        Visit(call->op);
+        if (primitive_supports_texture_) {
+          if (call->checked_type().as<TensorTypeNode>()) {
+            std::string scope = "global.texture";
+            if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
+              if (ttype->shape.size() == 5) {
+                scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(call)));
+              }
+            }
+            storage_scope_[call].push_back(scope);
+          } else {
+            const auto* tuple_type = call->type_as<TupleTypeNode>();
+            ICHECK(tuple_type);
+            // TODO(csullivan): Add support for mixed output storage scope.
+            // In current adreno storage planner all outputs of a
+            // primitive function are assumed to be of the same storage
+            // type. This should be easy to extend in the future.
+            for (size_t i = 0; i < tuple_type->fields.size(); i++) {
+              storage_scope_[call].push_back("global.texture");
+            }
+          }
+          for (size_t i = 0; i < fn->params.size(); i++) {
+            args_to_vars_[call->args[i]].push_back(fn->params[i]);
+          }
+        }
+        // Add consumer storage scope information for call arguments
+        for (auto& arg : call->args) {
+          if (storage_scope_.count(call)) {
+            ICHECK(!HasMixedStorageOutputs(call))
+                << "Mixed output storage scopes are not currently supported";
+            consumer_storage_scopes_[arg.operator->()].push_back("global.texture");
+          } else {
+            consumer_storage_scopes_[arg.operator->()].push_back("global");
+          }
+        }
+      }
+    }
+
+    primitive_supports_texture_ = SupportsTextureStorage(call);
+
+    for (auto& arg : call->args) {
+      Visit(arg);
+    }
+    // We have all callees filled into storage_scope_ if they support textures
+    // We need to verify if this call expects texture and if it does not, remove from
+    // storage_scope_ since initially storage_scope_ is filled only based on knowledge
+    // that function able to work with textures, but not necessary that this texture is
+    // expected by function callee
+    for (auto& arg : call->args) {
+      if (consumer_storage_scopes_.count(arg.operator->()) &&
+          GetConsumerScope(consumer_storage_scopes_[arg.operator->()]) != "global.texture") {
+        storage_scope_.erase(arg.operator->());
+        if (const auto* cn = arg.as<CallNode>()) {
+          if (const auto* fn = cn->op.as<FunctionNode>()) {
+            storage_scope_.erase(fn->body.operator->());
+          }
+        }
+      }
+    }
+  }
+
+  std::string Scope(Array<PrimExpr> shape, const VirtualDevice& vd) {
+    if (vd != VirtualDevice::FullyUnconstrained()) {
+      std::map<int, std::string> diffs;
+      int limit =
+          vd->target->GetAttr<Integer>("texture_spatial_limit").value_or(Integer(16384))->value;
+      int a0 = shape[0].as<IntImmNode>()->value;
+      int a1 = shape[1].as<IntImmNode>()->value;
+      int a2 = shape[2].as<IntImmNode>()->value;
+      int a3 = shape[3].as<IntImmNode>()->value;
+
+      int d3l = a0 * a1 * a2;
+      int d3r = a3;
+      int diff3 = d3l > d3r ? d3l - d3r : d3r - d3l;
+      if (d3l < limit && d3r < limit) diffs[diff3] = "";
+
+      int d2l = a0 * a1;
+      int d2r = a2 * a3;
+      int diff2 = d2l > d2r ? d2l - d2r : d2r - d2l;
+      if (d2l < limit && d2r < limit) diffs[diff2] = "nhwc";
+
+      int d1l = a0;
+      int d1r = a1 * a2 * a3;
+      int diff1 = d1l > d1r ? d1l - d1r : d1r - d1l;
+      if (d1l < limit && d1r < limit) diffs[diff1] = "weight";
+      if (!diffs.empty()) {
+        std::string scope = "global.texture";
+        if (!diffs.begin()->second.empty()) {
+          scope += ("-" + diffs.begin()->second);
+        }
+        return scope;
+      }
+    }
+    return "global";
+  }
+
+  void ApplyConsumerScopeToInputs(const ExprNode* expr) {
+    std::string scope;
+    auto consumer_scopes_it = consumer_storage_scopes_.find(expr);
+    if (consumer_scopes_it != consumer_storage_scopes_.end()) {
+      std::string consumer_scope = GetConsumerScope(consumer_scopes_it->second);
+      ICHECK(!storage_scope_.count(expr))
+          << "Already propagated consumer scopes to input: " << GetRef<Expr>(expr);
+
+      bool expr_is_rgba_vectorizable = false;
+      if (const auto* ttype = expr->checked_type().as<TensorTypeNode>()) {
+        if (ttype->shape.size() == 5) {

Review Comment:
   moved verification into Scope(), added comment



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] csullivan commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

csullivan commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r935772265


##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,558 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_residual_block():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    filter_shape3 = (32, 32, 2, 2)
+    bias_shape1 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
+
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv1, B1)
+    D = relay.op.nn.relu(D)
+
+    conv2 = relay.nn.conv2d(
+        D,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+    D = relay.op.add(conv2, D)
+    D = D * relay.const(0.15, "float16")
+    D = relay.op.nn.relu(D)
+
+    conv3 = relay.nn.conv2d(
+        D,
+        W3,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.nn.relu(conv3)
+
+    mod = relay.Function([A, W1, B1, W2, W3], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    filter_data3 = np.zeros(filter_shape3).astype(dtype)
+    initializer("weight", filter_data3)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "weight3": tvm.nd.array(filter_data3),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture-weight",
+        "global",
+        "global.texture",
+        "global.texture-weight",
+        "",

Review Comment:
   Thanks. 
   
   >these scopes are applied to any type of the tensor - data/weights/bias. dividing by buckets are defined only based by values in shape
   
   This is something I hope we can change down the line with the use of Buffer.axis_separators and `sch.transform_layout`
   
   >To compare the mapped values, we have to build such mapping basing on json. That is doable but requires more efforts.
   
   If you have extra cycles for adding these ergonomics in a follow up PR, this would make the unit testing more accessible for other developers to author.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] elvin-n commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

elvin-n commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r929951105


##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,641 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_2conv2d():

Review Comment:
   removed at all since this test duplicates situation in another test



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] echuraev commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

echuraev commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r927596550


##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,641 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_2conv2d():

Review Comment:
   Probably let's rename this test to something more meaningful?



##########
src/relay/transforms/annotate_texture_storage.cc:
##########
@@ -0,0 +1,528 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file annotate_texture_storage.cc
+ * \brief Collection of target specific relay passes which
+ * storage scope related information.
+ *
+ *  - CollectStorageInfo returns a mapping from relay expr
+ *    to a list of output storage scopes for each output.
+ *    These scopes are used during memory planning as well
+ *    as downstream when doing codegen and in the graph runtime when doing runtime dataspace
+ *    allocations.
+ *
+ *  - AnnotateMemoryScope calls *target.CollectStorageInfo for all target been represented
+ *    in the graph and rewrites graph modifying or inserting of VirtualDevice with required
+ *    memory_scop collected from the CollectStorageInfo
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/tir/expr.h>
+
+#include <memory>
+#include <unordered_map>
+
+#include "../transforms/device_aware_visitors.h"
+
+namespace tvm {
+namespace relay {
+namespace {
+
+/**
+ * @brief Analyzes the graph and returns mapping of expressions vs desired memory scope
+ */
+class StorageInfo : private transform::DeviceAwareExprVisitor {
+ public:
+  StorageInfo() : transform::DeviceAwareExprVisitor(Optional<IRModule>()) {}
+
+  static Map<Expr, Array<String>> GetStorageMap(const Expr& expr) {
+    StorageInfo storage_info;
+    storage_info.VisitExpr(expr);
+    storage_info.LegalizeProducerStorage();
+    Map<Expr, Array<String>> storage_map;
+    for (auto& kv : storage_info.storage_scope_) {
+      std::vector<String> storage_scopes;
+      std::copy(kv.second.begin(), kv.second.end(), std::back_inserter(storage_scopes));
+      storage_map.Set(GetRef<Expr>(kv.first), Array<String>{storage_scopes});
+    }
+
+    // Filling the input arguments by "global" scope to handle PlanDevice algo which propagates
+    // virtual devices from outputs to inputs. At the same time outputs must be unconstrained
+    // to avoid useless device_copy
+    for (const auto& cs : storage_info.consumer_storage_scopes_) {
+      // we have record in consumers that mean that potentially consumer
+      // dealt with textures anyhow, it's safe to mark this expr as global scope
+      // even without verification of the consumer's outputs scope
+      if (storage_info.CanConsumeTextures(cs.second) &&
+          storage_map.find(GetRef<Expr>(cs.first)) == storage_map.end()) {
+        storage_map.Set(GetRef<Expr>(cs.first), Array<String>{"global"});
+      }
+    }
+
+    // initial algo assumes mapping of outputs of the expr that is not enough, need to update
+    // VirtualDevice for function variables to get proper codegen. Adding vars to storage_map
+    for (const auto& a : storage_info.args_to_vars_) {
+      if (storage_map.count(a.first)) {
+        for (const auto& v : a.second) {
+          storage_map.Set(v, storage_map[a.first]);
+        }
+      }
+    }
+    return storage_map;
+  }
+
+ private:
+  void Visit(const Expr& expr) {
+    // Pre-order traversal to enable upward propagation
+    // of consumer storage scopes to producers when desirable.
+    if (const auto* fn = expr.as<FunctionNode>()) {
+      this->VisitExpr(fn->body);
+      for (const auto& param : fn->params) {
+        this->VisitExpr(param);
+      }
+    } else {
+      this->VisitExpr(expr);
+    }
+  }
+
+  void VisitExpr_(const VarNode* vn) final { ApplyConsumerScopeToInputs(vn); }
+
+  void VisitExpr_(const ConstantNode* cn) final { ApplyConsumerScopeToInputs(cn); }
+
+  void DeviceAwareVisitExpr_(const CallNode* call) final {
+    // Check the contents of this primitive function
+    if (DeviceSupportsTextureStorage(GetRef<Expr>(call))) {
+      if (const auto* fn = call->op.as<FunctionNode>()) {
+        if (fn->HasNonzeroAttr(attr::kPrimitive)) {
+          primitive_supports_texture_ = false;
+          Visit(call->op);
+          if (primitive_supports_texture_) {
+            if (call->checked_type().as<TensorTypeNode>()) {
+              std::string scope = "global.texture";
+              if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
+                if (ttype->shape.size() == 5) {
+                  scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(call)));
+                }
+              }
+              storage_scope_[call].push_back(scope);
+            } else {
+              const auto* tuple_type = call->type_as<TupleTypeNode>();
+              ICHECK(tuple_type);
+              // TODO(csullivan): Add support for mixed output storage scope.
+              // In current adreno storage planner all outputs of a
+              // primitive function are assumed to be of the same storage
+              // type. This should be easy to extend in the future.
+              for (size_t i = 0; i < tuple_type->fields.size(); i++) {
+                storage_scope_[call].push_back("global.texture");
+              }
+            }
+            for (size_t i = 0; i < fn->params.size(); i++) {
+              args_to_vars_[call->args[i]].push_back(fn->params[i]);
+            }
+          }
+          // Add consumer storage scope information for call arguments
+          for (auto& arg : call->args) {
+            if (storage_scope_.count(call)) {
+              ICHECK(!HasMixedStorageOutputs(call))
+                  << "Mixed output storage scopes are not currently supported";
+              consumer_storage_scopes_[arg.operator->()].push_back("global.texture");
+            } else {
+              consumer_storage_scopes_[arg.operator->()].push_back("global");
+            }
+          }
+        }
+      }
+    }
+
+    primitive_supports_texture_ = SupportsTextureStorage(call);
+
+    for (auto& arg : call->args) {
+      Visit(arg);
+    }
+    // We have all callees filled into storage_scope_ if they support textures
+    // We need to verify if this call expects texture and if it does not, remove from
+    // storage_scope_ since initially storage_scope_ is filled only based on knowledge
+    // that function able to work with textures, but not necessary that this texture is
+    // expected by function callee
+    for (auto& arg : call->args) {
+      if (consumer_storage_scopes_.count(arg.operator->()) &&
+          GetConsumerScope(consumer_storage_scopes_[arg.operator->()]) != "global.texture") {
+        storage_scope_.erase(arg.operator->());
+        if (const auto* cn = arg.as<CallNode>()) {
+          if (const auto* fn = cn->op.as<FunctionNode>()) {
+            storage_scope_.erase(fn->body.operator->());
+          }
+        }
+      }
+    }
+  }
+
+  std::string Scope(Array<PrimExpr> shape, const VirtualDevice& vd) {
+    if (vd != VirtualDevice::FullyUnconstrained()) {
+      std::map<int, std::string> diffs;
+      int limit =
+          vd->target->GetAttr<Integer>("texture_spatial_limit").value_or(Integer(16384))->value;
+      int a0 = shape[0].as<IntImmNode>()->value;
+      int a1 = shape[1].as<IntImmNode>()->value;
+      int a2 = shape[2].as<IntImmNode>()->value;
+      int a3 = shape[3].as<IntImmNode>()->value;
+
+      int d3l = a0 * a1 * a2;
+      int d3r = a3;
+      int diff3 = d3l > d3r ? d3l - d3r : d3r - d3l;
+      if (d3l < limit && d3r < limit) diffs[diff3] = "";
+
+      int d2l = a0 * a1;
+      int d2r = a2 * a3;
+      int diff2 = d2l > d2r ? d2l - d2r : d2r - d2l;
+      if (d2l < limit && d2r < limit) diffs[diff2] = "nhwc";
+
+      int d1l = a0;
+      int d1r = a1 * a2 * a3;
+      int diff1 = d1l > d1r ? d1l - d1r : d1r - d1l;
+      if (d1l < limit && d1r < limit) diffs[diff1] = "weight";
+      if (!diffs.empty()) {
+        std::string scope = "global.texture";
+        if (!diffs.begin()->second.empty()) {
+          scope += ("-" + diffs.begin()->second);
+        }
+        return scope;
+      }
+    }
+    return "global";
+  }
+
+  void ApplyConsumerScopeToInputs(const ExprNode* expr) {
+    std::string scope;
+    auto consumer_scopes_it = consumer_storage_scopes_.find(expr);
+    if (consumer_scopes_it != consumer_storage_scopes_.end()) {
+      std::string consumer_scope = GetConsumerScope(consumer_scopes_it->second);
+      ICHECK(!storage_scope_.count(expr))
+          << "Already propagated consumer scopes to input: " << GetRef<Expr>(expr);
+
+      bool expr_is_rgba_vectorizable = false;
+      if (const auto* ttype = expr->checked_type().as<TensorTypeNode>()) {
+        if (ttype->shape.size() == 5) {
+          scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(expr)));
+          if (scope != "global") {
+            auto inner_dim = ttype->shape.back().as<IntImmNode>();
+            if (inner_dim && inner_dim->value == 4) {
+              expr_is_rgba_vectorizable = true;
+            }
+          }
+        }
+      }
+
+      // Only propagate texture scope from consumers to input expr if
+      // the input shape of the input expr is rgba vectorizable.
+      if (consumer_scope.find("global.texture") != std::string::npos) {
+        if (expr_is_rgba_vectorizable) {
+          storage_scope_[expr].push_back(scope);
+        }
+      } else {
+        storage_scope_[expr].push_back(consumer_scope);
+      }
+    }
+  }
+
+  void LegalizeProducerStorage() {
+    for (auto& kv : consumer_storage_scopes_) {
+      const ExprNode* producer = kv.first;
+      std::string legal_scope = GetConsumerScope(kv.second);
+      if (storage_scope_.count(producer)) {
+        ICHECK(!HasMixedStorageOutputs(producer))
+            << "Mixed output storage scopes are not currently supported";
+        if (storage_scope_[producer][0].find(legal_scope) == std::string::npos) {
+          for (size_t i = 0; i < storage_scope_[producer].size(); i++) {
+            // Only support uniform storage scope across all outputs for now
+            storage_scope_[producer][i] = legal_scope;
+          }
+        }
+      }
+    }
+  }
+
+  bool DeviceSupportsTextureStorage(const Expr& expr) {
+    auto vd = GetVirtualDevice(expr);
+    if (vd != VirtualDevice::FullyUnconstrained()) {
+      if (Optional<String> t_device = vd->target->GetAttr<String>("device")) {
+        if (vd->target->kind->device_type == kDLOpenCL && t_device.defined()) {
+          if (t_device.value() == "adreno") {

Review Comment:
   Just an idea. Probably we could add a method to `t_device` which will report if the device support textures or not?



##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,641 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_2conv2d():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (96, 32, 2, 2)
+    filter_shape2 = (32, 96, 2, 2)
+    bias_shape1 = (1, 96, 1, 1)
+    bias_shape2 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    B2 = relay.var("bias2", shape=bias_shape2, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=96,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv1, B1)
+    D = relay.op.nn.relu(D)
+
+    conv2 = relay.nn.conv2d(
+        D,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv2, B2)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, W1, B1, W2, B2], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    bias_data2 = np.zeros(bias_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    initializer("bias", bias_data2)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "bias2": tvm.nd.array(bias_data2),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture-nhwc",
+        "global.texture-weight",
+        "global.texture-weight",
+        "",
+        "",
+    ]
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+
+
+@tvm.testing.requires_opencl
+def test_residual_block():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    filter_shape3 = (32, 32, 2, 2)
+    bias_shape1 = (1, 32, 1, 1)
+    # bias_shape2 = (1, 32, 1, 1)

Review Comment:
   Please, remove commented code



##########
src/relay/transforms/annotate_texture_storage.cc:
##########
@@ -0,0 +1,528 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file annotate_texture_storage.cc
+ * \brief Collection of target specific relay passes which
+ * storage scope related information.
+ *
+ *  - CollectStorageInfo returns a mapping from relay expr
+ *    to a list of output storage scopes for each output.
+ *    These scopes are used during memory planning as well
+ *    as downstream when doing codegen and in the graph runtime when doing runtime dataspace
+ *    allocations.
+ *
+ *  - AnnotateMemoryScope calls *target.CollectStorageInfo for all target been represented
+ *    in the graph and rewrites graph modifying or inserting of VirtualDevice with required
+ *    memory_scop collected from the CollectStorageInfo
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/tir/expr.h>
+
+#include <memory>
+#include <unordered_map>
+
+#include "../transforms/device_aware_visitors.h"
+
+namespace tvm {
+namespace relay {
+namespace {
+
+/**
+ * @brief Analyzes the graph and returns mapping of expressions vs desired memory scope
+ */
+class StorageInfo : private transform::DeviceAwareExprVisitor {
+ public:
+  StorageInfo() : transform::DeviceAwareExprVisitor(Optional<IRModule>()) {}
+
+  static Map<Expr, Array<String>> GetStorageMap(const Expr& expr) {
+    StorageInfo storage_info;
+    storage_info.VisitExpr(expr);
+    storage_info.LegalizeProducerStorage();
+    Map<Expr, Array<String>> storage_map;
+    for (auto& kv : storage_info.storage_scope_) {
+      std::vector<String> storage_scopes;
+      std::copy(kv.second.begin(), kv.second.end(), std::back_inserter(storage_scopes));
+      storage_map.Set(GetRef<Expr>(kv.first), Array<String>{storage_scopes});
+    }
+
+    // Filling the input arguments by "global" scope to handle PlanDevice algo which propagates
+    // virtual devices from outputs to inputs. At the same time outputs must be unconstrained
+    // to avoid useless device_copy
+    for (const auto& cs : storage_info.consumer_storage_scopes_) {
+      // we have record in consumers that mean that potentially consumer
+      // dealt with textures anyhow, it's safe to mark this expr as global scope
+      // even without verification of the consumer's outputs scope
+      if (storage_info.CanConsumeTextures(cs.second) &&
+          storage_map.find(GetRef<Expr>(cs.first)) == storage_map.end()) {
+        storage_map.Set(GetRef<Expr>(cs.first), Array<String>{"global"});
+      }
+    }
+
+    // initial algo assumes mapping of outputs of the expr that is not enough, need to update
+    // VirtualDevice for function variables to get proper codegen. Adding vars to storage_map
+    for (const auto& a : storage_info.args_to_vars_) {
+      if (storage_map.count(a.first)) {
+        for (const auto& v : a.second) {
+          storage_map.Set(v, storage_map[a.first]);
+        }
+      }
+    }
+    return storage_map;
+  }
+
+ private:
+  void Visit(const Expr& expr) {
+    // Pre-order traversal to enable upward propagation
+    // of consumer storage scopes to producers when desirable.
+    if (const auto* fn = expr.as<FunctionNode>()) {
+      this->VisitExpr(fn->body);
+      for (const auto& param : fn->params) {
+        this->VisitExpr(param);
+      }
+    } else {
+      this->VisitExpr(expr);
+    }
+  }
+
+  void VisitExpr_(const VarNode* vn) final { ApplyConsumerScopeToInputs(vn); }
+
+  void VisitExpr_(const ConstantNode* cn) final { ApplyConsumerScopeToInputs(cn); }
+
+  void DeviceAwareVisitExpr_(const CallNode* call) final {
+    // Check the contents of this primitive function
+    if (DeviceSupportsTextureStorage(GetRef<Expr>(call))) {
+      if (const auto* fn = call->op.as<FunctionNode>()) {
+        if (fn->HasNonzeroAttr(attr::kPrimitive)) {
+          primitive_supports_texture_ = false;
+          Visit(call->op);
+          if (primitive_supports_texture_) {
+            if (call->checked_type().as<TensorTypeNode>()) {
+              std::string scope = "global.texture";
+              if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
+                if (ttype->shape.size() == 5) {
+                  scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(call)));
+                }
+              }
+              storage_scope_[call].push_back(scope);
+            } else {
+              const auto* tuple_type = call->type_as<TupleTypeNode>();
+              ICHECK(tuple_type);
+              // TODO(csullivan): Add support for mixed output storage scope.
+              // In current adreno storage planner all outputs of a
+              // primitive function are assumed to be of the same storage
+              // type. This should be easy to extend in the future.
+              for (size_t i = 0; i < tuple_type->fields.size(); i++) {
+                storage_scope_[call].push_back("global.texture");
+              }
+            }
+            for (size_t i = 0; i < fn->params.size(); i++) {
+              args_to_vars_[call->args[i]].push_back(fn->params[i]);
+            }
+          }
+          // Add consumer storage scope information for call arguments
+          for (auto& arg : call->args) {
+            if (storage_scope_.count(call)) {
+              ICHECK(!HasMixedStorageOutputs(call))
+                  << "Mixed output storage scopes are not currently supported";
+              consumer_storage_scopes_[arg.operator->()].push_back("global.texture");
+            } else {
+              consumer_storage_scopes_[arg.operator->()].push_back("global");
+            }
+          }
+        }
+      }
+    }
+
+    primitive_supports_texture_ = SupportsTextureStorage(call);
+
+    for (auto& arg : call->args) {
+      Visit(arg);
+    }
+    // We have all callees filled into storage_scope_ if they support textures
+    // We need to verify if this call expects texture and if it does not, remove from
+    // storage_scope_ since initially storage_scope_ is filled only based on knowledge
+    // that function able to work with textures, but not necessary that this texture is
+    // expected by function callee
+    for (auto& arg : call->args) {
+      if (consumer_storage_scopes_.count(arg.operator->()) &&
+          GetConsumerScope(consumer_storage_scopes_[arg.operator->()]) != "global.texture") {
+        storage_scope_.erase(arg.operator->());
+        if (const auto* cn = arg.as<CallNode>()) {
+          if (const auto* fn = cn->op.as<FunctionNode>()) {
+            storage_scope_.erase(fn->body.operator->());
+          }
+        }
+      }
+    }
+  }
+
+  std::string Scope(Array<PrimExpr> shape, const VirtualDevice& vd) {
+    if (vd != VirtualDevice::FullyUnconstrained()) {
+      std::map<int, std::string> diffs;
+      int limit =
+          vd->target->GetAttr<Integer>("texture_spatial_limit").value_or(Integer(16384))->value;
+      int a0 = shape[0].as<IntImmNode>()->value;
+      int a1 = shape[1].as<IntImmNode>()->value;
+      int a2 = shape[2].as<IntImmNode>()->value;
+      int a3 = shape[3].as<IntImmNode>()->value;
+
+      int d3l = a0 * a1 * a2;
+      int d3r = a3;
+      int diff3 = d3l > d3r ? d3l - d3r : d3r - d3l;
+      if (d3l < limit && d3r < limit) diffs[diff3] = "";
+
+      int d2l = a0 * a1;
+      int d2r = a2 * a3;
+      int diff2 = d2l > d2r ? d2l - d2r : d2r - d2l;
+      if (d2l < limit && d2r < limit) diffs[diff2] = "nhwc";
+
+      int d1l = a0;
+      int d1r = a1 * a2 * a3;
+      int diff1 = d1l > d1r ? d1l - d1r : d1r - d1l;
+      if (d1l < limit && d1r < limit) diffs[diff1] = "weight";
+      if (!diffs.empty()) {
+        std::string scope = "global.texture";
+        if (!diffs.begin()->second.empty()) {
+          scope += ("-" + diffs.begin()->second);
+        }
+        return scope;
+      }
+    }
+    return "global";
+  }
+
+  void ApplyConsumerScopeToInputs(const ExprNode* expr) {
+    std::string scope;
+    auto consumer_scopes_it = consumer_storage_scopes_.find(expr);
+    if (consumer_scopes_it != consumer_storage_scopes_.end()) {
+      std::string consumer_scope = GetConsumerScope(consumer_scopes_it->second);
+      ICHECK(!storage_scope_.count(expr))
+          << "Already propagated consumer scopes to input: " << GetRef<Expr>(expr);
+
+      bool expr_is_rgba_vectorizable = false;
+      if (const auto* ttype = expr->checked_type().as<TensorTypeNode>()) {
+        if (ttype->shape.size() == 5) {
+          scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(expr)));
+          if (scope != "global") {
+            auto inner_dim = ttype->shape.back().as<IntImmNode>();
+            if (inner_dim && inner_dim->value == 4) {
+              expr_is_rgba_vectorizable = true;
+            }
+          }
+        }
+      }
+
+      // Only propagate texture scope from consumers to input expr if
+      // the input shape of the input expr is rgba vectorizable.
+      if (consumer_scope.find("global.texture") != std::string::npos) {
+        if (expr_is_rgba_vectorizable) {
+          storage_scope_[expr].push_back(scope);
+        }
+      } else {
+        storage_scope_[expr].push_back(consumer_scope);
+      }
+    }
+  }
+
+  void LegalizeProducerStorage() {
+    for (auto& kv : consumer_storage_scopes_) {
+      const ExprNode* producer = kv.first;
+      std::string legal_scope = GetConsumerScope(kv.second);
+      if (storage_scope_.count(producer)) {
+        ICHECK(!HasMixedStorageOutputs(producer))
+            << "Mixed output storage scopes are not currently supported";
+        if (storage_scope_[producer][0].find(legal_scope) == std::string::npos) {
+          for (size_t i = 0; i < storage_scope_[producer].size(); i++) {
+            // Only support uniform storage scope across all outputs for now
+            storage_scope_[producer][i] = legal_scope;
+          }
+        }
+      }
+    }
+  }
+
+  bool DeviceSupportsTextureStorage(const Expr& expr) {
+    auto vd = GetVirtualDevice(expr);
+    if (vd != VirtualDevice::FullyUnconstrained()) {
+      if (Optional<String> t_device = vd->target->GetAttr<String>("device")) {
+        if (vd->target->kind->device_type == kDLOpenCL && t_device.defined()) {
+          if (t_device.value() == "adreno") {
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
+
+  std::string GetConsumerScope(const std::vector<std::string>& consumer_scopes) const {
+    if (!consumer_scopes.size()) {
+      return "global";
+    }
+    std::string texture_tag = "global.texture";
+    for (auto& consumer_scope : consumer_scopes) {
+      if (consumer_scope.find(texture_tag) == std::string::npos) {
+        return "global";
+      }
+    }
+    return texture_tag;
+  }
+
+  bool CanConsumeTextures(const std::vector<std::string>& consumer_scopes) const {
+    if (!consumer_scopes.size()) {
+      return false;
+    }

Review Comment:
   We can remove it? Because anyway we won't go to the loop and the `false` will be returned. Same comment for `GetConsumerScope`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] elvin-n commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

elvin-n commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r932015279


##########
src/relay/transforms/annotate_texture_storage.cc:
##########
@@ -0,0 +1,509 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file annotate_texture_storage.cc
+ * \brief Collection of target specific relay passes which
+ * storage scope related information.
+ *
+ *  - CollectStorageInfo returns a mapping from relay expr
+ *    to a list of output storage scopes for each output.
+ *    These scopes are used during memory planning as well
+ *    as downstream when doing codegen and in the graph runtime when doing runtime dataspace
+ *    allocations.
+ *
+ *  - AnnotateMemoryScope calls *target.CollectStorageInfo for all target been represented
+ *    in the graph and rewrites graph modifying or inserting of VirtualDevice with required
+ *    memory_scop collected from the CollectStorageInfo
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/tir/expr.h>
+
+#include <memory>
+#include <unordered_map>
+
+#include "../transforms/device_aware_visitors.h"
+
+namespace tvm {
+namespace relay {
+namespace {
+
+/**
+ * @brief Analyzes the graph and returns mapping of expressions vs desired memory scope
+ */
+class StorageInfo : private transform::DeviceAwareExprVisitor {
+ public:
+  StorageInfo() : transform::DeviceAwareExprVisitor(Optional<IRModule>()) {}
+
+  static Map<Expr, Array<String>> GetStorageMap(const Expr& expr) {
+    StorageInfo storage_info;
+    storage_info.VisitExpr(expr);
+    storage_info.LegalizeProducerStorage();
+    Map<Expr, Array<String>> storage_map;
+    for (auto& kv : storage_info.storage_scope_) {
+      std::vector<String> storage_scopes;
+      std::copy(kv.second.begin(), kv.second.end(), std::back_inserter(storage_scopes));
+      storage_map.Set(GetRef<Expr>(kv.first), Array<String>{storage_scopes});
+    }
+
+    // Filling the input arguments by "global" scope to handle PlanDevice algo which propagates
+    // virtual devices from outputs to inputs. At the same time outputs must be unconstrained
+    // to avoid useless device_copy
+    for (const auto& cs : storage_info.consumer_storage_scopes_) {
+      // we have record in consumers that mean that potentially consumer
+      // dealt with textures anyhow, it's safe to mark this expr as global scope
+      // even without verification of the consumer's outputs scope
+      if (storage_info.CanConsumeTextures(cs.second) &&
+          storage_map.find(GetRef<Expr>(cs.first)) == storage_map.end()) {
+        storage_map.Set(GetRef<Expr>(cs.first), Array<String>{"global"});
+      }
+    }
+
+    // initial algo assumes mapping of outputs of the expr that is not enough, need to update
+    // VirtualDevice for function variables to get proper codegen. Adding vars to storage_map
+    for (const auto& a : storage_info.args_to_vars_) {
+      if (storage_map.count(a.first)) {
+        for (const auto& v : a.second) {
+          storage_map.Set(v, storage_map[a.first]);
+        }
+      }
+    }
+    return storage_map;
+  }
+
+ private:
+  void Visit(const Expr& expr) {
+    // Pre-order traversal to enable upward propagation
+    // of consumer storage scopes to producers when desirable.
+    if (const auto* fn = expr.as<FunctionNode>()) {
+      this->VisitExpr(fn->body);
+      for (const auto& param : fn->params) {
+        this->VisitExpr(param);
+      }
+    } else {
+      this->VisitExpr(expr);
+    }
+  }
+
+  void VisitExpr_(const VarNode* vn) final { ApplyConsumerScopeToInputs(vn); }
+
+  void VisitExpr_(const ConstantNode* cn) final { ApplyConsumerScopeToInputs(cn); }
+
+  void DeviceAwareVisitExpr_(const CallNode* call) final {
+    // Check the contents of this primitive function
+    if (const auto* fn = call->op.as<FunctionNode>()) {
+      if (fn->HasNonzeroAttr(attr::kPrimitive)) {
+        primitive_supports_texture_ = false;
+        Visit(call->op);
+        if (primitive_supports_texture_) {
+          if (call->checked_type().as<TensorTypeNode>()) {
+            std::string scope = "global.texture";
+            if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
+              if (ttype->shape.size() == 5) {
+                scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(call)));
+              }
+            }
+            storage_scope_[call].push_back(scope);
+          } else {
+            const auto* tuple_type = call->type_as<TupleTypeNode>();
+            ICHECK(tuple_type);
+            // TODO(csullivan): Add support for mixed output storage scope.
+            // In current adreno storage planner all outputs of a
+            // primitive function are assumed to be of the same storage
+            // type. This should be easy to extend in the future.
+            for (size_t i = 0; i < tuple_type->fields.size(); i++) {
+              storage_scope_[call].push_back("global.texture");
+            }
+          }
+          for (size_t i = 0; i < fn->params.size(); i++) {
+            args_to_vars_[call->args[i]].push_back(fn->params[i]);
+          }
+        }
+        // Add consumer storage scope information for call arguments
+        for (auto& arg : call->args) {
+          if (storage_scope_.count(call)) {
+            ICHECK(!HasMixedStorageOutputs(call))
+                << "Mixed output storage scopes are not currently supported";
+            consumer_storage_scopes_[arg.operator->()].push_back("global.texture");
+          } else {
+            consumer_storage_scopes_[arg.operator->()].push_back("global");
+          }
+        }
+      }
+    }
+
+    primitive_supports_texture_ = SupportsTextureStorage(call);
+
+    for (auto& arg : call->args) {
+      Visit(arg);
+    }
+    // We have all callees filled into storage_scope_ if they support textures
+    // We need to verify if this call expects texture and if it does not, remove from
+    // storage_scope_ since initially storage_scope_ is filled only based on knowledge
+    // that function able to work with textures, but not necessary that this texture is
+    // expected by function callee
+    for (auto& arg : call->args) {
+      if (consumer_storage_scopes_.count(arg.operator->()) &&
+          GetConsumerScope(consumer_storage_scopes_[arg.operator->()]) != "global.texture") {
+        storage_scope_.erase(arg.operator->());
+        if (const auto* cn = arg.as<CallNode>()) {
+          if (const auto* fn = cn->op.as<FunctionNode>()) {
+            storage_scope_.erase(fn->body.operator->());
+          }
+        }
+      }
+    }
+  }
+
+  std::string Scope(Array<PrimExpr> shape, const VirtualDevice& vd) {

Review Comment:
   Done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] elvin-n commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

elvin-n commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r931984198


##########
src/relay/transforms/annotate_texture_storage.cc:
##########
@@ -0,0 +1,509 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file annotate_texture_storage.cc
+ * \brief Collection of target specific relay passes which
+ * storage scope related information.
+ *
+ *  - CollectStorageInfo returns a mapping from relay expr
+ *    to a list of output storage scopes for each output.
+ *    These scopes are used during memory planning as well
+ *    as downstream when doing codegen and in the graph runtime when doing runtime dataspace
+ *    allocations.
+ *
+ *  - AnnotateMemoryScope calls *target.CollectStorageInfo for all target been represented
+ *    in the graph and rewrites graph modifying or inserting of VirtualDevice with required
+ *    memory_scop collected from the CollectStorageInfo

Review Comment:
   Done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] elvin-n commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

elvin-n commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r929977746


##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,641 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_2conv2d():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (96, 32, 2, 2)
+    filter_shape2 = (32, 96, 2, 2)
+    bias_shape1 = (1, 96, 1, 1)
+    bias_shape2 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    B2 = relay.var("bias2", shape=bias_shape2, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=96,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv1, B1)
+    D = relay.op.nn.relu(D)
+
+    conv2 = relay.nn.conv2d(
+        D,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv2, B2)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, W1, B1, W2, B2], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    bias_data2 = np.zeros(bias_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    initializer("bias", bias_data2)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "bias2": tvm.nd.array(bias_data2),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture-nhwc",
+        "global.texture-weight",
+        "global.texture-weight",
+        "",
+        "",
+    ]
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+
+
+@tvm.testing.requires_opencl
+def test_residual_block():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    filter_shape3 = (32, 32, 2, 2)
+    bias_shape1 = (1, 32, 1, 1)
+    # bias_shape2 = (1, 32, 1, 1)

Review Comment:
   done
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] elvin-n commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

elvin-n commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r929952390


##########
src/relay/transforms/annotate_texture_storage.cc:
##########
@@ -0,0 +1,528 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file annotate_texture_storage.cc
+ * \brief Collection of target specific relay passes which
+ * storage scope related information.
+ *
+ *  - CollectStorageInfo returns a mapping from relay expr
+ *    to a list of output storage scopes for each output.
+ *    These scopes are used during memory planning as well
+ *    as downstream when doing codegen and in the graph runtime when doing runtime dataspace
+ *    allocations.
+ *
+ *  - AnnotateMemoryScope calls *target.CollectStorageInfo for all target been represented
+ *    in the graph and rewrites graph modifying or inserting of VirtualDevice with required
+ *    memory_scop collected from the CollectStorageInfo
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/tir/expr.h>
+
+#include <memory>
+#include <unordered_map>
+
+#include "../transforms/device_aware_visitors.h"
+
+namespace tvm {
+namespace relay {
+namespace {
+
+/**
+ * @brief Analyzes the graph and returns mapping of expressions vs desired memory scope
+ */
+class StorageInfo : private transform::DeviceAwareExprVisitor {
+ public:
+  StorageInfo() : transform::DeviceAwareExprVisitor(Optional<IRModule>()) {}
+
+  static Map<Expr, Array<String>> GetStorageMap(const Expr& expr) {
+    StorageInfo storage_info;
+    storage_info.VisitExpr(expr);
+    storage_info.LegalizeProducerStorage();
+    Map<Expr, Array<String>> storage_map;
+    for (auto& kv : storage_info.storage_scope_) {
+      std::vector<String> storage_scopes;
+      std::copy(kv.second.begin(), kv.second.end(), std::back_inserter(storage_scopes));
+      storage_map.Set(GetRef<Expr>(kv.first), Array<String>{storage_scopes});
+    }
+
+    // Filling the input arguments by "global" scope to handle PlanDevice algo which propagates
+    // virtual devices from outputs to inputs. At the same time outputs must be unconstrained
+    // to avoid useless device_copy
+    for (const auto& cs : storage_info.consumer_storage_scopes_) {
+      // we have record in consumers that mean that potentially consumer
+      // dealt with textures anyhow, it's safe to mark this expr as global scope
+      // even without verification of the consumer's outputs scope
+      if (storage_info.CanConsumeTextures(cs.second) &&
+          storage_map.find(GetRef<Expr>(cs.first)) == storage_map.end()) {
+        storage_map.Set(GetRef<Expr>(cs.first), Array<String>{"global"});
+      }
+    }
+
+    // initial algo assumes mapping of outputs of the expr that is not enough, need to update
+    // VirtualDevice for function variables to get proper codegen. Adding vars to storage_map
+    for (const auto& a : storage_info.args_to_vars_) {
+      if (storage_map.count(a.first)) {
+        for (const auto& v : a.second) {
+          storage_map.Set(v, storage_map[a.first]);
+        }
+      }
+    }
+    return storage_map;
+  }
+
+ private:
+  void Visit(const Expr& expr) {
+    // Pre-order traversal to enable upward propagation
+    // of consumer storage scopes to producers when desirable.
+    if (const auto* fn = expr.as<FunctionNode>()) {
+      this->VisitExpr(fn->body);
+      for (const auto& param : fn->params) {
+        this->VisitExpr(param);
+      }
+    } else {
+      this->VisitExpr(expr);
+    }
+  }
+
+  void VisitExpr_(const VarNode* vn) final { ApplyConsumerScopeToInputs(vn); }
+
+  void VisitExpr_(const ConstantNode* cn) final { ApplyConsumerScopeToInputs(cn); }
+
+  void DeviceAwareVisitExpr_(const CallNode* call) final {
+    // Check the contents of this primitive function
+    if (DeviceSupportsTextureStorage(GetRef<Expr>(call))) {
+      if (const auto* fn = call->op.as<FunctionNode>()) {
+        if (fn->HasNonzeroAttr(attr::kPrimitive)) {
+          primitive_supports_texture_ = false;
+          Visit(call->op);
+          if (primitive_supports_texture_) {
+            if (call->checked_type().as<TensorTypeNode>()) {
+              std::string scope = "global.texture";
+              if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
+                if (ttype->shape.size() == 5) {
+                  scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(call)));
+                }
+              }
+              storage_scope_[call].push_back(scope);
+            } else {
+              const auto* tuple_type = call->type_as<TupleTypeNode>();
+              ICHECK(tuple_type);
+              // TODO(csullivan): Add support for mixed output storage scope.
+              // In current adreno storage planner all outputs of a
+              // primitive function are assumed to be of the same storage
+              // type. This should be easy to extend in the future.
+              for (size_t i = 0; i < tuple_type->fields.size(); i++) {
+                storage_scope_[call].push_back("global.texture");
+              }
+            }
+            for (size_t i = 0; i < fn->params.size(); i++) {
+              args_to_vars_[call->args[i]].push_back(fn->params[i]);
+            }
+          }
+          // Add consumer storage scope information for call arguments
+          for (auto& arg : call->args) {
+            if (storage_scope_.count(call)) {
+              ICHECK(!HasMixedStorageOutputs(call))
+                  << "Mixed output storage scopes are not currently supported";
+              consumer_storage_scopes_[arg.operator->()].push_back("global.texture");
+            } else {
+              consumer_storage_scopes_[arg.operator->()].push_back("global");
+            }
+          }
+        }
+      }
+    }
+
+    primitive_supports_texture_ = SupportsTextureStorage(call);
+
+    for (auto& arg : call->args) {
+      Visit(arg);
+    }
+    // We have all callees filled into storage_scope_ if they support textures
+    // We need to verify if this call expects texture and if it does not, remove from
+    // storage_scope_ since initially storage_scope_ is filled only based on knowledge
+    // that function able to work with textures, but not necessary that this texture is
+    // expected by function callee
+    for (auto& arg : call->args) {
+      if (consumer_storage_scopes_.count(arg.operator->()) &&
+          GetConsumerScope(consumer_storage_scopes_[arg.operator->()]) != "global.texture") {
+        storage_scope_.erase(arg.operator->());
+        if (const auto* cn = arg.as<CallNode>()) {
+          if (const auto* fn = cn->op.as<FunctionNode>()) {
+            storage_scope_.erase(fn->body.operator->());
+          }
+        }
+      }
+    }
+  }
+
+  std::string Scope(Array<PrimExpr> shape, const VirtualDevice& vd) {
+    if (vd != VirtualDevice::FullyUnconstrained()) {
+      std::map<int, std::string> diffs;
+      int limit =
+          vd->target->GetAttr<Integer>("texture_spatial_limit").value_or(Integer(16384))->value;
+      int a0 = shape[0].as<IntImmNode>()->value;
+      int a1 = shape[1].as<IntImmNode>()->value;
+      int a2 = shape[2].as<IntImmNode>()->value;
+      int a3 = shape[3].as<IntImmNode>()->value;
+
+      int d3l = a0 * a1 * a2;
+      int d3r = a3;
+      int diff3 = d3l > d3r ? d3l - d3r : d3r - d3l;
+      if (d3l < limit && d3r < limit) diffs[diff3] = "";
+
+      int d2l = a0 * a1;
+      int d2r = a2 * a3;
+      int diff2 = d2l > d2r ? d2l - d2r : d2r - d2l;
+      if (d2l < limit && d2r < limit) diffs[diff2] = "nhwc";
+
+      int d1l = a0;
+      int d1r = a1 * a2 * a3;
+      int diff1 = d1l > d1r ? d1l - d1r : d1r - d1l;
+      if (d1l < limit && d1r < limit) diffs[diff1] = "weight";
+      if (!diffs.empty()) {
+        std::string scope = "global.texture";
+        if (!diffs.begin()->second.empty()) {
+          scope += ("-" + diffs.begin()->second);
+        }
+        return scope;
+      }
+    }
+    return "global";
+  }
+
+  void ApplyConsumerScopeToInputs(const ExprNode* expr) {
+    std::string scope;
+    auto consumer_scopes_it = consumer_storage_scopes_.find(expr);
+    if (consumer_scopes_it != consumer_storage_scopes_.end()) {
+      std::string consumer_scope = GetConsumerScope(consumer_scopes_it->second);
+      ICHECK(!storage_scope_.count(expr))
+          << "Already propagated consumer scopes to input: " << GetRef<Expr>(expr);
+
+      bool expr_is_rgba_vectorizable = false;
+      if (const auto* ttype = expr->checked_type().as<TensorTypeNode>()) {
+        if (ttype->shape.size() == 5) {
+          scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(expr)));
+          if (scope != "global") {
+            auto inner_dim = ttype->shape.back().as<IntImmNode>();
+            if (inner_dim && inner_dim->value == 4) {
+              expr_is_rgba_vectorizable = true;
+            }
+          }
+        }
+      }
+
+      // Only propagate texture scope from consumers to input expr if
+      // the input shape of the input expr is rgba vectorizable.
+      if (consumer_scope.find("global.texture") != std::string::npos) {
+        if (expr_is_rgba_vectorizable) {
+          storage_scope_[expr].push_back(scope);
+        }
+      } else {
+        storage_scope_[expr].push_back(consumer_scope);
+      }
+    }
+  }
+
+  void LegalizeProducerStorage() {
+    for (auto& kv : consumer_storage_scopes_) {
+      const ExprNode* producer = kv.first;
+      std::string legal_scope = GetConsumerScope(kv.second);
+      if (storage_scope_.count(producer)) {
+        ICHECK(!HasMixedStorageOutputs(producer))
+            << "Mixed output storage scopes are not currently supported";
+        if (storage_scope_[producer][0].find(legal_scope) == std::string::npos) {
+          for (size_t i = 0; i < storage_scope_[producer].size(); i++) {
+            // Only support uniform storage scope across all outputs for now
+            storage_scope_[producer][i] = legal_scope;
+          }
+        }
+      }
+    }
+  }
+
+  bool DeviceSupportsTextureStorage(const Expr& expr) {
+    auto vd = GetVirtualDevice(expr);
+    if (vd != VirtualDevice::FullyUnconstrained()) {
+      if (Optional<String> t_device = vd->target->GetAttr<String>("device")) {
+        if (vd->target->kind->device_type == kDLOpenCL && t_device.defined()) {
+          if (t_device.value() == "adreno") {

Review Comment:
   removed this method at all since annotation is executed already for adreno target.
   regarding verification of textures support - this should be extended additionally



##########
src/relay/transforms/annotate_texture_storage.cc:
##########
@@ -0,0 +1,528 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file annotate_texture_storage.cc
+ * \brief Collection of target specific relay passes which
+ * storage scope related information.
+ *
+ *  - CollectStorageInfo returns a mapping from relay expr
+ *    to a list of output storage scopes for each output.
+ *    These scopes are used during memory planning as well
+ *    as downstream when doing codegen and in the graph runtime when doing runtime dataspace
+ *    allocations.
+ *
+ *  - AnnotateMemoryScope calls *target.CollectStorageInfo for all target been represented
+ *    in the graph and rewrites graph modifying or inserting of VirtualDevice with required
+ *    memory_scop collected from the CollectStorageInfo
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/tir/expr.h>
+
+#include <memory>
+#include <unordered_map>
+
+#include "../transforms/device_aware_visitors.h"
+
+namespace tvm {
+namespace relay {
+namespace {
+
+/**
+ * @brief Analyzes the graph and returns mapping of expressions vs desired memory scope
+ */
+class StorageInfo : private transform::DeviceAwareExprVisitor {
+ public:
+  StorageInfo() : transform::DeviceAwareExprVisitor(Optional<IRModule>()) {}
+
+  static Map<Expr, Array<String>> GetStorageMap(const Expr& expr) {
+    StorageInfo storage_info;
+    storage_info.VisitExpr(expr);
+    storage_info.LegalizeProducerStorage();
+    Map<Expr, Array<String>> storage_map;
+    for (auto& kv : storage_info.storage_scope_) {
+      std::vector<String> storage_scopes;
+      std::copy(kv.second.begin(), kv.second.end(), std::back_inserter(storage_scopes));
+      storage_map.Set(GetRef<Expr>(kv.first), Array<String>{storage_scopes});
+    }
+
+    // Filling the input arguments by "global" scope to handle PlanDevice algo which propagates
+    // virtual devices from outputs to inputs. At the same time outputs must be unconstrained
+    // to avoid useless device_copy
+    for (const auto& cs : storage_info.consumer_storage_scopes_) {
+      // we have record in consumers that mean that potentially consumer
+      // dealt with textures anyhow, it's safe to mark this expr as global scope
+      // even without verification of the consumer's outputs scope
+      if (storage_info.CanConsumeTextures(cs.second) &&
+          storage_map.find(GetRef<Expr>(cs.first)) == storage_map.end()) {
+        storage_map.Set(GetRef<Expr>(cs.first), Array<String>{"global"});
+      }
+    }
+
+    // initial algo assumes mapping of outputs of the expr that is not enough, need to update
+    // VirtualDevice for function variables to get proper codegen. Adding vars to storage_map
+    for (const auto& a : storage_info.args_to_vars_) {
+      if (storage_map.count(a.first)) {
+        for (const auto& v : a.second) {
+          storage_map.Set(v, storage_map[a.first]);
+        }
+      }
+    }
+    return storage_map;
+  }
+
+ private:
+  void Visit(const Expr& expr) {
+    // Pre-order traversal to enable upward propagation
+    // of consumer storage scopes to producers when desirable.
+    if (const auto* fn = expr.as<FunctionNode>()) {
+      this->VisitExpr(fn->body);
+      for (const auto& param : fn->params) {
+        this->VisitExpr(param);
+      }
+    } else {
+      this->VisitExpr(expr);
+    }
+  }
+
+  void VisitExpr_(const VarNode* vn) final { ApplyConsumerScopeToInputs(vn); }
+
+  void VisitExpr_(const ConstantNode* cn) final { ApplyConsumerScopeToInputs(cn); }
+
+  void DeviceAwareVisitExpr_(const CallNode* call) final {
+    // Check the contents of this primitive function
+    if (DeviceSupportsTextureStorage(GetRef<Expr>(call))) {
+      if (const auto* fn = call->op.as<FunctionNode>()) {
+        if (fn->HasNonzeroAttr(attr::kPrimitive)) {
+          primitive_supports_texture_ = false;
+          Visit(call->op);
+          if (primitive_supports_texture_) {
+            if (call->checked_type().as<TensorTypeNode>()) {
+              std::string scope = "global.texture";
+              if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
+                if (ttype->shape.size() == 5) {
+                  scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(call)));
+                }
+              }
+              storage_scope_[call].push_back(scope);
+            } else {
+              const auto* tuple_type = call->type_as<TupleTypeNode>();
+              ICHECK(tuple_type);
+              // TODO(csullivan): Add support for mixed output storage scope.
+              // In current adreno storage planner all outputs of a
+              // primitive function are assumed to be of the same storage
+              // type. This should be easy to extend in the future.
+              for (size_t i = 0; i < tuple_type->fields.size(); i++) {
+                storage_scope_[call].push_back("global.texture");
+              }
+            }
+            for (size_t i = 0; i < fn->params.size(); i++) {
+              args_to_vars_[call->args[i]].push_back(fn->params[i]);
+            }
+          }
+          // Add consumer storage scope information for call arguments
+          for (auto& arg : call->args) {
+            if (storage_scope_.count(call)) {
+              ICHECK(!HasMixedStorageOutputs(call))
+                  << "Mixed output storage scopes are not currently supported";
+              consumer_storage_scopes_[arg.operator->()].push_back("global.texture");
+            } else {
+              consumer_storage_scopes_[arg.operator->()].push_back("global");
+            }
+          }
+        }
+      }
+    }
+
+    primitive_supports_texture_ = SupportsTextureStorage(call);
+
+    for (auto& arg : call->args) {
+      Visit(arg);
+    }
+    // We have all callees filled into storage_scope_ if they support textures
+    // We need to verify if this call expects texture and if it does not, remove from
+    // storage_scope_ since initially storage_scope_ is filled only based on knowledge
+    // that function able to work with textures, but not necessary that this texture is
+    // expected by function callee
+    for (auto& arg : call->args) {
+      if (consumer_storage_scopes_.count(arg.operator->()) &&
+          GetConsumerScope(consumer_storage_scopes_[arg.operator->()]) != "global.texture") {
+        storage_scope_.erase(arg.operator->());
+        if (const auto* cn = arg.as<CallNode>()) {
+          if (const auto* fn = cn->op.as<FunctionNode>()) {
+            storage_scope_.erase(fn->body.operator->());
+          }
+        }
+      }
+    }
+  }
+
+  std::string Scope(Array<PrimExpr> shape, const VirtualDevice& vd) {
+    if (vd != VirtualDevice::FullyUnconstrained()) {
+      std::map<int, std::string> diffs;
+      int limit =
+          vd->target->GetAttr<Integer>("texture_spatial_limit").value_or(Integer(16384))->value;
+      int a0 = shape[0].as<IntImmNode>()->value;
+      int a1 = shape[1].as<IntImmNode>()->value;
+      int a2 = shape[2].as<IntImmNode>()->value;
+      int a3 = shape[3].as<IntImmNode>()->value;
+
+      int d3l = a0 * a1 * a2;
+      int d3r = a3;
+      int diff3 = d3l > d3r ? d3l - d3r : d3r - d3l;
+      if (d3l < limit && d3r < limit) diffs[diff3] = "";
+
+      int d2l = a0 * a1;
+      int d2r = a2 * a3;
+      int diff2 = d2l > d2r ? d2l - d2r : d2r - d2l;
+      if (d2l < limit && d2r < limit) diffs[diff2] = "nhwc";
+
+      int d1l = a0;
+      int d1r = a1 * a2 * a3;
+      int diff1 = d1l > d1r ? d1l - d1r : d1r - d1l;
+      if (d1l < limit && d1r < limit) diffs[diff1] = "weight";
+      if (!diffs.empty()) {
+        std::string scope = "global.texture";
+        if (!diffs.begin()->second.empty()) {
+          scope += ("-" + diffs.begin()->second);
+        }
+        return scope;
+      }
+    }
+    return "global";
+  }
+
+  void ApplyConsumerScopeToInputs(const ExprNode* expr) {
+    std::string scope;
+    auto consumer_scopes_it = consumer_storage_scopes_.find(expr);
+    if (consumer_scopes_it != consumer_storage_scopes_.end()) {
+      std::string consumer_scope = GetConsumerScope(consumer_scopes_it->second);
+      ICHECK(!storage_scope_.count(expr))
+          << "Already propagated consumer scopes to input: " << GetRef<Expr>(expr);
+
+      bool expr_is_rgba_vectorizable = false;
+      if (const auto* ttype = expr->checked_type().as<TensorTypeNode>()) {
+        if (ttype->shape.size() == 5) {
+          scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(expr)));
+          if (scope != "global") {
+            auto inner_dim = ttype->shape.back().as<IntImmNode>();
+            if (inner_dim && inner_dim->value == 4) {
+              expr_is_rgba_vectorizable = true;
+            }
+          }
+        }
+      }
+
+      // Only propagate texture scope from consumers to input expr if
+      // the input shape of the input expr is rgba vectorizable.
+      if (consumer_scope.find("global.texture") != std::string::npos) {
+        if (expr_is_rgba_vectorizable) {
+          storage_scope_[expr].push_back(scope);
+        }
+      } else {
+        storage_scope_[expr].push_back(consumer_scope);
+      }
+    }
+  }
+
+  void LegalizeProducerStorage() {
+    for (auto& kv : consumer_storage_scopes_) {
+      const ExprNode* producer = kv.first;
+      std::string legal_scope = GetConsumerScope(kv.second);
+      if (storage_scope_.count(producer)) {
+        ICHECK(!HasMixedStorageOutputs(producer))
+            << "Mixed output storage scopes are not currently supported";
+        if (storage_scope_[producer][0].find(legal_scope) == std::string::npos) {
+          for (size_t i = 0; i < storage_scope_[producer].size(); i++) {
+            // Only support uniform storage scope across all outputs for now
+            storage_scope_[producer][i] = legal_scope;
+          }
+        }
+      }
+    }
+  }
+
+  bool DeviceSupportsTextureStorage(const Expr& expr) {
+    auto vd = GetVirtualDevice(expr);
+    if (vd != VirtualDevice::FullyUnconstrained()) {
+      if (Optional<String> t_device = vd->target->GetAttr<String>("device")) {
+        if (vd->target->kind->device_type == kDLOpenCL && t_device.defined()) {
+          if (t_device.value() == "adreno") {
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
+
+  std::string GetConsumerScope(const std::vector<std::string>& consumer_scopes) const {
+    if (!consumer_scopes.size()) {
+      return "global";
+    }
+    std::string texture_tag = "global.texture";
+    for (auto& consumer_scope : consumer_scopes) {
+      if (consumer_scope.find(texture_tag) == std::string::npos) {
+        return "global";
+      }
+    }
+    return texture_tag;
+  }
+
+  bool CanConsumeTextures(const std::vector<std::string>& consumer_scopes) const {
+    if (!consumer_scopes.size()) {
+      return false;
+    }

Review Comment:
   removed



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] csullivan commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

csullivan commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r931581417


##########
src/relay/transforms/annotate_texture_storage.cc:
##########
@@ -0,0 +1,509 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file annotate_texture_storage.cc
+ * \brief Collection of target specific relay passes which
+ * storage scope related information.
+ *
+ *  - CollectStorageInfo returns a mapping from relay expr
+ *    to a list of output storage scopes for each output.
+ *    These scopes are used during memory planning as well
+ *    as downstream when doing codegen and in the graph runtime when doing runtime dataspace
+ *    allocations.
+ *
+ *  - AnnotateMemoryScope calls *target.CollectStorageInfo for all target been represented
+ *    in the graph and rewrites graph modifying or inserting of VirtualDevice with required
+ *    memory_scop collected from the CollectStorageInfo
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/tir/expr.h>
+
+#include <memory>
+#include <unordered_map>
+
+#include "../transforms/device_aware_visitors.h"
+
+namespace tvm {
+namespace relay {
+namespace {
+
+/**
+ * @brief Analyzes the graph and returns mapping of expressions vs desired memory scope
+ */
+class StorageInfo : private transform::DeviceAwareExprVisitor {
+ public:
+  StorageInfo() : transform::DeviceAwareExprVisitor(Optional<IRModule>()) {}
+
+  static Map<Expr, Array<String>> GetStorageMap(const Expr& expr) {
+    StorageInfo storage_info;
+    storage_info.VisitExpr(expr);
+    storage_info.LegalizeProducerStorage();
+    Map<Expr, Array<String>> storage_map;
+    for (auto& kv : storage_info.storage_scope_) {
+      std::vector<String> storage_scopes;
+      std::copy(kv.second.begin(), kv.second.end(), std::back_inserter(storage_scopes));
+      storage_map.Set(GetRef<Expr>(kv.first), Array<String>{storage_scopes});
+    }
+
+    // Filling the input arguments by "global" scope to handle PlanDevice algo which propagates
+    // virtual devices from outputs to inputs. At the same time outputs must be unconstrained
+    // to avoid useless device_copy
+    for (const auto& cs : storage_info.consumer_storage_scopes_) {
+      // we have record in consumers that mean that potentially consumer
+      // dealt with textures anyhow, it's safe to mark this expr as global scope
+      // even without verification of the consumer's outputs scope
+      if (storage_info.CanConsumeTextures(cs.second) &&
+          storage_map.find(GetRef<Expr>(cs.first)) == storage_map.end()) {
+        storage_map.Set(GetRef<Expr>(cs.first), Array<String>{"global"});
+      }
+    }
+
+    // initial algo assumes mapping of outputs of the expr that is not enough, need to update
+    // VirtualDevice for function variables to get proper codegen. Adding vars to storage_map
+    for (const auto& a : storage_info.args_to_vars_) {
+      if (storage_map.count(a.first)) {
+        for (const auto& v : a.second) {
+          storage_map.Set(v, storage_map[a.first]);
+        }
+      }
+    }
+    return storage_map;
+  }
+
+ private:
+  void Visit(const Expr& expr) {
+    // Pre-order traversal to enable upward propagation
+    // of consumer storage scopes to producers when desirable.
+    if (const auto* fn = expr.as<FunctionNode>()) {
+      this->VisitExpr(fn->body);
+      for (const auto& param : fn->params) {
+        this->VisitExpr(param);
+      }
+    } else {
+      this->VisitExpr(expr);
+    }
+  }
+
+  void VisitExpr_(const VarNode* vn) final { ApplyConsumerScopeToInputs(vn); }
+
+  void VisitExpr_(const ConstantNode* cn) final { ApplyConsumerScopeToInputs(cn); }
+
+  void DeviceAwareVisitExpr_(const CallNode* call) final {
+    // Check the contents of this primitive function
+    if (const auto* fn = call->op.as<FunctionNode>()) {
+      if (fn->HasNonzeroAttr(attr::kPrimitive)) {
+        primitive_supports_texture_ = false;
+        Visit(call->op);
+        if (primitive_supports_texture_) {
+          if (call->checked_type().as<TensorTypeNode>()) {
+            std::string scope = "global.texture";
+            if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
+              if (ttype->shape.size() == 5) {
+                scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(call)));
+              }
+            }
+            storage_scope_[call].push_back(scope);
+          } else {
+            const auto* tuple_type = call->type_as<TupleTypeNode>();
+            ICHECK(tuple_type);
+            // TODO(csullivan): Add support for mixed output storage scope.
+            // In current adreno storage planner all outputs of a
+            // primitive function are assumed to be of the same storage
+            // type. This should be easy to extend in the future.
+            for (size_t i = 0; i < tuple_type->fields.size(); i++) {
+              storage_scope_[call].push_back("global.texture");
+            }
+          }
+          for (size_t i = 0; i < fn->params.size(); i++) {
+            args_to_vars_[call->args[i]].push_back(fn->params[i]);
+          }
+        }
+        // Add consumer storage scope information for call arguments
+        for (auto& arg : call->args) {
+          if (storage_scope_.count(call)) {
+            ICHECK(!HasMixedStorageOutputs(call))
+                << "Mixed output storage scopes are not currently supported";
+            consumer_storage_scopes_[arg.operator->()].push_back("global.texture");
+          } else {
+            consumer_storage_scopes_[arg.operator->()].push_back("global");
+          }
+        }
+      }
+    }
+
+    primitive_supports_texture_ = SupportsTextureStorage(call);
+
+    for (auto& arg : call->args) {
+      Visit(arg);
+    }
+    // We have all callees filled into storage_scope_ if they support textures
+    // We need to verify if this call expects texture and if it does not, remove from
+    // storage_scope_ since initially storage_scope_ is filled only based on knowledge
+    // that function able to work with textures, but not necessary that this texture is
+    // expected by function callee
+    for (auto& arg : call->args) {
+      if (consumer_storage_scopes_.count(arg.operator->()) &&
+          GetConsumerScope(consumer_storage_scopes_[arg.operator->()]) != "global.texture") {
+        storage_scope_.erase(arg.operator->());
+        if (const auto* cn = arg.as<CallNode>()) {
+          if (const auto* fn = cn->op.as<FunctionNode>()) {
+            storage_scope_.erase(fn->body.operator->());
+          }
+        }
+      }
+    }
+  }
+
+  std::string Scope(Array<PrimExpr> shape, const VirtualDevice& vd) {
+    if (vd != VirtualDevice::FullyUnconstrained()) {
+      std::map<int, std::string> diffs;
+      int limit =
+          vd->target->GetAttr<Integer>("texture_spatial_limit").value_or(Integer(16384))->value;
+      int a0 = shape[0].as<IntImmNode>()->value;
+      int a1 = shape[1].as<IntImmNode>()->value;
+      int a2 = shape[2].as<IntImmNode>()->value;
+      int a3 = shape[3].as<IntImmNode>()->value;
+
+      int d3l = a0 * a1 * a2;
+      int d3r = a3;
+      int diff3 = d3l > d3r ? d3l - d3r : d3r - d3l;
+      if (d3l < limit && d3r < limit) diffs[diff3] = "";
+
+      int d2l = a0 * a1;
+      int d2r = a2 * a3;
+      int diff2 = d2l > d2r ? d2l - d2r : d2r - d2l;
+      if (d2l < limit && d2r < limit) diffs[diff2] = "nhwc";
+
+      int d1l = a0;
+      int d1r = a1 * a2 * a3;
+      int diff1 = d1l > d1r ? d1l - d1r : d1r - d1l;
+      if (d1l < limit && d1r < limit) diffs[diff1] = "weight";
+      if (!diffs.empty()) {
+        std::string scope = "global.texture";
+        if (!diffs.begin()->second.empty()) {
+          scope += ("-" + diffs.begin()->second);
+        }
+        return scope;
+      }
+    }
+    return "global";
+  }
+
+  void ApplyConsumerScopeToInputs(const ExprNode* expr) {
+    std::string scope;
+    auto consumer_scopes_it = consumer_storage_scopes_.find(expr);
+    if (consumer_scopes_it != consumer_storage_scopes_.end()) {
+      std::string consumer_scope = GetConsumerScope(consumer_scopes_it->second);
+      ICHECK(!storage_scope_.count(expr))
+          << "Already propagated consumer scopes to input: " << GetRef<Expr>(expr);
+
+      bool expr_is_rgba_vectorizable = false;
+      if (const auto* ttype = expr->checked_type().as<TensorTypeNode>()) {
+        if (ttype->shape.size() == 5) {
+          scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(expr)));
+          if (scope != "global") {
+            auto inner_dim = ttype->shape.back().as<IntImmNode>();
+            if (inner_dim && inner_dim->value == 4) {
+              expr_is_rgba_vectorizable = true;
+            }
+          }
+        }
+      }
+
+      // Only propagate texture scope from consumers to input expr if
+      // the input shape of the input expr is rgba vectorizable.
+      if (consumer_scope.find("global.texture") != std::string::npos) {
+        if (expr_is_rgba_vectorizable) {
+          storage_scope_[expr].push_back(scope);
+        }
+      } else {
+        storage_scope_[expr].push_back(consumer_scope);
+      }
+    }
+  }
+
+  void LegalizeProducerStorage() {
+    for (auto& kv : consumer_storage_scopes_) {
+      const ExprNode* producer = kv.first;
+      std::string legal_scope = GetConsumerScope(kv.second);
+      if (storage_scope_.count(producer)) {
+        ICHECK(!HasMixedStorageOutputs(producer))
+            << "Mixed output storage scopes are not currently supported";
+        if (storage_scope_[producer][0].find(legal_scope) == std::string::npos) {
+          for (size_t i = 0; i < storage_scope_[producer].size(); i++) {
+            // Only support uniform storage scope across all outputs for now
+            storage_scope_[producer][i] = legal_scope;
+          }
+        }
+      }
+    }
+  }
+
+  std::string GetConsumerScope(const std::vector<std::string>& consumer_scopes) const {
+    if (!consumer_scopes.size()) {
+      return "global";
+    }
+    std::string texture_tag = "global.texture";
+    for (auto& consumer_scope : consumer_scopes) {
+      if (consumer_scope.find(texture_tag) == std::string::npos) {
+        return "global";
+      }
+    }
+    return texture_tag;
+  }
+
+  bool CanConsumeTextures(const std::vector<std::string>& consumer_scopes) const {
+    std::string texture_tag = "global.texture";
+    for (auto& consumer_scope : consumer_scopes) {
+      if (consumer_scope.find(texture_tag) == 0) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool HasMixedStorageOutputs(const ExprNode* expr) {
+    if (storage_scope_.count(expr)) {
+      std::string ref_scope = storage_scope_[expr][0];
+      for (std::string& scope : storage_scope_[expr]) {
+        if (scope != ref_scope) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  bool SupportsTextureStorage(const CallNode* call) const {
+    bool supports_texture_storage = false;
+    if (auto attrs = call->attrs.as<Conv2DAttrs>()) {
+      if (attrs->data_layout == "NCHW4c" && attrs->kernel_layout == "OIHW4o") {
+        supports_texture_storage = true;
+      } else if (attrs->data_layout == "NHWC4c" &&
+                 (attrs->kernel_layout == "HWOI4o" || attrs->kernel_layout == "HWIO4o" ||
+                  attrs->kernel_layout == "OIHW4o")) {
+        supports_texture_storage = true;
+      }
+    } else if (auto attrs = call->attrs.as<Conv2DWinogradAttrs>()) {
+      if ((attrs->data_layout == "NCHW4c" || attrs->data_layout == "NHWC4c") &&
+          (attrs->kernel_layout == "OIHW4o" || attrs->kernel_layout == "HWIO4o")) {
+        supports_texture_storage = true;
+      }
+    } else if (auto attrs = call->attrs.as<GlobalPool2DAttrs>()) {
+      if (attrs->layout == "NCHW4c") {
+        supports_texture_storage = true;
+      }
+    } else if (auto attrs = call->attrs.as<MaxPool2DAttrs>()) {
+      if (attrs->layout == "NCHW4c") {
+        supports_texture_storage = true;
+      }
+    } else if (auto attrs = call->attrs.as<AvgPool2DAttrs>()) {
+      if (attrs->layout == "NCHW4c") {
+        supports_texture_storage = true;
+      }
+    }
+
+    return supports_texture_storage;
+  }
+
+  /*! \brief Temporary state for marking whether a visited function
+   *         primitive supports texture storage scope */
+  bool primitive_supports_texture_ = false;
+  /*! \brief expr storage scope mapping for each output  */
+  std::unordered_map<const ExprNode*, std::vector<std::string>> storage_scope_;
+  /*! \brief output storage scopes used by consumers of expr key  */
+  std::unordered_map<const ExprNode*, std::vector<std::string>> consumer_storage_scopes_;
+  /*! \brief mapping of arguments to call to function variables*/
+  std::unordered_map<Expr, std::vector<Var>, ObjectPtrHash, ObjectPtrEqual> args_to_vars_;
+};
+
+}  // namespace
+
+/**
+ * @brief rewrite of virtual devices, memory_scope part for expressions defined
+ * by the StorageInfo analysis pass
+ *
+ * Currently this workflow supports analysis and rewriting of VirtualDevice for
+ * Constants and function Variables
+ */
+class VDRewriter : public transform::DeviceAwareExprMutator {

Review Comment:
   VDRewriter is a bit too generic class name could be `RewriteVDStorageScopes` or similar



##########
src/relay/transforms/annotate_texture_storage.cc:
##########
@@ -0,0 +1,509 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file annotate_texture_storage.cc
+ * \brief Collection of target specific relay passes which
+ * storage scope related information.
+ *
+ *  - CollectStorageInfo returns a mapping from relay expr
+ *    to a list of output storage scopes for each output.
+ *    These scopes are used during memory planning as well
+ *    as downstream when doing codegen and in the graph runtime when doing runtime dataspace
+ *    allocations.
+ *
+ *  - AnnotateMemoryScope calls *target.CollectStorageInfo for all target been represented
+ *    in the graph and rewrites graph modifying or inserting of VirtualDevice with required
+ *    memory_scop collected from the CollectStorageInfo
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/tir/expr.h>
+
+#include <memory>
+#include <unordered_map>
+
+#include "../transforms/device_aware_visitors.h"
+
+namespace tvm {
+namespace relay {
+namespace {
+
+/**
+ * @brief Analyzes the graph and returns mapping of expressions vs desired memory scope
+ */
+class StorageInfo : private transform::DeviceAwareExprVisitor {
+ public:
+  StorageInfo() : transform::DeviceAwareExprVisitor(Optional<IRModule>()) {}
+
+  static Map<Expr, Array<String>> GetStorageMap(const Expr& expr) {
+    StorageInfo storage_info;
+    storage_info.VisitExpr(expr);
+    storage_info.LegalizeProducerStorage();
+    Map<Expr, Array<String>> storage_map;
+    for (auto& kv : storage_info.storage_scope_) {
+      std::vector<String> storage_scopes;
+      std::copy(kv.second.begin(), kv.second.end(), std::back_inserter(storage_scopes));
+      storage_map.Set(GetRef<Expr>(kv.first), Array<String>{storage_scopes});
+    }
+
+    // Filling the input arguments by "global" scope to handle PlanDevice algo which propagates
+    // virtual devices from outputs to inputs. At the same time outputs must be unconstrained
+    // to avoid useless device_copy
+    for (const auto& cs : storage_info.consumer_storage_scopes_) {
+      // we have record in consumers that mean that potentially consumer
+      // dealt with textures anyhow, it's safe to mark this expr as global scope
+      // even without verification of the consumer's outputs scope
+      if (storage_info.CanConsumeTextures(cs.second) &&
+          storage_map.find(GetRef<Expr>(cs.first)) == storage_map.end()) {
+        storage_map.Set(GetRef<Expr>(cs.first), Array<String>{"global"});
+      }
+    }
+
+    // initial algo assumes mapping of outputs of the expr that is not enough, need to update
+    // VirtualDevice for function variables to get proper codegen. Adding vars to storage_map
+    for (const auto& a : storage_info.args_to_vars_) {
+      if (storage_map.count(a.first)) {
+        for (const auto& v : a.second) {
+          storage_map.Set(v, storage_map[a.first]);
+        }
+      }
+    }
+    return storage_map;
+  }
+
+ private:
+  void Visit(const Expr& expr) {
+    // Pre-order traversal to enable upward propagation
+    // of consumer storage scopes to producers when desirable.
+    if (const auto* fn = expr.as<FunctionNode>()) {
+      this->VisitExpr(fn->body);
+      for (const auto& param : fn->params) {
+        this->VisitExpr(param);
+      }
+    } else {
+      this->VisitExpr(expr);
+    }
+  }
+
+  void VisitExpr_(const VarNode* vn) final { ApplyConsumerScopeToInputs(vn); }
+
+  void VisitExpr_(const ConstantNode* cn) final { ApplyConsumerScopeToInputs(cn); }
+
+  void DeviceAwareVisitExpr_(const CallNode* call) final {
+    // Check the contents of this primitive function
+    if (const auto* fn = call->op.as<FunctionNode>()) {
+      if (fn->HasNonzeroAttr(attr::kPrimitive)) {
+        primitive_supports_texture_ = false;
+        Visit(call->op);
+        if (primitive_supports_texture_) {
+          if (call->checked_type().as<TensorTypeNode>()) {
+            std::string scope = "global.texture";
+            if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
+              if (ttype->shape.size() == 5) {
+                scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(call)));
+              }
+            }
+            storage_scope_[call].push_back(scope);
+          } else {
+            const auto* tuple_type = call->type_as<TupleTypeNode>();
+            ICHECK(tuple_type);
+            // TODO(csullivan): Add support for mixed output storage scope.
+            // In current adreno storage planner all outputs of a
+            // primitive function are assumed to be of the same storage
+            // type. This should be easy to extend in the future.
+            for (size_t i = 0; i < tuple_type->fields.size(); i++) {
+              storage_scope_[call].push_back("global.texture");
+            }
+          }
+          for (size_t i = 0; i < fn->params.size(); i++) {
+            args_to_vars_[call->args[i]].push_back(fn->params[i]);
+          }
+        }
+        // Add consumer storage scope information for call arguments
+        for (auto& arg : call->args) {
+          if (storage_scope_.count(call)) {
+            ICHECK(!HasMixedStorageOutputs(call))
+                << "Mixed output storage scopes are not currently supported";
+            consumer_storage_scopes_[arg.operator->()].push_back("global.texture");
+          } else {
+            consumer_storage_scopes_[arg.operator->()].push_back("global");
+          }
+        }
+      }
+    }
+
+    primitive_supports_texture_ = SupportsTextureStorage(call);
+
+    for (auto& arg : call->args) {
+      Visit(arg);
+    }
+    // We have all callees filled into storage_scope_ if they support textures
+    // We need to verify if this call expects texture and if it does not, remove from
+    // storage_scope_ since initially storage_scope_ is filled only based on knowledge
+    // that function able to work with textures, but not necessary that this texture is
+    // expected by function callee
+    for (auto& arg : call->args) {
+      if (consumer_storage_scopes_.count(arg.operator->()) &&
+          GetConsumerScope(consumer_storage_scopes_[arg.operator->()]) != "global.texture") {
+        storage_scope_.erase(arg.operator->());
+        if (const auto* cn = arg.as<CallNode>()) {
+          if (const auto* fn = cn->op.as<FunctionNode>()) {
+            storage_scope_.erase(fn->body.operator->());
+          }
+        }
+      }
+    }
+  }
+
+  std::string Scope(Array<PrimExpr> shape, const VirtualDevice& vd) {

Review Comment:
   This function needs some documentation explaining its intent to check various limits on texture and assign the correct texture layout to the storage scope based on the limit comparisons. 



##########
src/relay/transforms/annotate_texture_storage.cc:
##########
@@ -0,0 +1,509 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file annotate_texture_storage.cc
+ * \brief Collection of target specific relay passes which
+ * storage scope related information.
+ *
+ *  - CollectStorageInfo returns a mapping from relay expr
+ *    to a list of output storage scopes for each output.
+ *    These scopes are used during memory planning as well
+ *    as downstream when doing codegen and in the graph runtime when doing runtime dataspace
+ *    allocations.
+ *
+ *  - AnnotateMemoryScope calls *target.CollectStorageInfo for all target been represented
+ *    in the graph and rewrites graph modifying or inserting of VirtualDevice with required
+ *    memory_scop collected from the CollectStorageInfo
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/tir/expr.h>
+
+#include <memory>
+#include <unordered_map>
+
+#include "../transforms/device_aware_visitors.h"
+
+namespace tvm {
+namespace relay {
+namespace {
+
+/**
+ * @brief Analyzes the graph and returns mapping of expressions vs desired memory scope
+ */
+class StorageInfo : private transform::DeviceAwareExprVisitor {
+ public:
+  StorageInfo() : transform::DeviceAwareExprVisitor(Optional<IRModule>()) {}
+
+  static Map<Expr, Array<String>> GetStorageMap(const Expr& expr) {
+    StorageInfo storage_info;
+    storage_info.VisitExpr(expr);
+    storage_info.LegalizeProducerStorage();
+    Map<Expr, Array<String>> storage_map;
+    for (auto& kv : storage_info.storage_scope_) {
+      std::vector<String> storage_scopes;
+      std::copy(kv.second.begin(), kv.second.end(), std::back_inserter(storage_scopes));
+      storage_map.Set(GetRef<Expr>(kv.first), Array<String>{storage_scopes});
+    }
+
+    // Filling the input arguments by "global" scope to handle PlanDevice algo which propagates
+    // virtual devices from outputs to inputs. At the same time outputs must be unconstrained
+    // to avoid useless device_copy
+    for (const auto& cs : storage_info.consumer_storage_scopes_) {
+      // we have record in consumers that mean that potentially consumer
+      // dealt with textures anyhow, it's safe to mark this expr as global scope
+      // even without verification of the consumer's outputs scope
+      if (storage_info.CanConsumeTextures(cs.second) &&
+          storage_map.find(GetRef<Expr>(cs.first)) == storage_map.end()) {
+        storage_map.Set(GetRef<Expr>(cs.first), Array<String>{"global"});
+      }
+    }
+
+    // initial algo assumes mapping of outputs of the expr that is not enough, need to update
+    // VirtualDevice for function variables to get proper codegen. Adding vars to storage_map
+    for (const auto& a : storage_info.args_to_vars_) {
+      if (storage_map.count(a.first)) {
+        for (const auto& v : a.second) {
+          storage_map.Set(v, storage_map[a.first]);
+        }
+      }
+    }
+    return storage_map;
+  }
+
+ private:
+  void Visit(const Expr& expr) {
+    // Pre-order traversal to enable upward propagation
+    // of consumer storage scopes to producers when desirable.
+    if (const auto* fn = expr.as<FunctionNode>()) {
+      this->VisitExpr(fn->body);
+      for (const auto& param : fn->params) {
+        this->VisitExpr(param);
+      }
+    } else {
+      this->VisitExpr(expr);
+    }
+  }
+
+  void VisitExpr_(const VarNode* vn) final { ApplyConsumerScopeToInputs(vn); }
+
+  void VisitExpr_(const ConstantNode* cn) final { ApplyConsumerScopeToInputs(cn); }
+
+  void DeviceAwareVisitExpr_(const CallNode* call) final {
+    // Check the contents of this primitive function
+    if (const auto* fn = call->op.as<FunctionNode>()) {
+      if (fn->HasNonzeroAttr(attr::kPrimitive)) {
+        primitive_supports_texture_ = false;
+        Visit(call->op);
+        if (primitive_supports_texture_) {
+          if (call->checked_type().as<TensorTypeNode>()) {
+            std::string scope = "global.texture";
+            if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
+              if (ttype->shape.size() == 5) {
+                scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(call)));
+              }
+            }
+            storage_scope_[call].push_back(scope);
+          } else {
+            const auto* tuple_type = call->type_as<TupleTypeNode>();
+            ICHECK(tuple_type);
+            // TODO(csullivan): Add support for mixed output storage scope.
+            // In current adreno storage planner all outputs of a
+            // primitive function are assumed to be of the same storage
+            // type. This should be easy to extend in the future.

Review Comment:
   Can we relax this requirement now? I don't recall the outcome of our discussion previously on this. I thought I recall you having mentioned some change on the topic of heterogeneous scopes in a tuple.



##########
src/relay/transforms/annotate_texture_storage.cc:
##########
@@ -0,0 +1,509 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file annotate_texture_storage.cc
+ * \brief Collection of target specific relay passes which
+ * storage scope related information.
+ *
+ *  - CollectStorageInfo returns a mapping from relay expr
+ *    to a list of output storage scopes for each output.
+ *    These scopes are used during memory planning as well
+ *    as downstream when doing codegen and in the graph runtime when doing runtime dataspace
+ *    allocations.
+ *
+ *  - AnnotateMemoryScope calls *target.CollectStorageInfo for all target been represented
+ *    in the graph and rewrites graph modifying or inserting of VirtualDevice with required
+ *    memory_scop collected from the CollectStorageInfo

Review Comment:
   ```suggestion
    *    memory_scope collected from the CollectStorageInfo
   ```



##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,558 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_residual_block():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    filter_shape3 = (32, 32, 2, 2)
+    bias_shape1 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
+
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv1, B1)
+    D = relay.op.nn.relu(D)
+
+    conv2 = relay.nn.conv2d(
+        D,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+    D = relay.op.add(conv2, D)
+    D = D * relay.const(0.15, "float16")
+    D = relay.op.nn.relu(D)
+
+    conv3 = relay.nn.conv2d(
+        D,
+        W3,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.nn.relu(conv3)
+
+    mod = relay.Function([A, W1, B1, W2, W3], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    filter_data3 = np.zeros(filter_shape3).astype(dtype)
+    initializer("weight", filter_data3)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "weight3": tvm.nd.array(filter_data3),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture-weight",
+        "global",
+        "global.texture",
+        "global.texture-weight",
+        "",
+        "",
+    ]
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+
+
+@tvm.testing.requires_opencl
+def test_plan_device_issue1():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    conv2 = relay.nn.conv2d(
+        conv1,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+
+    mod = relay.Function([A, W1, W2], conv2)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "weight2": tvm.nd.array(filter_data2),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture-weight",
+        "",
+        "",
+    ]
+
+    static_memory_scope = []
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+
+
+@tvm.testing.requires_opencl
+def test_branch_textures():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (96, 32, 2, 2)
+    filter_shape2 = (32, 96, 2, 2)
+    filter_shape3 = (5, 96, 2, 2)
+    bias_shape1 = (1, 96, 1, 1)
+    bias_shape2 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
+    B2 = relay.var("bias2", shape=bias_shape2, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=96,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv1, B1)
+    D = relay.op.nn.relu(D)
+
+    conv2 = relay.nn.conv2d(
+        D,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    conv2 = relay.op.add(conv2, B2)
+    conv2 = relay.op.nn.relu(conv2)
+
+    conv3 = relay.nn.conv2d(
+        D,
+        W3,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=5,
+        kernel_size=(2, 2),
+    )
+
+    t = relay.Tuple([conv2, conv3])
+    c = relay.op.concatenate(t, axis=1)
+
+    mod = relay.Function([A, W1, B1, W2, B2, W3], c)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    bias_data2 = np.zeros(bias_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    initializer("bias", bias_data2)
+    filter_data3 = np.zeros(filter_shape3).astype(dtype)
+    initializer("weight", filter_data3)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "bias2": tvm.nd.array(bias_data2),
+        "weight3": tvm.nd.array(filter_data3),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ]
+
+    static_memory_scope = []
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+
+
+@tvm.testing.requires_opencl
+def test_branch1_texture_params():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape0 = (32, 32, 1, 1)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    filter_shape3 = (32, 32, 2, 2)
+    bias_shape1 = (1, 32, 1, 1)
+    # bias_shape2 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W0 = relay.var("weight0", shape=filter_shape0, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
+
+    conv0 = relay.nn.conv2d(
+        A,
+        W0,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+
+    pool = relay.nn.avg_pool2d(conv0, pool_size=(2, 2), strides=(2, 2))
+    conv1 = relay.nn.conv2d(
+        pool,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 1, 1],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    conv1 = relay.op.add(conv1, B1)
+    conv1 = relay.op.nn.relu(conv1)
+
+    conv2 = relay.nn.conv2d(
+        pool,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+
+    conv3 = relay.nn.conv2d(
+        pool,
+        W3,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 1, 1, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    conv3 = relay.op.nn.relu(conv3)
+    res = relay.op.add(conv1, conv2)
+    res = relay.op.add(res, conv3)
+
+    mod = relay.Function([A, W0, W1, B1, W2, W3], res)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data0 = np.zeros(filter_shape0).astype(dtype)
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    filter_data3 = np.zeros(filter_shape3).astype(dtype)
+    initializer("weight", filter_data3)
+    params1 = {
+        "weight0": tvm.nd.array(filter_data0),
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "weight3": tvm.nd.array(filter_data3),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture-weight",
+        "global.texture",
+        "",
+        "",
+    ]
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+
+
+#                      conv2d <- to get textures

Review Comment:
   This diagram is helpful. Can you format it in a docstring. The above comment about making the static_memory_scopes would greatly help in tying back that list to this diagram.



##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,558 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_residual_block():

Review Comment:
   Can you give a docstring description here? Specifically below there are some ops in global.texture and others in global. It would be nice to know why that's expected. 



##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,558 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_residual_block():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    filter_shape3 = (32, 32, 2, 2)
+    bias_shape1 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
+
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv1, B1)
+    D = relay.op.nn.relu(D)
+
+    conv2 = relay.nn.conv2d(
+        D,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+    D = relay.op.add(conv2, D)
+    D = D * relay.const(0.15, "float16")
+    D = relay.op.nn.relu(D)
+
+    conv3 = relay.nn.conv2d(
+        D,
+        W3,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.nn.relu(conv3)
+
+    mod = relay.Function([A, W1, B1, W2, W3], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    filter_data3 = np.zeros(filter_shape3).astype(dtype)
+    initializer("weight", filter_data3)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "weight3": tvm.nd.array(filter_data3),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture-weight",
+        "global",
+        "global.texture",
+        "global.texture-weight",
+        "",

Review Comment:
   It's great to have unit tests explicitly checking the scopes!
   That said, this list is a bit confusing. For example, this list has `global.texture-weight` scope as the final entry, which is a scope used for a weight, but that certainly is not the final expr in topo order. Even nicer would be a map from expr to scope, or since its checking against the json maybe function/kernel name for the key? This would make the test more intuitive. 



##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,558 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_residual_block():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    filter_shape3 = (32, 32, 2, 2)
+    bias_shape1 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
+
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv1, B1)
+    D = relay.op.nn.relu(D)
+
+    conv2 = relay.nn.conv2d(
+        D,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+    D = relay.op.add(conv2, D)
+    D = D * relay.const(0.15, "float16")
+    D = relay.op.nn.relu(D)
+
+    conv3 = relay.nn.conv2d(
+        D,
+        W3,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.nn.relu(conv3)
+
+    mod = relay.Function([A, W1, B1, W2, W3], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    filter_data3 = np.zeros(filter_shape3).astype(dtype)
+    initializer("weight", filter_data3)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "weight3": tvm.nd.array(filter_data3),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture-weight",
+        "global",
+        "global.texture",
+        "global.texture-weight",
+        "",
+        "",
+    ]
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, static_memory_scope)
+
+
+@tvm.testing.requires_opencl
+def test_plan_device_issue1():

Review Comment:
   Need a better name and a docstring. Same for the below tests, just a simple description can help. 



##########
src/relay/transforms/annotate_texture_storage.cc:
##########
@@ -0,0 +1,509 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file annotate_texture_storage.cc
+ * \brief Collection of target specific relay passes which
+ * storage scope related information.
+ *
+ *  - CollectStorageInfo returns a mapping from relay expr
+ *    to a list of output storage scopes for each output.
+ *    These scopes are used during memory planning as well
+ *    as downstream when doing codegen and in the graph runtime when doing runtime dataspace
+ *    allocations.
+ *
+ *  - AnnotateMemoryScope calls *target.CollectStorageInfo for all target been represented
+ *    in the graph and rewrites graph modifying or inserting of VirtualDevice with required
+ *    memory_scop collected from the CollectStorageInfo
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/tir/expr.h>
+
+#include <memory>
+#include <unordered_map>
+
+#include "../transforms/device_aware_visitors.h"
+
+namespace tvm {
+namespace relay {
+namespace {
+
+/**
+ * @brief Analyzes the graph and returns mapping of expressions vs desired memory scope
+ */
+class StorageInfo : private transform::DeviceAwareExprVisitor {
+ public:
+  StorageInfo() : transform::DeviceAwareExprVisitor(Optional<IRModule>()) {}
+
+  static Map<Expr, Array<String>> GetStorageMap(const Expr& expr) {
+    StorageInfo storage_info;
+    storage_info.VisitExpr(expr);
+    storage_info.LegalizeProducerStorage();
+    Map<Expr, Array<String>> storage_map;
+    for (auto& kv : storage_info.storage_scope_) {
+      std::vector<String> storage_scopes;
+      std::copy(kv.second.begin(), kv.second.end(), std::back_inserter(storage_scopes));
+      storage_map.Set(GetRef<Expr>(kv.first), Array<String>{storage_scopes});
+    }
+
+    // Filling the input arguments by "global" scope to handle PlanDevice algo which propagates
+    // virtual devices from outputs to inputs. At the same time outputs must be unconstrained
+    // to avoid useless device_copy
+    for (const auto& cs : storage_info.consumer_storage_scopes_) {
+      // we have record in consumers that mean that potentially consumer
+      // dealt with textures anyhow, it's safe to mark this expr as global scope
+      // even without verification of the consumer's outputs scope
+      if (storage_info.CanConsumeTextures(cs.second) &&
+          storage_map.find(GetRef<Expr>(cs.first)) == storage_map.end()) {
+        storage_map.Set(GetRef<Expr>(cs.first), Array<String>{"global"});
+      }
+    }
+
+    // initial algo assumes mapping of outputs of the expr that is not enough, need to update
+    // VirtualDevice for function variables to get proper codegen. Adding vars to storage_map
+    for (const auto& a : storage_info.args_to_vars_) {
+      if (storage_map.count(a.first)) {
+        for (const auto& v : a.second) {
+          storage_map.Set(v, storage_map[a.first]);
+        }
+      }
+    }
+    return storage_map;
+  }
+
+ private:
+  void Visit(const Expr& expr) {
+    // Pre-order traversal to enable upward propagation
+    // of consumer storage scopes to producers when desirable.
+    if (const auto* fn = expr.as<FunctionNode>()) {
+      this->VisitExpr(fn->body);
+      for (const auto& param : fn->params) {
+        this->VisitExpr(param);
+      }
+    } else {
+      this->VisitExpr(expr);
+    }
+  }
+
+  void VisitExpr_(const VarNode* vn) final { ApplyConsumerScopeToInputs(vn); }
+
+  void VisitExpr_(const ConstantNode* cn) final { ApplyConsumerScopeToInputs(cn); }
+
+  void DeviceAwareVisitExpr_(const CallNode* call) final {
+    // Check the contents of this primitive function
+    if (const auto* fn = call->op.as<FunctionNode>()) {
+      if (fn->HasNonzeroAttr(attr::kPrimitive)) {
+        primitive_supports_texture_ = false;
+        Visit(call->op);
+        if (primitive_supports_texture_) {
+          if (call->checked_type().as<TensorTypeNode>()) {
+            std::string scope = "global.texture";
+            if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
+              if (ttype->shape.size() == 5) {
+                scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(call)));
+              }
+            }
+            storage_scope_[call].push_back(scope);
+          } else {
+            const auto* tuple_type = call->type_as<TupleTypeNode>();
+            ICHECK(tuple_type);
+            // TODO(csullivan): Add support for mixed output storage scope.
+            // In current adreno storage planner all outputs of a
+            // primitive function are assumed to be of the same storage
+            // type. This should be easy to extend in the future.
+            for (size_t i = 0; i < tuple_type->fields.size(); i++) {
+              storage_scope_[call].push_back("global.texture");
+            }
+          }
+          for (size_t i = 0; i < fn->params.size(); i++) {
+            args_to_vars_[call->args[i]].push_back(fn->params[i]);
+          }
+        }
+        // Add consumer storage scope information for call arguments
+        for (auto& arg : call->args) {
+          if (storage_scope_.count(call)) {
+            ICHECK(!HasMixedStorageOutputs(call))
+                << "Mixed output storage scopes are not currently supported";
+            consumer_storage_scopes_[arg.operator->()].push_back("global.texture");
+          } else {
+            consumer_storage_scopes_[arg.operator->()].push_back("global");
+          }
+        }
+      }
+    }
+
+    primitive_supports_texture_ = SupportsTextureStorage(call);
+
+    for (auto& arg : call->args) {
+      Visit(arg);
+    }
+    // We have all callees filled into storage_scope_ if they support textures
+    // We need to verify if this call expects texture and if it does not, remove from
+    // storage_scope_ since initially storage_scope_ is filled only based on knowledge
+    // that function able to work with textures, but not necessary that this texture is
+    // expected by function callee
+    for (auto& arg : call->args) {
+      if (consumer_storage_scopes_.count(arg.operator->()) &&
+          GetConsumerScope(consumer_storage_scopes_[arg.operator->()]) != "global.texture") {
+        storage_scope_.erase(arg.operator->());
+        if (const auto* cn = arg.as<CallNode>()) {
+          if (const auto* fn = cn->op.as<FunctionNode>()) {
+            storage_scope_.erase(fn->body.operator->());
+          }
+        }
+      }
+    }
+  }
+
+  std::string Scope(Array<PrimExpr> shape, const VirtualDevice& vd) {
+    if (vd != VirtualDevice::FullyUnconstrained()) {
+      std::map<int, std::string> diffs;
+      int limit =
+          vd->target->GetAttr<Integer>("texture_spatial_limit").value_or(Integer(16384))->value;
+      int a0 = shape[0].as<IntImmNode>()->value;
+      int a1 = shape[1].as<IntImmNode>()->value;
+      int a2 = shape[2].as<IntImmNode>()->value;
+      int a3 = shape[3].as<IntImmNode>()->value;
+
+      int d3l = a0 * a1 * a2;
+      int d3r = a3;
+      int diff3 = d3l > d3r ? d3l - d3r : d3r - d3l;
+      if (d3l < limit && d3r < limit) diffs[diff3] = "";
+
+      int d2l = a0 * a1;
+      int d2r = a2 * a3;
+      int diff2 = d2l > d2r ? d2l - d2r : d2r - d2l;
+      if (d2l < limit && d2r < limit) diffs[diff2] = "nhwc";
+
+      int d1l = a0;
+      int d1r = a1 * a2 * a3;
+      int diff1 = d1l > d1r ? d1l - d1r : d1r - d1l;
+      if (d1l < limit && d1r < limit) diffs[diff1] = "weight";
+      if (!diffs.empty()) {
+        std::string scope = "global.texture";
+        if (!diffs.begin()->second.empty()) {
+          scope += ("-" + diffs.begin()->second);
+        }
+        return scope;
+      }
+    }
+    return "global";
+  }
+
+  void ApplyConsumerScopeToInputs(const ExprNode* expr) {
+    std::string scope;
+    auto consumer_scopes_it = consumer_storage_scopes_.find(expr);
+    if (consumer_scopes_it != consumer_storage_scopes_.end()) {
+      std::string consumer_scope = GetConsumerScope(consumer_scopes_it->second);
+      ICHECK(!storage_scope_.count(expr))
+          << "Already propagated consumer scopes to input: " << GetRef<Expr>(expr);
+
+      bool expr_is_rgba_vectorizable = false;
+      if (const auto* ttype = expr->checked_type().as<TensorTypeNode>()) {
+        if (ttype->shape.size() == 5) {

Review Comment:
   This is essentially checking for packing of C with layouts like `NHWCxc and NCHWxc` but `==5` is pretty mystical to the reader without that context. Can we do this in a way that is more clear? If not perhaps pull that check into Scope function and add a comment.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] csullivan merged pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

csullivan merged PR #11878:
URL: https://github.com/apache/tvm/pull/11878


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] TejashShah commented on pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

TejashShah commented on PR #11878:
URL: https://github.com/apache/tvm/pull/11878#issuecomment-1195739065

    @masahi @csullivan @junrushao1994, please take out sometime to review this code and offer some feedback to @elvin-n. Thanks.
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] elvin-n commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

elvin-n commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r931845224


##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,558 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_residual_block():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    filter_shape3 = (32, 32, 2, 2)
+    bias_shape1 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
+
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv1, B1)
+    D = relay.op.nn.relu(D)
+
+    conv2 = relay.nn.conv2d(
+        D,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+    D = relay.op.add(conv2, D)
+    D = D * relay.const(0.15, "float16")
+    D = relay.op.nn.relu(D)
+
+    conv3 = relay.nn.conv2d(
+        D,
+        W3,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.nn.relu(conv3)
+
+    mod = relay.Function([A, W1, B1, W2, W3], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    filter_data3 = np.zeros(filter_shape3).astype(dtype)
+    initializer("weight", filter_data3)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "weight3": tvm.nd.array(filter_data3),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture-weight",
+        "global",
+        "global.texture",
+        "global.texture-weight",
+        "",

Review Comment:
   The network output memory scope is "" / (empty string) that is _ideologically_ synonym for "global" but attempt to mark the tail by "global" memory scope causes PlanDevice pass behaves unexpectedly and fails the transformation. Sometimes PlanDevice try to put device_copy from global to empty scope, sometime just aborts.
   
   
   The idea of having mapping of `op->scope` instead of just array was dictated by the nature how memory scopes are stored in json. To compare the mapped values, we have to build such mapping basing on json. That is doable but requires more efforts.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] elvin-n commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

elvin-n commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r931917655


##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,558 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_residual_block():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    filter_shape3 = (32, 32, 2, 2)
+    bias_shape1 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
+
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv1, B1)
+    D = relay.op.nn.relu(D)
+
+    conv2 = relay.nn.conv2d(
+        D,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+    D = relay.op.add(conv2, D)
+    D = D * relay.const(0.15, "float16")
+    D = relay.op.nn.relu(D)
+
+    conv3 = relay.nn.conv2d(
+        D,
+        W3,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.nn.relu(conv3)
+
+    mod = relay.Function([A, W1, B1, W2, W3], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    filter_data3 = np.zeros(filter_shape3).astype(dtype)
+    initializer("weight", filter_data3)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "weight3": tvm.nd.array(filter_data3),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture-weight",
+        "global",
+        "global.texture",
+        "global.texture-weight",
+        "",

Review Comment:
   BTW, please don't be confused by the name of the memory scopes. It is historical naming. Now the layout of texture is defined by some algorithm defined in annotate_texture_storage.cc `Scope()` functions. and it is rather refer to 
   ```texture -> 123|4|5
   texture-weight -> 1|234|5
   texture-nchw  ->12|34|5```
   these scopes are applied to any type of the tensor - data/weights/bias. dividing by buckets are defined only based by values in shape



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] elvin-n commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

elvin-n commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r931917655


##########
tests/python/relay/test_conv2d_nchw_texture.py:
##########
@@ -435,3 +435,558 @@ def test_conv2d_vgg16_winograd_4d():
     graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
     matches = re.findall("winograd", graph)
     assert len(matches) > 0
+
+
+@tvm.testing.requires_opencl
+def test_residual_block():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape1 = (32, 32, 2, 2)
+    filter_shape2 = (32, 32, 1, 1)
+    filter_shape3 = (32, 32, 2, 2)
+    bias_shape1 = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+    B1 = relay.var("bias1", shape=bias_shape1, dtype=dtype)
+    W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+    W3 = relay.var("weight3", shape=filter_shape3, dtype=dtype)
+
+    conv1 = relay.nn.conv2d(
+        A,
+        W1,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv1, B1)
+    D = relay.op.nn.relu(D)
+
+    conv2 = relay.nn.conv2d(
+        D,
+        W2,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(1, 1),
+    )
+    D = relay.op.add(conv2, D)
+    D = D * relay.const(0.15, "float16")
+    D = relay.op.nn.relu(D)
+
+    conv3 = relay.nn.conv2d(
+        D,
+        W3,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.nn.relu(conv3)
+
+    mod = relay.Function([A, W1, B1, W2, W3], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data1 = np.zeros(filter_shape1).astype(dtype)
+    bias_data1 = np.zeros(bias_shape1).astype(dtype)
+    initializer("weight", filter_data1)
+    initializer("bias", bias_data1)
+    filter_data2 = np.zeros(filter_shape2).astype(dtype)
+    initializer("weight", filter_data2)
+    filter_data3 = np.zeros(filter_shape3).astype(dtype)
+    initializer("weight", filter_data3)
+    params1 = {
+        "weight1": tvm.nd.array(filter_data1),
+        "bias1": tvm.nd.array(bias_data1),
+        "weight2": tvm.nd.array(filter_data2),
+        "weight3": tvm.nd.array(filter_data3),
+    }
+
+    static_memory_scope = [
+        "",
+        "global",
+        "global.texture-weight",
+        "global.texture-weight",
+        "global.texture",
+        "global.texture-weight",
+        "global",
+        "global.texture",
+        "global.texture-weight",
+        "",

Review Comment:
   BTW, please don't be confused by the name of the memory scopes. It is historical naming. Now the layout of texture is defined by some algorithm defined in annotate_texture_storage.cc `Scope()` functions. and it is rather refer to 
   ```texture -> 123|4|5
   texture-weight -> 1|234|5
   texture-nchw  ->12|34|5
   ```
   
   these scopes are applied to any type of the tensor - data/weights/bias. dividing by buckets are defined only based by values in shape



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] elvin-n commented on a diff in pull request #11878: [Adreno] Add markup pass of relay tensors for static texture planning

Posted by GitBox <gi...@apache.org>.

elvin-n commented on code in PR #11878:
URL: https://github.com/apache/tvm/pull/11878#discussion_r931971080


##########
src/relay/transforms/annotate_texture_storage.cc:
##########
@@ -0,0 +1,509 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file annotate_texture_storage.cc
+ * \brief Collection of target specific relay passes which
+ * storage scope related information.
+ *
+ *  - CollectStorageInfo returns a mapping from relay expr
+ *    to a list of output storage scopes for each output.
+ *    These scopes are used during memory planning as well
+ *    as downstream when doing codegen and in the graph runtime when doing runtime dataspace
+ *    allocations.
+ *
+ *  - AnnotateMemoryScope calls *target.CollectStorageInfo for all target been represented
+ *    in the graph and rewrites graph modifying or inserting of VirtualDevice with required
+ *    memory_scop collected from the CollectStorageInfo
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/tir/expr.h>
+
+#include <memory>
+#include <unordered_map>
+
+#include "../transforms/device_aware_visitors.h"
+
+namespace tvm {
+namespace relay {
+namespace {
+
+/**
+ * @brief Analyzes the graph and returns mapping of expressions vs desired memory scope
+ */
+class StorageInfo : private transform::DeviceAwareExprVisitor {
+ public:
+  StorageInfo() : transform::DeviceAwareExprVisitor(Optional<IRModule>()) {}
+
+  static Map<Expr, Array<String>> GetStorageMap(const Expr& expr) {
+    StorageInfo storage_info;
+    storage_info.VisitExpr(expr);
+    storage_info.LegalizeProducerStorage();
+    Map<Expr, Array<String>> storage_map;
+    for (auto& kv : storage_info.storage_scope_) {
+      std::vector<String> storage_scopes;
+      std::copy(kv.second.begin(), kv.second.end(), std::back_inserter(storage_scopes));
+      storage_map.Set(GetRef<Expr>(kv.first), Array<String>{storage_scopes});
+    }
+
+    // Filling the input arguments by "global" scope to handle PlanDevice algo which propagates
+    // virtual devices from outputs to inputs. At the same time outputs must be unconstrained
+    // to avoid useless device_copy
+    for (const auto& cs : storage_info.consumer_storage_scopes_) {
+      // we have record in consumers that mean that potentially consumer
+      // dealt with textures anyhow, it's safe to mark this expr as global scope
+      // even without verification of the consumer's outputs scope
+      if (storage_info.CanConsumeTextures(cs.second) &&
+          storage_map.find(GetRef<Expr>(cs.first)) == storage_map.end()) {
+        storage_map.Set(GetRef<Expr>(cs.first), Array<String>{"global"});
+      }
+    }
+
+    // initial algo assumes mapping of outputs of the expr that is not enough, need to update
+    // VirtualDevice for function variables to get proper codegen. Adding vars to storage_map
+    for (const auto& a : storage_info.args_to_vars_) {
+      if (storage_map.count(a.first)) {
+        for (const auto& v : a.second) {
+          storage_map.Set(v, storage_map[a.first]);
+        }
+      }
+    }
+    return storage_map;
+  }
+
+ private:
+  void Visit(const Expr& expr) {
+    // Pre-order traversal to enable upward propagation
+    // of consumer storage scopes to producers when desirable.
+    if (const auto* fn = expr.as<FunctionNode>()) {
+      this->VisitExpr(fn->body);
+      for (const auto& param : fn->params) {
+        this->VisitExpr(param);
+      }
+    } else {
+      this->VisitExpr(expr);
+    }
+  }
+
+  void VisitExpr_(const VarNode* vn) final { ApplyConsumerScopeToInputs(vn); }
+
+  void VisitExpr_(const ConstantNode* cn) final { ApplyConsumerScopeToInputs(cn); }
+
+  void DeviceAwareVisitExpr_(const CallNode* call) final {
+    // Check the contents of this primitive function
+    if (const auto* fn = call->op.as<FunctionNode>()) {
+      if (fn->HasNonzeroAttr(attr::kPrimitive)) {
+        primitive_supports_texture_ = false;
+        Visit(call->op);
+        if (primitive_supports_texture_) {
+          if (call->checked_type().as<TensorTypeNode>()) {
+            std::string scope = "global.texture";
+            if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
+              if (ttype->shape.size() == 5) {
+                scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(call)));
+              }
+            }
+            storage_scope_[call].push_back(scope);
+          } else {
+            const auto* tuple_type = call->type_as<TupleTypeNode>();
+            ICHECK(tuple_type);
+            // TODO(csullivan): Add support for mixed output storage scope.
+            // In current adreno storage planner all outputs of a
+            // primitive function are assumed to be of the same storage
+            // type. This should be easy to extend in the future.

Review Comment:
   We need to have the use case with layer producing tuple having several scopes. E.g. producing of data 5d and 1d tensors. Currently we do not have such test example and we cannot move forward with design and implementation. Until we do not have it, I propose to leave TODO in this place



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org