You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by GitBox <gi...@apache.org> on 2022/08/17 22:29:09 UTC

[GitHub] [tvm] csullivan commented on a diff in pull request #12411: [Hexagon] Asynchronous DMA support

csullivan commented on code in PR #12411:
URL: https://github.com/apache/tvm/pull/12411#discussion_r948483578


##########
src/runtime/hexagon/hexagon_user_dma.cc:
##########
@@ -90,32 +78,80 @@ int hexagon_user_dma_1d_sync_helper(void* dst, void* src, uint32_t length) {
   dma_desc_set_src(dma_desc, src32);
   dma_desc_set_dst(dma_desc, dst32);
 
-  dmstart(dma_desc);
-  unsigned int status = dmwait() & DM0_STATUS_MASK;
-  unsigned int done = dma_desc_get_done(dma_desc);
+  // only for first DMA
+  if (first_dma_) {
+    // reset DMA engine
+    auto status = Init();
+    if (status != DM0_STATUS_IDLE) {
+      return DMA_FAILURE;
+    }
+
+    // `dmstart` first descriptor
+    dmstart(dma_desc);
+    first_dma_ = false;
+  } else {
+    // `dmlink` descriptor to tail
+    dmlink(dma_descriptors_.back(), dma_desc);
+  }
 
-  free(dma_desc);
+  // set descriptor as new tail
+  dma_descriptors_.push_back(dma_desc);
 
-  if (status == DM0_STATUS_IDLE && done == DESC_DONE_COMPLETE) {
-    return DMA_SUCCESS;
+  return DMA_SUCCESS;
+}
+
+void HexagonUserDMA::Wait(uint32_t max_dmas_in_flight) {
+  // wait (forever) until max DMAs in flight <= actual DMAs in flight
+  while (DMAsInFlight() > max_dmas_in_flight) {

Review Comment:
   We could add a timeout after some large period (e.g. 10 seconds) that logs a warning. That said, I'm more inclined to leave the infinite spin to start as it will be more evident of an issue than a buried log message. 



##########
tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc:
##########
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../src/runtime/hexagon/hexagon_user_dma.h"
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::hexagon;
+
+class HexagonUserDMATest : public ::testing::Test {
+  void SetUp() override {
+    src = malloc(length);
+    dst = malloc(length);
+    ASSERT_NE(src, nullptr);
+    ASSERT_NE(dst, nullptr);
+
+    src_char = static_cast<char*>(src);
+    dst_char = static_cast<char*>(dst);
+    for (uint32_t i = 0; i < length; ++i) {
+      src_char[i] = 1;
+      dst_char[i] = 0;
+    }
+  }
+  void TearDown() override {
+    free(src);
+    free(dst);
+  }
+
+ public:
+  int ret{0};
+  void* src{nullptr};
+  void* dst{nullptr};
+  char* src_char{nullptr};
+  char* dst_char{nullptr};
+  uint32_t length{0x400000};  // 4MB
+};
+
+TEST_F(HexagonUserDMATest, wait) {
+  HexagonUserDMA::Get().Wait(0);
+  HexagonUserDMA::Get().Wait(10);
+}
+
+TEST_F(HexagonUserDMATest, poll) { ASSERT_EQ(HexagonUserDMA::Get().Poll(), 0); }
+
+TEST_F(HexagonUserDMATest, bad_copy) {
+  uint64_t bigaddr = 0x100000000;
+  void* src64 = reinterpret_cast<void*>(bigaddr);
+  void* dst64 = reinterpret_cast<void*>(bigaddr);
+  uint32_t biglength = 0x1000000;
+  ASSERT_NE(HexagonUserDMA::Get().Copy(dst64, src, length), DMA_SUCCESS);
+  ASSERT_NE(HexagonUserDMA::Get().Copy(dst, src64, length), DMA_SUCCESS);
+  ASSERT_NE(HexagonUserDMA::Get().Copy(dst, src, biglength), DMA_SUCCESS);
+}
+
+TEST_F(HexagonUserDMATest, sync_dma) {
+  // kick off 1 DMA
+  ret = HexagonUserDMA::Get().Copy(dst, src, length);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // wait for DMA to complete
+  HexagonUserDMA::Get().Wait(0);
+
+  // verify
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(src_char[i], dst_char[i]);
+  }
+}
+
+TEST_F(HexagonUserDMATest, async_dma) {
+  // kick off 10x duplicate DMAs
+  for (uint32_t i = 0; i < 10; ++i) {
+    ret = HexagonUserDMA::Get().Copy(dst, src, length);
+    ASSERT_EQ(ret, DMA_SUCCESS);
+  }
+
+  // verify at least 1 DMA in flight
+  // TODO: re-enable when CI runs on hardware - fails on simulator
+  // ASSERT_GT(HexagonUserDMA::Get().Poll(), 0);
+
+  // wait for at least 1 DMA to complete
+  HexagonUserDMA::Get().Wait(9);
+
+  // verify
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(src_char[i], dst_char[i]);
+  }
+}
+
+// TODO: Run non-pipelined case with sync DMA and execution time vs. pipelined case
+TEST_F(HexagonUserDMATest, pipeline) {
+  uint32_t pipeline_depth = 4;
+  uint32_t pipeline_length = length / pipeline_depth;
+
+  for (uint32_t i = 0; i < pipeline_depth; ++i) {
+    ret |= HexagonUserDMA::Get().Copy(dst_char + i * pipeline_length,
+                                      src_char + i * pipeline_length, pipeline_length);
+  }
+
+  HexagonUserDMA::Get().Wait(3);
+  for (uint32_t i = 0; i < pipeline_length; ++i) {
+    dst_char[i]++;
+  }
+
+  HexagonUserDMA::Get().Wait(2);
+  for (uint32_t i = pipeline_length; i < 2 * pipeline_length; ++i) {
+    dst_char[i]++;
+  }
+
+  HexagonUserDMA::Get().Wait(1);
+  for (uint32_t i = 2 * pipeline_length; i < 3 * pipeline_length; ++i) {
+    dst_char[i]++;
+  }
+
+  HexagonUserDMA::Get().Wait(0);
+  for (uint32_t i = 3 * pipeline_length; i < 4 * pipeline_length; ++i) {
+    dst_char[i]++;
+  }
+
+  // verify
+  ASSERT_EQ(ret, 0);
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(dst_char[i], 2);
+  }
+}

Review Comment:
   Wonderful unit testing, as always! 👏 



##########
src/runtime/hexagon/hexagon_user_dma.cc:
##########
@@ -90,32 +78,80 @@ int hexagon_user_dma_1d_sync_helper(void* dst, void* src, uint32_t length) {
   dma_desc_set_src(dma_desc, src32);
   dma_desc_set_dst(dma_desc, dst32);
 
-  dmstart(dma_desc);
-  unsigned int status = dmwait() & DM0_STATUS_MASK;
-  unsigned int done = dma_desc_get_done(dma_desc);
+  // only for first DMA
+  if (first_dma_) {
+    // reset DMA engine
+    auto status = Init();
+    if (status != DM0_STATUS_IDLE) {
+      return DMA_FAILURE;
+    }
+
+    // `dmstart` first descriptor
+    dmstart(dma_desc);
+    first_dma_ = false;
+  } else {
+    // `dmlink` descriptor to tail
+    dmlink(dma_descriptors_.back(), dma_desc);

Review Comment:
   Is there a limit to the number of descriptors we can link before requiring a new dmstart?
   
   



##########
src/runtime/hexagon/hexagon_user_dma.cc:
##########
@@ -90,32 +78,80 @@ int hexagon_user_dma_1d_sync_helper(void* dst, void* src, uint32_t length) {
   dma_desc_set_src(dma_desc, src32);
   dma_desc_set_dst(dma_desc, dst32);
 
-  dmstart(dma_desc);
-  unsigned int status = dmwait() & DM0_STATUS_MASK;
-  unsigned int done = dma_desc_get_done(dma_desc);
+  // only for first DMA
+  if (first_dma_) {
+    // reset DMA engine
+    auto status = Init();
+    if (status != DM0_STATUS_IDLE) {
+      return DMA_FAILURE;
+    }
+
+    // `dmstart` first descriptor
+    dmstart(dma_desc);
+    first_dma_ = false;
+  } else {
+    // `dmlink` descriptor to tail
+    dmlink(dma_descriptors_.back(), dma_desc);
+  }
 
-  free(dma_desc);
+  // set descriptor as new tail
+  dma_descriptors_.push_back(dma_desc);
 
-  if (status == DM0_STATUS_IDLE && done == DESC_DONE_COMPLETE) {
-    return DMA_SUCCESS;
+  return DMA_SUCCESS;
+}
+
+void HexagonUserDMA::Wait(uint32_t max_dmas_in_flight) {
+  // wait (forever) until max DMAs in flight <= actual DMAs in flight
+  while (DMAsInFlight() > max_dmas_in_flight) {
+  }
+}
+
+uint32_t HexagonUserDMA::Poll() { return DMAsInFlight(); }
+
+uint32_t HexagonUserDMA::DMAsInFlight() {
+  // poll DMA engine to make sure DMA status is current
+  dmpoll();
+
+  // find the oldest DMA in flight
+  for (; oldest_dma_in_flight_ < dma_descriptors_.size(); ++oldest_dma_in_flight_) {
+    // read the `done` bit from the DMA descriptor and stop if incomplete
+    unsigned int done = dma_desc_get_done(dma_descriptors_[oldest_dma_in_flight_]);
+    if (done == DESC_DONE_INCOMPLETE) {
+      break;
+    }
+  }
+  // total DMAs in flight = total DMAs - oldest DMA in flight
+  return dma_descriptors_.size() - oldest_dma_in_flight_;
+}
+
+HexagonUserDMA::~HexagonUserDMA() {
+  Init();  // reset DMA engine
+  for (auto dma_desc : dma_descriptors_) {
+    free(dma_desc);
   }
-#endif
-  return DMA_FAILURE;
 }
 
 int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) {

Review Comment:
   Can we add `hexagon_user_dma_1d_async` and `hexagon_user_dma_1d_async_wait` equivalents now for use in tensorization?



##########
src/runtime/hexagon/hexagon_user_dma.cc:
##########
@@ -28,55 +30,41 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-int init_hexagon_user_dma() {
-#if __HEXAGON_ARCH__ >= 68
+unsigned int HexagonUserDMA::Init() {
   // reset DMA engine
   unsigned int status = dmpause() & DM0_STATUS_MASK;
-  if (status != DM0_STATUS_IDLE) {
-    return DMA_FAILURE;
-  }
-#endif
-  return DMA_SUCCESS;
+  return status;
 }
 
-int hexagon_user_dma_1d_sync_helper(void* dst, void* src, uint32_t length) {
-#if __HEXAGON_ARCH__ >= 68
-  static int config_dma = init_hexagon_user_dma();
-  if (config_dma != DMA_SUCCESS) {
+int HexagonUserDMA::Copy(void* dst, void* src, uint32_t length) {
+  // length limited to 24 bits
+  if (length > DESC_LENGTH_MASK) {
     return DMA_FAILURE;
   }
 
-  uint64_t src64 = reinterpret_cast<uint64_t>(src);
   // source address limited to 32 bits
-  if (src64 > DESC_SRC_MASK) {
+  uint64_t src64 = reinterpret_cast<uint64_t>(src);
+  if (!src64 || src64 > DESC_SRC_MASK) {
     return DMA_FAILURE;
   }
 
-  uint64_t dst64 = reinterpret_cast<uint64_t>(dst);
   // destination address limited to 32 bits
-  if (dst64 > DESC_DST_MASK) {
-    return DMA_FAILURE;
-  }
-
-  // length limited to 24 bits
-  if (length > DESC_LENGTH_MASK) {
+  uint64_t dst64 = reinterpret_cast<uint64_t>(dst);
+  if (!dst64 || dst64 > DESC_DST_MASK) {
     return DMA_FAILURE;
   }
 
-  uint32_t src32 = src64 & DESC_SRC_MASK;
-  uint32_t dst32 = dst64 & DESC_DST_MASK;
+  uint32_t src32 = static_cast<uint32_t>(src64);
+  uint32_t dst32 = static_cast<uint32_t>(dst64);
 
+  // allocate new descriptor
   void* dma_desc = nullptr;
-
   int ret = posix_memalign(&dma_desc, DMA_DESC_2D_SIZE, DMA_DESC_2D_SIZE);
-  if (ret) {

Review Comment:
   Potential backlog item could be to allocate the descriptors from a preallocated pool to avoid runtime allocations for each copy. 



##########
tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc:
##########
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../src/runtime/hexagon/hexagon_user_dma.h"
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::hexagon;
+
+class HexagonUserDMATest : public ::testing::Test {
+  void SetUp() override {
+    src = malloc(length);
+    dst = malloc(length);
+    ASSERT_NE(src, nullptr);
+    ASSERT_NE(dst, nullptr);
+
+    src_char = static_cast<char*>(src);
+    dst_char = static_cast<char*>(dst);
+    for (uint32_t i = 0; i < length; ++i) {
+      src_char[i] = 1;
+      dst_char[i] = 0;
+    }
+  }
+  void TearDown() override {
+    free(src);
+    free(dst);
+  }
+
+ public:
+  int ret{0};
+  void* src{nullptr};
+  void* dst{nullptr};
+  char* src_char{nullptr};
+  char* dst_char{nullptr};
+  uint32_t length{0x400000};  // 4MB
+};
+
+TEST_F(HexagonUserDMATest, wait) {
+  HexagonUserDMA::Get().Wait(0);
+  HexagonUserDMA::Get().Wait(10);
+}
+
+TEST_F(HexagonUserDMATest, poll) { ASSERT_EQ(HexagonUserDMA::Get().Poll(), 0); }
+
+TEST_F(HexagonUserDMATest, bad_copy) {
+  uint64_t bigaddr = 0x100000000;
+  void* src64 = reinterpret_cast<void*>(bigaddr);
+  void* dst64 = reinterpret_cast<void*>(bigaddr);
+  uint32_t biglength = 0x1000000;
+  ASSERT_NE(HexagonUserDMA::Get().Copy(dst64, src, length), DMA_SUCCESS);
+  ASSERT_NE(HexagonUserDMA::Get().Copy(dst, src64, length), DMA_SUCCESS);
+  ASSERT_NE(HexagonUserDMA::Get().Copy(dst, src, biglength), DMA_SUCCESS);
+}
+
+TEST_F(HexagonUserDMATest, sync_dma) {
+  // kick off 1 DMA
+  ret = HexagonUserDMA::Get().Copy(dst, src, length);
+  ASSERT_EQ(ret, DMA_SUCCESS);
+
+  // wait for DMA to complete
+  HexagonUserDMA::Get().Wait(0);
+
+  // verify
+  for (uint32_t i = 0; i < length; ++i) {
+    ASSERT_EQ(src_char[i], dst_char[i]);
+  }
+}
+
+TEST_F(HexagonUserDMATest, async_dma) {
+  // kick off 10x duplicate DMAs
+  for (uint32_t i = 0; i < 10; ++i) {
+    ret = HexagonUserDMA::Get().Copy(dst, src, length);
+    ASSERT_EQ(ret, DMA_SUCCESS);
+  }
+
+  // verify at least 1 DMA in flight
+  // TODO: re-enable when CI runs on hardware - fails on simulator
+  // ASSERT_GT(HexagonUserDMA::Get().Poll(), 0);

Review Comment:
   Should we re-enable this so it runs on hardware in nightly but skip when running on the simulator? e.g.
   
   ```
   auto sn = std::getenv("ANDROID_SERIAL_NUMBER");
   if (sn && sn == "simulator") { GTEST_SKIP(); }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org