You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ma...@apache.org on 2022/04/21 09:54:16 UTC

[tvm] branch main updated: Restart popen pool. (#11074)

This is an automated email from the ASF dual-hosted git repository.

masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new b952425b2d Restart popen pool. (#11074)
b952425b2d is described below

commit b952425b2d46076ccbc0a55953e31afbfac0da33
Author: Xiyou Zhou <xi...@octoml.ai>
AuthorDate: Thu Apr 21 02:54:11 2022 -0700

    Restart popen pool. (#11074)
    
    Retrigger CI.
    
    Address issues.
    
    Retrigger CI.
---
 python/tvm/meta_schedule/builder/local_builder.py | 35 +++++++++++++++++------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/python/tvm/meta_schedule/builder/local_builder.py b/python/tvm/meta_schedule/builder/local_builder.py
index 0d9ef6e4cf..eb1b1f377b 100644
--- a/python/tvm/meta_schedule/builder/local_builder.py
+++ b/python/tvm/meta_schedule/builder/local_builder.py
@@ -58,8 +58,12 @@ class LocalBuilder(PyBuilder):
     ----------
     pool : PopenPoolExecutor
         The process pool to run the build.
+    max_workers: int
+        The max number of Popen workers.
     timeout_sec : float
         The timeout in seconds for the build.
+    initializer: Optional[Callable[[], None]]
+        The initializer function for each popen worker.
     f_build : Union[None, str, T_BUILD]
         Name of the build function to be used.
         Defaults to `meta_schedule.builder.default_build`.
@@ -97,8 +101,9 @@ class LocalBuilder(PyBuilder):
     please send the registration logic via initializer.
     """
 
-    pool: PopenPoolExecutor
+    max_workers: int
     timeout_sec: float
+    initializer: Optional[Callable[[], None]]
     f_build: Union[None, str, T_BUILD]
     f_export: Union[None, str, T_EXPORT]
 
@@ -135,12 +140,9 @@ class LocalBuilder(PyBuilder):
             max_workers = cpu_count(logical=True)
         logger.info("LocalBuilder: max_workers = %d", max_workers)
 
-        self.pool = PopenPoolExecutor(
-            max_workers=max_workers,
-            timeout=timeout_sec,
-            initializer=initializer,
-        )
+        self.max_workers = max_workers
         self.timeout_sec = timeout_sec
+        self.initializer = initializer
         self.f_build = f_build
         self.f_export = f_export
         self._sanity_check()
@@ -149,8 +151,17 @@ class LocalBuilder(PyBuilder):
         results: List[BuilderResult] = []
         map_result: MapResult
 
+        # Here we restart the PopenPool everytime because of a known memory leak issue with the
+        # PopenPool workers after a couple times of usage. We don't apply the same to runners to
+        # avoid potential problem caused by async behaviour.
+        pool = PopenPoolExecutor(
+            max_workers=self.max_workers,
+            timeout=self.timeout_sec,
+            initializer=self.initializer,
+        )
+
         # Dispatch the build inputs to the worker processes.
-        for map_result in self.pool.map_with_error_catching(
+        for map_result in pool.map_with_error_catching(
             lambda x: _worker_func(*x),
             [
                 (
@@ -181,6 +192,7 @@ class LocalBuilder(PyBuilder):
                 )
             else:
                 raise ValueError("Unreachable: unexpected result: {map_result}")
+        del pool
         return results
 
     def _sanity_check(self) -> None:
@@ -188,8 +200,15 @@ class LocalBuilder(PyBuilder):
             get_global_func_with_default_on_worker(name=f_build, default=None)
             get_global_func_with_default_on_worker(name=f_export, default=None)
 
-        value = self.pool.submit(_check, self.f_build, self.f_export)
+        # Same reason for the single use PopenPool as mentioned above
+        pool = PopenPoolExecutor(
+            max_workers=self.max_workers,
+            timeout=self.timeout_sec,
+            initializer=self.initializer,
+        )
+        value = pool.submit(_check, self.f_build, self.f_export)
         value.result()
+        del pool
 
 
 def _worker_func(