You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ju...@apache.org on 2022/11/03 19:15:26 UTC

[tvm] branch main updated: [MetaSchedule] Fix Task Hanging in EvolutionarySearch (#13246)

This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 1d1db35236 [MetaSchedule] Fix Task Hanging in EvolutionarySearch (#13246)
1d1db35236 is described below

commit 1d1db352367e3dd435a58f41283ac75cdf9d8858
Author: Xiyou Zhou <xi...@octoml.ai>
AuthorDate: Thu Nov 3 14:15:17 2022 -0500

    [MetaSchedule] Fix Task Hanging in EvolutionarySearch (#13246)
    
    This PR introduces a new argument for EvolutionarySearch that limits the failures (defined as rounds of no new generated candidate) in the `SampleInitPopulation` stage. In this way we can avoid the task to be hanging forever in special cases, e.g., some postproc always fails. This should fix #12330.
---
 include/tvm/meta_schedule/search_strategy.h        |  2 +
 .../search_strategy/evolutionary_search.py         |  4 ++
 .../search_strategy/evolutionary_search.cc         | 18 ++++++-
 .../unittest/test_meta_schedule_search_strategy.py | 56 ++++++++++++++++++++++
 4 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/include/tvm/meta_schedule/search_strategy.h b/include/tvm/meta_schedule/search_strategy.h
index c2399eef08..3f44a2438d 100644
--- a/include/tvm/meta_schedule/search_strategy.h
+++ b/include/tvm/meta_schedule/search_strategy.h
@@ -200,6 +200,7 @@ class SearchStrategy : public runtime::ObjectRef {
    * \param population_size The initial sample population.
    * \param init_measured_ratio The ratio of measures samples in initial population.
    * \param init_min_unmeasured The minimal size of unmeasured population in the initial sampling.
+   * \param max_fail_count The max number of failure during initial sampling.
    * \param genetic_num_iters The iterations to run the genetic algorithm.
    * \param genetic_mutate_prob The probability of mutation.
    * \param genetic_max_fail_count The maximum number to try evolving the given trace.
@@ -208,6 +209,7 @@ class SearchStrategy : public runtime::ObjectRef {
   TVM_DLL static SearchStrategy EvolutionarySearch(int population_size,         //
                                                    double init_measured_ratio,  //
                                                    int init_min_unmeasured,     //
+                                                   int max_fail_count,          //
                                                    int genetic_num_iters,       //
                                                    double genetic_mutate_prob,  //
                                                    int genetic_max_fail_count,  //
diff --git a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
index 2851ebe7b1..65e7ddc468 100644
--- a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
+++ b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
@@ -35,6 +35,8 @@ class EvolutionarySearch(SearchStrategy):
         The ratio of measured samples in the initial population.
     init_min_unmeasured : int
         The minimal size of unmeasured population in the initial sampling.
+    max_fail_count : int
+        The maximum number of failure during initial sampling.
     genetic_num_iters : int
         The number of iterations for genetic algorithm.
     genetic_mutate_prob : float
@@ -59,6 +61,7 @@ class EvolutionarySearch(SearchStrategy):
         population_size: int = 2048,
         init_measured_ratio: float = 0.2,
         init_min_unmeasured: int = 50,
+        max_fail_count: int = 5,
         genetic_num_iters: int = 4,
         genetic_mutate_prob: float = 0.85,
         genetic_max_fail_count: int = 10,
@@ -70,6 +73,7 @@ class EvolutionarySearch(SearchStrategy):
             population_size,
             init_measured_ratio,
             init_min_unmeasured,
+            max_fail_count,
             genetic_num_iters,
             genetic_mutate_prob,
             genetic_max_fail_count,
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index 2cc45e01bb..cc99951239 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -365,6 +365,8 @@ class EvolutionarySearchNode : public SearchStrategyNode {
   double init_measured_ratio;
   /*! \brief The minimal size of unmeasured population in the initial sampling.*/
   int init_min_unmeasured;
+  /*! \brief The maximum number of failure during initial sampling. */
+  int max_fail_count;
   /*** Configuration: evolution ***/
   /*! \brief The number of iterations performed by generic algorithm. */
   int genetic_num_iters;
@@ -387,6 +389,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     /*** Configuration: the initial population ***/
     v->Visit("init_measured_ratio", &init_measured_ratio);
     v->Visit("init_min_unmeasured", &init_min_unmeasured);
+    v->Visit("max_fail_count", &max_fail_count);
     /*** Configuration: evolution ***/
     v->Visit("genetic_num_iters", &genetic_num_iters);
     v->Visit("genetic_mutate_prob", &genetic_mutate_prob);
@@ -456,6 +459,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     n->num_empty_iters_before_early_stop = this->num_empty_iters_before_early_stop;
     n->init_measured_ratio = this->init_measured_ratio;
     n->init_min_unmeasured = this->init_min_unmeasured;
+    n->max_fail_count = this->max_fail_count;
     n->genetic_num_iters = this->genetic_num_iters;
     n->genetic_mutate_prob = this->genetic_mutate_prob;
     n->genetic_max_fail_count = this->genetic_max_fail_count;
@@ -501,7 +505,9 @@ std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int nu
   auto _ = Profiler::TimedScope("EvoSearch/SampleInitPopulation");
   ThreadedTraceApply pp(self->postprocs_);
   std::vector<Schedule> out_schs;
-  while (static_cast<int>(out_schs.size()) < self->init_min_unmeasured) {
+  int fail_count = 0;
+  while (static_cast<int>(out_schs.size()) < self->init_min_unmeasured &&
+         fail_count < self->max_fail_count) {
     std::vector<Schedule> results(num, Schedule{nullptr});
     auto f_proc_unmeasured = [this, &results, &pp](int thread_id, int trace_id) -> void {
       PerThreadData& data = this->per_thread_data_.at(thread_id);
@@ -516,11 +522,14 @@ std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int nu
       }
     };
     support::parallel_for_dynamic(0, num, self->ctx_->num_threads, f_proc_unmeasured);
+    bool found_new = false;
     for (int i = 0; i < num; i++) {
       if (results[i].defined()) {
+        found_new = true;
         out_schs.push_back(results[i]);
       }
     }
+    fail_count += !found_new;
     TVM_PY_LOG(INFO, self->ctx_->logger) << "Sample-Init-Population summary:\n"
                                          << pp.SummarizeFailures();
   }
@@ -706,6 +715,11 @@ Optional<Array<MeasureCandidate>> EvolutionarySearchNode::State::GenerateMeasure
   TVM_PY_LOG(INFO, self->ctx_->logger)
       << "Picked top " << measured.size() << " candidate(s) from database";
   std::vector<Schedule> unmeasured = SampleInitPopulation(pop - measured.size());
+  if (static_cast<int>(unmeasured.size()) < self->init_min_unmeasured) {
+    TVM_PY_LOG(WARNING, self->ctx_->logger)
+        << "Cannot sample enough initial population, evolutionary search failed.";
+    return NullOpt;
+  }
   TVM_PY_LOG(INFO, self->ctx_->logger) << "Sampled " << unmeasured.size() << " candidate(s)";
   inits.insert(inits.end(), measured.begin(), measured.end());
   inits.insert(inits.end(), unmeasured.begin(), unmeasured.end());
@@ -737,6 +751,7 @@ size_t EvolutionarySearchNode::State::ModuleHash(const IRModule& mod) const {
 SearchStrategy SearchStrategy::EvolutionarySearch(int population_size,         //
                                                   double init_measured_ratio,  //
                                                   int init_min_unmeasured,     //
+                                                  int max_fail_count,          //
                                                   int genetic_num_iters,       //
                                                   double genetic_mutate_prob,  //
                                                   int genetic_max_fail_count,  //
@@ -749,6 +764,7 @@ SearchStrategy SearchStrategy::EvolutionarySearch(int population_size,         /
   n->num_empty_iters_before_early_stop = 5;
   n->init_measured_ratio = init_measured_ratio;
   n->init_min_unmeasured = init_min_unmeasured;
+  n->max_fail_count = max_fail_count;
   n->genetic_num_iters = genetic_num_iters;
   n->genetic_max_fail_count = genetic_max_fail_count;
   n->genetic_mutate_prob = genetic_mutate_prob;
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index e345544206..29c20ced04 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -22,6 +22,7 @@ import pytest
 import tvm
 import tvm.testing
 from tvm import meta_schedule as ms
+from tvm.meta_schedule.utils import derived_object
 from tvm.meta_schedule.testing.dummy_object import DummyMutator
 from tvm.script import tir as T
 from tvm.tir.schedule import Schedule, Trace
@@ -251,8 +252,63 @@ def test_meta_schedule_evolutionary_search_early_stop():  # pylint: disable = in
     assert num_trials_each_iter == [1, 0, 0, 0, 0]
 
 
+def test_meta_schedule_evolutionary_search_fail_init_population():  # pylint: disable = invalid-name
+    @derived_object
+    class AlwaysFailPostproc(ms.postproc.PyPostproc):
+        """A postproc that always fails."""
+
+        def _initialize_with_tune_context(self, context: ms.TuneContext) -> None:
+            pass
+
+        def apply(self, sch: Schedule) -> bool:
+            return False
+
+        def clone(self) -> "AlwaysFailPostproc":
+            return AlwaysFailPostproc()
+
+        def __str__(self) -> str:
+            return "AlwaysFailPostproc"
+
+    num_trials_per_iter = 10
+    max_trials_per_task = 2000
+
+    context = ms.TuneContext(
+        mod=Matmul,
+        space_generator=ms.space_generator.ScheduleFn(
+            sch_fn=_schedule_matmul,
+            sch_rules=[],
+            postprocs=[AlwaysFailPostproc()],
+            mutator_probs={
+                DummyMutator(): 1.0,
+            },
+        ),
+        search_strategy=ms.search_strategy.EvolutionarySearch(
+            population_size=5,
+            init_measured_ratio=0.1,
+            init_min_unmeasured=50,
+            genetic_num_iters=3,
+            genetic_mutate_prob=0.5,
+            genetic_max_fail_count=10,
+            eps_greedy=0.9,
+        ),
+        target=tvm.target.Target("llvm"),
+        num_threads=1,  # because we are using a mutator from the python side
+    )
+    strategy = context.search_strategy
+    strategy.pre_tuning(
+        max_trials=max_trials_per_task,
+        num_trials_per_iter=num_trials_per_iter,
+        design_spaces=context.space_generator.generate_design_space(context.mod),
+        database=ms.database.MemoryDatabase(),
+        cost_model=ms.cost_model.RandomModel(),
+    )
+    candidates = strategy.generate_measure_candidates()
+    assert candidates is None
+
+
 if __name__ == "__main__":
     test_meta_schedule_replay_func(ms.search_strategy.ReplayFunc)
     test_meta_schedule_replay_func(ms.search_strategy.ReplayTrace)
     test_meta_schedule_evolutionary_search()
     test_meta_schedule_evolutionary_search_early_stop()
+    test_meta_schedule_evolutionary_search_fail_init_population()