You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/06/10 01:15:19 UTC

[incubator-doris] branch master updated: [fix](load) fix streamload failure due to false unhealthy replica in concurrent stream load (#10007)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 3363b3aa19 [fix](load) fix streamload failure due to false unhealthy replica in concurrent stream load (#10007)
3363b3aa19 is described below

commit 3363b3aa1932132995637e1ea69f51ce3c6eacb6
Author: yixiutt <10...@users.noreply.github.com>
AuthorDate: Fri Jun 10 09:15:14 2022 +0800

    [fix](load) fix streamload failure due to false unhealthy replica in concurrent stream load (#10007)
    
    in concurrent stream load, fe will run publish version task concurrently,
    which cause publish task disorder in be.
    For example:
    fe publish task with version 1 2 3 4
    be may handle task with sequence 1 2 4 3
    In case above, when report tablet info, be found that version 4
    published but version 3 not visible, it'll report version miss to fe,
    and fe will set replica lastFailedVersion, and finally makes transaction
    commits fail while no quorum health replicas。
    
    Add a time condition if a version miss for 60 seconds then report version miss.
---
 be/src/olap/tablet.cpp         | 32 +++++++++++++++++++++++++++++---
 be/src/olap/tablet.h           |  6 +++++-
 be/src/olap/tablet_manager.cpp |  2 +-
 3 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp
index df7f02a1f9..275511786f 100644
--- a/be/src/olap/tablet.cpp
+++ b/be/src/olap/tablet.cpp
@@ -77,7 +77,9 @@ Tablet::Tablet(TabletMetaSharedPtr tablet_meta, const StorageParamPB& storage_pa
           _cumulative_compaction_type(cumulative_compaction_type),
           _last_record_scan_count(0),
           _last_record_scan_count_timestamp(time(nullptr)),
-          _is_clone_occurred(false) {
+          _is_clone_occurred(false),
+          _last_missed_version(-1),
+          _last_missed_time_s(0) {
     // construct _timestamped_versioned_tracker from rs and stale rs meta
     _timestamped_version_tracker.construct_versioned_tracker(_tablet_meta->all_rs_metas(),
                                                              _tablet_meta->all_stale_rs_metas());
@@ -1277,7 +1279,10 @@ bool Tablet::_contains_rowset(const RowsetId rowset_id) {
     return false;
 }
 
-void Tablet::build_tablet_report_info(TTabletInfo* tablet_info) {
+// need check if consecutive version missing in full report
+// alter tablet will ignore this check
+void Tablet::build_tablet_report_info(TTabletInfo* tablet_info,
+                                      bool enable_consecutive_missing_check) {
     std::shared_lock rdlock(_meta_lock);
     tablet_info->tablet_id = _tablet_meta->tablet_id();
     tablet_info->schema_hash = _tablet_meta->schema_hash();
@@ -1290,7 +1295,28 @@ void Tablet::build_tablet_report_info(TTabletInfo* tablet_info) {
     Version max_version;
     bool has_version_cross;
     _max_continuous_version_from_beginning_unlocked(&cversion, &max_version, &has_version_cross);
-    tablet_info->__set_version_miss(cversion.second < max_version.second);
+    // cause publish version task runs concurrently, version may be flying
+    // so we add a consecutive miss check to solve this problem:
+    // if publish version 5 arrives but version 4 flying, we may judge replica miss version
+    // and set version miss in tablet_info, which makes fe treat this replica as unhealth
+    // and lead to other problems
+    if (enable_consecutive_missing_check) {
+        if (cversion.second < max_version.second) {
+            if (_last_missed_version == cversion.second + 1) {
+                if (_last_missed_time_s - MonotonicSeconds() >= 60) {
+                    // version missed for over 60 seconds
+                    tablet_info->__set_version_miss(true);
+                    _last_missed_version = -1;
+                    _last_missed_time_s = 0;
+                }
+            } else {
+                _last_missed_version = cversion.second + 1;
+                _last_missed_time_s = MonotonicSeconds();
+            }
+        }
+    } else {
+        tablet_info->__set_version_miss(cversion.second < max_version.second);
+    }
     // find rowset with max version
     auto iter = _rs_version_map.find(max_version);
     if (iter == _rs_version_map.end()) {
diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h
index 80ebfed636..fbc9a57a29 100644
--- a/be/src/olap/tablet.h
+++ b/be/src/olap/tablet.h
@@ -226,7 +226,8 @@ public:
     // Rowset whose version range is not covered by this tablet is also useful.
     bool rowset_meta_is_useful(RowsetMetaSharedPtr rowset_meta);
 
-    void build_tablet_report_info(TTabletInfo* tablet_info);
+    void build_tablet_report_info(TTabletInfo* tablet_info,
+                                  bool enable_consecutive_missing_check = false);
 
     void generate_tablet_meta_copy(TabletMetaSharedPtr new_tablet_meta) const;
     // caller should hold the _meta_lock before calling this method
@@ -358,6 +359,9 @@ private:
     // whether clone task occurred during the tablet is in thread pool queue to wait for compaction
     std::atomic<bool> _is_clone_occurred;
 
+    int64_t _last_missed_version;
+    int64_t _last_missed_time_s;
+
     DISALLOW_COPY_AND_ASSIGN(Tablet);
 
 public:
diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp
index 06008d5c78..0675754d29 100644
--- a/be/src/olap/tablet_manager.cpp
+++ b/be/src/olap/tablet_manager.cpp
@@ -867,7 +867,7 @@ Status TabletManager::build_all_report_tablets_info(std::map<TTabletId, TTablet>
             TabletSharedPtr tablet_ptr = item.second;
             TTablet t_tablet;
             TTabletInfo tablet_info;
-            tablet_ptr->build_tablet_report_info(&tablet_info);
+            tablet_ptr->build_tablet_report_info(&tablet_info, true);
             // find expired transaction corresponding to this tablet
             TabletInfo tinfo(tablet_id, tablet_ptr->schema_hash(), tablet_ptr->tablet_uid());
             auto find = expire_txn_map.find(tinfo);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org