You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/06/10 01:15:19 UTC
[incubator-doris] branch master updated: [fix](load) fix streamload failure due to false unhealthy replica in concurrent stream load (#10007)
This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new 3363b3aa19 [fix](load) fix streamload failure due to false unhealthy replica in concurrent stream load (#10007)
3363b3aa19 is described below
commit 3363b3aa1932132995637e1ea69f51ce3c6eacb6
Author: yixiutt <10...@users.noreply.github.com>
AuthorDate: Fri Jun 10 09:15:14 2022 +0800
[fix](load) fix streamload failure due to false unhealthy replica in concurrent stream load (#10007)
in concurrent stream load, fe will run publish version task concurrently,
which cause publish task disorder in be.
For example:
fe publish task with version 1 2 3 4
be may handle task with sequence 1 2 4 3
In case above, when report tablet info, be found that version 4
published but version 3 not visible, it'll report version miss to fe,
and fe will set replica lastFailedVersion, and finally makes transaction
commits fail while no quorum health replicas。
Add a time condition if a version miss for 60 seconds then report version miss.
---
be/src/olap/tablet.cpp | 32 +++++++++++++++++++++++++++++---
be/src/olap/tablet.h | 6 +++++-
be/src/olap/tablet_manager.cpp | 2 +-
3 files changed, 35 insertions(+), 5 deletions(-)
diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp
index df7f02a1f9..275511786f 100644
--- a/be/src/olap/tablet.cpp
+++ b/be/src/olap/tablet.cpp
@@ -77,7 +77,9 @@ Tablet::Tablet(TabletMetaSharedPtr tablet_meta, const StorageParamPB& storage_pa
_cumulative_compaction_type(cumulative_compaction_type),
_last_record_scan_count(0),
_last_record_scan_count_timestamp(time(nullptr)),
- _is_clone_occurred(false) {
+ _is_clone_occurred(false),
+ _last_missed_version(-1),
+ _last_missed_time_s(0) {
// construct _timestamped_versioned_tracker from rs and stale rs meta
_timestamped_version_tracker.construct_versioned_tracker(_tablet_meta->all_rs_metas(),
_tablet_meta->all_stale_rs_metas());
@@ -1277,7 +1279,10 @@ bool Tablet::_contains_rowset(const RowsetId rowset_id) {
return false;
}
-void Tablet::build_tablet_report_info(TTabletInfo* tablet_info) {
+// need check if consecutive version missing in full report
+// alter tablet will ignore this check
+void Tablet::build_tablet_report_info(TTabletInfo* tablet_info,
+ bool enable_consecutive_missing_check) {
std::shared_lock rdlock(_meta_lock);
tablet_info->tablet_id = _tablet_meta->tablet_id();
tablet_info->schema_hash = _tablet_meta->schema_hash();
@@ -1290,7 +1295,28 @@ void Tablet::build_tablet_report_info(TTabletInfo* tablet_info) {
Version max_version;
bool has_version_cross;
_max_continuous_version_from_beginning_unlocked(&cversion, &max_version, &has_version_cross);
- tablet_info->__set_version_miss(cversion.second < max_version.second);
+ // cause publish version task runs concurrently, version may be flying
+ // so we add a consecutive miss check to solve this problem:
+ // if publish version 5 arrives but version 4 flying, we may judge replica miss version
+ // and set version miss in tablet_info, which makes fe treat this replica as unhealth
+ // and lead to other problems
+ if (enable_consecutive_missing_check) {
+ if (cversion.second < max_version.second) {
+ if (_last_missed_version == cversion.second + 1) {
+ if (_last_missed_time_s - MonotonicSeconds() >= 60) {
+ // version missed for over 60 seconds
+ tablet_info->__set_version_miss(true);
+ _last_missed_version = -1;
+ _last_missed_time_s = 0;
+ }
+ } else {
+ _last_missed_version = cversion.second + 1;
+ _last_missed_time_s = MonotonicSeconds();
+ }
+ }
+ } else {
+ tablet_info->__set_version_miss(cversion.second < max_version.second);
+ }
// find rowset with max version
auto iter = _rs_version_map.find(max_version);
if (iter == _rs_version_map.end()) {
diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h
index 80ebfed636..fbc9a57a29 100644
--- a/be/src/olap/tablet.h
+++ b/be/src/olap/tablet.h
@@ -226,7 +226,8 @@ public:
// Rowset whose version range is not covered by this tablet is also useful.
bool rowset_meta_is_useful(RowsetMetaSharedPtr rowset_meta);
- void build_tablet_report_info(TTabletInfo* tablet_info);
+ void build_tablet_report_info(TTabletInfo* tablet_info,
+ bool enable_consecutive_missing_check = false);
void generate_tablet_meta_copy(TabletMetaSharedPtr new_tablet_meta) const;
// caller should hold the _meta_lock before calling this method
@@ -358,6 +359,9 @@ private:
// whether clone task occurred during the tablet is in thread pool queue to wait for compaction
std::atomic<bool> _is_clone_occurred;
+ int64_t _last_missed_version;
+ int64_t _last_missed_time_s;
+
DISALLOW_COPY_AND_ASSIGN(Tablet);
public:
diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp
index 06008d5c78..0675754d29 100644
--- a/be/src/olap/tablet_manager.cpp
+++ b/be/src/olap/tablet_manager.cpp
@@ -867,7 +867,7 @@ Status TabletManager::build_all_report_tablets_info(std::map<TTabletId, TTablet>
TabletSharedPtr tablet_ptr = item.second;
TTablet t_tablet;
TTabletInfo tablet_info;
- tablet_ptr->build_tablet_report_info(&tablet_info);
+ tablet_ptr->build_tablet_report_info(&tablet_info, true);
// find expired transaction corresponding to this tablet
TabletInfo tinfo(tablet_id, tablet_ptr->schema_hash(), tablet_ptr->tablet_uid());
auto find = expire_txn_map.find(tinfo);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org