You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/06/10 15:48:35 UTC
[incubator-doris] 02/03: [fix](load) fix streamload failure due to false unhealthy replica in concurrent stream load (#10007)
This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch dev-1.0.1
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
commit 4bc3a364f8f8c4396a642fd2471e3204b8fb3d80
Author: yixiutt <10...@users.noreply.github.com>
AuthorDate: Fri Jun 10 09:15:14 2022 +0800
[fix](load) fix streamload failure due to false unhealthy replica in concurrent stream load (#10007)
in concurrent stream load, fe will run publish version task concurrently,
which cause publish task disorder in be.
For example:
fe publish task with version 1 2 3 4
be may handle task with sequence 1 2 4 3
In case above, when report tablet info, be found that version 4
published but version 3 not visible, it'll report version miss to fe,
and fe will set replica lastFailedVersion, and finally makes transaction
commits fail while no quorum health replicas。
Add a time condition if a version miss for 60 seconds then report version miss.
---
be/src/olap/tablet.cpp | 31 ++++++++++++++++++++++++++++---
be/src/olap/tablet.h | 6 +++++-
be/src/olap/tablet_manager.cpp | 2 +-
3 files changed, 34 insertions(+), 5 deletions(-)
diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp
index 8214c44494..6f84b1764f 100644
--- a/be/src/olap/tablet.cpp
+++ b/be/src/olap/tablet.cpp
@@ -76,7 +76,9 @@ Tablet::Tablet(TabletMetaSharedPtr tablet_meta, DataDir* data_dir,
_cumulative_compaction_type(cumulative_compaction_type),
_last_record_scan_count(0),
_last_record_scan_count_timestamp(time(nullptr)),
- _is_clone_occurred(false) {
+ _is_clone_occurred(false),
+ _last_missed_version(-1),
+ _last_missed_time_s(0) {
// construct _timestamped_versioned_tracker from rs and stale rs meta
_timestamped_version_tracker.construct_versioned_tracker(_tablet_meta->all_rs_metas(),
_tablet_meta->all_stale_rs_metas());
@@ -1285,7 +1287,9 @@ bool Tablet::_contains_rowset(const RowsetId rowset_id) {
return false;
}
-void Tablet::build_tablet_report_info(TTabletInfo* tablet_info) {
+// need check if consecutive version missing in full report
+// alter tablet will ignore this check
+void Tablet::build_tablet_report_info(TTabletInfo* tablet_info, bool enable_consecutive_missing_check) {
ReadLock rdlock(_meta_lock);
tablet_info->tablet_id = _tablet_meta->tablet_id();
tablet_info->schema_hash = _tablet_meta->schema_hash();
@@ -1298,7 +1302,28 @@ void Tablet::build_tablet_report_info(TTabletInfo* tablet_info) {
Version max_version;
bool has_version_cross;
_max_continuous_version_from_beginning_unlocked(&cversion, &max_version, &has_version_cross);
- tablet_info->__set_version_miss(cversion.second < max_version.second);
+ // cause publish version task runs concurrently, version may be flying
+ // so we add a consecutive miss check to solve this problem:
+ // if publish version 5 arrives but version 4 flying, we may judge replica miss version
+ // and set version miss in tablet_info, which makes fe treat this replica as unhealth
+ // and lead to other problems
+ if (enable_consecutive_missing_check) {
+ if (cversion.second < max_version.second) {
+ if (_last_missed_version == cversion.second + 1) {
+ if (_last_missed_time_s - MonotonicSeconds() >= 60) {
+ // version missed for over 60 seconds
+ tablet_info->__set_version_miss(true);
+ _last_missed_version = -1;
+ _last_missed_time_s = 0;
+ }
+ } else {
+ _last_missed_version = cversion.second + 1;
+ _last_missed_time_s = MonotonicSeconds();
+ }
+ }
+ } else {
+ tablet_info->__set_version_miss(cversion.second < max_version.second);
+ }
// find rowset with max version
auto iter = _rs_version_map.find(max_version);
if (iter == _rs_version_map.end()) {
diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h
index 5d0ad5ad05..114ca26e72 100644
--- a/be/src/olap/tablet.h
+++ b/be/src/olap/tablet.h
@@ -234,7 +234,8 @@ public:
// Rowset whose version range is not covered by this tablet is also useful.
bool rowset_meta_is_useful(RowsetMetaSharedPtr rowset_meta);
- void build_tablet_report_info(TTabletInfo* tablet_info);
+ void build_tablet_report_info(TTabletInfo* tablet_info,
+ bool enable_consecutive_missing_check = false);
void generate_tablet_meta_copy(TabletMetaSharedPtr new_tablet_meta) const;
// caller should hold the _meta_lock before calling this method
@@ -351,6 +352,9 @@ private:
// whether clone task occurred during the tablet is in thread pool queue to wait for compaction
std::atomic<bool> _is_clone_occurred;
+ int64_t _last_missed_version;
+ int64_t _last_missed_time_s;
+
DISALLOW_COPY_AND_ASSIGN(Tablet);
public:
diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp
index 88efecfe99..66d4031fe9 100644
--- a/be/src/olap/tablet_manager.cpp
+++ b/be/src/olap/tablet_manager.cpp
@@ -903,7 +903,7 @@ OLAPStatus TabletManager::build_all_report_tablets_info(
TTablet t_tablet;
for (TabletSharedPtr tablet_ptr : item.second.table_arr) {
TTabletInfo tablet_info;
- tablet_ptr->build_tablet_report_info(&tablet_info);
+ tablet_ptr->build_tablet_report_info(&tablet_info, true);
// find expired transaction corresponding to this tablet
TabletInfo tinfo(tablet_id, tablet_ptr->schema_hash(), tablet_ptr->tablet_uid());
auto find = expire_txn_map.find(tinfo);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org