You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2020/07/22 14:42:41 UTC

[incubator-doris] branch master updated: [TabletRepair] Delete bad replicas when no BE can be used to create new replica

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new e4f5a29  [TabletRepair] Delete bad replicas when no BE can be used to create new replica
e4f5a29 is described below

commit e4f5a2936bfecf7dd0465706f0cb2112fb852be6
Author: Lijia Liu <li...@yeah.net>
AuthorDate: Wed Jul 22 22:42:31 2020 +0800

    [TabletRepair] Delete bad replicas when no BE can be used to create new replica
    
    When there is no available BE for relocating replicas, delete the bad replica first.
---
 .../java/org/apache/doris/catalog/OlapTable.java   |  4 +--
 .../main/java/org/apache/doris/catalog/Tablet.java | 33 ++++++++++++++++------
 .../java/org/apache/doris/clone/TabletChecker.java |  4 +--
 .../org/apache/doris/clone/TabletSchedCtx.java     |  4 +--
 .../org/apache/doris/clone/TabletScheduler.java    |  4 +--
 .../apache/doris/common/proc/StatisticProcDir.java | 18 ++++++------
 .../org/apache/doris/master/ReportHandler.java     |  4 +--
 7 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
index f8f4423..10603eb 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
@@ -1229,7 +1229,7 @@ public class OlapTable extends Table {
     }
 
     public boolean isStable(SystemInfoService infoService, TabletScheduler tabletScheduler, String clusterName) {
-        int availableBackendsNum = infoService.getClusterBackendIds(clusterName, true).size();
+        List<Long> aliveBeIdsInCluster = infoService.getClusterBackendIds(clusterName, true);
         for (Partition partition : idToPartition.values()) {
             long visibleVersion = partition.getVisibleVersion();
             long visibleVersionHash = partition.getVisibleVersionHash();
@@ -1242,7 +1242,7 @@ public class OlapTable extends Table {
 
                     Pair<TabletStatus, TabletSchedCtx.Priority> statusPair = tablet.getHealthStatusWithPriority(
                             infoService, clusterName, visibleVersion, visibleVersionHash, replicationNum,
-                            availableBackendsNum);
+                            aliveBeIdsInCluster);
                     if (statusPair.first != TabletStatus.HEALTHY) {
                         LOG.info("table {} is not stable because tablet {} status is {}. replicas: {}",
                                 id, tablet.getId(), statusPair.first, tablet.getReplicas());
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java
index d8eb691..87e9309 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java
@@ -42,6 +42,7 @@ import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
+import java.util.stream.Collectors;
 
 /**
  * This class represents the olap tablet related metadata.
@@ -407,7 +408,7 @@ public class Tablet extends MetaObject implements Writable {
     public Pair<TabletStatus, TabletSchedCtx.Priority> getHealthStatusWithPriority(
             SystemInfoService systemInfoService, String clusterName,
             long visibleVersion, long visibleVersionHash, int replicationNum,
-            int availableBackendsNum) {
+            List<Long> aliveBeIdsInCluster) {
 
         int alive = 0;
         int aliveAndVersionComplete = 0;
@@ -453,15 +454,16 @@ public class Tablet extends MetaObject implements Writable {
         }
 
         // 1. alive replicas are not enough
-        if (alive < replicationNum && replicas.size() >= availableBackendsNum
-                && availableBackendsNum >= replicationNum && replicationNum > 1) {
+        int aliveBackendsNum = aliveBeIdsInCluster.size();
+        if (alive < replicationNum && replicas.size() >= aliveBackendsNum
+                && aliveBackendsNum >= replicationNum && replicationNum > 1) {
             // there is no enough backend for us to create a new replica, so we have to delete an existing replica,
             // so there can be available backend for us to create a new replica.
             // And if there is only one replica, we will not handle it(maybe need human interference)
             // condition explain:
             // 1. alive < replicationNum: replica is missing or bad
-            // 2. replicas.size() >= availableBackendsNum: the existing replicas occupies all available backends
-            // 3. availableBackendsNum >= replicationNum: make sure after deleting, there will be at least one backend for new replica.
+            // 2. replicas.size() >= aliveBackendsNum: the existing replicas occupies all available backends
+            // 3. aliveBackendsNum >= replicationNum: make sure after deleting, there will be at least one backend for new replica.
             // 4. replicationNum > 1: if replication num is set to 1, do not delete any replica, for safety reason
             return Pair.create(TabletStatus.FORCE_REDUNDANT, TabletSchedCtx.Priority.VERY_HIGH);
         } else if (alive < (replicationNum / 2) + 1) {
@@ -484,10 +486,23 @@ public class Tablet extends MetaObject implements Writable {
         }
 
         // 3. replica is under relocating
-        if (stable < (replicationNum / 2) + 1) {
-            return Pair.create(TabletStatus.REPLICA_RELOCATING, TabletSchedCtx.Priority.NORMAL);
-        } else if (stable < replicationNum) {
-            return Pair.create(TabletStatus.REPLICA_RELOCATING, TabletSchedCtx.Priority.LOW);
+        if (stable < replicationNum) {
+            List<Long> replicaBeIds = replicas.stream()
+                    .map(Replica::getBackendId).collect(Collectors.toList());
+            List<Long> availableBeIds = aliveBeIdsInCluster.stream()
+                    .filter(systemInfoService::checkBackendAvailable)
+                    .collect(Collectors.toList());
+            if (replicaBeIds.containsAll(availableBeIds)
+                    && availableBeIds.size() >= replicationNum
+                    && replicationNum > 1) { // No BE can be choose to create a new replica
+                return Pair.create(TabletStatus.FORCE_REDUNDANT,
+                        stable < (replicationNum / 2) + 1 ? TabletSchedCtx.Priority.NORMAL : TabletSchedCtx.Priority.LOW);
+            }
+            if (stable < (replicationNum / 2) + 1) {
+                return Pair.create(TabletStatus.REPLICA_RELOCATING, TabletSchedCtx.Priority.NORMAL);
+            } else if (stable < replicationNum) {
+                return Pair.create(TabletStatus.REPLICA_RELOCATING, TabletSchedCtx.Priority.LOW);
+            }
         }
 
         // 4. healthy replicas in cluster are not enough
diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java
index 850fe04..952f564 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java
@@ -206,7 +206,7 @@ public class TabletChecker extends MasterDaemon {
 
             db.readLock();
             try {
-                int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
+                List<Long> aliveBeIdsInCluster = infoService.getClusterBackendIds(db.getClusterName(), true);
                 for (Table table : db.getTables()) {
                     if (!table.needSchedule()) {
                         continue;
@@ -239,7 +239,7 @@ public class TabletChecker extends MasterDaemon {
                                         partition.getVisibleVersion(),
                                         partition.getVisibleVersionHash(),
                                         olapTbl.getPartitionInfo().getReplicationNum(partition.getId()),
-                                        availableBackendsNum);
+                                        aliveBeIdsInCluster);
 
                                 if (statusWithPrio.first == TabletStatus.HEALTHY) {
                                     // Only set last status check time when status is healthy.
diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java
index 059306f..98e1dab 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java
@@ -799,11 +799,11 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
                 throw new SchedException(Status.UNRECOVERABLE, "tablet does not exist");
             }
             
-            int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
+            List<Long> aliveBeIdsInCluster = infoService.getClusterBackendIds(db.getClusterName(), true);
             short replicationNum = olapTable.getPartitionInfo().getReplicationNum(partitionId);
             Pair<TabletStatus, TabletSchedCtx.Priority> pair = tablet.getHealthStatusWithPriority(
                     infoService, db.getClusterName(), visibleVersion, visibleVersionHash, replicationNum,
-                    availableBackendsNum);
+                    aliveBeIdsInCluster);
             if (pair.first == TabletStatus.HEALTHY) {
                 throw new SchedException(Status.FINISHED, "tablet is healthy");
             }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
index 4ecfe56..83fa29c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
@@ -495,13 +495,13 @@ public class TabletScheduler extends MasterDaemon {
                 statusPair = Pair.create(st, Priority.HIGH);
                 tabletCtx.setColocateGroupBackendIds(backendsSet);
             } else {
-                int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
+                List<Long> aliveBeIdsInCluster = infoService.getClusterBackendIds(db.getClusterName(), true);
                 statusPair = tablet.getHealthStatusWithPriority(
                         infoService, tabletCtx.getCluster(),
                         partition.getVisibleVersion(),
                         partition.getVisibleVersionHash(),
                         tbl.getPartitionInfo().getReplicationNum(partition.getId()),
-                        availableBackendsNum);
+                        aliveBeIdsInCluster);
             }
 
             if (tabletCtx.getType() == TabletSchedCtx.Type.BALANCE && tableState != OlapTableState.NORMAL) {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java b/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java
index e75a3fe..29f48c7 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java
@@ -17,6 +17,10 @@
 
 package org.apache.doris.common.proc;
 
+import com.google.common.base.Preconditions;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Multimap;
 import org.apache.doris.catalog.Catalog;
 import org.apache.doris.catalog.Database;
 import org.apache.doris.catalog.MaterializedIndex;
@@ -34,19 +38,13 @@ import org.apache.doris.common.util.ListComparator;
 import org.apache.doris.system.SystemInfoService;
 import org.apache.doris.task.AgentTaskQueue;
 import org.apache.doris.thrift.TTaskType;
-
-import com.google.common.base.Preconditions;
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Multimap;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
 
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
-
 public class StatisticProcDir implements ProcDirInterface {
     public static final ImmutableList<String> TITLE_NAMES = new ImmutableList.Builder<String>()
             .add("DbId").add("DbName").add("TableNum").add("PartitionNum")
@@ -108,7 +106,7 @@ public class StatisticProcDir implements ProcDirInterface {
             }
 
             ++totalDbNum;
-            int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
+            List<Long> aliveBeIdsInCluster = infoService.getClusterBackendIds(db.getClusterName(), true);
             db.readLock();
             try {
                 int dbTableNum = 0;
@@ -137,7 +135,7 @@ public class StatisticProcDir implements ProcDirInterface {
                                 Pair<TabletStatus, Priority> res = tablet.getHealthStatusWithPriority(
                                         infoService, db.getClusterName(),
                                         partition.getVisibleVersion(), partition.getVisibleVersionHash(),
-                                        replicationNum, availableBackendsNum);
+                                        replicationNum, aliveBeIdsInCluster);
 
                                 // here we treat REDUNDANT as HEALTHY, for user friendly.
                                 if (res.first != TabletStatus.HEALTHY && res.first != TabletStatus.REDUNDANT
diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
index c8c1d18..8d40812 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
@@ -1027,10 +1027,10 @@ public class ReportHandler extends Daemon {
                 return;
             }
 
-            int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
+            List<Long> aliveBeIdsInCluster = infoService.getClusterBackendIds(db.getClusterName(), true);
             Pair<TabletStatus, TabletSchedCtx.Priority> status = tablet.getHealthStatusWithPriority(infoService,
                     db.getClusterName(), visibleVersion, visibleVersionHash,
-                    replicationNum, availableBackendsNum);
+                    replicationNum, aliveBeIdsInCluster);
             
             if (status.first == TabletStatus.VERSION_INCOMPLETE || status.first == TabletStatus.REPLICA_MISSING) {
                 long lastFailedVersion = -1L;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org