You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by la...@apache.org on 2020/08/15 14:59:47 UTC

[kudu] 19/23: [script] fix minos_control_server.py for newer versions

This is an automated email from the ASF dual-hosted git repository.

laiyingchun pushed a commit to tag kudu-1.12.0-mdh1.0.0-4c2c075-centos-release
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit b42d25b4dc5c41d3a379f0a4324910612472660c
Author: zhangyifan8 <zh...@xiaomi.com>
AuthorDate: Fri Jun 5 18:12:01 2020 +0800

    [script] fix minos_control_server.py for newer versions
    
    According to https://kudu.apache.org/docs/administration.html#rolling_restart,
    fix minos_control_server.py. And also add some new metrics introduced in 1.12.0.
---
 src/kudu/scripts/falcon_screen.json      | 30 +++++++++++++-
 src/kudu/scripts/falcon_screen.py        |  1 +
 src/kudu/scripts/minos_control_server.py | 69 ++++++++++++++++++++++++++++----
 3 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/src/kudu/scripts/falcon_screen.json b/src/kudu/scripts/falcon_screen.json
index 26b7acd..8f467e5 100644
--- a/src/kudu/scripts/falcon_screen.json
+++ b/src/kudu/scripts/falcon_screen.json
@@ -44,6 +44,8 @@
           "metric=key_file_lookups_per_op_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4",
           "metric=key_file_lookups service=kudu cluster=${cluster.name} level=${level} v=4",
           "metric=leader_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4",
+          "metric=last_read_elapsed_time service=kudu cluster=${cluster.name} level=${level} v=4",
+          "metric=last_write_elapsed_time service=kudu cluster=${cluster.name} level=${level} v=4",
           "metric=live_row_count service=kudu cluster=${cluster.name} level=${level} v=4",
           "metric=log_append_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4",
           "metric=log_bytes_logged service=kudu cluster=${cluster.name} level=${level} v=4",
@@ -84,6 +86,7 @@
           "metric=state service=kudu cluster=${cluster.name} level=${level} v=4",
           "metric=tablet_active_scanners service=kudu cluster=${cluster.name} level=${level} v=4",
           "metric=time_since_last_leader_heartbeat service=kudu cluster=${cluster.name} level=${level} v=4",
+          "metric=transaction_memory_limit_rejections service=kudu cluster=${cluster.name} level=${level} v=4",
           "metric=transaction_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4",
           "metric=undo_delta_block_estimated_retained_bytes service=kudu cluster=${cluster.name} level=${level} v=4",
           "metric=undo_delta_block_gc_bytes_deleted service=kudu cluster=${cluster.name} level=${level} v=4",
@@ -173,7 +176,12 @@
         "metric=kudu-table-health service=kudu cluster=${cluster.name} level=${level} v=4"
       ],
       "table_l": [
+        "metric=last_read_elapsed_time service=kudu cluster=${cluster.name} level=${level} v=4",
+        "metric=last_write_elapsed_time service=kudu cluster=${cluster.name} level=${level} v=4",
         "metric=leader_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4",
+        "metric=live_row_count service=kudu cluster=${cluster.name} level=${level} v=4"
+      ],
+      "table_log": [
         "metric=log_append_latency_mean service=kudu cluster=${cluster.name} level=${level} v=4",
         "metric=log_append_latency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4",
         "metric=log_append_latency_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4",
@@ -207,8 +215,7 @@
         "metric=log_sync_latency_mean service=kudu cluster=${cluster.name} level=${level} v=4",
         "metric=log_sync_latency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4",
         "metric=log_sync_latency_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4",
-        "metric=log_sync_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4",
-        "metric=live_row_count service=kudu cluster=${cluster.name} level=${level} v=4"
+        "metric=log_sync_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4"
       ],
       "table_mn": [
         "metric=majority_done_ops service=kudu cluster=${cluster.name} level=${level} v=4",
@@ -276,6 +283,7 @@
       "table_tw": [
         "metric=tablet_active_scanners service=kudu cluster=${cluster.name} level=${level} v=4",
         "metric=time_since_last_leader_heartbeat service=kudu cluster=${cluster.name} level=${level} v=4",
+        "metric=transaction_memory_limit_rejections service=kudu cluster=${cluster.name} level=${level} v=4",
         "metric=transaction_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4",
         "metric=write_op_duration_client_propagated_consistency_mean service=kudu cluster=${cluster.name} level=${level} v=4",
         "metric=write_op_duration_client_propagated_consistency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4",
@@ -401,6 +409,7 @@
           "metric=rpcs_queue_overflow service=kudu cluster=${cluster.name} level=host v=4",
           "metric=rpcs_timed_out_in_queue service=kudu cluster=${cluster.name} level=host v=4",
           "metric=spinlock_contention_time service=kudu cluster=${cluster.name} level=host v=4",
+          "metric=sys_catalog_oversized_write_requests service=kudu cluster=${cluster.name} level=host v=4",
           "metric=tablet_copy_bytes_sent service=kudu cluster=${cluster.name} level=host v=4",
           "metric=tablet_copy_open_source_sessions service=kudu cluster=${cluster.name} level=host v=4",
           "metric=tcmalloc_current_total_thread_cache_bytes service=kudu cluster=${cluster.name} level=host v=4",
@@ -491,6 +500,7 @@
           "metric=log_block_manager_dead_containers_deleted service=kudu cluster=${cluster.name} level=host v=4",
           "metric=log_block_manager_full_containers service=kudu cluster=${cluster.name} level=host v=4",
           "metric=log_block_manager_holes_punched service=kudu cluster=${cluster.name} level=host v=4",
+          "metric=num_raft_leaders service=kudu cluster=${cluster.name} level=host v=4",
           "metric=op_apply_queue_length_percentile_99 service=kudu cluster=${cluster.name} level=host v=4",
           "metric=op_apply_queue_time_percentile_99 service=kudu cluster=${cluster.name} level=host v=4",
           "metric=op_apply_run_time_percentile_99 service=kudu cluster=${cluster.name} level=host v=4",
@@ -744,6 +754,22 @@
       ]
     },
     {
+      "screen": "${cluster.name} [metrics_log]",
+      "graphs": [
+        {
+          "title": "单表metrics",
+          "endpoints": ["${for.each.table}"],
+          "counters": {
+              "level": "table",
+              "template": "table_log"
+          },
+          "graph_type": "h",
+          "method": "",
+          "timespan": 86400
+        }
+      ]
+    },
+    {
       "screen": "${cluster.name} [metrics_mn]",
       "graphs": [
         {
diff --git a/src/kudu/scripts/falcon_screen.py b/src/kudu/scripts/falcon_screen.py
index 1695ca5..000d05f 100755
--- a/src/kudu/scripts/falcon_screen.py
+++ b/src/kudu/scripts/falcon_screen.py
@@ -39,6 +39,7 @@ screenIdList = {
         "[metrics_f]",
         "[metrics_ghijk]",
         "[metrics_l]",
+        "[metrics_log]",
         "[metrics_mn]",
         "[metrics_o]",
         "[metrics_r]",
diff --git a/src/kudu/scripts/minos_control_server.py b/src/kudu/scripts/minos_control_server.py
index 875802c..e35bf33 100755
--- a/src/kudu/scripts/minos_control_server.py
+++ b/src/kudu/scripts/minos_control_server.py
@@ -53,7 +53,7 @@ def get_host(host_port):
 
 def is_cluster_health():
     status, output = commands.getstatusoutput('${KUDU_HOME}/kudu cluster ksck @%s -consensus=false'
-                                              ' -ksck_format=json_compact -color=never'
+                                              ' -ksck_format=json_compact -quiescing_info=false -color=never'
                                               ' -sections=MASTER_SUMMARIES,TSERVER_SUMMARIES,TABLE_SUMMARIES'
                                               ' 2>/dev/null'
                                               % cluster)
@@ -140,6 +140,22 @@ def get_tablet_server_info(hostname, tservers_info):
     return rpc_address, uuid
 
 
+def get_cluster_version():
+    version_info = dict()
+    version = ''
+    min_version = '9.9.9'
+    status, output = commands.getstatusoutput('${KUDU_HOME}/kudu cluster ksck @%s -sections=VERSION_SUMMARIES '
+                                              '-quiescing_info=false -ksck_format=json_compact'
+                                              % cluster)
+    exit_if_failed(status, output)
+    version_info = json.loads(output)
+    for item in version_info['version_summaries']:
+        version = item['version']
+        if version < min_version:
+            min_version = version
+    return min_version
+
+
 def set_flag(rpc_address, seconds):
     cmd = ('${KUDU_HOME}/kudu tserver set_flag %s follower_unavailable_considered_failed_sec %s'
            % (rpc_address, seconds))
@@ -147,13 +163,42 @@ def set_flag(rpc_address, seconds):
     exit_if_failed(status, output)
 
 
+def maintain_tserver(op_type, ts_uuid):
+    cmd = ('${KUDU_HOME}/kudu tserver state %s @%s %s'
+           % (op_type, cluster, ts_uuid))
+    status, output = commands.getstatusoutput(cmd)
+    exit_if_failed(status, output)
+
+
+def wait_tserver_quiesce(rpc_address):
+    print(time_header() + 'Start to quiesce tserver ' + rpc_address)
+    cmd = ('${KUDU_HOME}/kudu tserver quiesce start %s -error_if_not_fully_quiesced' % (rpc_address))
+    is_quiesced = False
+    while not is_quiesced:
+        status, output = commands.getstatusoutput(cmd)
+        if status == 0:
+            print(time_header() + 'Tablet server is fully quiesced.')
+            is_quiesced = True
+        else:
+            print(time_header() + output)
+            time.sleep(1)
+
+
 def rebalance_cluster(blacklist_tserver_uuid):
     ignored_tservers_uuid = set()
     for node in known_unhealth_nodes:
         rpc_address, uuid = get_tablet_server_info(node, tservers_info)
         ignored_tservers_uuid.add(uuid)
-    cmd = ('${KUDU_HOME}/kudu cluster rebalance @%s -blacklist_tservers=%s -ignored_tservers=%s'
-           % (cluster, blacklist_tserver_uuid, str(','.join(ignored_tservers_uuid))))
+    if blacklist_tserver_uuid == '':
+        cmd = ('${KUDU_HOME}/kudu cluster rebalance @%s -ignored_tservers=%s -quiescing_info=false'
+               % (cluster, str(','.join(ignored_tservers_uuid))))
+    elif version < '1.11':
+        cmd = ('${KUDU_HOME}/kudu cluster rebalance @%s -blacklist_tservers=%s -ignored_tservers=%s -quiescing_info=false'
+               % (cluster, blacklist_tserver_uuid, str(','.join(ignored_tservers_uuid))))
+    else:
+        ignored_tservers_uuid.add(blacklist_tserver_uuid)
+        cmd = ('${KUDU_HOME}/kudu cluster rebalance @%s -ignored_tservers=%s -move_replicas_from_ignored_tservers'
+               % (cluster, str(','.join(ignored_tservers_uuid))))
     p = subprocess.Popen(cmd, stdout = subprocess.PIPE, shell=True)
     for line in iter(p.stdout.readline, b''):
         print line
@@ -184,8 +229,10 @@ check_parameter('You will rebalance cluster after operation: %s? (y/n)', rebalan
 
 tservers_info = get_tservers_info()
 wait_cluster_health()
+version = get_cluster_version()
+print('The cluster version(before rolling_update) is ' + version)
 
-if 'tablet_server' in job and operate in ['restart', 'rolling_update']:
+if version < '1.11' and 'tablet_server' in job and operate in ['restart', 'rolling_update']:
     for tserver in tservers_info:
         set_flag(tserver['rpc-addresses'], 7200)
 
@@ -195,13 +242,16 @@ for task in tasks:
         exit()
 
     if 'tablet_server' in job:
-        cmd = ('%s/deploy show kudu %s --job %s --task %d'
-          % (minos_client_path, cluster, job, task))
+        cmd = ('%s/deploy show kudu %s --job %s --task %d' % (minos_client_path, cluster, job, task))
         status, output = commands.getstatusoutput(cmd)
         exit_if_failed(status, output)
         print(output)
         hostname = parse_node_from_minos_output(output, job)
         rpc_address, uuid = get_tablet_server_info(hostname, tservers_info)
+        if version > '1.11':
+            maintain_tserver("enter_maintenance", uuid)
+            if version > '1.12':
+                wait_tserver_quiesce(rpc_address)
         if operate == 'stop':
             # migrate replicas on tserver
             rebalance_cluster(uuid)
@@ -218,12 +268,15 @@ for task in tasks:
     wait_cluster_health()
 
     if 'tablet_server' in job and operate in ['restart', 'rolling_update']:
-        set_flag(rpc_address, 7200)
+        if version < '1.11':
+            set_flag(rpc_address, 7200)
+        else:
+            maintain_tserver("exit_maintenance", uuid)
 
     print(time_header() + '==========================')
     time.sleep(10)
 
-if 'tablet_server' in job and operate in ['restart', 'rolling_update']:
+if version < '1.11' and 'tablet_server' in job and operate in ['restart', 'rolling_update']:
     for tserver in tservers_info:
         set_flag(tserver['rpc-addresses'], default_follower_unavailable_considered_failed_sec)