You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by la...@apache.org on 2020/08/15 14:59:47 UTC
[kudu] 19/23: [script] fix minos_control_server.py for newer
versions
This is an automated email from the ASF dual-hosted git repository.
laiyingchun pushed a commit to tag kudu-1.12.0-mdh1.0.0-4c2c075-centos-release
in repository https://gitbox.apache.org/repos/asf/kudu.git
commit b42d25b4dc5c41d3a379f0a4324910612472660c
Author: zhangyifan8 <zh...@xiaomi.com>
AuthorDate: Fri Jun 5 18:12:01 2020 +0800
[script] fix minos_control_server.py for newer versions
According to https://kudu.apache.org/docs/administration.html#rolling_restart,
fix minos_control_server.py. And also add some new metrics introduced in 1.12.0.
---
src/kudu/scripts/falcon_screen.json | 30 +++++++++++++-
src/kudu/scripts/falcon_screen.py | 1 +
src/kudu/scripts/minos_control_server.py | 69 ++++++++++++++++++++++++++++----
3 files changed, 90 insertions(+), 10 deletions(-)
diff --git a/src/kudu/scripts/falcon_screen.json b/src/kudu/scripts/falcon_screen.json
index 26b7acd..8f467e5 100644
--- a/src/kudu/scripts/falcon_screen.json
+++ b/src/kudu/scripts/falcon_screen.json
@@ -44,6 +44,8 @@
"metric=key_file_lookups_per_op_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=key_file_lookups service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=leader_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4",
+ "metric=last_read_elapsed_time service=kudu cluster=${cluster.name} level=${level} v=4",
+ "metric=last_write_elapsed_time service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=live_row_count service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=log_append_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=log_bytes_logged service=kudu cluster=${cluster.name} level=${level} v=4",
@@ -84,6 +86,7 @@
"metric=state service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=tablet_active_scanners service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=time_since_last_leader_heartbeat service=kudu cluster=${cluster.name} level=${level} v=4",
+ "metric=transaction_memory_limit_rejections service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=transaction_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=undo_delta_block_estimated_retained_bytes service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=undo_delta_block_gc_bytes_deleted service=kudu cluster=${cluster.name} level=${level} v=4",
@@ -173,7 +176,12 @@
"metric=kudu-table-health service=kudu cluster=${cluster.name} level=${level} v=4"
],
"table_l": [
+ "metric=last_read_elapsed_time service=kudu cluster=${cluster.name} level=${level} v=4",
+ "metric=last_write_elapsed_time service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=leader_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4",
+ "metric=live_row_count service=kudu cluster=${cluster.name} level=${level} v=4"
+ ],
+ "table_log": [
"metric=log_append_latency_mean service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=log_append_latency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=log_append_latency_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4",
@@ -207,8 +215,7 @@
"metric=log_sync_latency_mean service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=log_sync_latency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=log_sync_latency_percentile_95 service=kudu cluster=${cluster.name} level=${level} v=4",
- "metric=log_sync_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4",
- "metric=live_row_count service=kudu cluster=${cluster.name} level=${level} v=4"
+ "metric=log_sync_latency_percentile_99 service=kudu cluster=${cluster.name} level=${level} v=4"
],
"table_mn": [
"metric=majority_done_ops service=kudu cluster=${cluster.name} level=${level} v=4",
@@ -276,6 +283,7 @@
"table_tw": [
"metric=tablet_active_scanners service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=time_since_last_leader_heartbeat service=kudu cluster=${cluster.name} level=${level} v=4",
+ "metric=transaction_memory_limit_rejections service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=transaction_memory_pressure_rejections service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=write_op_duration_client_propagated_consistency_mean service=kudu cluster=${cluster.name} level=${level} v=4",
"metric=write_op_duration_client_propagated_consistency_percentile_75 service=kudu cluster=${cluster.name} level=${level} v=4",
@@ -401,6 +409,7 @@
"metric=rpcs_queue_overflow service=kudu cluster=${cluster.name} level=host v=4",
"metric=rpcs_timed_out_in_queue service=kudu cluster=${cluster.name} level=host v=4",
"metric=spinlock_contention_time service=kudu cluster=${cluster.name} level=host v=4",
+ "metric=sys_catalog_oversized_write_requests service=kudu cluster=${cluster.name} level=host v=4",
"metric=tablet_copy_bytes_sent service=kudu cluster=${cluster.name} level=host v=4",
"metric=tablet_copy_open_source_sessions service=kudu cluster=${cluster.name} level=host v=4",
"metric=tcmalloc_current_total_thread_cache_bytes service=kudu cluster=${cluster.name} level=host v=4",
@@ -491,6 +500,7 @@
"metric=log_block_manager_dead_containers_deleted service=kudu cluster=${cluster.name} level=host v=4",
"metric=log_block_manager_full_containers service=kudu cluster=${cluster.name} level=host v=4",
"metric=log_block_manager_holes_punched service=kudu cluster=${cluster.name} level=host v=4",
+ "metric=num_raft_leaders service=kudu cluster=${cluster.name} level=host v=4",
"metric=op_apply_queue_length_percentile_99 service=kudu cluster=${cluster.name} level=host v=4",
"metric=op_apply_queue_time_percentile_99 service=kudu cluster=${cluster.name} level=host v=4",
"metric=op_apply_run_time_percentile_99 service=kudu cluster=${cluster.name} level=host v=4",
@@ -744,6 +754,22 @@
]
},
{
+ "screen": "${cluster.name} [metrics_log]",
+ "graphs": [
+ {
+ "title": "单表metrics",
+ "endpoints": ["${for.each.table}"],
+ "counters": {
+ "level": "table",
+ "template": "table_log"
+ },
+ "graph_type": "h",
+ "method": "",
+ "timespan": 86400
+ }
+ ]
+ },
+ {
"screen": "${cluster.name} [metrics_mn]",
"graphs": [
{
diff --git a/src/kudu/scripts/falcon_screen.py b/src/kudu/scripts/falcon_screen.py
index 1695ca5..000d05f 100755
--- a/src/kudu/scripts/falcon_screen.py
+++ b/src/kudu/scripts/falcon_screen.py
@@ -39,6 +39,7 @@ screenIdList = {
"[metrics_f]",
"[metrics_ghijk]",
"[metrics_l]",
+ "[metrics_log]",
"[metrics_mn]",
"[metrics_o]",
"[metrics_r]",
diff --git a/src/kudu/scripts/minos_control_server.py b/src/kudu/scripts/minos_control_server.py
index 875802c..e35bf33 100755
--- a/src/kudu/scripts/minos_control_server.py
+++ b/src/kudu/scripts/minos_control_server.py
@@ -53,7 +53,7 @@ def get_host(host_port):
def is_cluster_health():
status, output = commands.getstatusoutput('${KUDU_HOME}/kudu cluster ksck @%s -consensus=false'
- ' -ksck_format=json_compact -color=never'
+ ' -ksck_format=json_compact -quiescing_info=false -color=never'
' -sections=MASTER_SUMMARIES,TSERVER_SUMMARIES,TABLE_SUMMARIES'
' 2>/dev/null'
% cluster)
@@ -140,6 +140,22 @@ def get_tablet_server_info(hostname, tservers_info):
return rpc_address, uuid
+def get_cluster_version():
+ version_info = dict()
+ version = ''
+ min_version = '9.9.9'
+ status, output = commands.getstatusoutput('${KUDU_HOME}/kudu cluster ksck @%s -sections=VERSION_SUMMARIES '
+ '-quiescing_info=false -ksck_format=json_compact'
+ % cluster)
+ exit_if_failed(status, output)
+ version_info = json.loads(output)
+ for item in version_info['version_summaries']:
+ version = item['version']
+ if version < min_version:
+ min_version = version
+ return min_version
+
+
def set_flag(rpc_address, seconds):
cmd = ('${KUDU_HOME}/kudu tserver set_flag %s follower_unavailable_considered_failed_sec %s'
% (rpc_address, seconds))
@@ -147,13 +163,42 @@ def set_flag(rpc_address, seconds):
exit_if_failed(status, output)
+def maintain_tserver(op_type, ts_uuid):
+ cmd = ('${KUDU_HOME}/kudu tserver state %s @%s %s'
+ % (op_type, cluster, ts_uuid))
+ status, output = commands.getstatusoutput(cmd)
+ exit_if_failed(status, output)
+
+
+def wait_tserver_quiesce(rpc_address):
+ print(time_header() + 'Start to quiesce tserver ' + rpc_address)
+ cmd = ('${KUDU_HOME}/kudu tserver quiesce start %s -error_if_not_fully_quiesced' % (rpc_address))
+ is_quiesced = False
+ while not is_quiesced:
+ status, output = commands.getstatusoutput(cmd)
+ if status == 0:
+ print(time_header() + 'Tablet server is fully quiesced.')
+ is_quiesced = True
+ else:
+ print(time_header() + output)
+ time.sleep(1)
+
+
def rebalance_cluster(blacklist_tserver_uuid):
ignored_tservers_uuid = set()
for node in known_unhealth_nodes:
rpc_address, uuid = get_tablet_server_info(node, tservers_info)
ignored_tservers_uuid.add(uuid)
- cmd = ('${KUDU_HOME}/kudu cluster rebalance @%s -blacklist_tservers=%s -ignored_tservers=%s'
- % (cluster, blacklist_tserver_uuid, str(','.join(ignored_tservers_uuid))))
+ if blacklist_tserver_uuid == '':
+ cmd = ('${KUDU_HOME}/kudu cluster rebalance @%s -ignored_tservers=%s -quiescing_info=false'
+ % (cluster, str(','.join(ignored_tservers_uuid))))
+ elif version < '1.11':
+ cmd = ('${KUDU_HOME}/kudu cluster rebalance @%s -blacklist_tservers=%s -ignored_tservers=%s -quiescing_info=false'
+ % (cluster, blacklist_tserver_uuid, str(','.join(ignored_tservers_uuid))))
+ else:
+ ignored_tservers_uuid.add(blacklist_tserver_uuid)
+ cmd = ('${KUDU_HOME}/kudu cluster rebalance @%s -ignored_tservers=%s -move_replicas_from_ignored_tservers'
+ % (cluster, str(','.join(ignored_tservers_uuid))))
p = subprocess.Popen(cmd, stdout = subprocess.PIPE, shell=True)
for line in iter(p.stdout.readline, b''):
print line
@@ -184,8 +229,10 @@ check_parameter('You will rebalance cluster after operation: %s? (y/n)', rebalan
tservers_info = get_tservers_info()
wait_cluster_health()
+version = get_cluster_version()
+print('The cluster version(before rolling_update) is ' + version)
-if 'tablet_server' in job and operate in ['restart', 'rolling_update']:
+if version < '1.11' and 'tablet_server' in job and operate in ['restart', 'rolling_update']:
for tserver in tservers_info:
set_flag(tserver['rpc-addresses'], 7200)
@@ -195,13 +242,16 @@ for task in tasks:
exit()
if 'tablet_server' in job:
- cmd = ('%s/deploy show kudu %s --job %s --task %d'
- % (minos_client_path, cluster, job, task))
+ cmd = ('%s/deploy show kudu %s --job %s --task %d' % (minos_client_path, cluster, job, task))
status, output = commands.getstatusoutput(cmd)
exit_if_failed(status, output)
print(output)
hostname = parse_node_from_minos_output(output, job)
rpc_address, uuid = get_tablet_server_info(hostname, tservers_info)
+ if version > '1.11':
+ maintain_tserver("enter_maintenance", uuid)
+ if version > '1.12':
+ wait_tserver_quiesce(rpc_address)
if operate == 'stop':
# migrate replicas on tserver
rebalance_cluster(uuid)
@@ -218,12 +268,15 @@ for task in tasks:
wait_cluster_health()
if 'tablet_server' in job and operate in ['restart', 'rolling_update']:
- set_flag(rpc_address, 7200)
+ if version < '1.11':
+ set_flag(rpc_address, 7200)
+ else:
+ maintain_tserver("exit_maintenance", uuid)
print(time_header() + '==========================')
time.sleep(10)
-if 'tablet_server' in job and operate in ['restart', 'rolling_update']:
+if version < '1.11' and 'tablet_server' in job and operate in ['restart', 'rolling_update']:
for tserver in tservers_info:
set_flag(tserver['rpc-addresses'], default_follower_unavailable_considered_failed_sec)