X-Git-Url: https://gerrit.akraino.org/r/gitweb?p=ta%2Fcloudtaf.git;a=blobdiff_plain;f=testcases%2Fbasic_func_tests%2Ftc_002_pod_health_check.py;fp=testcases%2Fbasic_func_tests%2Ftc_002_pod_health_check.py;h=20235b968947db8fe993aa9c508085f8fbbce159;hp=0000000000000000000000000000000000000000;hb=af5eb3ff36b92ab1d9c156ffa0391eadc73eb6ba;hpb=025a45508d009db84c34076fb4a668f712628d6d diff --git a/testcases/basic_func_tests/tc_002_pod_health_check.py b/testcases/basic_func_tests/tc_002_pod_health_check.py new file mode 100644 index 0000000..20235b9 --- /dev/null +++ b/testcases/basic_func_tests/tc_002_pod_health_check.py @@ -0,0 +1,110 @@ +from robot.api import logger +from robot.libraries.BuiltIn import BuiltIn +from decorators_for_robot_functionalities import * +import sys +import os +sys.path.append(os.path.join(os.path.dirname(__file__), '..', '../libraries/common')) +from test_constants import * # noqa +import common_utils # noqa + + +ex = BuiltIn().get_library_instance('execute_command') +STACK_INFOS = BuiltIn().get_library_instance('stack_infos') + + +def tc_002_pod_health_check(): + steps = ['step1_check_componentstatus', + 'step2_check_kubelet_is_running', + 'step3_check_apiserver_is_running', + 'step4_check_all_kubernetes_pod', + 'step5_check_services_with_systemctl'] + common_utils.keyword_runner(steps) + + +@pabot_lock("health_check_1") +@pabot_lock("health_check_2") +def step1_check_componentstatus(): + stdout = ex.execute_unix_command("kubectl get componentstatus -o json | jq .items[].conditions[].type") + logger.console('\n') + for line in stdout.split('\n'): + if "Healthy" in line: + logger.console(line) + else: + raise Exception(line) + + +@robot_log +def check_container_is_running(name, nodes): + for key in nodes: + stdout = ex.execute_unix_command_on_remote_as_user("docker ps --filter status=running --filter name=" + name + + " | grep -v pause | grep " + name + " | wc -l ", nodes[key]) + if stdout == '1': + logger.console("\n" + name + " container is running on node " + key + ".") + else: + stdout = ex.execute_unix_command_on_remote_as_user("docker ps | grep -v pause | grep " + name, nodes[key]) + raise Exception(name + "container is NOT running on node " + key + "\n" + stdout) + + +@robot_log +def check_program_is_running(name, nodes): + for key in nodes: + stdout = ex.execute_unix_command_on_remote_as_user("ps -aux | grep '" + name + "' | grep -v 'color' | wc -l ", + nodes[key]) + if stdout == '1': + logger.console("\n" + name + " is running on node " + key + ".") + else: + stdout = ex.execute_unix_command_on_remote_as_user("ps -aux | grep '" + name + "' | grep -v 'color'", + nodes[key]) + raise Exception(name + " is NOT running on node " + key + "\n" + stdout) + + +def step2_check_kubelet_is_running(): + all_nodes = STACK_INFOS.get_all_nodes() + check_program_is_running("/kubelet ", all_nodes) + check_program_is_running("/kubelet_healthcheck.sh", all_nodes) + + +def step3_check_apiserver_is_running(): + crf_nodes = STACK_INFOS.get_crf_nodes() + check_container_is_running("kube-apiserver", crf_nodes) + + +@pabot_lock("health_check_1") +def step4_check_all_kubernetes_pod(): + LOG_DIR = os.path.join(os.path.dirname(__file__)) + command = "kubectl get po -n kube-system | tail -n +2 | grep -vP 'Running" + for pod in pods_skipped: + command += '|'+pod + command += "'" + stdout = ex.execute_unix_command(command, fail_on_non_zero_rc=False, skip_prompt_in_command_output=True)[0] + if not stdout: + logger.console("\nAll kubernetes PODs are running.") + return + for line in stdout.split("\n"): + line = line.split() + command = "kubectl logs --namespace " + line[0] + " " + line[1] + filename = "tc004_step1_" + line[1] + ".log" + common_utils.gather_logs(command, filename, LOG_DIR) + raise Exception(stdout) + + +def step5_check_services_with_systemctl(): + all_nodes = STACK_INFOS.get_all_nodes() + command = "systemctl status | grep -E 'State: running|Jobs: 0 queued|Failed: 0 units' | grep -v grep" + for key in all_nodes: + logger.console(key) + stdout = "\nsystemctl status output:\n" + ex.execute_unix_command_on_remote_as_user(command, all_nodes[key]) + if all(x in stdout for x in ["State: running", "Jobs: 0 queued", "Failed: 0 units"]): + logger.console(stdout) + else: + # cat is needed here to remove the coloring of the systemctl for the robot logs + failedservices = ex.execute_unix_command_on_remote_as_user("systemctl --failed | cat", all_nodes[key]) + # TODO: cloud-final.service fails with unknown reason + if any(service in failedservices for service in services_skipped): + stdout = stdout + "\n" + ex.execute_unix_command_on_remote_as_user("systemctl --failed | cat", + all_nodes[key]) + logger.console(stdout) + else: + stdout = stdout + "\n" + ex.execute_unix_command_on_remote_as_user("systemctl --failed | cat", + all_nodes[key]) + raise Exception(stdout)