from robot.api import logger from robot.libraries.BuiltIn import BuiltIn from decorators_for_robot_functionalities import * import sys import os sys.path.append(os.path.join(os.path.dirname(__file__), '..', '../libraries/common')) from test_constants import * # noqa import common_utils # noqa ex = BuiltIn().get_library_instance('execute_command') STACK_INFOS = BuiltIn().get_library_instance('stack_infos') def tc_002_pod_health_check(): steps = ['step1_check_componentstatus', 'step2_check_kubelet_is_running', 'step3_check_apiserver_is_running', 'step4_check_all_kubernetes_pod', 'step5_check_services_with_systemctl'] common_utils.keyword_runner(steps) @pabot_lock("health_check_1") @pabot_lock("health_check_2") def step1_check_componentstatus(): stdout = ex.execute_unix_command("kubectl get componentstatus -o json | jq .items[].conditions[].type") logger.console('\n') for line in stdout.split('\n'): if "Healthy" in line: logger.console(line) else: raise Exception(line) @robot_log def check_container_is_running(name, nodes): for key in nodes: stdout = ex.execute_unix_command_on_remote_as_user("docker ps --filter status=running --filter name=" + name + " | grep -v pause | grep " + name + " | wc -l ", nodes[key]) if stdout == '1': logger.console("\n" + name + " container is running on node " + key + ".") else: stdout = ex.execute_unix_command_on_remote_as_user("docker ps | grep -v pause | grep " + name, nodes[key]) raise Exception(name + "container is NOT running on node " + key + "\n" + stdout) @robot_log def check_program_is_running(name, nodes): for key in nodes: stdout = ex.execute_unix_command_on_remote_as_user("ps -aux | grep '" + name + "' | grep -v 'color' | wc -l ", nodes[key]) if stdout == '1': logger.console("\n" + name + " is running on node " + key + ".") else: stdout = ex.execute_unix_command_on_remote_as_user("ps -aux | grep '" + name + "' | grep -v 'color'", nodes[key]) raise Exception(name + " is NOT running on node " + key + "\n" + stdout) def step2_check_kubelet_is_running(): all_nodes = STACK_INFOS.get_all_nodes() check_program_is_running("/kubelet ", all_nodes) check_program_is_running("/kubelet_healthcheck.sh", all_nodes) def step3_check_apiserver_is_running(): crf_nodes = STACK_INFOS.get_crf_nodes() check_container_is_running("kube-apiserver", crf_nodes) @pabot_lock("health_check_1") def step4_check_all_kubernetes_pod(): LOG_DIR = os.path.join(os.path.dirname(__file__)) command = "kubectl get po -n kube-system | tail -n +2 | grep -vP 'Running" for pod in pods_skipped: command += '|'+pod command += "'" stdout = ex.execute_unix_command(command, fail_on_non_zero_rc=False, skip_prompt_in_command_output=True)[0] if not stdout: logger.console("\nAll kubernetes PODs are running.") return for line in stdout.split("\n"): line = line.split() command = "kubectl logs --namespace " + line[0] + " " + line[1] filename = "tc004_step1_" + line[1] + ".log" common_utils.gather_logs(command, filename, LOG_DIR) raise Exception(stdout) def step5_check_services_with_systemctl(): all_nodes = STACK_INFOS.get_all_nodes() command = "systemctl status | grep -E 'State: running|Jobs: 0 queued|Failed: 0 units' | grep -v grep" for key in all_nodes: logger.console(key) stdout = "\nsystemctl status output:\n" + ex.execute_unix_command_on_remote_as_user(command, all_nodes[key]) if all(x in stdout for x in ["State: running", "Jobs: 0 queued", "Failed: 0 units"]): logger.console(stdout) else: # cat is needed here to remove the coloring of the systemctl for the robot logs failedservices = ex.execute_unix_command_on_remote_as_user("systemctl --failed | cat", all_nodes[key]) # TODO: cloud-final.service fails with unknown reason if any(service in failedservices for service in services_skipped): stdout = stdout + "\n" + ex.execute_unix_command_on_remote_as_user("systemctl --failed | cat", all_nodes[key]) logger.console(stdout) else: stdout = stdout + "\n" + ex.execute_unix_command_on_remote_as_user("systemctl --failed | cat", all_nodes[key]) raise Exception(stdout)