testcases/basic_func_tests/tc_002_pod_health_check.py

   1 from robot.api import logger
   2 from robot.libraries.BuiltIn import BuiltIn
   3 from decorators_for_robot_functionalities import *
   4 import sys
   5 import os
   6 sys.path.append(os.path.join(os.path.dirname(__file__), '..', '../libraries/common'))
   7 from test_constants import *  # noqa
   8 import common_utils  # noqa
   9
  10
  11 ex = BuiltIn().get_library_instance('execute_command')
  12 STACK_INFOS = BuiltIn().get_library_instance('stack_infos')
  13
  14
  15 def tc_002_pod_health_check():
  16     steps = ['step1_check_componentstatus',
  17              'step2_check_kubelet_is_running',
  18              'step3_check_apiserver_is_running',
  19              'step4_check_all_kubernetes_pod',
  20              'step5_check_services_with_systemctl']
  21     common_utils.keyword_runner(steps)
  22
  23
  24 @pabot_lock("health_check_1")
  25 @pabot_lock("health_check_2")
  26 def step1_check_componentstatus():
  27     stdout = ex.execute_unix_command("kubectl get componentstatus -o json | jq .items[].conditions[].type")
  28     logger.console('\n')
  29     for line in stdout.split('\n'):
  30         if "Healthy" in line:
  31             logger.console(line)
  32         else:
  33             raise Exception(line)
  34
  35
  36 @robot_log
  37 def check_container_is_running(name, nodes):
  38     for key in nodes:
  39         stdout = ex.execute_unix_command_on_remote_as_user("docker ps --filter status=running --filter name=" + name +
  40                                                            " | grep -v pause | grep " + name + " | wc -l ", nodes[key])
  41         if stdout == '1':
  42             logger.console("\n" + name + " container is running on node " + key + ".")
  43         else:
  44             stdout = ex.execute_unix_command_on_remote_as_user("docker ps | grep -v pause | grep " + name, nodes[key])
  45             raise Exception(name + "container is NOT running on node " + key + "\n" + stdout)
  46
  47
  48 @robot_log
  49 def check_program_is_running(name, nodes):
  50     for key in nodes:
  51         stdout = ex.execute_unix_command_on_remote_as_user("ps -aux | grep '" + name + "' | grep -v 'color' | wc -l ",
  52                                                            nodes[key])
  53         if stdout == '1':
  54             logger.console("\n" + name + " is running on node " + key + ".")
  55         else:
  56             stdout = ex.execute_unix_command_on_remote_as_user("ps -aux | grep '" + name + "' | grep -v 'color'",
  57                                                                nodes[key])
  58             raise Exception(name + " is NOT running on node " + key + "\n" + stdout)
  59
  60
  61 def step2_check_kubelet_is_running():
  62     all_nodes = STACK_INFOS.get_all_nodes()
  63     check_program_is_running("/kubelet ", all_nodes)
  64     check_program_is_running("/kubelet_healthcheck.sh", all_nodes)
  65
  66
  67 def step3_check_apiserver_is_running():
  68     crf_nodes = STACK_INFOS.get_crf_nodes()
  69     check_container_is_running("kube-apiserver", crf_nodes)
  70
  71
  72 @pabot_lock("health_check_1")
  73 def step4_check_all_kubernetes_pod():
  74     LOG_DIR = os.path.join(os.path.dirname(__file__))
  75     command = "kubectl get po -n kube-system | tail -n +2 | grep -vP 'Running"
  76     for pod in pods_skipped:
  77         command += '|'+pod
  78     command += "'"
  79     stdout = ex.execute_unix_command(command, fail_on_non_zero_rc=False, skip_prompt_in_command_output=True)[0]
  80     if not stdout:
  81         logger.console("\nAll kubernetes PODs are running.")
  82         return
  83     for line in stdout.split("\n"):
  84         line = line.split()
  85         command = "kubectl logs --namespace " + line[0] + " " + line[1]
  86         filename = "tc004_step1_" + line[1] + ".log"
  87         common_utils.gather_logs(command, filename, LOG_DIR)
  88     raise Exception(stdout)
  89
  90
  91 def step5_check_services_with_systemctl():
  92     all_nodes = STACK_INFOS.get_all_nodes()
  93     command = "systemctl status | grep -E 'State: running|Jobs: 0 queued|Failed: 0 units' | grep -v grep"
  94     for key in all_nodes:
  95         logger.console(key)
  96         stdout = "\nsystemctl status output:\n" + ex.execute_unix_command_on_remote_as_user(command, all_nodes[key])
  97         if all(x in stdout for x in ["State: running", "Jobs: 0 queued", "Failed: 0 units"]):
  98             logger.console(stdout)
  99         else:
 100             # cat is needed here to remove the coloring of the systemctl for the robot logs
 101             failedservices = ex.execute_unix_command_on_remote_as_user("systemctl --failed | cat", all_nodes[key])
 102             # TODO: cloud-final.service fails with unknown reason
 103             if any(service in failedservices for service in services_skipped):
 104                 stdout = stdout + "\n" + ex.execute_unix_command_on_remote_as_user("systemctl --failed | cat",
 105                                                                                    all_nodes[key])
 106                 logger.console(stdout)
 107             else:
 108                 stdout = stdout + "\n" + ex.execute_unix_command_on_remote_as_user("systemctl --failed | cat",
 109                                                                                    all_nodes[key])
 110                 raise Exception(stdout)