From 4fa2d523a4eced951f6e250cf926ce3d7edf1649 Mon Sep 17 00:00:00 2001 From: Balazs Szekeres Date: Tue, 12 Nov 2019 14:52:08 +0100 Subject: [PATCH] Kubelet healthcheck reimagined. Fix for: https://jira.akraino.org/browse/REC-71 Healthcheck now pings local kubelet healtz api. Every successful healthcheck loop now tries to uncordon the node. Change-Id: Ib7665f0864fbdd2feb4d5ab9116e7c34030ee3f4 Signed-off-by: Balazs Szekeres --- ansible/roles/kubelet/defaults/main.yaml | 2 + .../roles/kubelet/templates/kubelet_healthcheck.sh | 61 +++++++--------------- caas-kubernetes.spec | 2 +- 3 files changed, 21 insertions(+), 44 deletions(-) diff --git a/ansible/roles/kubelet/defaults/main.yaml b/ansible/roles/kubelet/defaults/main.yaml index c0c071b..7ce5660 100644 --- a/ansible/roles/kubelet/defaults/main.yaml +++ b/ansible/roles/kubelet/defaults/main.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +kubelet_healthcheck_port: 10248 + kubelet_kubeconfig_path: "{% if nodename | search('caas_master') %}/etc/kubernetes/kubeconfig/kubeletc.yml{% else %}/root/kubeletc.yml{% endif %}" common_kubelet_params: diff --git a/ansible/roles/kubelet/templates/kubelet_healthcheck.sh b/ansible/roles/kubelet/templates/kubelet_healthcheck.sh index 7cf4a08..6c9f46d 100644 --- a/ansible/roles/kubelet/templates/kubelet_healthcheck.sh +++ b/ansible/roles/kubelet/templates/kubelet_healthcheck.sh @@ -13,61 +13,36 @@ # See the License for the specific language governing permissions and # limitations under the License. -wait_for_file () { - while [[ ! -f $1 ]] - do - echo "Waiting for file $1" - sleep 1 - done -} - - -CERT_AUTH="/etc/openssl/ca.pem" -CLIENT_CER="/etc/kubernetes/ssl/kubelet-server.pem" -CLIENT_KEY="/etc/kubernetes/ssl/kubelet-server-key.pem" -wait_for_file $CERT_AUTH -wait_for_file $CLIENT_CER -wait_for_file $CLIENT_KEY - - -keepdoing="true" error=0 while true do - if [[ "$keepdoing" == "true" ]] - then - echo "Waiting for kubernetes node to become ready..." - uncordon_ready=$( /usr/bin/kubectl get node --show-labels | grep -i "{{ nodename }}" | grep -i "ready" | grep -i "SchedulingDisabled" | wc -l ) - if [[ "$uncordon_ready" -eq "1" ]] - then - keepdoing="false" - /usr/bin/kubectl uncordon {{ ansible_host }} || echo "Post start kubelet, this node was never cordoned." - echo "Node uncordoned, and ready!" - fi - node_ready=$( /usr/bin/kubectl get node --show-labels | grep -i "{{ nodename }}" | grep -i " ready " | wc -l ) - if [[ "$node_ready" -eq "1" ]] - then - keepdoing="false" - echo "Node become ready." - fi - fi set +e - result="$(wget --timeout 10 --tries 5 --ca-certificate $CERT_AUTH --certificate $CLIENT_CER --private-key $CLIENT_KEY --spider https://{{ ansible_host }}:10250/healthz 2>&1 | grep 'HTTP' | grep -E -o '[[:digit:]]{3}')" - + result="$(curl 127.0.0.1:{{ kubelet_healthcheck_port }}/healthz)" set -e - if [ "$result" == "200" ] + if [ "$result" == "ok" ] then - echo "Healtcheck success" + echo "Healtcheck success." error=0 + set +e + uncordonresult="$(/usr/bin/kubectl uncordon {{ ansible_host }} 2>&1)" + set -e + echo "$uncordonresult" else - echo "Healtcheck failed" + echo "Healtcheck failed." error=$(($error+1)) fi if [ "$error" -ge "5" ] then - echo "Error with kubelet (Healtcheck failed 5 times) restarting it" - systemctl restart kubelet.service + activeState="$(systemctl show -p ActiveState --value kubelet)" + if [[ "$activeState" == "deactivating" ]] || [[ "$activeState" == "activating" ]] + then + echo "Kubelet is possibly restarting." + error=0 + else + echo "Error with kubelet (Healtcheck failed 5 times) restarting it." + systemctl restart kubelet.service + fi fi - sleep 30 + sleep 1 done diff --git a/caas-kubernetes.spec b/caas-kubernetes.spec index bcb42df..d387e7e 100644 --- a/caas-kubernetes.spec +++ b/caas-kubernetes.spec @@ -15,7 +15,7 @@ %define COMPONENT kubernetes %define RPM_NAME caas-%{COMPONENT} %define RPM_MAJOR_VERSION 1.16.2 -%define RPM_MINOR_VERSION 3 +%define RPM_MINOR_VERSION 4 %define IMAGE_TAG %{RPM_MAJOR_VERSION}-%{RPM_MINOR_VERSION} %define KUBERNETESPAUSE_VERSION 3.1 -- 2.16.6