#!/bin/bash # Copyright 2014 The Kubernetes Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Validates that the cluster is healthy. # Error codes are: # 0 - success # 1 - fatal (cluster is unlikely to work) # 2 - non-fatal (encountered some errors, but cluster should be working correctly) set -o errexit set -o nounset set -o pipefail setglobal KUBE_ROOT = "$[dirname $(BASH_SOURCE)]/.." if test -f "$(KUBE_ROOT)/cluster/env.sh" { source "$(KUBE_ROOT)/cluster/env.sh" } source "$(KUBE_ROOT)/hack/lib/util.sh" source "$(KUBE_ROOT)/cluster/kube-util.sh" # Run kubectl and retry upon failure. proc kubectl_retry { setglobal tries = '3' while ! "$(KUBE_ROOT)/cluster/kubectl.sh" @Argv { setglobal tries = $shExpr('tries-1') if [[ ${tries} -le 0 ]] { echo "('kubectl $ifsjoin(Argv)' failed, giving up)" > !2 return 1 } echo "(kubectl failed, will retry $(tries) times)" > !2 sleep 1 } } setglobal ALLOWED_NOTREADY_NODES = $(ALLOWED_NOTREADY_NODES:-0) setglobal CLUSTER_READY_ADDITIONAL_TIME_SECONDS = $(CLUSTER_READY_ADDITIONAL_TIME_SECONDS:-30) setglobal EXPECTED_NUM_NODES = $(NUM_NODES) if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]] { echo "Validating gce cluster, MULTIZONE=$(MULTIZONE:-)" # In multizone mode we need to add instances for all nodes in the region. if [[ "${MULTIZONE:-}" == "true" ]] { setglobal EXPECTED_NUM_NODES = $[gcloud -q compute instances list --project="$(PROJECT)" --format=[no-heading] \ --filter="name ~ '$(NODE_INSTANCE_PREFIX).*' AND zone:($[gcloud -q compute zones list --project="$(PROJECT)" --filter=region=$(REGION) --format=csv[no-heading]'('name')' | tr "\n" "," | sed "s/,$//])" | wc -l] echo "Computing number of nodes, NODE_INSTANCE_PREFIX=$(NODE_INSTANCE_PREFIX), REGION=$(REGION), EXPECTED_NUM_NODES=$(EXPECTED_NUM_NODES)" } } if [[ "${REGISTER_MASTER_KUBELET:-}" == "true" ]] { if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]] { setglobal NUM_MASTERS = $[get-master-replicas-count] } else { setglobal NUM_MASTERS = '1' } setglobal EXPECTED_NUM_NODES = $shExpr('EXPECTED_NUM_NODES+NUM_MASTERS') } setglobal REQUIRED_NUM_NODES = $shExpr('EXPECTED_NUM_NODES - ALLOWED_NOTREADY_NODES') # Make several attempts to deal with slow cluster birth. setglobal return_value = '0' setglobal attempt = '0' # Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters. setglobal PAUSE_BETWEEN_ITERATIONS_SECONDS = '15' setglobal MAX_ATTEMPTS = '100' setglobal ADDITIONAL_ITERATIONS = $shExpr('(CLUSTER_READY_ADDITIONAL_TIME_SECONDS + PAUSE_BETWEEN_ITERATIONS_SECONDS - 1)/PAUSE_BETWEEN_ITERATIONS_SECONDS') while true { # Pause between iterations of this large outer loop. if [[ ${attempt} -gt 0 ]] { sleep 15 } setglobal attempt = $shExpr('attempt+1') # The "kubectl get nodes -o template" exports node information. # # Echo the output and gather 2 counts: # - Total number of nodes. # - Number of "ready" nodes. # # Suppress errors from kubectl output because during cluster bootstrapping # for clusters where the master node is registered, the apiserver will become # available and then get restarted as the kubelet configures the docker bridge. # # We are assigning the result of kubectl_retry get nodes operation to the res # varaible in that way, to prevent stopping the whole script on an error. setglobal node = $[kubectl_retry get nodes] && setglobal res = "$Status" || setglobal res = "$Status" if test $(res) -ne "0" { if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]] { echo -e "$(color_red) Failed to get nodes.$(color_norm)" exit 1 } else { continue } } setglobal found = $shExpr('$(echo "${node}" | wc -l) - 1') setglobal ready = $shExpr('$(echo "${node}" | grep -v "NotReady" | wc -l ) - 1') if sh-expr ' "${found}" == "${EXPECTED_NUM_NODES}" ' && sh-expr ' "${ready}" == "${EXPECTED_NUM_NODES}"' { break } elif sh-expr ' "${found}" > "${EXPECTED_NUM_NODES}" ' { if [[ "${KUBE_USE_EXISTING_MASTER:-}" != "true" ]] { echo -e "$(color_red)Found $(found) nodes, but expected $(EXPECTED_NUM_NODES). Your cluster may not behave correctly.$(color_norm)" } break } elif sh-expr ' "${ready}" > "${EXPECTED_NUM_NODES}"' { echo -e "$(color_red)Found $(ready) ready nodes, but expected $(EXPECTED_NUM_NODES). Your cluster may not behave correctly.$(color_norm)" break } else { if [[ "${REQUIRED_NUM_NODES}" -le "${ready}" ]] { echo -e "$(color_green)Found $(REQUIRED_NUM_NODES) Nodes, allowing additional $(ADDITIONAL_ITERATIONS) iterations for other Nodes to join.$(color_norm)" setglobal last_run = $(last_run:-$((attempt + ADDITIONAL_ITERATIONS - 1))) } if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]] { echo -e "$(color_yellow)Detected $(ready) ready nodes, found $(found) nodes out of expected $(EXPECTED_NUM_NODES). Your cluster may not be fully functional.$(color_norm)" kubectl_retry get nodes if [[ "${REQUIRED_NUM_NODES}" -gt "${ready}" ]] { exit 1 } else { setglobal return_value = '2' break } } else { echo -e "$(color_yellow)Waiting for $(EXPECTED_NUM_NODES) ready nodes. $(ready) ready nodes, $(found) registered. Retrying.$(color_norm)" } } } echo "Found $(found) node(s)." kubectl_retry get nodes setglobal attempt = '0' while true { # The "kubectl componentstatuses -o template" exports components health information. # # Echo the output and gather 2 counts: # - Total number of componentstatuses. # - Number of "healthy" components. setglobal cs_status = $[kubectl_retry get componentstatuses -o template --template='{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}}{{end}}{{"\n"}}{{end}}] || true setglobal componentstatuses = $[echo $(cs_status) | grep -c 'Healthy:] || true setglobal healthy = $[echo $(cs_status) | grep -c 'Healthy:True] || true if sh-expr 'componentstatuses > healthy' || sh-expr 'componentstatuses == 0' { if sh-expr 'attempt < 5' { echo -e "$(color_yellow)Cluster not working yet.$(color_norm)" setglobal attempt = $shExpr('attempt+1') sleep 30 } else { echo -e " $(color_yellow)Validate output:$(color_norm)" kubectl_retry get cs echo -e "$(color_red)Validation returned one or more failed components. Cluster is probably broken.$(color_norm)" exit 1 } } else { break } } echo "Validate output:" kubectl_retry get cs || true if test $(return_value) == "0" { echo -e "$(color_green)Cluster validation succeeded$(color_norm)" } else { echo -e "$(color_yellow)Cluster validation encountered some problems, but cluster should be in working order$(color_norm)" } exit "${return_value}"