Skip to content

Commit c9118fa

Browse files
authored
Check for additional node (pressure) conditions (#28)
* Check for additional node conditions * Check for additional node (pressure) conditions
1 parent 64371fc commit c9118fa

File tree

1 file changed

+80
-1
lines changed

1 file changed

+80
-1
lines changed

check_rancher2.sh

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
# 20210413 1.5.0 Plugin now uses jq instead of jshon, fix cluster error check (#19) #
4646
# 20210504 1.6.0 Add usage performance data on single cluster check, fix project check #
4747
# 20210824 1.6.1 Fix cluster and project not found error (#24) #
48+
# 20211021 1.7.0 Check for additional node (pressure) conditions (#27) #
4849
##########################################################################################
4950
# (Pre-)Define some fixed variables
5051
STATE_OK=0 # define the exit code if status is OK
@@ -53,7 +54,7 @@ STATE_CRITICAL=2 # define the exit code if status is Critical
5354
STATE_UNKNOWN=3 # define the exit code if status is Unknown
5455
export PATH=/usr/local/bin:/usr/bin:/bin:$PATH # Set path
5556
proto=http # Protocol to use, default is http, can be overwritten with -S parameter
56-
version=1.6.1
57+
version=1.7.0
5758

5859
# Check for necessary commands
5960
for cmd in jq curl [
@@ -299,7 +300,12 @@ if [[ -z $clustername ]]; then
299300
declare -a node_names=( $(echo "$api_out_nodes" | jq -r '.data[].nodeName') )
300301
declare -a node_status=( $(echo "$api_out_nodes" | jq -r '.data[].state') )
301302
declare -a node_cluster_member=( $(echo "$api_out_nodes" | jq -r '.data[].clusterId') )
303+
declare -a node_diskpressure=( $(echo "$api_out_nodes" | jq -r '.data[].conditions[] | select(.type=="DiskPressure").status' | awk '/True/ {print FNR}' ) )
304+
declare -a node_memorypressure=( $(echo "$api_out_nodes" | jq -r '.data[].conditions[] | select(.type=="MemoryPressure").status' | awk '/True/ {print FNR}' ) )
305+
declare -a node_kubeletready=( $(echo "$api_out_nodes" | jq -r '.data[].conditions[] | select(.type=="Ready").status' | awk '/False/ {print FNR}' ) )
306+
declare -a node_network=( $(echo "$api_out_nodes" | jq -r '.data[].conditions[] | select(.type=="NetworkUnavailable").status' | awk '/True/ {print FNR}' ) )
302307

308+
# Check node status (user controlled)
303309
i=0
304310
for node in ${node_names[*]}
305311
do
@@ -316,6 +322,35 @@ if [[ -z $clustername ]]; then
316322
let i++
317323
done
318324

325+
# Handle node pressure situations and other conditions (Kubernetes controlled)
326+
if [[ ${#node_diskpressure[*]} -gt 0 ]]; then
327+
for n in ${node_diskpressure[*]}; do
328+
hostid=$(( $n - 1 ))
329+
nodeerrors+=("${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} has Disk Pressure -")
330+
done
331+
fi
332+
333+
if [[ ${#node_memorypressure[*]} -gt 0 ]]; then
334+
for n in ${node_memorypressure[*]}; do
335+
hostid=$(( $n - 1 ))
336+
nodeerrors+=("${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} has Memory Pressure -")
337+
done
338+
fi
339+
340+
if [[ ${#node_kubeletready[*]} -gt 0 ]]; then
341+
for n in ${node_kubeletready[*]}; do
342+
hostid=$(( $n - 1 ))
343+
nodeerrors+=("Kubelet on node ${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} is not ready -")
344+
done
345+
fi
346+
347+
if [[ ${#node_network[*]} -gt 0 ]]; then
348+
for n in ${node_network[*]}; do
349+
hostid=$(( $n - 1 ))
350+
nodeerrors+=("Network on node ${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} is unavailable -")
351+
done
352+
fi
353+
319354
if [[ ${#nodeerrors[*]} -gt 0 ]]; then
320355
echo "CHECK_RANCHER2 CRITICAL - ${nodeerrors[*]}|'nodes_total'=${#node_names[*]};;;; 'node_errors'=${#nodeerrors[*]};;;; 'node_ignored'=${#nodeignored[*]};;;;"
321356
exit ${STATE_CRITICAL}
@@ -331,6 +366,10 @@ else
331366

332367
# Check status of all nodes in a specific cluster
333368
api_out_nodes=$(curl -s ${selfsigned} -u "${apiuser}:${apipass}" "${proto}://${apihost}/v3/nodes/?clusterId=${clustername}")
369+
declare -a node_diskpressure=( $(echo "$api_out_nodes" | jq -r '.data[].conditions[] | select(.type=="DiskPressure").status' | awk '/True/ {print FNR}' ) )
370+
declare -a node_memorypressure=( $(echo "$api_out_nodes" | jq -r '.data[].conditions[] | select(.type=="MemoryPressure").status' | awk '/True/ {print FNR}' ) )
371+
declare -a node_kubeletready=( $(echo "$api_out_nodes" | jq -r '.data[].conditions[] | select(.type=="Ready").status' | awk '/False/ {print FNR}' ) )
372+
declare -a node_network=( $(echo "$api_out_nodes" | jq -r '.data[].conditions[] | select(.type=="NetworkUnavailable").status' | awk '/True/ {print FNR}' ) )
334373

335374
# Check if that given cluster name exists
336375
if [[ -n $(echo "$api_out_nodes" | grep -i "NotFound") ]]
@@ -340,6 +379,7 @@ else
340379
declare -a node_names=( $(echo "$api_out_nodes" | jq -r '.data[].nodeName') )
341380
declare -a node_status=( $(echo "$api_out_nodes" | jq -r '.data[].state') )
342381

382+
# Check node status (user controlled)
343383
i=0
344384
for node in ${node_names[*]}
345385
do
@@ -356,6 +396,45 @@ else
356396
let i++
357397
done
358398

399+
# Handle node pressure situations and other conditions (Kubernetes controlled)
400+
if [[ ${#node_diskpressure[*]} -gt 0 ]]; then
401+
for n in ${node_diskpressure[*]}; do
402+
hostid=$(( $n - 1 ))
403+
nodeerrors+=("${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} has Disk Pressure -")
404+
done
405+
fi
406+
407+
if [[ ${#node_memorypressure[*]} -gt 0 ]]; then
408+
for n in ${node_memorypressure[*]}; do
409+
hostid=$(( $n - 1 ))
410+
nodeerrors+=("${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} has Memory Pressure -")
411+
done
412+
fi
413+
414+
if [[ ${#node_kubeletready[*]} -gt 0 ]]; then
415+
for n in ${node_kubeletready[*]}; do
416+
hostid=$(( $n - 1 ))
417+
nodeerrors+=("Kubelet on node ${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} is not ready -")
418+
done
419+
fi
420+
421+
if [[ ${#node_network[*]} -gt 0 ]]; then
422+
for n in ${node_network[*]}; do
423+
hostid=$(( $n - 1 ))
424+
nodeerrors+=("Network on node ${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} is unavailable -")
425+
done
426+
fi
427+
428+
if [[ ${#nodeerrors[*]} -gt 0 ]]; then
429+
echo "CHECK_RANCHER2 CRITICAL - ${nodeerrors[*]}|'nodes_total'=${#node_names[*]};;;; 'node_errors'=${#nodeerrors[*]};;;; 'node_ignored'=${#nodeignored[*]};;;;"
430+
exit ${STATE_CRITICAL}
431+
elif [[ ${#nodeignored[*]} -gt 0 ]]; then
432+
echo "CHECK_RANCHER2 OK - All nodes OK - Info: ${nodeignored[*]}|'nodes_total'=${#node_names[*]};;;; 'node_errors'=${#nodeerrors[*]};;;; 'node_ignored'=${#nodeignored[*]};;;;"
433+
exit ${STATE_OK}
434+
else
435+
echo "CHECK_RANCHER2 OK - All ${#node_names[*]} nodes are active|'nodes_total'=${#node_names[*]};;;; 'node_errors'=${#nodeerrors[*]};;;; 'node_ignored'=${#nodeignored[*]};;;;"
436+
exit ${STATE_OK}
437+
fi
359438
if [[ ${#nodeerrors[*]} -gt 0 ]]; then
360439
echo "CHECK_RANCHER2 CRITICAL - ${nodeerrors[*]}|'nodes_total'=${#node_names[*]};;;; 'node_errors'=${#nodeerrors[*]};;;; 'node_ignored'=${#nodeignored[*]};;;;"
361440
exit ${STATE_CRITICAL}

0 commit comments

Comments
 (0)