45
45
# 20210413 1.5.0 Plugin now uses jq instead of jshon, fix cluster error check (#19) #
46
46
# 20210504 1.6.0 Add usage performance data on single cluster check, fix project check #
47
47
# 20210824 1.6.1 Fix cluster and project not found error (#24) #
48
+ # 20211021 1.7.0 Check for additional node (pressure) conditions (#27) #
48
49
# #########################################################################################
49
50
# (Pre-)Define some fixed variables
50
51
STATE_OK=0 # define the exit code if status is OK
@@ -53,7 +54,7 @@ STATE_CRITICAL=2 # define the exit code if status is Critical
53
54
STATE_UNKNOWN=3 # define the exit code if status is Unknown
54
55
export PATH=/usr/local/bin:/usr/bin:/bin:$PATH # Set path
55
56
proto=http # Protocol to use, default is http, can be overwritten with -S parameter
56
- version=1.6.1
57
+ version=1.7.0
57
58
58
59
# Check for necessary commands
59
60
for cmd in jq curl [
@@ -299,7 +300,12 @@ if [[ -z $clustername ]]; then
299
300
declare -a node_names=( $( echo " $api_out_nodes " | jq -r ' .data[].nodeName' ) )
300
301
declare -a node_status=( $( echo " $api_out_nodes " | jq -r ' .data[].state' ) )
301
302
declare -a node_cluster_member=( $( echo " $api_out_nodes " | jq -r ' .data[].clusterId' ) )
303
+ declare -a node_diskpressure=( $( echo " $api_out_nodes " | jq -r ' .data[].conditions[] | select(.type=="DiskPressure").status' | awk ' /True/ {print FNR}' ) )
304
+ declare -a node_memorypressure=( $( echo " $api_out_nodes " | jq -r ' .data[].conditions[] | select(.type=="MemoryPressure").status' | awk ' /True/ {print FNR}' ) )
305
+ declare -a node_kubeletready=( $( echo " $api_out_nodes " | jq -r ' .data[].conditions[] | select(.type=="Ready").status' | awk ' /False/ {print FNR}' ) )
306
+ declare -a node_network=( $( echo " $api_out_nodes " | jq -r ' .data[].conditions[] | select(.type=="NetworkUnavailable").status' | awk ' /True/ {print FNR}' ) )
302
307
308
+ # Check node status (user controlled)
303
309
i=0
304
310
for node in ${node_names[*]}
305
311
do
@@ -316,6 +322,35 @@ if [[ -z $clustername ]]; then
316
322
let i++
317
323
done
318
324
325
+ # Handle node pressure situations and other conditions (Kubernetes controlled)
326
+ if [[ ${# node_diskpressure[*]} -gt 0 ]]; then
327
+ for n in ${node_diskpressure[*]} ; do
328
+ hostid=$(( $n - 1 ))
329
+ nodeerrors+=(" ${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} has Disk Pressure -" )
330
+ done
331
+ fi
332
+
333
+ if [[ ${# node_memorypressure[*]} -gt 0 ]]; then
334
+ for n in ${node_memorypressure[*]} ; do
335
+ hostid=$(( $n - 1 ))
336
+ nodeerrors+=(" ${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} has Memory Pressure -" )
337
+ done
338
+ fi
339
+
340
+ if [[ ${# node_kubeletready[*]} -gt 0 ]]; then
341
+ for n in ${node_kubeletready[*]} ; do
342
+ hostid=$(( $n - 1 ))
343
+ nodeerrors+=(" Kubelet on node ${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} is not ready -" )
344
+ done
345
+ fi
346
+
347
+ if [[ ${# node_network[*]} -gt 0 ]]; then
348
+ for n in ${node_network[*]} ; do
349
+ hostid=$(( $n - 1 ))
350
+ nodeerrors+=(" Network on node ${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} is unavailable -" )
351
+ done
352
+ fi
353
+
319
354
if [[ ${# nodeerrors[*]} -gt 0 ]]; then
320
355
echo " CHECK_RANCHER2 CRITICAL - ${nodeerrors[*]} |'nodes_total'=${# node_names[*]} ;;;; 'node_errors'=${# nodeerrors[*]} ;;;; 'node_ignored'=${# nodeignored[*]} ;;;;"
321
356
exit ${STATE_CRITICAL}
@@ -331,6 +366,10 @@ else
331
366
332
367
# Check status of all nodes in a specific cluster
333
368
api_out_nodes=$( curl -s ${selfsigned} -u " ${apiuser} :${apipass} " " ${proto} ://${apihost} /v3/nodes/?clusterId=${clustername} " )
369
+ declare -a node_diskpressure=( $( echo " $api_out_nodes " | jq -r ' .data[].conditions[] | select(.type=="DiskPressure").status' | awk ' /True/ {print FNR}' ) )
370
+ declare -a node_memorypressure=( $( echo " $api_out_nodes " | jq -r ' .data[].conditions[] | select(.type=="MemoryPressure").status' | awk ' /True/ {print FNR}' ) )
371
+ declare -a node_kubeletready=( $( echo " $api_out_nodes " | jq -r ' .data[].conditions[] | select(.type=="Ready").status' | awk ' /False/ {print FNR}' ) )
372
+ declare -a node_network=( $( echo " $api_out_nodes " | jq -r ' .data[].conditions[] | select(.type=="NetworkUnavailable").status' | awk ' /True/ {print FNR}' ) )
334
373
335
374
# Check if that given cluster name exists
336
375
if [[ -n $( echo " $api_out_nodes " | grep -i " NotFound" ) ]]
340
379
declare -a node_names=( $( echo " $api_out_nodes " | jq -r ' .data[].nodeName' ) )
341
380
declare -a node_status=( $( echo " $api_out_nodes " | jq -r ' .data[].state' ) )
342
381
382
+ # Check node status (user controlled)
343
383
i=0
344
384
for node in ${node_names[*]}
345
385
do
@@ -356,6 +396,45 @@ else
356
396
let i++
357
397
done
358
398
399
+ # Handle node pressure situations and other conditions (Kubernetes controlled)
400
+ if [[ ${# node_diskpressure[*]} -gt 0 ]]; then
401
+ for n in ${node_diskpressure[*]} ; do
402
+ hostid=$(( $n - 1 ))
403
+ nodeerrors+=(" ${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} has Disk Pressure -" )
404
+ done
405
+ fi
406
+
407
+ if [[ ${# node_memorypressure[*]} -gt 0 ]]; then
408
+ for n in ${node_memorypressure[*]} ; do
409
+ hostid=$(( $n - 1 ))
410
+ nodeerrors+=(" ${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} has Memory Pressure -" )
411
+ done
412
+ fi
413
+
414
+ if [[ ${# node_kubeletready[*]} -gt 0 ]]; then
415
+ for n in ${node_kubeletready[*]} ; do
416
+ hostid=$(( $n - 1 ))
417
+ nodeerrors+=(" Kubelet on node ${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} is not ready -" )
418
+ done
419
+ fi
420
+
421
+ if [[ ${# node_network[*]} -gt 0 ]]; then
422
+ for n in ${node_network[*]} ; do
423
+ hostid=$(( $n - 1 ))
424
+ nodeerrors+=(" Network on node ${node_names[$hostid]} in cluster ${node_cluster_member[$hostid]} is unavailable -" )
425
+ done
426
+ fi
427
+
428
+ if [[ ${# nodeerrors[*]} -gt 0 ]]; then
429
+ echo " CHECK_RANCHER2 CRITICAL - ${nodeerrors[*]} |'nodes_total'=${# node_names[*]} ;;;; 'node_errors'=${# nodeerrors[*]} ;;;; 'node_ignored'=${# nodeignored[*]} ;;;;"
430
+ exit ${STATE_CRITICAL}
431
+ elif [[ ${# nodeignored[*]} -gt 0 ]]; then
432
+ echo " CHECK_RANCHER2 OK - All nodes OK - Info: ${nodeignored[*]} |'nodes_total'=${# node_names[*]} ;;;; 'node_errors'=${# nodeerrors[*]} ;;;; 'node_ignored'=${# nodeignored[*]} ;;;;"
433
+ exit ${STATE_OK}
434
+ else
435
+ echo " CHECK_RANCHER2 OK - All ${# node_names[*]} nodes are active|'nodes_total'=${# node_names[*]} ;;;; 'node_errors'=${# nodeerrors[*]} ;;;; 'node_ignored'=${# nodeignored[*]} ;;;;"
436
+ exit ${STATE_OK}
437
+ fi
359
438
if [[ ${# nodeerrors[*]} -gt 0 ]]; then
360
439
echo " CHECK_RANCHER2 CRITICAL - ${nodeerrors[*]} |'nodes_total'=${# node_names[*]} ;;;; 'node_errors'=${# nodeerrors[*]} ;;;; 'node_ignored'=${# nodeignored[*]} ;;;;"
361
440
exit ${STATE_CRITICAL}
0 commit comments