I use a “virtual” Icinga host object with the HA IP.
This has the 116_cluster_nodes
variable set and the same service name as on the HA nodes with the following DSL code as check.
object CheckCommand "116-cmd-only-one" {
import "plugin-check-command"
command = [ "/usr/lib64/nagios/plugins/dummy" ]
timeout = 10s
arguments += {
"--message" = {
description = "Message"
required = true
value = {{
var output_status = ""
var up_count = 0
var down_count = 0
var cluster_nodes = macro("$116_cluster_nodes$")
var only_one_service_name = macro("$116-cluster-only-one-service$")
for (node in cluster_nodes) {
if (get_service(node, only_one_service_name).state > 0) {
down_count += 1
} else {
up_count += 1
}
}
if (up_count == 1) {
output_status = "OK: "
} else {
output_status = "CRITICAL: "
}
var output = output_status
for (node in cluster_nodes) {
output += node + ": " + only_one_service_name + ": " + get_service(node, only_one_service_name).last_check_result.output + " "
}
output += " | count_of_alive_" + only_one_service_name +"="+up_count+";1:1;1:1"
return output
}}
}
"--state" = {
description = "State"
value = {{
var up_count = 0
var down_count = 0
var cluster_nodes = macro("$116_cluster_nodes$")
var only_one_service_name = macro("$116-cluster-only-one-service$")
for (node in cluster_nodes) {
if (get_service(node, only_one_service_name).state > 0) {
down_count += 1
} else {
up_count += 1
}
}
if (up_count == 1) {
return "ok" //same up as down -> UP
} else {
return "crit" //something is broken
}
}}
}
}
}