Hi all, I would like to monitor my sql cluster better. Currently it looks like I am monitoring node2 with a ping and the SQL services on node1. I would like to check if the server goes to node2 on failover or if there is some other problem in the cluster.
Does anyone have a good tip or ideas for me?
Best regards!
rivad
(Dominik)
June 7, 2022, 2:48pm
2
I use 3 hosts and check the service on both nodes, then I use a service on the cluster host with the Icinga DSL to check the status of the service on both nodes.
object Host "cluster" {
import "-tpl-host-cluster-dummy"
address = "10.5.68.64"
vars["_cluster_nodes"] = [ "node1", "node2" ]
}
object CheckCommand "cmd-cluster-nodes" {
import "plugin-check-command"
command = [ "/usr/lib64/nagios/plugins/dummy" ]
timeout = 10s
arguments += {
"--message" = {{
var output = "Cluster hosts:\n"
var cluster_nodes = macro("$cluster_nodes$")
for (node in cluster_nodes) {
output += node + ": " + get_host(node).last_check_result.output + "\n"
}
return output
}}
"--state" = {{
var up_count = 0
var down_count = 0
var cluster_nodes = macro("$cluster_nodes$")
for (node in cluster_nodes) {
if (get_host(node).state > 0) {
down_count += 1
} else {
up_count += 1
}
}
if (up_count >= down_count) {
return "ok" //same up as down -> UP
} else {
return "crit" //something is broken
}
}}
}
}
/usr/lib64/nagios/plugins/dummy
is only needed because of the director.
For the service check I use:
object CheckCommand "cmd-only-one" {
import "plugin-check-command"
command = [ "/usr/lib64/nagios/plugins/dummy" ]
timeout = 10s
arguments += {
"--message" = {
description = "Message"
required = true
value = {{
var output_status = ""
var up_count = 0
var down_count = 0
var cluster_nodes = macro("$cluster_nodes$")
var only_one_service_name = macro("$cluster-only-one-service$")
for (node in cluster_nodes) {
if (get_service(node, only_one_service_name).state > 0) {
down_count += 1
} else {
up_count += 1
}
}
if (up_count == 1) {
output_status = "OK: "
} else {
output_status = "CRITICAL: "
}
var output = output_status
for (node in cluster_nodes) {
output += node + ": " + only_one_service_name + ": " + get_service(node, only_one_service_name).last_check_result.output + " "
}
output += " | count_of_alive_" + only_one_service_name +"="+up_count+";1:1;1:1"
return output
}}
}
"--state" = {
description = "State"
value = {{
var up_count = 0
var down_count = 0
var cluster_nodes = macro("$cluster_nodes$")
var only_one_service_name = macro("$cluster-only-one-service$")
for (node in cluster_nodes) {
if (get_service(node, only_one_service_name).state > 0) {
down_count += 1
} else {
up_count += 1
}
}
if (up_count == 1) {
return "ok" //same up as down -> UP
} else {
return "crit" //something is broken
}
}}
}
}
}
Alerts on the nodes are disabled only the cluster host sends messages and shows up on our dashboard because I filter out disabled notifications.
1 Like