Monitoring MS SQL Cluster

ShowMeYourSkil · June 6, 2022, 10:39am

Hi all, I would like to monitor my sql cluster better. Currently it looks like I am monitoring node2 with a ping and the SQL services on node1. I would like to check if the server goes to node2 on failover or if there is some other problem in the cluster.

Does anyone have a good tip or ideas for me?

Best regards!

rivad · June 7, 2022, 2:48pm

I use 3 hosts and check the service on both nodes, then I use a service on the cluster host with the Icinga DSL to check the status of the service on both nodes.

object Host "cluster" {
    import "-tpl-host-cluster-dummy"
    address = "10.5.68.64"
    vars["_cluster_nodes"] = [ "node1", "node2" ]
}

object CheckCommand "cmd-cluster-nodes" {
    import "plugin-check-command"
    command = [ "/usr/lib64/nagios/plugins/dummy" ]
    timeout = 10s
    arguments += {
        "--message" = {{
            var output = "Cluster hosts:\n"
            var cluster_nodes = macro("$cluster_nodes$")
        
            for (node in cluster_nodes) {
              output += node + ": " + get_host(node).last_check_result.output + "\n"
            }
        
            return output
        }}
        "--state" = {{
            var up_count = 0
            var down_count = 0
            var cluster_nodes = macro("$cluster_nodes$")
        
            for (node in cluster_nodes) {
              if (get_host(node).state > 0) {
                down_count += 1
              } else {
                up_count += 1
              }
            }
        
            if (up_count >= down_count) {
              return "ok" //same up as down -> UP
            } else {
              return "crit" //something is broken
            }
        }}
    }
}

/usr/lib64/nagios/plugins/dummy is only needed because of the director.

For the service check I use:

object CheckCommand "cmd-only-one" {
    import "plugin-check-command"
    command = [ "/usr/lib64/nagios/plugins/dummy" ]
    timeout = 10s
    arguments += {
        "--message" = {
            description = "Message"
            required = true
            value = {{
                var output_status = ""
                var up_count = 0
                var down_count = 0
                var cluster_nodes = macro("$cluster_nodes$")
                var only_one_service_name = macro("$cluster-only-one-service$")
            
                for (node in cluster_nodes) {
                  if (get_service(node, only_one_service_name).state > 0) {
                    down_count += 1
                  } else {
                    up_count += 1
                  }
                }
            
                if (up_count == 1) {
                  output_status = "OK: "
                } else {
                  output_status = "CRITICAL: "
                }
            
                var output = output_status
            
                for (node in cluster_nodes) {
                  output += node + ": " + only_one_service_name + ": " + get_service(node, only_one_service_name).last_check_result.output + " "
                }
            
                output += " | count_of_alive_" + only_one_service_name +"="+up_count+";1:1;1:1"
                return output
            }}
        }
        "--state" = {
            description = "State"
            value = {{
                var up_count = 0
                var down_count = 0
                var cluster_nodes = macro("$cluster_nodes$")
                var only_one_service_name = macro("$cluster-only-one-service$")
            
                for (node in cluster_nodes) {
                  if (get_service(node, only_one_service_name).state > 0) {
                    down_count += 1
                  } else {
                    up_count += 1
                  }
                }
            
                if (up_count == 1) {
                  return "ok" //same up as down -> UP
                } else {
                  return "crit" //something is broken
                }
            }}
        }
    }
}

Alerts on the nodes are disabled only the cluster host sends messages and shows up on our dashboard because I filter out disabled notifications.