Calculating a state over multiple services

midtnight15 · September 4, 2022, 4:17pm

Hi, i am new too icinga, trying out this code:
Calculating a state over multiple services and from another post Calculating a state over multiple services - Post

I would like to know the CPU use for multiple machines and have a graph from the output of the combined_state , i got the service to work in icinga2 web, but the state of combined_state for all the host never changes, even though the check_load state changes in a seperate check, but if i change the

apply Service “http” {
check_command = “load”
assign where host.vars.check_http
}

to:

apply Service “http” {
check_command = “random”
assign where host.vars.check_http
}

Then the state in combined_state change randomly.

moreamazingnick · September 5, 2022, 7:28am

In the tutorial they setup 20 dummy hosts and something like mock services. that’s why they use check command random, it creates a random result between 0…3, thats why your real results are changing randomly

The service_state is a value between 0…3,
0 => OK
1 => WARNING
2 => CRITICAL
3 => UNKNOWN

but you want to know or graph the output over the “CPU use”. I would recommend grafana in order to create these graphs

midtnight15 · September 5, 2022, 10:52am

Thanks for the input.

I have change to code to use real host and call “check_load” instead of “random” for a CPU check, and if i look at the individual host i get a graph for the CPU check, but not from the combined_state, and that state is still just OK for all host.

Any good guide on how to get the graph from the service_state, from the combined_state.

moreamazingnick · September 6, 2022, 1:30pm

combined graphs can be realised with grafana and a performance data database

rivad · September 6, 2022, 3:29pm

I use the following code but because of the director I need to use a dummy check plugin to reflect the calculated data back into Icinga2:

Plugin: monitoring-plugins/check-plugins/dummy at main · Linuxfabrik/monitoring-plugins · GitHub

Value Type: Icinga DSL
– message:

function get_perfdata(service){
	  var perf_value = service.last_check_result.performance_data[0].split("=")[1]
	  var perf_value = perf_value.split("c;")[0]
	  return perf_value
	}
	var output = ""
	var hosts_to_compare = macro("$116_comparison_hosts$")
	var service_pattern = macro("$116_comparison_service_pattern$")
	var tolerance = macro("$116_comparison_tolerance$")
	var values = []
	var servcies = []

	for (host in hosts_to_compare) {
	  var service_names = get_services(host).map(s => s.name)
	  for (service_name in service_names) {
		if (match(service_pattern, service_name)) {
		  if (match("Compare Services*", service_name)){
		   continue
		  }
		  servcies.add(host + "!" + service_name)
		  service = get_service(host, service_name)
	      if (len(service.last_check_result.performance_data) < 1){
           return "[UNKNOWN] '" + host + "!" + service_name + "' has no performance data value in last_check_result!"
	      }
		  values.add(get_perfdata(service))
		}
	  }
	}
    if (len(values) < 2) {
      return "[UNKNOWN] less then 2 values collected!"  
    }
	
	output = "[OK] all values in allowed tolerance |"
	
	for (value in values) {
	  if (number(value) + number(tolerance) < number(values[0]) || number(value) - number(tolerance) > number(values[0])) {
		output = "[CRITICAL] value " + value + " not in allowed tolerance " + tolerance + " from first value " + values[0] + " |"
	  }
	}

	for (service in servcies) {
	  var host = service.split("!")[0]
	  var service_name = service.split("!")[1]
	  var service = get_service(host, service_name)
	  var value = get_perfdata(service)
	  var value_min = number(values[0]) - number(tolerance)
	  var value_max = number(values[0]) + number(tolerance)
	  output += " '" + host + "!" + service_name + "'=" + value + ";;" + value_min + ":" + value_max + ";"
	}

	return output

Value Type: Icinga DSL
– state:

function get_perfdata(service){
	  var perf_value = service.last_check_result.performance_data[0].split("=")[1]
	  var perf_value = perf_value.split("c;")[0]
	  return perf_value
	}
	var hosts_to_compare = macro("$116_comparison_hosts$")
	var service_pattern = macro("$116_comparison_service_pattern$")
	var tolerance = macro("$116_comparison_tolerance$")
	var values = []

	for (host in hosts_to_compare) {
	  var service_names = get_services(host).map(s => s.name)
	  for (service_name in service_names) {
		if (match(service_pattern, service_name)) {
		  if (match("Compare Services*", service_name)){
		   continue
		  }
		  var service = get_service(host,service_name)
		  if (len(service.last_check_result.performance_data) < 1){
            return "unk"
	      }
		  values.add(get_perfdata(service))
		}
	  }
	}
    if (len(values) < 2) {
	  return "unk"  
    }

	for (value in values) {
	  if (number(value) + number(tolerance) < number(values[0]) || number(value) - number(tolerance) > number(values[0])) {
		return "crit" //at least one service is not OK
	  }
	}

	return "ok" //all is well