Notifications despite Dependencies

I’ve been building out an Icinga2/IcingaWeb2 system and am impressed by the capabilities and configuration. However (the fly in the ointment) I’m seeing a lot of both service critical and host down notifications that (I think) should be blocked by dependencies.

My setup is fairly straightforward. A few servers depend on their
upstream router, which lies between the servers and a single Icinga2
instance. Each runs a “Nagios NRPE” (tcp-nrpe) Service that has an
implicit dependency on its host. In turn, each host has many NRPE
services that have an explicit dependency on “Nagios NRPE”, as well as
their implicit host dependency.

The router host has only ping monitoring. When the router no longer
pings, I see two things quickly afterwards. “Nagios NRPE” notifies,
then the router notifies again, within a minute. I may or may not see
a few random services on the servers notify in that same timeframe.

Trying to follow best practices, I have the router on a shorter check and
retry time that the downstream servers.

I tried to force dependencies by making the implicit host dependencies explicit,
only to get 100% duplicate dependency error messages when checking config.

As a result of this set of behaviors, Icinga notifies at roughly 3
times the rate of the older Nagios system.

I’ll summarize my configs in a moment. Thank you for taking a look.

object Host "abcd-router" {
  import "corp-vpn-endpoint"
  address = "10.8.148.1"
  vars..facility_code = "abcd"
  vars.hostgroups = [ "abcd" ]
}


apply Dependency "router-internal" to Host {
  parent_host_name = "abcd-router"
  disable_notifications = true

  assign where host.vars..facility_code == "abcd"
  ignore where  host.name == "abcd-router"
}


object Host "abcd-ras2.site.getwellnetwork.com" {
  import "corp-linux-host"
  address = "10.8.148.15"
  vars.inpatient.facility_code = "abcd"
  
  vars.hostgroups = [ "abcd",  ]
  display_name = "abcd-ras2"
  vars..facility_code = "abcd"
}

object Host "abcd-ras1.site.getwellnetwork.com" {
  import "corp-linux-host"
  address = "10.8.148.14"
  vars.inpatient.facility_code = "abcd"
  
  vars.hostgroups = [ "abcd",  ]
  display_name = "abcd-ras1"
  vars..facility_code = "abcd"
}


/*
 * All NRPE checks require tcp-nrpe to be up, so define a dependency here
 */
apply Dependency "nrpe" to Service {
  parent_service_name = "tcp-nrpe"
  disable_checks = true    // <-- then the states remain unchanged and the service is "late"
  disable_notifications = true
  assign where service.vars.nrpe_dependency == true
}



/* TCP NRPE CHECK
 * This check should be applied to any NRPE-enabled host. It is the parent check
 * for all other NRPE checks, so of the port becomes unreachable, the other
 * NRPE checks don't send notifications on their own because of the dead port
 */
apply Service "tcp-nrpe" {
  import "corp-generic-nrpe"
  display_name = "Nagios NRPE"
  check_command = "tcp"
  vars.tcp_port = 5666
  vars.tcp_ctime = 15

  check_interval = 3m
  retry_interval = 30s

  assign where supports_nrpe(host)
}


/*
 * DISK ROOT PARTITION
 */
apply Service "nrpe-disk-root" {
  import "corp-generic-nrpe"
  display_name = "DiskRootPartition"

  check_command = "corp-nrpe"
  vars.nrpe_command = "check_disk"
  vars.nrpe_arguments = [ "7%", "5%", "/" ]

  vars.nrpe_dependency = true
  assign where supports_nrpe(host)
}

template Service "corp-generic-nrpe" {
  import "corp-generic-service"
  vars.nrpe_timeout = 35s
}


/**
 * Provides default settings for services. By convention
 * all services should import this template.
 */
template Service "corp-generic-service" {
  max_check_attempts = 5
  check_interval = 10m
  retry_interval = 30s
  vars.notification.use_slack = true
}


template Host "corp-vpn-endpoint" {
  import "corp-generic-host"
  check_interval = 2m
  retry_interval = 15s
  vars.vpn.delay = 6m
}


apply Notification "corp-chat-vpn-notification-devops" to Host {
  import "corp-chat-host-notification-devops-tmpl"
  users = [ "slack" ]

  // Delay notification
  times.begin = host.vars.vpn.delay

  assign where host.vars.notification.use_slack == true && host.vars.vpn.delay
}

apply Notification "corp-chat-service-notification-devops" to Service {
  import "corp-chat-service-notification-devops-tmpl"
  users = [ "slack" ]
  states = [ Critical, OK ]

  assign where host.vars.notification.use_slack == true && service.vars.notification.use_slack == true
  ignore where host.vars.vpn.delay
}


template Notification "corp-chat-host-notification-devops-tmpl" {
  import "corp-generic-notification"
  command = "corp-cmd-chat-host-notification-devops"
}



template Notification "corp-chat-service-notification-devops-tmpl" {
  import "corp-generic-notification"
  command = "corp-cmd-chat-service-notification-devops"
}


/**
* The notification template all other notification definitions should inherit from
*/
template Notification "corp-generic-notification" {
  interval = 4h
  
}

.

apply Dependency “router-internal” to Host {
parent_host_name = “abcd-router”
disable_notifications = true

assign where host.vars…facility_code == “abcd”
ignore where host.name == “abcd-router”
}

Maybe a minor quibble, but if the “…” in that stanza above is a direct copy-
n-paste from your live config, it could be a contributor to your problems…

Antony.

Thanks, that’s just a typo from the copy and paste, not in the actual configs

Here are the dependencies that are not stopping notifications:

Object 'abcd-ras2.site.getwellnetwork.com!nrpe-disk-root!nrpe' of type 'Dependency':
  % declared in '/etc/icinga2/corp.d/corp-services.conf', lines 125:1-125:34
  * __name = "abcd-ras2.site.getwellnetwork.com!nrpe-disk-root!nrpe"
  * child_host_name = "abcd-ras2.site.getwellnetwork.com"
    % = modified in '/etc/icinga2/corp.d/corp-services.conf', lines 125:1-125:34
  * child_service_name = "nrpe-disk-root"
    % = modified in '/etc/icinga2/corp.d/corp-services.conf', lines 125:1-125:34
  * disable_checks = true
    % = modified in '/etc/icinga2/corp.d/corp-services.conf', lines 127:3-127:23
  * disable_notifications = true
    % = modified in '/etc/icinga2/corp.d/corp-services.conf', lines 129:3-129:30
  * ignore_soft_states = true
  * name = "nrpe"
  * package = "_etc"
    % = modified in '/etc/icinga2/corp.d/corp-services.conf', lines 125:1-125:34
  * parent_host_name = "abcd-ras2.site.getwellnetwork.com"
    % = modified in '/etc/icinga2/corp.d/corp-services.conf', lines 125:1-125:34
  * parent_service_name = "tcp-nrpe"
    % = modified in '/etc/icinga2/corp.d/corp-services.conf', lines 126:3-126:34
  * period = ""
  * source_location
    * first_column = 1
    * first_line = 125
    * last_column = 34
    * last_line = 125
    * path = "/etc/icinga2/corp.d/corp-services.conf"
  * states = null
  * templates = [ "nrpe" ]
    % = modified in '/etc/icinga2/corp.d/corp-services.conf', lines 125:1-125:34
  * type = "Dependency"
  * vars = null
  * zone = ""


Object 'abcd-ras2.site.getwellnetwork.com!router-internal' of type 'Dependency':
  % declared in '/etc/icinga2/corp.d/routers.conf', lines 6790:1-6790:46
  * __name = "abcd-ras2.site.getwellnetwork.com!router-internal"
  * child_host_name = "abcd-ras2.site.getwellnetwork.com"
    % = modified in '/etc/icinga2/corp.d/routers.conf', lines 6790:1-6790:46
  * child_service_name = ""
  * disable_checks = false
  * disable_notifications = true
    % = modified in '/etc/icinga2/corp.d/routers.conf', lines 6792:3-6792:30
  * ignore_soft_states = true
  * name = "router-internal"
  * package = "_etc"
    % = modified in '/etc/icinga2/corp.d/routers.conf', lines 6790:1-6790:46
  * parent_host_name = "abcd-router"
    % = modified in '/etc/icinga2/corp.d/routers.conf', lines 6790:1-6790:46
    % = modified in '/etc/icinga2/corp.d/routers.conf', lines 6791:3-6791:38
  * parent_service_name = ""
  * period = ""
  * source_location
    * first_column = 1
    * first_line = 6790
    * last_column = 46
    * last_line = 6790
    * path = "/etc/icinga2/corp.d/routers.conf"
  * states = null
  * templates = [ "router-internal" ]
    % = modified in '/etc/icinga2/corp.d/routers.conf', lines 6790:1-6790:46
  * type = "Dependency"
  * vars = null
  * zone = ""

Is my question being ignored for a reason?