Service cluster-zone is not scheduled (anymore)

I’ve defined a service for checking every zone using check command cluster-zone. This had been working for a long time but stopped working for existing satellite zones and never been executing for a new satellite.

In my test environment (currently r2.11.0-1) last execution was Feb 1, 2019.

At one customer site (currently r2.10.5-1) last execution for

  • satellite #1 was 2018-06, next check Jan 28
  • satellite #2 was Apr 15, 2019, next check Sep 25
  • satellite #3 was deployed on Sept 25 and the check is still pending

Initiate a check now does not help. debug.log just contain e.g.

[2019-10-15 09:40:04 +0200] information/ExternalCommandListener: Executing external command: [1571125204] SCHEDULE_FORCED_SVC_CHECK;mon1.site1.em.lan;icinga_zone;1571125204
[2019-10-15 09:40:04 +0200] notice/ExternalCommandProcessor: Rescheduling next check for service 'icinga_zone'

but the check isn’t executed and icingaweb2 next check counts backwards. The check result is still as it stopped working means OK (or still pending for the new satellite). Is this by design?!

Any idea what is wrong here and how to fix it?

It would help if you provide the service configuration and the output of icinga2 object list -type service -name <servicename>

cat /etc/icinga2/conf.d/icinga/zone.conf

apply Service "icinga_zone" {
   display_name = "Icinga Zone"
   check_command = "cluster-zone"

   if (host.vars.cluster_zone) {
      vars.cluster_zone = host.vars.cluster_zone
   }

   assign where get_object("Endpoint", host.name)
   ignore where host.name == NodeName
}

icinga2 object list -n icinga_zone

Object 'mon1.site1.em.lan!icinga_zone' of type 'Service':
  % declared in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 1:0-1:26
  * __name = "mon1.site1.em.lan!icinga_zone"
  * action_url = ""
  * check_command = "cluster-zone"
    % = modified in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 3:4-3:33
  * check_interval = 300
  * check_period = ""
  * check_timeout = null
  * command_endpoint = ""
  * display_name = "Icinga Zone"
    % = modified in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 2:4-2:31
  * enable_active_checks = true
  * enable_event_handler = true
  * enable_flapping = false
  * enable_notifications = true
  * enable_passive_checks = true
  * enable_perfdata = true
  * event_command = ""
  * flapping_threshold = 0
  * flapping_threshold_high = 30
  * flapping_threshold_low = 25
  * groups = [ ]
  * host_name = "mon1.site1.em.lan"
    % = modified in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 1:0-1:26
  * icon_image = ""
  * icon_image_alt = ""
  * max_check_attempts = 3
  * name = "icinga_zone"
    % = modified in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 1:0-1:26
  * notes = ""
  * notes_url = ""
  * package = "_etc"
    % = modified in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 1:0-1:26
  * retry_interval = 60
  * source_location
    * first_column = 0
    * first_line = 1
    * last_column = 26
    * last_line = 1
    * path = "/etc/icinga2/conf.d/icinga2/zone.conf"
  * templates = [ "icinga_zone" ]
    % = modified in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 1:0-1:26
  * type = "Service"
  * vars = null
  * volatile = false
  * zone = "site1.em.lan"
    % = modified in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 1:0-1:26

curl -k -s -u ‘user:password’ ‘https://localhost:5665/v1/objects/services?service=mon1.site1.em.lan!icinga_zone’ | python -m json.tool

{
    "results": [
        {
            "attrs": {
                "__name": "mon1.site1.em.lan!icinga_zone",
                "acknowledgement": 0.0,
                "acknowledgement_expiry": 0.0,
                "action_url": "",
                "active": true,
                "check_attempt": 1.0,
                "check_command": "cluster-zone",
                "check_interval": 300.0,
                "check_period": "",
                "check_timeout": null,
                "command_endpoint": "",
                "display_name": "Icinga Zone",
                "downtime_depth": 0.0,
                "enable_active_checks": true,
                "enable_event_handler": true,
                "enable_flapping": false,
                "enable_notifications": true,
                "enable_passive_checks": true,
                "enable_perfdata": true,
                "event_command": "",
                "flapping": false,
                "flapping_current": 0.0,
                "flapping_last_change": 0.0,
                "flapping_threshold": 0.0,
                "flapping_threshold_high": 30.0,
                "flapping_threshold_low": 25.0,
                "force_next_check": true,
                "force_next_notification": false,
                "groups": [],
                "ha_mode": 0.0,
                "handled": false,
                "host_name": "mon1.site1.em.lan",
                "icon_image": "",
                "icon_image_alt": "",
                "last_check": 1549022198.450375,
                "last_check_result": {
                    "active": true,
                    "check_source": "main.em.lan",
                    "command": null,
                    "execution_end": 1549022198.450375,
                    "execution_start": 1549022198.450254,
                    "exit_status": 0.0,
                    "output": "Zone 'site1.em.lan' is connected. Log lag: less than 1 millisecond",
                    "performance_data": [
                        {
                            "counter": false,
                            "crit": 0.0,
                            "label": "slave_lag",
                            "max": null,
                            "min": null,
                            "type": "PerfdataValue",
                            "unit": "s",
                            "value": 0.0,
                            "warn": 0.0
                        },
                        {
                            "counter": false,
                            "crit": null,
                            "label": "last_messages_sent",
                            "max": null,
                            "min": null,
                            "type": "PerfdataValue",
                            "unit": "",
                            "value": 1549022194.870903,
                            "warn": null
                        },
                        {
                            "counter": false,
                            "crit": null,
                            "label": "last_messages_received",
                            "max": null,
                            "min": null,
                            "type": "PerfdataValue",
                            "unit": "",
                            "value": 1549022194.169085,
                            "warn": null
                        },
                        {
                            "counter": false,
                            "crit": null,
                            "label": "sum_messages_sent_per_second",
                            "max": null,
                            "min": null,
                            "type": "PerfdataValue",
                            "unit": "",
                            "value": 0.3,
                            "warn": null
                        },
                        {
                            "counter": false,
                            "crit": null,
                            "label": "sum_messages_received_per_second",
                            "max": null,
                            "min": null,
                            "type": "PerfdataValue",
                            "unit": "",
                            "value": 1.2833333333333334,
                            "warn": null
                        },
                        {
                            "counter": false,
                            "crit": null,
                            "label": "sum_bytes_sent_per_second",
                            "max": null,
                            "min": null,
                            "type": "PerfdataValue",
                            "unit": "",
                            "value": 27.566666666666666,
                            "warn": null
                        },
                        {
                            "counter": false,
                            "crit": null,
                            "label": "sum_bytes_received_per_second",
                            "max": null,
                            "min": null,
                            "type": "PerfdataValue",
                            "unit": "",
                            "value": 457.73333333333335,
                            "warn": null
                        }
                    ],
                    "schedule_end": 1549022198.450375,
                    "schedule_start": 1549022198.45,
                    "state": 0.0,
                    "ttl": 0.0,
                    "type": "CheckResult",
                    "vars_after": {
                        "attempt": 1.0,
                        "reachable": true,
                        "state": 0.0,
                        "state_type": 1.0
                    },
                    "vars_before": {
                        "attempt": 1.0,
                        "reachable": true,
                        "state": 0.0,
                        "state_type": 1.0
                    }
                },
                "last_hard_state": 0.0,
                "last_hard_state_change": 1548241662.976568,
                "last_reachable": true,
                "last_state": 0.0,
                "last_state_change": 1548241662.976568,
                "last_state_critical": 1548241606.296162,
                "last_state_ok": 1549022198.4504,
                "last_state_type": 1.0,
                "last_state_unknown": 0.0,
                "last_state_unreachable": 0.0,
                "last_state_warning": 0.0,
                "max_check_attempts": 3.0,
                "name": "icinga_zone",
                "next_check": 1571143097.6620665,
                "notes": "",
                "notes_url": "",
                "original_attributes": null,
                "package": "_etc",
                "paused": false,
                "previous_state_change": 1569922221.539295,
                "problem": false,
                "retry_interval": 60.0,
                "severity": 8.0,
                "source_location": {
                    "first_column": 0.0,
                    "first_line": 1.0,
                    "last_column": 26.0,
                    "last_line": 1.0,
                    "path": "/etc/icinga2/conf.d/icinga2/zone.conf"
                },
                "state": 0.0,
                "state_type": 1.0,
                "templates": [
                    "icinga_zone"
                ],
                "type": "Service",
                "vars": null,
                "version": 0.0,
                "volatile": false,
                "zone": "site1.em.lan"
            },
            "joins": {},
            "meta": {},
            "name": "mon1.site1.em.lan!icinga_zone",
            "type": "Service"
        }
    ]
}

I copied you service configuration to my test lab and it works out of the box.
The only thing i can guess is thata you endpoint " mon1.site1.em.lan" Zone is not the same as the endpoints name.

The cluseter-zone command uses the host.name as zone name if vars.cluster_zone is not defined in the service.

my config for an endpoint looks like this

object Endpoint "DESKTOP-3FAH28J" {                                                                                                                                                                                                                                                                                                                                                         
}                                                                                                                                                                                                                                                                                                                                                                                           
                                                                                                                                                                                                                                                                                                                                                                                            
object Zone "DESKTOP-3FAH28J" {                                                                                                                                                                                                                                                                                                                                                             
  endpoints = [ "DESKTOP-3FAH28J" ]                                                                                                                                                                                                                                                                                                                                                         
  parent = "Master"                                                                                                                                                                                                                                                                                                                                                                         
}                                                                                                                                                                                                                                                                                                                                                                                           

and the host object is

object Host "DESKTOP-3FAH28J" {
  import "generic-host"


  address = "192.168.178.70"
  enable_notifications = false


  event_command = "automatic-downtime"
  vars.downtime_length = 30d


  vars.geolocation = "48.898310,9.092119"


  vars.os = "Windows"


  vars.windisks["disk c:"] = {
    nscp_disk_drive = "c:"
  }
  vars.windisks["disk d:"] = {
    nscp_disk_drive = "d:"
    nscp_disk_warning = "90"
    nscp_disk_critical = "95"
  }
  vars.windisks["disk e:"] = {
    nscp_disk_drive = "e:"
  }
}

Could you check/post your endpoint and object zone for your endpoint?

cat /etc/icinga2/zones.conf

object Endpoint "main.em.lan" {
}

object Zone "em.lan" {
   endpoints = [ "main.em.lan" ]
}

object Zone "global-templates" {
   global = true
}

object Zone "director-global" {
   global = true
}

object Zone "windows-commands" {
   global = true
}

object Zone "site1.em.lan" {
   endpoints = [ "mon1.site1.em.lan" ]
}

object Endpoint "mon1.site1.em.lan" {
   host = "192.168.3.211"
   port = "5665"
}

“host.conf”

object Host "mon1.site1.em.lan" {
   address = "192.168.3.211"
   check_command = "hostalive4]"
   zone = "site1.em.lan"
   vars.cluster_zone = "site1.em.lan"
...
}

Yep your zone name is not the host name. So you have to set the vars.cluster_zone to your zone name.

I’ve just edited my previous post. Sorry for mixing up.

I’ve set cluster_zone. But even if it was wrong the check should be scheduled (with wrong results of course).

Iam not sure if icinga fires the cluster-zone check for a unkown zone.
But you problem is that the variable is empty for the service, so the check will check for a zone named `mon1.site1.em.lan’

Your output of the service shows

* vars = null

and it should show the variable content from the host.vars.zone_config.

Oh, I’m so sorry. I took the old definition (cause I’ve done a cross check due to there must have been a change sometime because at the beginning about Apr 2018 it was working without that variable).

icinga2 object list -n icinga_zone

Object 'mon1.site1.em.lan!icinga_zone' of type 'Service':
  % declared in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 1:0-1:26
  * __name = "mon1.site1.em.lan!icinga_zone"
  * action_url = ""
  * check_command = "cluster-zone"
    % = modified in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 3:4-3:33
  * check_interval = 300
  * check_period = ""
  * check_timeout = null
  * command_endpoint = ""
  * display_name = "Icinga Zone"
    % = modified in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 2:4-2:31
  * enable_active_checks = true
  * enable_event_handler = true
  * enable_flapping = false
  * enable_notifications = true
  * enable_passive_checks = true
  * enable_perfdata = true
  * event_command = ""
  * flapping_threshold = 0
  * flapping_threshold_high = 30
  * flapping_threshold_low = 25
  * groups = [ ]
  * host_name = "mon1.site1.em.lan"
    % = modified in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 1:0-1:26
  * icon_image = ""
  * icon_image_alt = ""
  * max_check_attempts = 3
  * name = "icinga_zone"
    % = modified in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 1:0-1:26
  * notes = ""
  * notes_url = ""
  * package = "_etc"
    % = modified in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 1:0-1:26
  * retry_interval = 60
  * source_location
    * first_column = 0
    * first_line = 1
    * last_column = 26
    * last_line = 1
    * path = "/etc/icinga2/conf.d/icinga2/zone.conf"
  * templates = [ "icinga_zone" ]
    % = modified in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 1:0-1:26
  * type = "Service"
  * vars
    * cluster_zone = "site1.em.lan"
      % = modified in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 5:7-5:48
  * volatile = false
  * zone = "site1.em.lan"
    % = modified in '/etc/icinga2/conf.d/icinga2/zone.conf', lines 1:0-1:26
curl -k -s -u 'user:password' 'https://localhost:5665/v1/objects/services?service=mon1.site1.em.lan!icinga_zone' | python -m json.tool
{
    "results": [
        {
            "attrs": {
                "__name": "mon1.site1.em.lan!icinga_zone",
                "acknowledgement": 0.0,
                "acknowledgement_expiry": 0.0,
                "action_url": "",
                "active": true,
                "check_attempt": 1.0,
                "check_command": "cluster-zone",
                "check_interval": 300.0,
                "check_period": "",
                "check_timeout": null,
                "command_endpoint": "",
                "display_name": "Icinga Zone",
                "downtime_depth": 0.0,
                "enable_active_checks": true,
                "enable_event_handler": true,
                "enable_flapping": false,
                "enable_notifications": true,
                "enable_passive_checks": true,
                "enable_perfdata": true,
                "event_command": "",
                "flapping": false,
                "flapping_current": 0.0,
                "flapping_last_change": 0.0,
                "flapping_threshold": 0.0,
                "flapping_threshold_high": 30.0,
                "flapping_threshold_low": 25.0,
                "force_next_check": true,
                "force_next_notification": false,
                "groups": [],
                "ha_mode": 0.0,
                "handled": false,
                "host_name": "mon1.site1.em.lan",
                "icon_image": "",
                "icon_image_alt": "",
                "last_check": 1549022198.450375,
                "last_check_result": {
                    "active": true,
                    "check_source": "main.em.lan",
                    "command": null,
                    "execution_end": 1549022198.450375,
                    "execution_start": 1549022198.450254,
                    "exit_status": 0.0,
                    "output": "Zone 'site1.em.lan' is connected. Log lag: less than 1 millisecond",
                    "performance_data": [
                        {
                            "counter": false,
                            "crit": 0.0,
                            "label": "slave_lag",
                            "max": null,
                            "min": null,
                            "type": "PerfdataValue",
                            "unit": "s",
                            "value": 0.0,
                            "warn": 0.0
                        },
                        {
                            "counter": false,
                            "crit": null,
                            "label": "last_messages_sent",
                            "max": null,
                            "min": null,
                            "type": "PerfdataValue",
                            "unit": "",
                            "value": 1549022194.870903,
                            "warn": null
                        },
                        {
                            "counter": false,
                            "crit": null,
                            "label": "last_messages_received",
                            "max": null,
                            "min": null,
                            "type": "PerfdataValue",
                            "unit": "",
                            "value": 1549022194.169085,
                            "warn": null
                        },
                        {
                            "counter": false,
                            "crit": null,
                            "label": "sum_messages_sent_per_second",
                            "max": null,
                            "min": null,
                            "type": "PerfdataValue",
                            "unit": "",
                            "value": 0.3,
                            "warn": null
                        },
                        {
                            "counter": false,
                            "crit": null,
                            "label": "sum_messages_received_per_second",
                            "max": null,
                            "min": null,
                            "type": "PerfdataValue",
                            "unit": "",
                            "value": 1.2833333333333334,
                            "warn": null
                        },
                        {
                            "counter": false,
                            "crit": null,
                            "label": "sum_bytes_sent_per_second",
                            "max": null,
                            "min": null,
                            "type": "PerfdataValue",
                            "unit": "",
                            "value": 27.566666666666666,
                            "warn": null
                        },
                        {
                            "counter": false,
                            "crit": null,
                            "label": "sum_bytes_received_per_second",
                            "max": null,
                            "min": null,
                            "type": "PerfdataValue",
                            "unit": "",
                            "value": 457.73333333333335,
                            "warn": null
                        }
                    ],
                    "schedule_end": 1549022198.450375,
                    "schedule_start": 1549022198.45,
                    "state": 0.0,
                    "ttl": 0.0,
                    "type": "CheckResult",
                    "vars_after": {
                        "attempt": 1.0,
                        "reachable": true,
                        "state": 0.0,
                        "state_type": 1.0
                    },
                    "vars_before": {
                        "attempt": 1.0,
                        "reachable": true,
                        "state": 0.0,
                        "state_type": 1.0
                    }
                },
                "last_hard_state": 0.0,
                "last_hard_state_change": 1548241662.976568,
                "last_reachable": true,
                "last_state": 0.0,
                "last_state_change": 1548241662.976568,
                "last_state_critical": 1548241606.296162,
                "last_state_ok": 1549022198.4504,
                "last_state_type": 1.0,
                "last_state_unknown": 0.0,
                "last_state_unreachable": 0.0,
                "last_state_warning": 0.0,
                "max_check_attempts": 3.0,
                "name": "icinga_zone",
                "next_check": 1571214669.2994041,
                "notes": "",
                "notes_url": "",
                "original_attributes": null,
                "package": "_etc",
                "paused": false,
                "previous_state_change": 1569922221.539295,
                "problem": false,
                "retry_interval": 60.0,
                "severity": 8.0,
                "source_location": {
                    "first_column": 0.0,
                    "first_line": 1.0,
                    "last_column": 26.0,
                    "last_line": 1.0,
                    "path": "/etc/icinga2/conf.d/icinga2/zone.conf"
                },
                "state": 0.0,
                "state_type": 1.0,
                "templates": [
                    "icinga_zone"
                ],
                "type": "Service",
                "vars": {
                    "cluster_zone": "site1.em.lan"
                },
                "version": 0.0,
                "volatile": false,
                "zone": "site1.em.lan"
            },
            "joins": {},
            "meta": {},
            "name": "mon1.site1.em.lan!icinga_zone",
            "type": "Service"
        }
    ]
}

Does it work now after all changes?

No, as I said, I realized that issue and did many (cross) checks.

Did you upgrade your icinga2 to 2.11? If yes read the docu about sync changes. With 2.11 icinga2 only replicates zones if the zone is configured before the sync, otherwise the agent will not know anything about the zone. Best is that you check the log of the agent.

I think I got it, the service is applied in the wrong zone. It looks like it has to be applied in the parent zone of the zone that will be checked. I have been using this service without any change and it worked fine but stopped working sometime. So there must have been a change in the logic but I could not find it.

I’m wondering that this issue only occurs on my machines. Do I have a general missconfiguration? Could anybody share his configuration please? @lbetz: Are you reading here?