Hi everyone!
I have an issue with the agent health check. I never gets executed. I enabled debug logging to confirm that it actually never gets executed.
I have simply copy-pasted the snippet from the documentation (cluster-zone with Masters and Agents). The check also worked (or at least I assume it did) when running 2.10, but after the upgrade to 2.11 and the necessary config changes on my side they simply ceased working. I have done all troubleshooting mentioned in the upgrade notes, but that didn’t help either.
I assume something is wrong, but I can’t seem to find what it might be.
Here’s my config:
icinga2 object list -n agent-health
Object 'my.agent.host!agent-health' of type 'Service': % declared in '/etc/icinga2/zones.d/master/cluster-health.conf', lines 11:1-11:28 * __name = "my.agent.host!agent-health" * action_url = "" * check_command = "cluster-zone" % = modified in '/etc/icinga2/zones.d/master/cluster-health.conf', lines 12:3-12:32 * check_interval = 300 * check_period = "" * check_timeout = null * command_endpoint = "" * display_name = "cluster-health-my.agent.host" % = modified in '/etc/icinga2/zones.d/master/cluster-health.conf', lines 14:3-14:46 * enable_active_checks = true * enable_event_handler = true * enable_flapping = false * enable_notifications = true * enable_passive_checks = true * enable_perfdata = true * event_command = "" * flapping_threshold = 0 * flapping_threshold_high = 30 * flapping_threshold_low = 25 * groups = [ ] * host_name = "my.agent.host" % = modified in '/etc/icinga2/zones.d/master/cluster-health.conf', lines 11:1-11:28 * icon_image = "" * icon_image_alt = "" * max_check_attempts = 3 * name = "agent-health" % = modified in '/etc/icinga2/zones.d/master/cluster-health.conf', lines 11:1-11:28 * notes = "" * notes_url = "" * package = "_etc" % = modified in '/etc/icinga2/zones.d/master/cluster-health.conf', lines 11:1-11:28 * retry_interval = 60 * source_location * first_column = 1 * first_line = 11 * last_column = 28 * last_line = 11 * path = "/etc/icinga2/zones.d/master/cluster-health.conf" * templates = [ "agent-health" ] % = modified in '/etc/icinga2/zones.d/master/cluster-health.conf', lines 11:1-11:28 * type = "Service" * vars * cluster_zone = "my.agent.host" % = modified in '/etc/icinga2/zones.d/master/cluster-health.conf', lines 17:3-17:31 * volatile = false * zone = "my.agent.host" % = modified in '/etc/icinga2/zones.d/master/cluster-health.conf', lines 11:1-11:28
zones.conf
object Zone "master" { endpoints = [ "my.master.host" ] } object Endpoint "my.master.host" { // That's us } object Endpoint "my.agent.host" { host = "127.0.0.1" // Localhost, because SSH tunnel port = 5666 // SSH tunnel port log_duration = 0 // Disable the replay log for command endpoint agents } object Zone "my.agent.host" { endpoints = [ "my.agent.host" ] parent = "master" } /* sync global commands */ object Zone "global-templates" { global = true } object Zone "director-global" { global = true }
zones.d/master/cluster-health.conf
apply Service "agent-health" { check_command = "cluster-zone" display_name = "cluster-health-" + host.name /* This follows the convention that the agent zone name is the FQDN which is the same as the host object name. */ vars.cluster_zone = host.name assign where host.vars.agent_endpoint }
curl -k -s -u 'root:passwort' 'https://127.0.0.1:5665/v1/objects/services?service=my.agent.host!agent-health' | jq .
{ "results": [ { "attrs": { "__name": "my.agent.host!agent-health", "acknowledgement": 0, "acknowledgement_expiry": 0, "action_url": "", "active": true, "check_attempt": 1, "check_command": "cluster-zone", "check_interval": 300, "check_period": "", "check_timeout": null, "command_endpoint": "", "display_name": "cluster-health-my.agent.host", "downtime_depth": 0, "enable_active_checks": true, "enable_event_handler": true, "enable_flapping": false, "enable_notifications": true, "enable_passive_checks": true, "enable_perfdata": true, "event_command": "", "flapping": false, "flapping_current": 0, "flapping_last_change": 0, "flapping_threshold": 0, "flapping_threshold_high": 30, "flapping_threshold_low": 25, "force_next_check": false, "force_next_notification": false, "groups": [], "ha_mode": 0, "handled": false, "host_name": "my.agent.host", "icon_image": "", "icon_image_alt": "", "last_check": -1, "last_check_result": null, "last_hard_state": 3, "last_hard_state_change": 0, "last_reachable": true, "last_state": 3, "last_state_change": 0, "last_state_critical": 0, "last_state_ok": 0, "last_state_type": 0, "last_state_unknown": 0, "last_state_unreachable": 0, "last_state_warning": 0, "max_check_attempts": 3, "name": "agent-health", "next_check": 1592728914.1601052, "notes": "", "notes_url": "", "original_attributes": null, "package": "_etc", "paused": false, "previous_state_change": 0, "problem": true, "retry_interval": 60, "severity": 24, "source_location": { "first_column": 1, "first_line": 11, "last_column": 28, "last_line": 11, "path": "/etc/icinga2/zones.d/master/cluster-health.conf" }, "state": 3, "state_type": 0, "templates": [ "agent-health" ], "type": "Service", "vars": { "cluster_zone": "my.agent.host" }, "version": 0, "volatile": false, "zone": "my.agent.host" }, "joins": {}, "meta": {}, "name": "my.agent.host!agent-health", "type": "Service" } ] }
Even rescheduling with force does nothing, no check is performed.
Does anyone see the issue, because I don’t?
Cheers
Steffen