UPDATE 2: It turns out the path to the check, /usr/lib64/nagios/plugins/check_ssh, is wrong; but then, why is it not flagged up as failed?
======
UPDATE: In fact, this is a problem for ALL th host checks, it is clearly a problem with the way they have been defined. Still investigating.
======
I have just discovered that a number of host objects don’t get updated:
<1> => get_object(Host,"cx1-51-3-1.cx1.hpc.ic.ac.uk")
{
__name = "cx1-51-3-1.cx1.hpc.ic.ac.uk"
acknowledgement = 0.000000
acknowledgement_expiry = 0.000000
action_url = ""
active = true
address = "cx1-51-3-1.cx1.hpc.ic.ac.uk"
address6 = ""
check_attempt = 1.000000
check_command = "ssh"
check_interval = 900.000000
check_period = ""
check_timeout = null
command_endpoint = ""
display_name = "cx1-51-3-1.cx1.hpc.ic.ac.uk"
downtime_depth = 0.000000
enable_active_checks = true
enable_event_handler = true
enable_flapping = false
enable_notifications = true
enable_passive_checks = true
enable_perfdata = true
event_command = ""
extensions = {
DbObject = {
type = "Object"
}
}
flapping = false
flapping_buffer = 0.000000
flapping_current = 0.000000
flapping_index = 16.000000
flapping_last_change = 0.000000
flapping_threshold = 0.000000
flapping_threshold_high = 30.000000
flapping_threshold_low = 25.000000
force_next_check = true
force_next_notification = false
groups = [ "cx1" ]
ha_mode = 0.000000
icon_image = ""
icon_image_alt = ""
last_check = 1554797352.802409
last_check_result = {
active = true
check_source = "admin.cx1.hpc.imperial.ac.uk"
command = [ "/usr/lib64/nagios/plugins/check_ssh", "cx1-51-3-1.cx1.hpc.ic.ac.uk" ]
execution_end = 1554797352.777902
execution_start = 1554797352.752162
exit_status = 0.000000
output = "SSH OK - OpenSSH_7.4 (protocol 2.0) "
performance_data = [ "time=0.015941s;;;0.000000;10.000000" ]
schedule_end = 1554797352.802409
schedule_start = 1554797352.710000
state = 0.000000
ttl = 0.000000
type = "CheckResult"
vars_after = {
attempt = 1.000000
reachable = true
state = 0.000000
state_type = 1.000000
}
vars_before = {
attempt = 1.000000
reachable = true
state = 0.000000
state_type = 1.000000
}
}
last_hard_state = 0.000000
last_hard_state_change = 1553908449.713271
last_hard_state_raw = 0.000000
last_reachable = true
last_state = 0.000000
last_state_change = 1553908449.713271
last_state_down = 1553908410.442074
last_state_raw = 0.000000
last_state_type = 1.000000
last_state_unreachable = 0.000000
last_state_up = 1554797352.804830
max_check_attempts = 3.000000
name = "cx1-51-3-1.cx1.hpc.ic.ac.uk"
next_check = 1557933802.000000
notes = ""
notes_url = ""
original_attributes = null
package = "_api"
pause_called = false
paused = false
resume_called = true
retry_interval = 30.000000
severity = 8.000000
source_location = {
first_column = 0.000000
first_line = 1.000000
last_column = 40.000000
last_line = 1.000000
path = "/var/lib/icinga2/api/packages/_api/4f4482f9-394c-4ecf-86d6-e5adeb32995b/conf.d/hosts/cx1-51-3-1.cx1.hpc.ic.ac.uk.conf"
}
start_called = true
state = 0.000000
state_loaded = true
state_raw = 0.000000
state_type = 1.000000
stop_called = false
templates = [ "cx1-51-3-1.cx1.hpc.ic.ac.uk", "generic-host" ]
type = "Host"
vars = {
agent = "ssh"
cluster = "cx1"
os = "Linux"
}
version = 1552650314.790992
volatile = false
zone = "cx1-zone"
}
This particular one hasn’t been updated since 9 Apr - how can that be? I can see the passive checks coming in, in the log, until about 20 min ago, when the server went down, but I’m not sure what the host check itself looks like in the logs.