postgresql: add monitoring of archiver failures (#66070)

This commit is contained in:
Pierre Ducroquet 2022-06-10 15:09:22 +02:00
parent 5a77f636af
commit 82718a9d33
1 changed files with 23 additions and 18 deletions

View File

@ -190,24 +190,29 @@ def postgresql(ctn):
version = cluster["version"]
name = cluster["cluster"]
eo_postgresql.labels(ctn, version, name, "", "running").set(cluster["running"])
if "recovery" in cluster and cluster["recovery"] and cluster["running"]:
# we are on a standby, check it's connected to a master
receiver_statuses = run("sudo -u postgres psql -p %s -tAc 'select status from pg_stat_wal_receiver ;'" % cluster["port"])
eo_postgresql.labels(ctn, version, name, "", "replicating").set(0)
for status in receiver_statuses.splitlines():
if status == "streaming":
eo_postgresql.labels(ctn, version, name, "", "replicating").inc()
elif cluster["running"]:
# we are on a primary... check the slots are good
slots = run("sudo -u postgres psql -p %s -tAc 'select active, slot_name, pg_wal_lsn_diff(pg_current_wal_lsn() , restart_lsn) from pg_replication_slots where active;'" % cluster["port"])
active_slot_count = 0
for slot in slots.splitlines():
active, slot_name, delta = slot.split("|")
eo_postgresql.labels(ctn, version, name, slot_name, "slot-active").set(active == 't')
eo_postgresql.labels(ctn, version, name, slot_name, "slot-delta").set(int(delta))
if active == 't':
active_slot_count += 1
eo_postgresql.labels(ctn, version, name, "", "active-slot-count").set(active_slot_count)
if cluster["running"]:
# check the archiver status
archiver_failures = run("sudo -u postgres psql -p %s -tAc 'select failed_count from pg_stat_archiver;'" % cluster["port"])
eo_postgresql.labels(ctn, version, name, "", "archive_failed").set(int(archiver_failures))
if "recovery" in cluster and cluster["recovery"]:
# we are on a standby, check it's connected to a master
receiver_statuses = run("sudo -u postgres psql -p %s -tAc 'select status from pg_stat_wal_receiver ;'" % cluster["port"])
eo_postgresql.labels(ctn, version, name, "", "replicating").set(0)
for status in receiver_statuses.splitlines():
if status == "streaming":
eo_postgresql.labels(ctn, version, name, "", "replicating").inc()
else:
# we are on a primary... check the slots are good
slots = run("sudo -u postgres psql -p %s -tAc 'select active, slot_name, pg_wal_lsn_diff(pg_current_wal_lsn() , restart_lsn) from pg_replication_slots where active;'" % cluster["port"])
active_slot_count = 0
for slot in slots.splitlines():
active, slot_name, delta = slot.split("|")
eo_postgresql.labels(ctn, version, name, slot_name, "slot-active").set(active == 't')
eo_postgresql.labels(ctn, version, name, slot_name, "slot-delta").set(int(delta))
if active == 't':
active_slot_count += 1
eo_postgresql.labels(ctn, version, name, "", "active-slot-count").set(active_slot_count)
def rabbitmq(ctn):
rabbitmqctl = "/usr/sbin/rabbitmqctl"