From 82718a9d330d7d425a913b10ea37cb001defe039 Mon Sep 17 00:00:00 2001 From: Pierre Ducroquet Date: Fri, 10 Jun 2022 15:09:22 +0200 Subject: [PATCH] postgresql: add monitoring of archiver failures (#66070) --- .../prometheus-system-exporter.py | 41 +++++++++++-------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/prometheus-entrouvert-exporter/prometheus-system-exporter.py b/prometheus-entrouvert-exporter/prometheus-system-exporter.py index d5976e9..9ffcbc2 100755 --- a/prometheus-entrouvert-exporter/prometheus-system-exporter.py +++ b/prometheus-entrouvert-exporter/prometheus-system-exporter.py @@ -190,24 +190,29 @@ def postgresql(ctn): version = cluster["version"] name = cluster["cluster"] eo_postgresql.labels(ctn, version, name, "", "running").set(cluster["running"]) - if "recovery" in cluster and cluster["recovery"] and cluster["running"]: - # we are on a standby, check it's connected to a master - receiver_statuses = run("sudo -u postgres psql -p %s -tAc 'select status from pg_stat_wal_receiver ;'" % cluster["port"]) - eo_postgresql.labels(ctn, version, name, "", "replicating").set(0) - for status in receiver_statuses.splitlines(): - if status == "streaming": - eo_postgresql.labels(ctn, version, name, "", "replicating").inc() - elif cluster["running"]: - # we are on a primary... check the slots are good - slots = run("sudo -u postgres psql -p %s -tAc 'select active, slot_name, pg_wal_lsn_diff(pg_current_wal_lsn() , restart_lsn) from pg_replication_slots where active;'" % cluster["port"]) - active_slot_count = 0 - for slot in slots.splitlines(): - active, slot_name, delta = slot.split("|") - eo_postgresql.labels(ctn, version, name, slot_name, "slot-active").set(active == 't') - eo_postgresql.labels(ctn, version, name, slot_name, "slot-delta").set(int(delta)) - if active == 't': - active_slot_count += 1 - eo_postgresql.labels(ctn, version, name, "", "active-slot-count").set(active_slot_count) + if cluster["running"]: + # check the archiver status + archiver_failures = run("sudo -u postgres psql -p %s -tAc 'select failed_count from pg_stat_archiver;'" % cluster["port"]) + eo_postgresql.labels(ctn, version, name, "", "archive_failed").set(int(archiver_failures)) + + if "recovery" in cluster and cluster["recovery"]: + # we are on a standby, check it's connected to a master + receiver_statuses = run("sudo -u postgres psql -p %s -tAc 'select status from pg_stat_wal_receiver ;'" % cluster["port"]) + eo_postgresql.labels(ctn, version, name, "", "replicating").set(0) + for status in receiver_statuses.splitlines(): + if status == "streaming": + eo_postgresql.labels(ctn, version, name, "", "replicating").inc() + else: + # we are on a primary... check the slots are good + slots = run("sudo -u postgres psql -p %s -tAc 'select active, slot_name, pg_wal_lsn_diff(pg_current_wal_lsn() , restart_lsn) from pg_replication_slots where active;'" % cluster["port"]) + active_slot_count = 0 + for slot in slots.splitlines(): + active, slot_name, delta = slot.split("|") + eo_postgresql.labels(ctn, version, name, slot_name, "slot-active").set(active == 't') + eo_postgresql.labels(ctn, version, name, slot_name, "slot-delta").set(int(delta)) + if active == 't': + active_slot_count += 1 + eo_postgresql.labels(ctn, version, name, "", "active-slot-count").set(active_slot_count) def rabbitmq(ctn): rabbitmqctl = "/usr/sbin/rabbitmqctl"