diff --git a/prometheus-entrouvert-exporter/prometheus-system-exporter.py b/prometheus-entrouvert-exporter/prometheus-system-exporter.py index c7bf581..d5976e9 100755 --- a/prometheus-entrouvert-exporter/prometheus-system-exporter.py +++ b/prometheus-entrouvert-exporter/prometheus-system-exporter.py @@ -8,6 +8,7 @@ import dbus import mailbox from git import Repo import glob +import json import os import psutil import requests @@ -46,7 +47,7 @@ eo_local_changes = Gauge("eo_local_changes", "", ["ctn"], registry=registry) eo_mailboxes = Gauge("eo_mailboxes", "emails in local mailboxes", ["ctn", "name"], registry=registry) eo_munin = Gauge("eo_munin", "munin", ["ctn", "name"], registry=registry) eo_nginx = Gauge("eo_nginx", "nginx", ["ctn", "name"], registry=registry) -eo_postgresql = Gauge("eo_postgresql", "postgresql", ["ctn", "role", "name"], registry=registry) +eo_postgresql = Gauge("eo_postgresql", "postgresql", ["ctn", "version", "cluster", "context", "name"], registry=registry) eo_rabbitmq = Gauge("eo_rabbitmq", "rabbitmq", ["ctn"], registry=registry) eo_threads = Gauge("eo_threads", "system threads", ["ctn"], registry=registry) eo_units = Gauge("eo_units", "systemd units", ["ctn", "name", "state"], registry=registry) @@ -181,35 +182,32 @@ def mailboxes(ctn): def postgresql(ctn): - if not glob.glob("/etc/postgresql/*/main/postgresql.conf"): + if not os.path.exists("/usr/bin/pg_lsclusters"): return - def get_last_backup_delta(): - backup_files = glob.glob('/var/lib/postgresql/backups/base/*') - if not backup_files: - return -1 - sorted_backup_files = sorted(backup_files, key=os.path.getmtime) - created = os.stat(sorted_backup_files[-1]).st_ctime - return (datetime.datetime.now() - datetime.datetime.fromtimestamp(created)).total_seconds() - - recovery = glob.glob("/var/lib/postgresql/*/*/recovery.conf") - if len(recovery) == 0: - role = "primary" - - eo_postgresql.labels(ctn, role, "backup_delta").set(get_last_backup_delta()) - eo_postgresql.labels(ctn, role, "replicators").set(0) - for p in psutil.process_iter(): - cmd = p.cmdline() - if cmd and ('walsender' in cmd[0] or 'wal sender' in cmd[0]): - eo_postgresql.labels(ctn, role, "replicators").inc() - else: - role = "secondary" - eo_postgresql.labels(ctn, role, "replicating").set(0) - for p in psutil.process_iter(): - cmd = p.cmdline() - if cmd and 'walreceiver' in cmd[0]: - eo_postgresql.labels(ctn, role, "replicating").inc() - + clusters = json.loads(run("pg_lsclusters --json")) + for cluster in clusters: + version = cluster["version"] + name = cluster["cluster"] + eo_postgresql.labels(ctn, version, name, "", "running").set(cluster["running"]) + if "recovery" in cluster and cluster["recovery"] and cluster["running"]: + # we are on a standby, check it's connected to a master + receiver_statuses = run("sudo -u postgres psql -p %s -tAc 'select status from pg_stat_wal_receiver ;'" % cluster["port"]) + eo_postgresql.labels(ctn, version, name, "", "replicating").set(0) + for status in receiver_statuses.splitlines(): + if status == "streaming": + eo_postgresql.labels(ctn, version, name, "", "replicating").inc() + elif cluster["running"]: + # we are on a primary... check the slots are good + slots = run("sudo -u postgres psql -p %s -tAc 'select active, slot_name, pg_wal_lsn_diff(pg_current_wal_lsn() , restart_lsn) from pg_replication_slots where active;'" % cluster["port"]) + active_slot_count = 0 + for slot in slots.splitlines(): + active, slot_name, delta = slot.split("|") + eo_postgresql.labels(ctn, version, name, slot_name, "slot-active").set(active == 't') + eo_postgresql.labels(ctn, version, name, slot_name, "slot-delta").set(int(delta)) + if active == 't': + active_slot_count += 1 + eo_postgresql.labels(ctn, version, name, "", "active-slot-count").set(active_slot_count) def rabbitmq(ctn): rabbitmqctl = "/usr/sbin/rabbitmqctl"