postgresql: redo monitoring with PG13 compatibility (#65244)

This commit is contained in:
Pierre Ducroquet 2022-05-16 12:49:04 +02:00
parent 761d1c468c
commit 5a77f636af
1 changed files with 26 additions and 28 deletions

View File

@ -8,6 +8,7 @@ import dbus
import mailbox import mailbox
from git import Repo from git import Repo
import glob import glob
import json
import os import os
import psutil import psutil
import requests import requests
@ -46,7 +47,7 @@ eo_local_changes = Gauge("eo_local_changes", "", ["ctn"], registry=registry)
eo_mailboxes = Gauge("eo_mailboxes", "emails in local mailboxes", ["ctn", "name"], registry=registry) eo_mailboxes = Gauge("eo_mailboxes", "emails in local mailboxes", ["ctn", "name"], registry=registry)
eo_munin = Gauge("eo_munin", "munin", ["ctn", "name"], registry=registry) eo_munin = Gauge("eo_munin", "munin", ["ctn", "name"], registry=registry)
eo_nginx = Gauge("eo_nginx", "nginx", ["ctn", "name"], registry=registry) eo_nginx = Gauge("eo_nginx", "nginx", ["ctn", "name"], registry=registry)
eo_postgresql = Gauge("eo_postgresql", "postgresql", ["ctn", "role", "name"], registry=registry) eo_postgresql = Gauge("eo_postgresql", "postgresql", ["ctn", "version", "cluster", "context", "name"], registry=registry)
eo_rabbitmq = Gauge("eo_rabbitmq", "rabbitmq", ["ctn"], registry=registry) eo_rabbitmq = Gauge("eo_rabbitmq", "rabbitmq", ["ctn"], registry=registry)
eo_threads = Gauge("eo_threads", "system threads", ["ctn"], registry=registry) eo_threads = Gauge("eo_threads", "system threads", ["ctn"], registry=registry)
eo_units = Gauge("eo_units", "systemd units", ["ctn", "name", "state"], registry=registry) eo_units = Gauge("eo_units", "systemd units", ["ctn", "name", "state"], registry=registry)
@ -181,35 +182,32 @@ def mailboxes(ctn):
def postgresql(ctn): def postgresql(ctn):
if not glob.glob("/etc/postgresql/*/main/postgresql.conf"): if not os.path.exists("/usr/bin/pg_lsclusters"):
return return
def get_last_backup_delta(): clusters = json.loads(run("pg_lsclusters --json"))
backup_files = glob.glob('/var/lib/postgresql/backups/base/*') for cluster in clusters:
if not backup_files: version = cluster["version"]
return -1 name = cluster["cluster"]
sorted_backup_files = sorted(backup_files, key=os.path.getmtime) eo_postgresql.labels(ctn, version, name, "", "running").set(cluster["running"])
created = os.stat(sorted_backup_files[-1]).st_ctime if "recovery" in cluster and cluster["recovery"] and cluster["running"]:
return (datetime.datetime.now() - datetime.datetime.fromtimestamp(created)).total_seconds() # we are on a standby, check it's connected to a master
receiver_statuses = run("sudo -u postgres psql -p %s -tAc 'select status from pg_stat_wal_receiver ;'" % cluster["port"])
recovery = glob.glob("/var/lib/postgresql/*/*/recovery.conf") eo_postgresql.labels(ctn, version, name, "", "replicating").set(0)
if len(recovery) == 0: for status in receiver_statuses.splitlines():
role = "primary" if status == "streaming":
eo_postgresql.labels(ctn, version, name, "", "replicating").inc()
eo_postgresql.labels(ctn, role, "backup_delta").set(get_last_backup_delta()) elif cluster["running"]:
eo_postgresql.labels(ctn, role, "replicators").set(0) # we are on a primary... check the slots are good
for p in psutil.process_iter(): slots = run("sudo -u postgres psql -p %s -tAc 'select active, slot_name, pg_wal_lsn_diff(pg_current_wal_lsn() , restart_lsn) from pg_replication_slots where active;'" % cluster["port"])
cmd = p.cmdline() active_slot_count = 0
if cmd and ('walsender' in cmd[0] or 'wal sender' in cmd[0]): for slot in slots.splitlines():
eo_postgresql.labels(ctn, role, "replicators").inc() active, slot_name, delta = slot.split("|")
else: eo_postgresql.labels(ctn, version, name, slot_name, "slot-active").set(active == 't')
role = "secondary" eo_postgresql.labels(ctn, version, name, slot_name, "slot-delta").set(int(delta))
eo_postgresql.labels(ctn, role, "replicating").set(0) if active == 't':
for p in psutil.process_iter(): active_slot_count += 1
cmd = p.cmdline() eo_postgresql.labels(ctn, version, name, "", "active-slot-count").set(active_slot_count)
if cmd and 'walreceiver' in cmd[0]:
eo_postgresql.labels(ctn, role, "replicating").inc()
def rabbitmq(ctn): def rabbitmq(ctn):
rabbitmqctl = "/usr/sbin/rabbitmqctl" rabbitmqctl = "/usr/sbin/rabbitmqctl"