postgresql: redo monitoring with PG13 compatibility (#65244)

This commit is contained in:
Pierre Ducroquet 2022-05-16 12:49:04 +02:00
parent 761d1c468c
commit 5a77f636af
1 changed files with 26 additions and 28 deletions

View File

@ -8,6 +8,7 @@ import dbus
import mailbox
from git import Repo
import glob
import json
import os
import psutil
import requests
@ -46,7 +47,7 @@ eo_local_changes = Gauge("eo_local_changes", "", ["ctn"], registry=registry)
eo_mailboxes = Gauge("eo_mailboxes", "emails in local mailboxes", ["ctn", "name"], registry=registry)
eo_munin = Gauge("eo_munin", "munin", ["ctn", "name"], registry=registry)
eo_nginx = Gauge("eo_nginx", "nginx", ["ctn", "name"], registry=registry)
eo_postgresql = Gauge("eo_postgresql", "postgresql", ["ctn", "role", "name"], registry=registry)
eo_postgresql = Gauge("eo_postgresql", "postgresql", ["ctn", "version", "cluster", "context", "name"], registry=registry)
eo_rabbitmq = Gauge("eo_rabbitmq", "rabbitmq", ["ctn"], registry=registry)
eo_threads = Gauge("eo_threads", "system threads", ["ctn"], registry=registry)
eo_units = Gauge("eo_units", "systemd units", ["ctn", "name", "state"], registry=registry)
@ -181,35 +182,32 @@ def mailboxes(ctn):
def postgresql(ctn):
if not glob.glob("/etc/postgresql/*/main/postgresql.conf"):
if not os.path.exists("/usr/bin/pg_lsclusters"):
return
def get_last_backup_delta():
backup_files = glob.glob('/var/lib/postgresql/backups/base/*')
if not backup_files:
return -1
sorted_backup_files = sorted(backup_files, key=os.path.getmtime)
created = os.stat(sorted_backup_files[-1]).st_ctime
return (datetime.datetime.now() - datetime.datetime.fromtimestamp(created)).total_seconds()
recovery = glob.glob("/var/lib/postgresql/*/*/recovery.conf")
if len(recovery) == 0:
role = "primary"
eo_postgresql.labels(ctn, role, "backup_delta").set(get_last_backup_delta())
eo_postgresql.labels(ctn, role, "replicators").set(0)
for p in psutil.process_iter():
cmd = p.cmdline()
if cmd and ('walsender' in cmd[0] or 'wal sender' in cmd[0]):
eo_postgresql.labels(ctn, role, "replicators").inc()
else:
role = "secondary"
eo_postgresql.labels(ctn, role, "replicating").set(0)
for p in psutil.process_iter():
cmd = p.cmdline()
if cmd and 'walreceiver' in cmd[0]:
eo_postgresql.labels(ctn, role, "replicating").inc()
clusters = json.loads(run("pg_lsclusters --json"))
for cluster in clusters:
version = cluster["version"]
name = cluster["cluster"]
eo_postgresql.labels(ctn, version, name, "", "running").set(cluster["running"])
if "recovery" in cluster and cluster["recovery"] and cluster["running"]:
# we are on a standby, check it's connected to a master
receiver_statuses = run("sudo -u postgres psql -p %s -tAc 'select status from pg_stat_wal_receiver ;'" % cluster["port"])
eo_postgresql.labels(ctn, version, name, "", "replicating").set(0)
for status in receiver_statuses.splitlines():
if status == "streaming":
eo_postgresql.labels(ctn, version, name, "", "replicating").inc()
elif cluster["running"]:
# we are on a primary... check the slots are good
slots = run("sudo -u postgres psql -p %s -tAc 'select active, slot_name, pg_wal_lsn_diff(pg_current_wal_lsn() , restart_lsn) from pg_replication_slots where active;'" % cluster["port"])
active_slot_count = 0
for slot in slots.splitlines():
active, slot_name, delta = slot.split("|")
eo_postgresql.labels(ctn, version, name, slot_name, "slot-active").set(active == 't')
eo_postgresql.labels(ctn, version, name, slot_name, "slot-delta").set(int(delta))
if active == 't':
active_slot_count += 1
eo_postgresql.labels(ctn, version, name, "", "active-slot-count").set(active_slot_count)
def rabbitmq(ctn):
rabbitmqctl = "/usr/sbin/rabbitmqctl"