publik-infra/prometheus-entrouvert-exporter/prometheus-system-exporter.py

268 lines
8.8 KiB
Python
Executable File

#!/usr/bin/python3
import apt
import argparse
from cryptography import x509
from cryptography.hazmat.backends import default_backend
import datetime
import dbus
import mailbox
from git import Repo
import glob
import os
import psutil
import requests
import shlex
import subprocess
from systemd import login, journal
import time
from prometheus_client import CollectorRegistry, Gauge
from prometheus_client.exposition import generate_latest
from prometheus_client.parser import text_string_to_metric_families
JOURNALD_IGNORED_ERRORS = {
'dovecot.service': [
'Connection lost to LDAP server, reconnecting',
],
}
apt_cache = apt.Cache()
registry = CollectorRegistry()
eo_errors = Gauge("eo_errors", "failed tests", ["ctn"], registry=registry)
eo_certificates = Gauge("eo_certificates", "certificates", ["ctn", "name"], registry=registry)
eo_debian = Gauge("eo_debian", "debian os", ["ctn"], registry=registry)
eo_etckeeper = Gauge("eo_etckeeper", "etckeeper", ["ctn", "name"], registry=registry)
eo_exim = Gauge("eo_exim", "exim", ["ctn", "name"], registry=registry)
eo_journal = Gauge("eo_journal", "journald", ["ctn", "name"], registry=registry)
eo_local_changes = Gauge("eo_local_changes", "", ["ctn"], registry=registry)
eo_mailboxes = Gauge("eo_mailboxes", "emails in local mailboxes", ["ctn", "name"], registry=registry)
eo_munin = Gauge("eo_munin", "munin", ["ctn", "name"], registry=registry)
eo_nginx = Gauge("eo_nginx", "nginx", ["ctn", "name"], registry=registry)
eo_postgresql = Gauge("eo_postgresql", "postgresql", ["ctn", "role", "name"], registry=registry)
eo_rabbitmq = Gauge("eo_rabbitmq", "rabbitmq", ["ctn"], registry=registry)
eo_threads = Gauge("eo_threads", "system threads", ["ctn"], registry=registry)
eo_units = Gauge("eo_units", "systemd units", ["ctn", "state"], registry=registry)
eo_packages = Gauge("eo_packages", "packages", ["ctn", "state"], registry=registry)
def run(cmd):
m = shlex.split(cmd)
p = subprocess.Popen(m, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return "\n".join([l.decode().rstrip() for l in p.stdout.readlines()])
def debian(ctn):
with open('/etc/debian_version') as fh:
content = fh.read()
if 'sid' in content:
# we shouldn't have non-stable versions
ve = 0
else:
ve = float(content.strip())
eo_debian.labels(ctn).set(ve)
def etckeeper(ctn):
rep = Repo("/etc")
eo_etckeeper.labels(ctn, "dirty").set(int(rep.is_dirty()))
eo_etckeeper.labels(ctn, "untracked").set(len(rep.untracked_files))
def exim(ctn):
with open('/var/log/exim4/mainlog', 'rb') as fh:
for line in fh.readlines():
if b" ** " in line and not (
b'benjamin.dauvergne+eo@gmail.com' in line and b'support.google.com' in line
):
eo_exim.labels(ctn, "errors").inc()
deferred = int(run("exim -bpc"))
eo_exim.labels(ctn, "deferred").set(deferred)
def certificates(ctn):
certs = ["/etc/exim4/exim.crt"]
for cert in certs:
if os.path.isfile(cert):
c = x509.load_pem_x509_certificate(open(cert).read().encode(), default_backend())
remaining = (c.not_valid_after - datetime.datetime.today()).days
eo_certificates.labels(ctn, cert).set(remaining)
def journald(ctn):
j = journal.Reader()
fifteen = time.time() - 15 * 60
j.seek_realtime(fifteen)
j.add_match(PRIORITY=2)
eo_journal.labels(ctn, "critical").set(len(list(j)))
j.seek_realtime(fifteen)
j.add_match(PRIORITY=3)
for e in j:
msg = e["MESSAGE"]
ignored_message = False
for ignored_string in JOURNALD_IGNORED_ERRORS.get(e.get('_SYSTEMD_UNIT')) or []:
if ignored_string in msg:
ignored_message = True
break
if ignored_message:
continue
eo_journal.labels(ctn, "error").inc()
if "Connected -> NetworkFailure" in msg or "task nfsd" in msg:
eo_journal.labels(ctn, "network_failure").inc()
def local_changes(ctn):
f = "/var/log/check-local-changes.log"
if os.path.isfile(f):
n = len(open(f).readlines())
eo_local_changes.labels(ctn).set(n)
def munin(ctn):
since = datetime.datetime.now() - datetime.timedelta(hours=1)
since_str = since.strftime("%Y/%m/%d-%H:%M:%S")
count = 0
for filename in ["/var/log/munin/munin-node.log", "/var/log/munin/munin-node.log.1"]:
if os.path.isfile(filename):
count += len(
[x for x in open(filename).readlines() if x > since_str and "exited with status" in x]
)
eo_munin.labels(ctn, "errors").set(count)
def nginx(ctn):
try:
r = requests.get("http://localhost/nginx_status")
except (requests.exceptions.SSLError, requests.exceptions.ConnectionError):
return
if r.ok:
for line in r.text.splitlines():
if "Active connections" in line:
n = int(line.split(':')[1].strip())
eo_nginx.labels(ctn, "connections").set(n)
def packages(ctn):
n = 0
for pkg in apt_cache.get_changes():
if pkg.isUpgradable:
n += 1
eo_packages.labels(ctn, "upgradable").set(n)
def mailboxes(ctn):
if os.path.exists('/etc/dovecot/dovecot.conf'):
# skip servers where dovecot is installed as it's expected to have
# mailboxes there.
return
boxes = glob.glob("/var/spool/mail/*")
days_ago = time.time() - 30 * 86400
for m in boxes:
if not os.path.isfile(m):
continue
if not os.stat(m).st_mtime > days_ago:
# skip mailboxes that didn't change for a long time
continue
n = m.split("/")[-1]
c = len(mailbox.mbox(m))
eo_mailboxes.labels(ctn, n).set(c)
def postgresql(ctn):
if not glob.glob("/etc/postgresql/*/main/postgresql.conf"):
return
def get_last_backup_delta():
backup_files = glob.glob('/var/lib/postgresql/backups/base/*')
if not backup_files:
return -1
sorted_backup_files = sorted(backup_files, key=os.path.getmtime)
created = os.stat(sorted_backup_files[-1]).st_ctime
return (datetime.datetime.now() - datetime.datetime.fromtimestamp(created)).total_seconds()
recovery = glob.glob("/var/lib/postgresql/*/*/recovery.conf")
if len(recovery) == 0:
role = "primary"
eo_postgresql.labels(ctn, role, "backup_delta").set(get_last_backup_delta())
eo_postgresql.labels(ctn, role, "replicators").set(0)
for p in psutil.process_iter():
cmd = p.cmdline()
if cmd and ('walsender' in cmd[0] or 'wal sender' in cmd[0]):
eo_postgresql.labels(ctn, role, "replicators").inc()
else:
role = "secondary"
eo_postgresql.labels(ctn, role, "replicating").set(0)
for p in psutil.process_iter():
cmd = p.cmdline()
if cmd and 'walreceiver' in cmd[0]:
eo_postgresql.labels(ctn, role, "replicating").inc()
def rabbitmq(ctn):
rabbitmqctl = "/usr/sbin/rabbitmqctl"
if os.path.isfile(rabbitmqctl):
for i in run("%s list_queues messages" % rabbitmqctl):
if i.isdigit():
eo_rabbitmq.labels(ctn).inc(int(i))
def threads(ctn):
for p in psutil.process_iter():
eo_threads.labels(ctn).inc(p.num_threads())
def units(ctn):
bus = dbus.SystemBus()
s = bus.get_object("org.freedesktop.systemd1", "/org/freedesktop/systemd1")
manager = dbus.Interface(s, "org.freedesktop.systemd1.Manager")
units = manager.ListUnits()
n = len([u for u in units if u[3] == "failed" and not u[0].startswith("user@")])
eo_units.labels(ctn, "failed").set(n)
def run_in_machines(ctn):
for machine in login.machine_names():
r = run(
"systemd-run --wait --pipe -q --machine %s /usr/bin/prometheus-entrouvert-exporter.py --ctn %s"
% (machine, machine)
)
current_metrics = text_string_to_metric_families(r)
for m in current_metrics:
for s in m.samples:
metric = globals()[m.name]
metric.labels(**s.labels).set(s.value)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--test", action="store_true", help="raise errors")
parser.add_argument("--ctn", default="")
args = parser.parse_args()
for test in [
certificates,
debian,
etckeeper,
exim,
journald,
local_changes,
mailboxes,
munin,
nginx,
packages,
postgresql,
rabbitmq,
threads,
units,
run_in_machines,
]:
try:
test(args.ctn)
except Exception:
eo_errors.labels(ctn=args.ctn).inc()
if args.test:
raise
print(generate_latest(registry).decode())