publik-infra/prometheus-entrouvert-exporter/prometheus-system-exporter.py

288 lines
10 KiB
Python
Executable File

#!/usr/bin/python3
import apt
import argparse
from cryptography import x509
from cryptography.hazmat.backends import default_backend
import datetime
import dbus
import mailbox
from git import Repo
import glob
import json
import os
import psutil
import requests
import shlex
import subprocess
from systemd import login, journal
import time
from prometheus_client import CollectorRegistry, Gauge
from prometheus_client.exposition import generate_latest
from prometheus_client.parser import text_string_to_metric_families
JOURNALD_IGNORED_ERRORS = {
'dovecot.service': [
'Connection lost to LDAP server, reconnecting',
],
'ssh.service': [
'maximum authentication attempts exceeded for ',
],
'': [ # match all services (useful for ovpn*)
'Connection reset, restarting [0]',
],
}
apt_cache = apt.Cache()
registry = CollectorRegistry()
eo_errors = Gauge("eo_errors", "failed tests", ["ctn"], registry=registry)
eo_certificates = Gauge("eo_certificates", "certificates", ["ctn", "name"], registry=registry)
eo_debian = Gauge("eo_debian", "debian os", ["ctn"], registry=registry)
eo_etckeeper = Gauge("eo_etckeeper", "etckeeper", ["ctn", "name"], registry=registry)
eo_exim = Gauge("eo_exim", "exim", ["ctn", "name"], registry=registry)
eo_journal = Gauge("eo_journal", "journald", ["ctn", "name"], registry=registry)
eo_local_changes = Gauge("eo_local_changes", "", ["ctn"], registry=registry)
eo_mailboxes = Gauge("eo_mailboxes", "emails in local mailboxes", ["ctn", "name"], registry=registry)
eo_munin = Gauge("eo_munin", "munin", ["ctn", "name"], registry=registry)
eo_nginx = Gauge("eo_nginx", "nginx", ["ctn", "name"], registry=registry)
eo_postgresql = Gauge("eo_postgresql", "postgresql", ["ctn", "version", "cluster", "context", "name"], registry=registry)
eo_rabbitmq = Gauge("eo_rabbitmq", "rabbitmq", ["ctn"], registry=registry)
eo_threads = Gauge("eo_threads", "system threads", ["ctn"], registry=registry)
eo_units = Gauge("eo_units", "systemd units", ["ctn", "name", "state"], registry=registry)
eo_packages = Gauge("eo_packages", "packages", ["ctn", "state"], registry=registry)
def run(cmd):
m = shlex.split(cmd)
p = subprocess.Popen(m, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return "\n".join([l.decode().rstrip() for l in p.stdout.readlines()])
def debian(ctn):
with open('/etc/debian_version') as fh:
content = fh.read()
if 'sid' in content:
# we shouldn't have non-stable versions
ve = 0
else:
ve = float(content.strip())
eo_debian.labels(ctn).set(ve)
def etckeeper(ctn):
rep = Repo("/etc")
eo_etckeeper.labels(ctn, "dirty").set(int(rep.is_dirty()))
eo_etckeeper.labels(ctn, "untracked").set(len(rep.untracked_files))
def exim(ctn):
if not os.path.exists('/usr/sbin/exim'):
return
with open('/var/log/exim4/mainlog', 'rb') as fh:
for line in fh.readlines():
if b" ** " in line and not (
b'benjamin.dauvergne+eo@gmail.com' in line and b'support.google.com' in line
):
eo_exim.labels(ctn, "errors").inc()
deferred = int(run("exim -bpc"))
eo_exim.labels(ctn, "deferred").set(deferred)
def certificates(ctn):
certs = ["/etc/exim4/exim.crt"]
for cert in certs:
if os.path.isfile(cert):
c = x509.load_pem_x509_certificate(open(cert).read().encode(), default_backend())
remaining = (c.not_valid_after - datetime.datetime.today()).days
eo_certificates.labels(ctn, cert).set(remaining)
def journald(ctn):
j = journal.Reader()
fifteen = time.time() - 15 * 60
j.seek_realtime(fifteen)
j.add_match(PRIORITY=2)
eo_journal.labels(ctn, "critical").set(len(list(j)))
j.seek_realtime(fifteen)
j.add_match(PRIORITY=3)
for e in j:
msg = e["MESSAGE"]
ignored_message = False
ignored_strings = (
JOURNALD_IGNORED_ERRORS.get(e.get('_SYSTEMD_UNIT'), []) + JOURNALD_IGNORED_ERRORS['']
)
for ignored_string in ignored_strings:
if ignored_string in msg:
ignored_message = True
break
if ignored_message:
continue
eo_journal.labels(ctn, "error").inc()
if "Connected -> NetworkFailure" in msg or "task nfsd" in msg:
eo_journal.labels(ctn, "network_failure").inc()
def local_changes(ctn):
f = "/var/log/check-local-changes.log"
if os.path.isfile(f):
n = len(open(f).readlines())
eo_local_changes.labels(ctn).set(n)
def munin(ctn):
since = datetime.datetime.now() - datetime.timedelta(hours=1)
since_str = since.strftime("%Y/%m/%d-%H:%M:%S")
count = 0
for filename in ["/var/log/munin/munin-node.log", "/var/log/munin/munin-node.log.1"]:
if os.path.isfile(filename):
count += len(
[x for x in open(filename).readlines() if x > since_str and "exited with status" in x]
)
eo_munin.labels(ctn, "errors").set(count)
def nginx(ctn):
try:
r = requests.get("http://localhost/nginx_status")
except (requests.exceptions.SSLError, requests.exceptions.ConnectionError):
return
if r.ok:
for line in r.text.splitlines():
if "Active connections" in line:
n = int(line.split(':')[1].strip())
eo_nginx.labels(ctn, "connections").set(n)
def packages(ctn):
n = 0
for pkg in apt_cache.get_changes():
if pkg.isUpgradable:
n += 1
eo_packages.labels(ctn, "upgradable").set(n)
def mailboxes(ctn):
if os.path.exists('/etc/dovecot/dovecot.conf'):
# skip servers where dovecot is installed as it's expected to have
# mailboxes there.
return
boxes = glob.glob("/var/spool/mail/*")
days_ago = time.time() - 30 * 86400
for m in boxes:
if not os.path.isfile(m):
continue
if not os.stat(m).st_mtime > days_ago:
# skip mailboxes that didn't change for a long time
continue
n = m.split("/")[-1]
c = len(mailbox.mbox(m))
eo_mailboxes.labels(ctn, n).set(c)
def postgresql(ctn):
if not os.path.exists("/usr/bin/pg_lsclusters"):
return
clusters = json.loads(run("pg_lsclusters --json"))
for cluster in clusters:
version = cluster["version"]
name = cluster["cluster"]
eo_postgresql.labels(ctn, version, name, "", "running").set(cluster["running"])
if cluster["running"]:
# check the archiver status
archiver_failures = run("sudo -u postgres psql -p %s -tAc 'select failed_count from pg_stat_archiver;'" % cluster["port"])
eo_postgresql.labels(ctn, version, name, "", "archive_failed").set(int(archiver_failures))
if "recovery" in cluster and cluster["recovery"]:
# we are on a standby, check it's connected to a master
receiver_statuses = run("sudo -u postgres psql -p %s -tAc 'select status from pg_stat_wal_receiver ;'" % cluster["port"])
eo_postgresql.labels(ctn, version, name, "", "replicating").set(0)
for status in receiver_statuses.splitlines():
if status == "streaming":
eo_postgresql.labels(ctn, version, name, "", "replicating").inc()
else:
# we are on a primary... check the slots are good
slots = run("sudo -u postgres psql -p %s -tAc 'select active, slot_name, pg_wal_lsn_diff(pg_current_wal_lsn() , restart_lsn) from pg_replication_slots where active;'" % cluster["port"])
active_slot_count = 0
for slot in slots.splitlines():
active, slot_name, delta = slot.split("|")
eo_postgresql.labels(ctn, version, name, slot_name, "slot-active").set(active == 't')
eo_postgresql.labels(ctn, version, name, slot_name, "slot-delta").set(int(delta))
if active == 't':
active_slot_count += 1
eo_postgresql.labels(ctn, version, name, "", "active-slot-count").set(active_slot_count)
def rabbitmq(ctn):
rabbitmqctl = "/usr/sbin/rabbitmqctl"
if os.path.isfile(rabbitmqctl):
for i in run("%s list_queues messages" % rabbitmqctl):
if i.isdigit():
eo_rabbitmq.labels(ctn).inc(int(i))
def threads(ctn):
for p in psutil.process_iter():
eo_threads.labels(ctn).inc(p.num_threads())
def units(ctn):
bus = dbus.SystemBus()
s = bus.get_object("org.freedesktop.systemd1", "/org/freedesktop/systemd1")
manager = dbus.Interface(s, "org.freedesktop.systemd1.Manager")
units = manager.ListUnits()
n = len([u for u in units if u[3] == "failed" and not u[0].startswith("user@")])
name = [
u[0].replace('dbus.String', '') for u in units if u[3] == "failed" and not u[0].startswith("user@")
]
if name:
eo_units.labels(ctn, name, "failed").set(n)
else:
eo_units.labels(ctn, "", "failed").set(n)
def run_in_machines(ctn):
for machine in login.machine_names():
r = run(
"systemd-run --wait --pipe -q --machine %s /usr/bin/prometheus-system-exporter.py --ctn %s"
% (machine, machine)
)
current_metrics = text_string_to_metric_families(r)
for m in current_metrics:
for s in m.samples:
metric = globals()[m.name]
metric.labels(**s.labels).set(s.value)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--test", action="store_true", help="raise errors")
parser.add_argument("--ctn", default="")
args = parser.parse_args()
for test in [
certificates,
debian,
etckeeper,
exim,
journald,
local_changes,
mailboxes,
munin,
nginx,
packages,
postgresql,
rabbitmq,
threads,
units,
run_in_machines,
]:
try:
test(args.ctn)
except Exception:
eo_errors.labels(ctn=args.ctn).inc()
if args.test:
raise
print(generate_latest(registry).decode())