publik-infra/prometheus-entrouvert-exporter/prometheus-system-exporter.py

340 lines
12 KiB
Python
Executable File

#!/usr/bin/python3
import argparse
import datetime
import glob
import json
import mailbox
import os
import shlex
import subprocess
import time
import apt_pkg
import dbus
import psutil
import requests
from cryptography import x509
from cryptography.hazmat.backends import default_backend
from git import Repo
from prometheus_client import CollectorRegistry, Gauge
from prometheus_client.exposition import generate_latest
from prometheus_client.parser import text_string_to_metric_families
from systemd import journal, login
JOURNALD_IGNORED_ERRORS = {
'dovecot.service': [
'Connection lost to LDAP server, reconnecting',
],
'ssh.service': [
'maximum authentication attempts exceeded for ',
],
'': [ # match all services (useful for ovpn*)
'Connection reset, restarting [0]',
],
}
registry = CollectorRegistry()
eo_errors = Gauge("eo_errors", "failed tests", ["ctn"], registry=registry)
eo_certificates = Gauge("eo_certificates", "certificates", ["ctn", "name"], registry=registry)
eo_debian = Gauge("eo_debian", "debian os", ["ctn"], registry=registry)
eo_etckeeper = Gauge("eo_etckeeper", "etckeeper", ["ctn", "name"], registry=registry)
eo_exim = Gauge("eo_exim", "exim", ["ctn", "name"], registry=registry)
eo_harakiri = Gauge("eo_harakiri", "harakiri", ["ctn", "name", "unit"], registry=registry)
eo_journal = Gauge("eo_journal", "journald", ["ctn", "name"], registry=registry)
eo_local_changes = Gauge("eo_local_changes", "", ["ctn"], registry=registry)
eo_mailboxes = Gauge("eo_mailboxes", "emails in local mailboxes", ["ctn", "name"], registry=registry)
eo_munin = Gauge("eo_munin", "munin", ["ctn", "name"], registry=registry)
eo_nginx = Gauge("eo_nginx", "nginx", ["ctn", "name"], registry=registry)
eo_postgresql = Gauge(
"eo_postgresql", "postgresql", ["ctn", "version", "cluster", "context", "name"], registry=registry
)
eo_rabbitmq = Gauge("eo_rabbitmq", "rabbitmq", ["ctn"], registry=registry)
eo_threads = Gauge("eo_threads", "system threads", ["ctn"], registry=registry)
eo_units = Gauge("eo_units", "systemd units", ["ctn", "name", "state"], registry=registry)
eo_packages = Gauge("eo_packages", "packages", ["ctn", "state"], registry=registry)
eo_kernel = Gauge("eo_kernel", "kernel update", ["ctn"], registry=registry)
def run(cmd):
m = shlex.split(cmd)
p = subprocess.Popen(m, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return "\n".join([l.decode().rstrip() for l in p.stdout.readlines()])
def debian(ctn):
with open('/etc/debian_version') as fh:
content = fh.read()
if 'sid' in content:
# we shouldn't have non-stable versions
ve = 0
else:
ve = float(content.strip())
eo_debian.labels(ctn).set(ve)
def etckeeper(ctn):
rep = Repo("/etc")
eo_etckeeper.labels(ctn, "dirty").set(int(rep.is_dirty()))
eo_etckeeper.labels(ctn, "untracked").set(len(rep.untracked_files))
def exim(ctn):
if not os.path.exists('/usr/sbin/exim'):
return
with open('/var/log/exim4/mainlog', 'rb') as fh:
for line in fh.readlines():
if b" ** " in line and not (
b'benjamin.dauvergne+eo@gmail.com' in line and b'support.google.com' in line
):
eo_exim.labels(ctn, "errors").inc()
deferred = int(run("exim -bpc"))
eo_exim.labels(ctn, "deferred").set(deferred)
def certificates(ctn):
certs = ["/etc/exim4/exim.crt"]
for cert in certs:
if os.path.isfile(cert):
c = x509.load_pem_x509_certificate(open(cert).read().encode(), default_backend())
remaining = (c.not_valid_after - datetime.datetime.today()).days
eo_certificates.labels(ctn, cert).set(remaining)
def harakiri(ctn):
j = journal.Reader()
fifteen = time.time() - 15 * 60
j.seek_realtime(fifteen)
for e in j:
if 'HARAKIRI ON WORKER' in e['MESSAGE']:
eo_harakiri.labels(ctn, 'errors', e['_SYSTEMD_UNIT']).inc()
def journald(ctn):
j = journal.Reader()
fifteen = time.time() - 15 * 60
j.seek_realtime(fifteen)
j.add_match(PRIORITY=2)
eo_journal.labels(ctn, "critical").set(len(list(j)))
j.seek_realtime(fifteen)
j.add_match(PRIORITY=3)
for e in j:
msg = e["MESSAGE"]
ignored_message = False
ignored_strings = (
JOURNALD_IGNORED_ERRORS.get(e.get('_SYSTEMD_UNIT'), []) + JOURNALD_IGNORED_ERRORS['']
)
for ignored_string in ignored_strings:
if ignored_string in msg:
ignored_message = True
break
if ignored_message:
continue
eo_journal.labels(ctn, "error").inc()
if "Connected -> NetworkFailure" in msg or "task nfsd" in msg:
eo_journal.labels(ctn, "network_failure").inc()
def local_changes(ctn):
f = "/var/log/check-local-changes.log"
if os.path.isfile(f):
n = len(open(f).readlines())
eo_local_changes.labels(ctn).set(n)
def munin(ctn):
since = datetime.datetime.now() - datetime.timedelta(hours=1)
since_str = since.strftime("%Y/%m/%d-%H:%M:%S")
count = 0
for filename in ["/var/log/munin/munin-node.log", "/var/log/munin/munin-node.log.1"]:
if os.path.isfile(filename):
count += len(
[x for x in open(filename).readlines() if x > since_str and "exited with status" in x]
)
eo_munin.labels(ctn, "errors").set(count)
def nginx(ctn):
try:
r = requests.get("http://localhost/nginx_status")
except (requests.exceptions.SSLError, requests.exceptions.ConnectionError):
return
if r.ok:
for line in r.text.splitlines():
if "Active connections" in line:
n = int(line.split(':')[1].strip())
eo_nginx.labels(ctn, "connections").set(n)
def packages(ctn):
apt_pkg.init()
cache = apt_pkg.Cache(None)
depcache = apt_pkg.DepCache(cache)
depcache.read_pinfile()
depcache.init()
n = 0
for pkg in cache.packages:
if pkg.current_ver is not None and not depcache.marked_install(pkg) and depcache.is_upgradable(pkg):
n += 1
eo_packages.labels(ctn, "upgradable").set(n)
def mailboxes(ctn):
if os.path.exists('/etc/dovecot/dovecot.conf'):
# skip servers where dovecot is installed as it's expected to have
# mailboxes there.
return
boxes = glob.glob("/var/spool/mail/*")
days_ago = time.time() - 30 * 86400
for m in boxes:
if not os.path.isfile(m):
continue
if not os.stat(m).st_mtime > days_ago:
# skip mailboxes that didn't change for a long time
continue
n = m.split("/")[-1]
c = len(mailbox.mbox(m))
eo_mailboxes.labels(ctn, n).set(c)
def postgresql(ctn):
if not os.path.exists("/usr/bin/pg_lsclusters"):
return
clusters = json.loads(run("pg_lsclusters --json"))
for cluster in clusters:
version = cluster["version"]
name = cluster["cluster"]
eo_postgresql.labels(ctn, version, name, "", "running").set(cluster["running"])
if cluster["running"]:
# check the archiver status
archiver_failures = run(
"sudo -u postgres psql -p %s -tAc 'select failed_count from pg_stat_archiver;'"
% cluster["port"]
)
eo_postgresql.labels(ctn, version, name, "", "archive_failed").set(int(archiver_failures))
if "recovery" in cluster and cluster["recovery"]:
# we are on a standby, check it's connected to a master
receiver_statuses = run(
"sudo -u postgres psql -p %s -tAc 'select status from pg_stat_wal_receiver ;'"
% cluster["port"]
)
eo_postgresql.labels(ctn, version, name, "", "replicating").set(0)
for status in receiver_statuses.splitlines():
if status == "streaming":
eo_postgresql.labels(ctn, version, name, "", "replicating").inc()
else:
# we are on a primary... check the slots are good
slots = run(
"sudo -u postgres psql -p %s -tAc 'select active, slot_name, pg_wal_lsn_diff(pg_current_wal_lsn() , restart_lsn) from pg_replication_slots where active;'"
% cluster["port"]
)
active_slot_count = 0
for slot in slots.splitlines():
active, slot_name, delta = slot.split("|")
eo_postgresql.labels(ctn, version, name, slot_name, "slot-active").set(active == 't')
eo_postgresql.labels(ctn, version, name, slot_name, "slot-delta").set(int(delta))
if active == 't':
active_slot_count += 1
eo_postgresql.labels(ctn, version, name, "", "active-slot-count").set(active_slot_count)
def rabbitmq(ctn):
rabbitmqctl = "/usr/sbin/rabbitmqctl"
if os.path.isfile(rabbitmqctl):
for i in run("%s list_queues messages" % rabbitmqctl):
if i.isdigit():
eo_rabbitmq.labels(ctn).inc(int(i))
def threads(ctn):
for p in psutil.process_iter():
eo_threads.labels(ctn).inc(p.num_threads())
def units(ctn):
bus = dbus.SystemBus()
s = bus.get_object("org.freedesktop.systemd1", "/org/freedesktop/systemd1")
manager = dbus.Interface(s, "org.freedesktop.systemd1.Manager")
units = manager.ListUnits()
n = len([u for u in units if u[3] == "failed" and not u[0].startswith("user@")])
name = [
u[0].replace('dbus.String', '') for u in units if u[3] == "failed" and not u[0].startswith("user@")
]
if name:
eo_units.labels(ctn, name, "failed").set(n)
else:
eo_units.labels(ctn, "", "failed").set(n)
def run_in_machines(ctn):
for machine in login.machine_names():
r = run(
"systemd-run --wait --pipe -q --machine %s /usr/bin/prometheus-system-exporter.py --ctn %s"
% (machine, machine)
)
current_metrics = text_string_to_metric_families(r)
for m in current_metrics:
for s in m.samples:
metric = globals()[m.name]
metric.labels(**s.labels).set(s.value)
def check_kernel_version(ctn):
# no check if no kernel installed (aka container)
if not os.path.exists("/vmlinuz"):
return
uname = os.uname()
current_version = uname.version
current_release = uname.release
# first check, simple : verify that the current release match /vmlinuz
next_boot_release = os.path.basename(os.readlink("/vmlinuz"))
if next_boot_release != "vmlinuz-%s" % current_release:
eo_kernel.labels(ctn).set(2)
return
# second check, a bit harder : verify that the current version matches the package
current_version_extract = current_version.split(" ")[3]
for line in subprocess.check_output(["dpkg", "--status", "linux-image-" + current_release]).split(b"\n"):
if line.startswith(b"Version: "):
if line != b"Version: " + current_version_extract.encode("ascii"):
eo_kernel.labels(ctn).set(1)
return
break
eo_kernel.labels(ctn).set(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--test", action="store_true", help="raise errors")
parser.add_argument("--ctn", default="")
args = parser.parse_args()
for test in [
certificates,
check_kernel_version,
debian,
etckeeper,
exim,
harakiri,
journald,
local_changes,
mailboxes,
munin,
nginx,
packages,
postgresql,
rabbitmq,
threads,
units,
run_in_machines,
]:
try:
test(args.ctn)
except Exception:
eo_errors.labels(ctn=args.ctn).inc()
if args.test:
raise
print(generate_latest(registry).decode())