log errors for down connectors when it persists (#35380)

This commit is contained in:
Benjamin Dauvergne 2019-08-12 18:17:11 +02:00
parent 2374fed9d9
commit 3d98363db7
7 changed files with 292 additions and 55 deletions

View File

@ -1,6 +1,7 @@
from django import forms
from .models import ApiUser, AccessRight
from .models import ApiUser, AccessRight, AvailabilityParameters
class ApiUserForm(forms.ModelForm):
class Meta:
@ -17,3 +18,12 @@ class AccessRightForm(forms.ModelForm):
'resource_type': forms.HiddenInput(),
'resource_pk': forms.HiddenInput(),
}
class AvailabilityParametersForm(forms.ModelForm):
class Meta:
model = AvailabilityParameters
fields = ['run_check', 'notification_delays']
widgets = {
'notification_delays': forms.TextInput,
}

View File

@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.20 on 2019-08-20 14:14
from __future__ import unicode_literals
from django.db import migrations, models
import passerelle.base.models
class Migration(migrations.Migration):
dependencies = [
('base', '0013_delete_templatevar'),
]
operations = [
migrations.AddField(
model_name='availabilityparameters',
name='notification_delays',
field=models.TextField(default=b'0', validators=[passerelle.base.models.validate_notification_delays]),
),
migrations.AddField(
model_name='resourcestatus',
name='last_notification_timestamp',
field=models.DateTimeField(blank=True, null=True),
),
]

View File

@ -8,6 +8,7 @@ import re
import sys
import traceback
import base64
import itertools
from django.apps import apps
from django.conf import settings
@ -19,6 +20,7 @@ from django.test import override_settings
from django.utils.text import slugify
from django.utils import timezone, six
from django.utils.translation import ugettext_lazy as _
from django.utils.timezone import now
from django.core.files.base import ContentFile
from django.contrib.contenttypes.models import ContentType
@ -400,7 +402,8 @@ class BaseResource(models.Model):
if not self.availability_parameters.run_check:
return
currently_down = self.down()
availability_parameters = self.availability_parameters
try:
self.check_status()
status = 'up'
@ -408,26 +411,66 @@ class BaseResource(models.Model):
except NotImplementedError:
return
except Exception as e:
from passerelle.utils.conversion import exception_to_text
status = 'down'
message = repr(e)[:500]
message = exception_to_text(e)[:500]
resource_type = ContentType.objects.get_for_model(self)
current_status = ResourceStatus.objects.filter(
resource_type=resource_type,
resource_pk=self.pk).first()
resource_type=resource_type,
resource_pk=self.pk).first()
if not current_status or status != current_status.status:
if status == 'down' and not currently_down:
self.logger.error(u'connector "%s" (%s) is now down', self, self.__class__.__name__)
if status == 'down' and not self.down(): # new downtime
if availability_parameters.has_zero_delay():
self.logger.error(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message)
else:
self.logger.warning(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message)
ResourceStatus(
resource_type=resource_type,
resource_pk=self.pk,
status=status,
message=message).save()
if status == 'up' and currently_down:
if status == 'up' and current_status:
self.logger.info(u'connector "%s" (%s) is back up', self, self.__class__.__name__)
elif status == 'down':
current_status.message = message
current_status.save()
# check last_notification_downtime and current downtime to see if it matches a new notification delay
last_notification_timestamp = current_status.last_notification_timestamp or current_status.start_timestamp
current_time = now()
downtime = (current_time - current_status.start_timestamp).total_seconds() // 60
last_notification_downtime = (
last_notification_timestamp - current_status.start_timestamp).total_seconds() // 60
for delay in availability_parameters.notification_delays_generator():
if not delay:
continue
if downtime >= delay:
if last_notification_downtime < delay:
days = downtime // 1440
hours = downtime // 60
if days > 1:
human_duration = 'for %d days' % days
elif hours > 1:
human_duration = 'for %d hours' % hours
else:
human_duration = 'for %d minutes' % downtime
self.logger.error(u'connector "%s" (%s) has been down %s: %s',
self, self.__class__.__name__,
human_duration,
message,
# when connector is down, logging is shutdown
force=True)
ResourceStatus.objects.filter(pk=current_status.pk).update(
message=message, last_notification_timestamp=current_time)
break
else:
ResourceStatus.objects.filter(pk=current_status.pk).update(
message=message)
break
else:
ResourceStatus.objects.filter(pk=current_status.pk).update(
message=message)
def hourly(self):
pass
@ -536,6 +579,26 @@ class LoggingParameters(models.Model):
unique_together = (('resource_type', 'resource_pk'))
def parse_notification_delays(value):
delays = [int(v.strip()) for v in value.split(',')]
if not all(delay >= 0 for delay in delays):
raise ValueError
if delays != sorted(delays):
raise ValueError
if len(set(delays)) != len(delays):
raise ValueError
return delays
def validate_notification_delays(value):
try:
if not value:
raise ValueError
parse_notification_delays(value)
except ValueError:
raise ValidationError(_('You must provide a list of increasing minutes delays'))
class AvailabilityParameters(models.Model):
resource_type = models.ForeignKey(ContentType)
resource_pk = models.PositiveIntegerField()
@ -543,6 +606,23 @@ class AvailabilityParameters(models.Model):
run_check = models.BooleanField(
default=True, verbose_name=_('Run regular availability checks'),
help_text=_('Run an availability check every 5 minutes'))
notification_delays = models.TextField(
default='0',
blank=False,
validators=[validate_notification_delays],
help_text=_('Increasing delay between error notifications in minutes, ex.: 0,5,10'))
def has_zero_delay(self):
return 0 in parse_notification_delays(self.notification_delays)
def notification_delays_generator(self):
notification_delays = parse_notification_delays(self.notification_delays)
last_notification_delay = notification_delays[-1]
if last_notification_delay > 1:
notification_delays = itertools.chain(
notification_delays,
itertools.count(2 * last_notification_delay, last_notification_delay))
return notification_delays
class Meta:
unique_together = (('resource_type', 'resource_pk'))
@ -608,6 +688,7 @@ class ResourceStatus(models.Model):
start_timestamp = models.DateTimeField(auto_now_add=True)
status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='unknown')
message = models.CharField(max_length=500, blank=True)
last_notification_timestamp = models.DateTimeField(blank=True, null=True)
class Meta:
ordering = ['-start_timestamp']
@ -634,7 +715,8 @@ class ProxyLogger(object):
return self._logger.getEffectiveLevel()
def _log(self, levelname, message, *args, **kwargs):
if self.connector.down():
force = kwargs.pop('force', False)
if self.connector.down() and not force:
# don't log if the connector is known to be down
return
levelno = getattr(logging, levelname)

View File

@ -15,15 +15,17 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from django.contrib.contenttypes.models import ContentType
from django.contrib import messages
from django.core.urlresolvers import reverse
from django.core.exceptions import ObjectDoesNotExist, PermissionDenied
from django.forms import models as model_forms
from django.views.generic import (
DetailView, ListView, CreateView, UpdateView, DeleteView, FormView)
from django.http import Http404
from django.utils.translation import ugettext_lazy as _
from .models import ApiUser, AccessRight, LoggingParameters, AvailabilityParameters, ResourceStatus
from .forms import ApiUserForm, AccessRightForm
from .models import ApiUser, AccessRight, LoggingParameters, ResourceStatus
from .forms import ApiUserForm, AccessRightForm, AvailabilityParametersForm
from ..utils import get_trusted_services
@ -155,42 +157,33 @@ class LoggingParametersUpdateView(FormView):
return super(LoggingParametersUpdateView, self).form_valid(form)
class ManageAvailabilityView(FormView):
class ManageAvailabilityView(UpdateView):
template_name = 'passerelle/manage/manage_availability_form.html'
form_class = AvailabilityParametersForm
def get_resource(self):
if not hasattr(self, '_resource'):
content_type = ContentType.objects.get_for_id(self.kwargs['resource_type'])
self._resource = content_type.model_class().objects.get(pk=self.kwargs['resource_pk'])
return self._resource
def get_object(self, queryset=None):
return self.get_resource().availability_parameters
def get_context_data(self, **kwargs):
context = super(ManageAvailabilityView, self).get_context_data(**kwargs)
connector = self.get_resource()
context['connector'] = connector
context['availability_status'] = connector.get_availability_status()
context['connector'] = self.get_resource()
context['availability_status'] = self.get_resource().get_availability_status()
return context
def get_form_class(self):
form_class = model_forms.modelform_factory(
AvailabilityParameters,
fields=['run_check'])
return form_class
def get_initial(self):
d = self.initial.copy()
d['resource_type'] = self.kwargs['resource_type']
d['resource_pk'] = self.kwargs['resource_pk']
d['run_check'] = self.get_resource().availability_parameters.run_check
return d
def get_resource(self):
content_type = ContentType.objects.get_for_id(self.kwargs['resource_type'])
return content_type.model_class().objects.get(pk=self.kwargs['resource_pk'])
def get_success_url(self):
return self.get_resource().get_absolute_url()
def form_valid(self, form):
resource = self.get_resource()
parameters = resource.availability_parameters
run_check = form.cleaned_data['run_check']
if not run_check and resource.down():
# if availability check is disabled resource is forced to up to activate logs
if not form.instance.run_check and resource.down():
resource_type = ContentType.objects.get_for_model(resource)
ResourceStatus(
resource_type=resource_type,
@ -198,9 +191,14 @@ class ManageAvailabilityView(FormView):
status='up',
message='').save()
if parameters.run_check != run_check:
parameters.run_check = run_check
parameters.save()
resource.logger.info(u'availability checks %s', 'enabled' if run_check else 'disabled')
# log changes to notification delays
if 'notification_delays' in form.changed_data:
resource.logger.info(u'availability checks delays set to %s', form.instance.notification_delays)
# log changes to run_check, if enabled immediately check for availability
if 'run_check' in form.changed_data:
resource.logger.info(u'availability checks %s', 'enabled' if form.instance.run_check else 'disabled')
if form.instance.run_check:
resource.availability()
return super(ManageAvailabilityView, self).form_valid(form)

View File

@ -248,3 +248,18 @@ def test_availability_parameters(app, admin_user, monkeypatch):
assert not csv.down()
status = csv.get_availability_status()
assert status.status == 'up'
# Notification delays
resp = app.get(csv.get_absolute_url())
resp = resp.click('availability check')
assert resp.form['notification_delays'].value == '0'
resp.form['notification_delays'] = '0,5,100'
resp = resp.form.submit().follow()
assert not resp.pyquery('.messages .warning')
assert csv.availability_parameters.notification_delays == '0,5,100'
resp = app.get(csv.get_absolute_url())
resp = resp.click('availability check')
resp.form['notification_delays'] = 'x'
resp = resp.form.submit()
assert len(resp.pyquery('#id_notification_delays_p .error'))

View File

@ -1,9 +1,13 @@
# -*- coding: utf-8 -*-
import logging
import datetime
import itertools
import pytest
from httmock import all_requests, HTTMock
from httmock import HTTMock
from django.core.exceptions import ValidationError
from passerelle.base.models import ProxyLogger, ResourceLog
from passerelle.apps.feeds.models import Feed
@ -66,20 +70,121 @@ def test_proxy_logger_ignore_when_down(db, connector):
pr.debug(u'some message')
assert len(ResourceLog.objects.all()) == 0
def test_log_on_connector_availability_change(db, connector):
def test_validate_notification_delays(db, connector):
def take(iterator, count):
return list(itertools.compress(iterator, range(1, count + 1)))
availability_parameters = connector.availability_parameters
assert availability_parameters.notification_delays == '0'
availability_parameters.full_clean()
assert availability_parameters.has_zero_delay()
assert list(availability_parameters.notification_delays_generator()) == [0]
availability_parameters.notification_delays = '0,5,100,1000'
availability_parameters.full_clean()
assert availability_parameters.has_zero_delay()
assert take(availability_parameters.notification_delays_generator(), 6) == [0, 5, 100, 1000, 2000, 3000]
availability_parameters.notification_delays = '5,100,1000'
availability_parameters.full_clean()
assert not availability_parameters.has_zero_delay()
assert take(availability_parameters.notification_delays_generator(), 6) == [5, 100, 1000, 2000, 3000, 4000]
availability_parameters.notification_delays = '5'
availability_parameters.full_clean()
assert not availability_parameters.has_zero_delay()
assert take(availability_parameters.notification_delays_generator(), 3) == [5, 10, 15]
# validation errors
availability_parameters.notification_delays = '5,100,100'
with pytest.raises(ValidationError):
availability_parameters.full_clean()
availability_parameters.notification_delays = ''
with pytest.raises(ValidationError):
availability_parameters.full_clean()
availability_parameters.notification_delays = '1,x'
with pytest.raises(ValidationError):
availability_parameters.full_clean()
availability_parameters.notification_delays = '10,1'
with pytest.raises(ValidationError):
availability_parameters.full_clean()
@pytest.mark.parametrize('notification_delays', ['0', '0,5,100', '5,100'])
def test_log_on_connector_availability_change(db, connector, freezer, notification_delays):
connector.title = u'éléphant'
with HTTMock(up_mock): # set connector as up
connector.availability()
ResourceLog.objects.all().delete()
with HTTMock(down_mock): # set connector as down
connector.availability()
assert len(ResourceLog.objects.all()) == 2
assert ResourceLog.objects.all()[0].message == 'GET http://example.net/ (=> 404)'
assert ResourceLog.objects.all()[1].level == 'error'
assert ResourceLog.objects.all()[1].message == u'connector "éléphant" (Feed) is now down'
availability_parameters = connector.availability_parameters
availability_parameters.notification_delays = notification_delays
availability_parameters.save()
with HTTMock(up_mock): # set connector as up
connector.availability()
assert len(ResourceLog.objects.all()) == 3
assert ResourceLog.objects.all()[2].level == 'info'
assert ResourceLog.objects.all()[2].message == u'connector "éléphant" (Feed) is back up'
assert ResourceLog.objects.count() == 1
assert ResourceLog.objects.latest('id').level == 'info'
assert ResourceLog.objects.latest('id').message == 'GET http://example.net/ (=> 200)'
ResourceLog.objects.all().delete()
# move 5 minutes in the future
freezer.move_to(datetime.timedelta(seconds=60 * 5 + 1))
with HTTMock(down_mock): # set connector as down
connector.availability()
assert connector.down()
last_count1 = ResourceLog.objects.count()
assert last_count1 == 2
assert ResourceLog.objects.all()[0].message == 'GET http://example.net/ (=> 404)'
assert ResourceLog.objects.all()[1].level == 'error' if notification_delays.startswith('0') else 'warning'
assert (u'connector "éléphant" (Feed) is now down: 404 Client'
in ResourceLog.objects.all()[1].message)
# move 5 minutes in the future
freezer.move_to(datetime.timedelta(seconds=60 * 5 + 1))
# second time log as error
with HTTMock(down_mock): # connector is still down
connector.availability()
assert connector.down()
last_count2 = ResourceLog.objects.count()
if notification_delays == '0':
assert last_count2 == last_count1
else:
assert last_count2 == last_count1 + 1
assert ResourceLog.objects.all()[2].level == 'error' if notification_delays != '0' else 'warning'
assert (u'connector "éléphant" (Feed) is still down for 5 minutes: 404'
in ResourceLog.objects.all()[2].message)
# move 3 minutes in the future
freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1))
# third time no log
with HTTMock(down_mock): # connector is still down
connector.availability()
assert ResourceLog.objects.count() == last_count2
# move 3 minutes in the future
freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1))
with HTTMock(up_mock): # set connector as up
connector.availability()
assert not connector.down()
last_count3 = ResourceLog.objects.count()
assert last_count3 == last_count2 + 1
assert ResourceLog.objects.latest('id').level == 'info'
assert ResourceLog.objects.latest('id').message == u'connector "éléphant" (Feed) is back up'
# move 3 minutes in the future
freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1))
# last message is a GET
with HTTMock(up_mock): # set connector as up
connector.availability()
assert not connector.down()
last_count4 = ResourceLog.objects.count()
assert last_count4 == last_count3 + 1
assert ResourceLog.objects.latest('id').level == 'info'
assert ResourceLog.objects.latest('id').message == 'GET http://example.net/ (=> 200)'

View File

@ -35,6 +35,7 @@ deps =
http://quixote.python.ca/releases/Quixote-2.7b2.tar.gz
vobject
django-ratelimit
pyquery
commands =
./get_wcs.sh
django111: py.test {posargs: {env:FAST:} --junitxml=test_{envname}_results.xml --cov-report xml --cov-report html --cov=passerelle/ --cov-config .coveragerc tests/}