log errors for down connectors when it persists (#35380)
This commit is contained in:
parent
2374fed9d9
commit
3d98363db7
|
@ -1,6 +1,7 @@
|
|||
from django import forms
|
||||
|
||||
from .models import ApiUser, AccessRight
|
||||
from .models import ApiUser, AccessRight, AvailabilityParameters
|
||||
|
||||
|
||||
class ApiUserForm(forms.ModelForm):
|
||||
class Meta:
|
||||
|
@ -17,3 +18,12 @@ class AccessRightForm(forms.ModelForm):
|
|||
'resource_type': forms.HiddenInput(),
|
||||
'resource_pk': forms.HiddenInput(),
|
||||
}
|
||||
|
||||
|
||||
class AvailabilityParametersForm(forms.ModelForm):
|
||||
class Meta:
|
||||
model = AvailabilityParameters
|
||||
fields = ['run_check', 'notification_delays']
|
||||
widgets = {
|
||||
'notification_delays': forms.TextInput,
|
||||
}
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Generated by Django 1.11.20 on 2019-08-20 14:14
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
import passerelle.base.models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('base', '0013_delete_templatevar'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='availabilityparameters',
|
||||
name='notification_delays',
|
||||
field=models.TextField(default=b'0', validators=[passerelle.base.models.validate_notification_delays]),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='resourcestatus',
|
||||
name='last_notification_timestamp',
|
||||
field=models.DateTimeField(blank=True, null=True),
|
||||
),
|
||||
]
|
|
@ -8,6 +8,7 @@ import re
|
|||
import sys
|
||||
import traceback
|
||||
import base64
|
||||
import itertools
|
||||
|
||||
from django.apps import apps
|
||||
from django.conf import settings
|
||||
|
@ -19,6 +20,7 @@ from django.test import override_settings
|
|||
from django.utils.text import slugify
|
||||
from django.utils import timezone, six
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.utils.timezone import now
|
||||
from django.core.files.base import ContentFile
|
||||
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
|
@ -400,7 +402,8 @@ class BaseResource(models.Model):
|
|||
if not self.availability_parameters.run_check:
|
||||
return
|
||||
|
||||
currently_down = self.down()
|
||||
availability_parameters = self.availability_parameters
|
||||
|
||||
try:
|
||||
self.check_status()
|
||||
status = 'up'
|
||||
|
@ -408,26 +411,66 @@ class BaseResource(models.Model):
|
|||
except NotImplementedError:
|
||||
return
|
||||
except Exception as e:
|
||||
from passerelle.utils.conversion import exception_to_text
|
||||
|
||||
status = 'down'
|
||||
message = repr(e)[:500]
|
||||
message = exception_to_text(e)[:500]
|
||||
|
||||
resource_type = ContentType.objects.get_for_model(self)
|
||||
current_status = ResourceStatus.objects.filter(
|
||||
resource_type=resource_type,
|
||||
resource_pk=self.pk).first()
|
||||
resource_type=resource_type,
|
||||
resource_pk=self.pk).first()
|
||||
if not current_status or status != current_status.status:
|
||||
if status == 'down' and not currently_down:
|
||||
self.logger.error(u'connector "%s" (%s) is now down', self, self.__class__.__name__)
|
||||
if status == 'down' and not self.down(): # new downtime
|
||||
if availability_parameters.has_zero_delay():
|
||||
self.logger.error(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message)
|
||||
else:
|
||||
self.logger.warning(u'connector "%s" (%s) is now down: %s', self, self.__class__.__name__, message)
|
||||
ResourceStatus(
|
||||
resource_type=resource_type,
|
||||
resource_pk=self.pk,
|
||||
status=status,
|
||||
message=message).save()
|
||||
if status == 'up' and currently_down:
|
||||
if status == 'up' and current_status:
|
||||
self.logger.info(u'connector "%s" (%s) is back up', self, self.__class__.__name__)
|
||||
elif status == 'down':
|
||||
current_status.message = message
|
||||
current_status.save()
|
||||
# check last_notification_downtime and current downtime to see if it matches a new notification delay
|
||||
last_notification_timestamp = current_status.last_notification_timestamp or current_status.start_timestamp
|
||||
|
||||
current_time = now()
|
||||
downtime = (current_time - current_status.start_timestamp).total_seconds() // 60
|
||||
last_notification_downtime = (
|
||||
last_notification_timestamp - current_status.start_timestamp).total_seconds() // 60
|
||||
|
||||
for delay in availability_parameters.notification_delays_generator():
|
||||
if not delay:
|
||||
continue
|
||||
if downtime >= delay:
|
||||
if last_notification_downtime < delay:
|
||||
days = downtime // 1440
|
||||
hours = downtime // 60
|
||||
if days > 1:
|
||||
human_duration = 'for %d days' % days
|
||||
elif hours > 1:
|
||||
human_duration = 'for %d hours' % hours
|
||||
else:
|
||||
human_duration = 'for %d minutes' % downtime
|
||||
self.logger.error(u'connector "%s" (%s) has been down %s: %s',
|
||||
self, self.__class__.__name__,
|
||||
human_duration,
|
||||
message,
|
||||
# when connector is down, logging is shutdown
|
||||
force=True)
|
||||
ResourceStatus.objects.filter(pk=current_status.pk).update(
|
||||
message=message, last_notification_timestamp=current_time)
|
||||
break
|
||||
else:
|
||||
ResourceStatus.objects.filter(pk=current_status.pk).update(
|
||||
message=message)
|
||||
break
|
||||
else:
|
||||
ResourceStatus.objects.filter(pk=current_status.pk).update(
|
||||
message=message)
|
||||
|
||||
def hourly(self):
|
||||
pass
|
||||
|
@ -536,6 +579,26 @@ class LoggingParameters(models.Model):
|
|||
unique_together = (('resource_type', 'resource_pk'))
|
||||
|
||||
|
||||
def parse_notification_delays(value):
|
||||
delays = [int(v.strip()) for v in value.split(',')]
|
||||
if not all(delay >= 0 for delay in delays):
|
||||
raise ValueError
|
||||
if delays != sorted(delays):
|
||||
raise ValueError
|
||||
if len(set(delays)) != len(delays):
|
||||
raise ValueError
|
||||
return delays
|
||||
|
||||
|
||||
def validate_notification_delays(value):
|
||||
try:
|
||||
if not value:
|
||||
raise ValueError
|
||||
parse_notification_delays(value)
|
||||
except ValueError:
|
||||
raise ValidationError(_('You must provide a list of increasing minutes delays'))
|
||||
|
||||
|
||||
class AvailabilityParameters(models.Model):
|
||||
resource_type = models.ForeignKey(ContentType)
|
||||
resource_pk = models.PositiveIntegerField()
|
||||
|
@ -543,6 +606,23 @@ class AvailabilityParameters(models.Model):
|
|||
run_check = models.BooleanField(
|
||||
default=True, verbose_name=_('Run regular availability checks'),
|
||||
help_text=_('Run an availability check every 5 minutes'))
|
||||
notification_delays = models.TextField(
|
||||
default='0',
|
||||
blank=False,
|
||||
validators=[validate_notification_delays],
|
||||
help_text=_('Increasing delay between error notifications in minutes, ex.: 0,5,10'))
|
||||
|
||||
def has_zero_delay(self):
|
||||
return 0 in parse_notification_delays(self.notification_delays)
|
||||
|
||||
def notification_delays_generator(self):
|
||||
notification_delays = parse_notification_delays(self.notification_delays)
|
||||
last_notification_delay = notification_delays[-1]
|
||||
if last_notification_delay > 1:
|
||||
notification_delays = itertools.chain(
|
||||
notification_delays,
|
||||
itertools.count(2 * last_notification_delay, last_notification_delay))
|
||||
return notification_delays
|
||||
|
||||
class Meta:
|
||||
unique_together = (('resource_type', 'resource_pk'))
|
||||
|
@ -608,6 +688,7 @@ class ResourceStatus(models.Model):
|
|||
start_timestamp = models.DateTimeField(auto_now_add=True)
|
||||
status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='unknown')
|
||||
message = models.CharField(max_length=500, blank=True)
|
||||
last_notification_timestamp = models.DateTimeField(blank=True, null=True)
|
||||
|
||||
class Meta:
|
||||
ordering = ['-start_timestamp']
|
||||
|
@ -634,7 +715,8 @@ class ProxyLogger(object):
|
|||
return self._logger.getEffectiveLevel()
|
||||
|
||||
def _log(self, levelname, message, *args, **kwargs):
|
||||
if self.connector.down():
|
||||
force = kwargs.pop('force', False)
|
||||
if self.connector.down() and not force:
|
||||
# don't log if the connector is known to be down
|
||||
return
|
||||
levelno = getattr(logging, levelname)
|
||||
|
|
|
@ -15,15 +15,17 @@
|
|||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.contrib import messages
|
||||
from django.core.urlresolvers import reverse
|
||||
from django.core.exceptions import ObjectDoesNotExist, PermissionDenied
|
||||
from django.forms import models as model_forms
|
||||
from django.views.generic import (
|
||||
DetailView, ListView, CreateView, UpdateView, DeleteView, FormView)
|
||||
from django.http import Http404
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from .models import ApiUser, AccessRight, LoggingParameters, AvailabilityParameters, ResourceStatus
|
||||
from .forms import ApiUserForm, AccessRightForm
|
||||
from .models import ApiUser, AccessRight, LoggingParameters, ResourceStatus
|
||||
from .forms import ApiUserForm, AccessRightForm, AvailabilityParametersForm
|
||||
from ..utils import get_trusted_services
|
||||
|
||||
|
||||
|
@ -155,42 +157,33 @@ class LoggingParametersUpdateView(FormView):
|
|||
return super(LoggingParametersUpdateView, self).form_valid(form)
|
||||
|
||||
|
||||
class ManageAvailabilityView(FormView):
|
||||
class ManageAvailabilityView(UpdateView):
|
||||
template_name = 'passerelle/manage/manage_availability_form.html'
|
||||
form_class = AvailabilityParametersForm
|
||||
|
||||
def get_resource(self):
|
||||
if not hasattr(self, '_resource'):
|
||||
content_type = ContentType.objects.get_for_id(self.kwargs['resource_type'])
|
||||
self._resource = content_type.model_class().objects.get(pk=self.kwargs['resource_pk'])
|
||||
return self._resource
|
||||
|
||||
def get_object(self, queryset=None):
|
||||
return self.get_resource().availability_parameters
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
context = super(ManageAvailabilityView, self).get_context_data(**kwargs)
|
||||
connector = self.get_resource()
|
||||
context['connector'] = connector
|
||||
context['availability_status'] = connector.get_availability_status()
|
||||
context['connector'] = self.get_resource()
|
||||
context['availability_status'] = self.get_resource().get_availability_status()
|
||||
return context
|
||||
|
||||
def get_form_class(self):
|
||||
form_class = model_forms.modelform_factory(
|
||||
AvailabilityParameters,
|
||||
fields=['run_check'])
|
||||
return form_class
|
||||
|
||||
def get_initial(self):
|
||||
d = self.initial.copy()
|
||||
d['resource_type'] = self.kwargs['resource_type']
|
||||
d['resource_pk'] = self.kwargs['resource_pk']
|
||||
d['run_check'] = self.get_resource().availability_parameters.run_check
|
||||
return d
|
||||
|
||||
def get_resource(self):
|
||||
content_type = ContentType.objects.get_for_id(self.kwargs['resource_type'])
|
||||
return content_type.model_class().objects.get(pk=self.kwargs['resource_pk'])
|
||||
|
||||
def get_success_url(self):
|
||||
return self.get_resource().get_absolute_url()
|
||||
|
||||
def form_valid(self, form):
|
||||
resource = self.get_resource()
|
||||
parameters = resource.availability_parameters
|
||||
run_check = form.cleaned_data['run_check']
|
||||
|
||||
if not run_check and resource.down():
|
||||
# if availability check is disabled resource is forced to up to activate logs
|
||||
if not form.instance.run_check and resource.down():
|
||||
resource_type = ContentType.objects.get_for_model(resource)
|
||||
ResourceStatus(
|
||||
resource_type=resource_type,
|
||||
|
@ -198,9 +191,14 @@ class ManageAvailabilityView(FormView):
|
|||
status='up',
|
||||
message='').save()
|
||||
|
||||
if parameters.run_check != run_check:
|
||||
parameters.run_check = run_check
|
||||
parameters.save()
|
||||
resource.logger.info(u'availability checks %s', 'enabled' if run_check else 'disabled')
|
||||
# log changes to notification delays
|
||||
if 'notification_delays' in form.changed_data:
|
||||
resource.logger.info(u'availability checks delays set to %s', form.instance.notification_delays)
|
||||
|
||||
# log changes to run_check, if enabled immediately check for availability
|
||||
if 'run_check' in form.changed_data:
|
||||
resource.logger.info(u'availability checks %s', 'enabled' if form.instance.run_check else 'disabled')
|
||||
if form.instance.run_check:
|
||||
resource.availability()
|
||||
|
||||
return super(ManageAvailabilityView, self).form_valid(form)
|
||||
|
|
|
@ -248,3 +248,18 @@ def test_availability_parameters(app, admin_user, monkeypatch):
|
|||
assert not csv.down()
|
||||
status = csv.get_availability_status()
|
||||
assert status.status == 'up'
|
||||
|
||||
# Notification delays
|
||||
resp = app.get(csv.get_absolute_url())
|
||||
resp = resp.click('availability check')
|
||||
assert resp.form['notification_delays'].value == '0'
|
||||
resp.form['notification_delays'] = '0,5,100'
|
||||
resp = resp.form.submit().follow()
|
||||
assert not resp.pyquery('.messages .warning')
|
||||
assert csv.availability_parameters.notification_delays == '0,5,100'
|
||||
|
||||
resp = app.get(csv.get_absolute_url())
|
||||
resp = resp.click('availability check')
|
||||
resp.form['notification_delays'] = 'x'
|
||||
resp = resp.form.submit()
|
||||
assert len(resp.pyquery('#id_notification_delays_p .error'))
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
import datetime
|
||||
import itertools
|
||||
|
||||
import pytest
|
||||
from httmock import all_requests, HTTMock
|
||||
from httmock import HTTMock
|
||||
|
||||
from django.core.exceptions import ValidationError
|
||||
|
||||
from passerelle.base.models import ProxyLogger, ResourceLog
|
||||
from passerelle.apps.feeds.models import Feed
|
||||
|
@ -66,20 +70,121 @@ def test_proxy_logger_ignore_when_down(db, connector):
|
|||
pr.debug(u'some message')
|
||||
assert len(ResourceLog.objects.all()) == 0
|
||||
|
||||
def test_log_on_connector_availability_change(db, connector):
|
||||
|
||||
def test_validate_notification_delays(db, connector):
|
||||
def take(iterator, count):
|
||||
return list(itertools.compress(iterator, range(1, count + 1)))
|
||||
|
||||
availability_parameters = connector.availability_parameters
|
||||
assert availability_parameters.notification_delays == '0'
|
||||
|
||||
availability_parameters.full_clean()
|
||||
assert availability_parameters.has_zero_delay()
|
||||
assert list(availability_parameters.notification_delays_generator()) == [0]
|
||||
|
||||
availability_parameters.notification_delays = '0,5,100,1000'
|
||||
availability_parameters.full_clean()
|
||||
assert availability_parameters.has_zero_delay()
|
||||
assert take(availability_parameters.notification_delays_generator(), 6) == [0, 5, 100, 1000, 2000, 3000]
|
||||
|
||||
availability_parameters.notification_delays = '5,100,1000'
|
||||
availability_parameters.full_clean()
|
||||
assert not availability_parameters.has_zero_delay()
|
||||
assert take(availability_parameters.notification_delays_generator(), 6) == [5, 100, 1000, 2000, 3000, 4000]
|
||||
|
||||
availability_parameters.notification_delays = '5'
|
||||
availability_parameters.full_clean()
|
||||
assert not availability_parameters.has_zero_delay()
|
||||
assert take(availability_parameters.notification_delays_generator(), 3) == [5, 10, 15]
|
||||
|
||||
# validation errors
|
||||
availability_parameters.notification_delays = '5,100,100'
|
||||
with pytest.raises(ValidationError):
|
||||
availability_parameters.full_clean()
|
||||
|
||||
availability_parameters.notification_delays = ''
|
||||
with pytest.raises(ValidationError):
|
||||
availability_parameters.full_clean()
|
||||
|
||||
availability_parameters.notification_delays = '1,x'
|
||||
with pytest.raises(ValidationError):
|
||||
availability_parameters.full_clean()
|
||||
|
||||
availability_parameters.notification_delays = '10,1'
|
||||
with pytest.raises(ValidationError):
|
||||
availability_parameters.full_clean()
|
||||
|
||||
|
||||
@pytest.mark.parametrize('notification_delays', ['0', '0,5,100', '5,100'])
|
||||
def test_log_on_connector_availability_change(db, connector, freezer, notification_delays):
|
||||
connector.title = u'éléphant'
|
||||
with HTTMock(up_mock): # set connector as up
|
||||
connector.availability()
|
||||
ResourceLog.objects.all().delete()
|
||||
with HTTMock(down_mock): # set connector as down
|
||||
connector.availability()
|
||||
assert len(ResourceLog.objects.all()) == 2
|
||||
assert ResourceLog.objects.all()[0].message == 'GET http://example.net/ (=> 404)'
|
||||
assert ResourceLog.objects.all()[1].level == 'error'
|
||||
assert ResourceLog.objects.all()[1].message == u'connector "éléphant" (Feed) is now down'
|
||||
availability_parameters = connector.availability_parameters
|
||||
availability_parameters.notification_delays = notification_delays
|
||||
availability_parameters.save()
|
||||
|
||||
with HTTMock(up_mock): # set connector as up
|
||||
connector.availability()
|
||||
assert len(ResourceLog.objects.all()) == 3
|
||||
assert ResourceLog.objects.all()[2].level == 'info'
|
||||
assert ResourceLog.objects.all()[2].message == u'connector "éléphant" (Feed) is back up'
|
||||
assert ResourceLog.objects.count() == 1
|
||||
assert ResourceLog.objects.latest('id').level == 'info'
|
||||
assert ResourceLog.objects.latest('id').message == 'GET http://example.net/ (=> 200)'
|
||||
ResourceLog.objects.all().delete()
|
||||
|
||||
# move 5 minutes in the future
|
||||
freezer.move_to(datetime.timedelta(seconds=60 * 5 + 1))
|
||||
|
||||
with HTTMock(down_mock): # set connector as down
|
||||
connector.availability()
|
||||
assert connector.down()
|
||||
last_count1 = ResourceLog.objects.count()
|
||||
assert last_count1 == 2
|
||||
assert ResourceLog.objects.all()[0].message == 'GET http://example.net/ (=> 404)'
|
||||
assert ResourceLog.objects.all()[1].level == 'error' if notification_delays.startswith('0') else 'warning'
|
||||
assert (u'connector "éléphant" (Feed) is now down: 404 Client'
|
||||
in ResourceLog.objects.all()[1].message)
|
||||
|
||||
# move 5 minutes in the future
|
||||
freezer.move_to(datetime.timedelta(seconds=60 * 5 + 1))
|
||||
|
||||
# second time log as error
|
||||
with HTTMock(down_mock): # connector is still down
|
||||
connector.availability()
|
||||
assert connector.down()
|
||||
last_count2 = ResourceLog.objects.count()
|
||||
if notification_delays == '0':
|
||||
assert last_count2 == last_count1
|
||||
else:
|
||||
assert last_count2 == last_count1 + 1
|
||||
assert ResourceLog.objects.all()[2].level == 'error' if notification_delays != '0' else 'warning'
|
||||
assert (u'connector "éléphant" (Feed) is still down for 5 minutes: 404'
|
||||
in ResourceLog.objects.all()[2].message)
|
||||
|
||||
# move 3 minutes in the future
|
||||
freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1))
|
||||
|
||||
# third time no log
|
||||
with HTTMock(down_mock): # connector is still down
|
||||
connector.availability()
|
||||
assert ResourceLog.objects.count() == last_count2
|
||||
|
||||
# move 3 minutes in the future
|
||||
freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1))
|
||||
|
||||
with HTTMock(up_mock): # set connector as up
|
||||
connector.availability()
|
||||
assert not connector.down()
|
||||
last_count3 = ResourceLog.objects.count()
|
||||
assert last_count3 == last_count2 + 1
|
||||
assert ResourceLog.objects.latest('id').level == 'info'
|
||||
assert ResourceLog.objects.latest('id').message == u'connector "éléphant" (Feed) is back up'
|
||||
|
||||
# move 3 minutes in the future
|
||||
freezer.move_to(datetime.timedelta(seconds=60 * 3 + 1))
|
||||
|
||||
# last message is a GET
|
||||
with HTTMock(up_mock): # set connector as up
|
||||
connector.availability()
|
||||
assert not connector.down()
|
||||
last_count4 = ResourceLog.objects.count()
|
||||
assert last_count4 == last_count3 + 1
|
||||
assert ResourceLog.objects.latest('id').level == 'info'
|
||||
assert ResourceLog.objects.latest('id').message == 'GET http://example.net/ (=> 200)'
|
||||
|
|
1
tox.ini
1
tox.ini
|
@ -35,6 +35,7 @@ deps =
|
|||
http://quixote.python.ca/releases/Quixote-2.7b2.tar.gz
|
||||
vobject
|
||||
django-ratelimit
|
||||
pyquery
|
||||
commands =
|
||||
./get_wcs.sh
|
||||
django111: py.test {posargs: {env:FAST:} --junitxml=test_{envname}_results.xml --cov-report xml --cov-report html --cov=passerelle/ --cov-config .coveragerc tests/}
|
||||
|
|
Loading…
Reference in New Issue