snapshot: store a patch instead of serialization (#57299)
gitea-wip/wcs/pipeline/head There was a failure building this commit Details

This commit is contained in:
Lauréline Guérin 2021-10-05 11:38:28 +02:00
parent 897dc44bc3
commit 0565ffe5d3
No known key found for this signature in database
GPG Key ID: 1FAB9B9B4F93D473
4 changed files with 205 additions and 121 deletions

View File

@ -66,28 +66,55 @@ def test_snapshot_basics(pub):
formdef.fields = []
formdef.store()
carddef = CardDef()
carddef.name = 'testcard'
carddef.fields = []
carddef.store()
# first occurence, complete snapshot stored
assert pub.snapshot_class.count() == 1
snapshot1 = pub.snapshot_class.get_latest('formdef', formdef.id)
assert snapshot1.serialization is not None
assert '>testform<' in snapshot1.serialization
assert snapshot1.patch is None
assert snapshot1.instance # possible to restore
# no changes
formdef.store()
assert pub.snapshot_class.count() == 1
# patch only
formdef.name = 'testform2'
formdef.store()
assert pub.snapshot_class.count() == 2
carddef.name = 'testcard2'
carddef.store()
snapshot2 = pub.snapshot_class.get_latest('formdef', formdef.id)
assert snapshot2.serialization is None
assert '>testform2<' in snapshot2.patch
assert snapshot2.instance # possible to restore
data_source = NamedDataSource(name='foobar')
data_source.data_source = {'type': 'formula', 'value': repr([('1', 'un'), ('2', 'deux')])}
data_source.store()
# no diff with latest snap but label is given
pub.snapshot_class.snap(instance=formdef, label="foo bar")
assert pub.snapshot_class.count() == 3
snapshot3 = pub.snapshot_class.get_latest('formdef', formdef.id)
assert snapshot3.serialization is None
assert '>testform2<' in snapshot3.patch
assert snapshot2.patch == snapshot3.patch
assert snapshot3.instance # possible to restore
# patch is longer as serialization, store serialization
formdef.name = 'testform3'
formdef.fields = [StringField(id=str(i), label='Test %s' % i, type='string') for i in range(0, 10)]
formdef.store()
assert pub.snapshot_class.count() == 4
snapshot4 = pub.snapshot_class.get_latest('formdef', formdef.id)
assert snapshot4.serialization is not None
assert '>testform3<' in snapshot4.serialization
assert snapshot4.patch is None
assert snapshot4.instance # possible to restore
# no diff with latest snap but label is given
pub.snapshot_class.snap(instance=formdef, label="foo bar")
assert pub.snapshot_class.count() == 5
# check we got correct data in the serializations
snapshot = pub.snapshot_class.get_latest('formdef', formdef.id)
assert '>testform2<' in snapshot.serialization
snapshot = pub.snapshot_class.get_latest('carddef', carddef.id)
assert '>testcard2<' in snapshot.serialization
snapshot5 = pub.snapshot_class.get_latest('formdef', formdef.id)
assert snapshot5.serialization is None
assert snapshot5.patch == '' # no difference with latest snap, which has a serialization
assert snapshot5.instance # possible to restore
def test_snapshot_instance(pub):
@ -117,7 +144,8 @@ def test_snapshot_instance(pub):
snapshots = pub.snapshot_class.select_object_history(formdef)
assert len(snapshots) == 10
for i in range(10):
assert snapshots[i].serialization is None
assert snapshots[i].serialization is None # not loaded
assert snapshots[i].patch is None # not loaded
assert pub.snapshot_class.get(snapshots[i].id).instance.name == 'testform %s' % (9 - i)
snapshots = pub.snapshot_class.select_object_history(carddef)
@ -703,88 +731,6 @@ def test_snapshot_workflow_variable(pub):
assert 'sortable readonly' in resp.text
@pytest.fixture
def size_limit(pub):
pub.snapshot_class.WCS_MAX_LEN = 100
yield
pub.snapshot_class.WCS_MAX_LEN = 1000000
def test_workflow_snapshot_max_len(pub, size_limit):
formdef = FormDef()
formdef.name = 'testform'
formdef.fields = []
formdef.store()
Workflow.wipe()
workflow = Workflow(name='test')
workflow.store()
another_workflow = Workflow(name='other test')
another_workflow.store() # same object_type - check that other instances snapshots are not deleted
assert formdef.id == workflow.id # same id - check other object_type snapshots are not deleted
# first one: saved
assert pub.snapshot_class.count() == 3
first_id = pub.snapshot_class.select(order_by='id')[0].id
assert pub.snapshot_class.get(first_id).object_type == 'formdef'
assert pub.snapshot_class.get(first_id + 1).object_type == 'workflow'
assert pub.snapshot_class.get(first_id + 1).object_id == '1'
old_timestamp = pub.snapshot_class.get(first_id + 1).timestamp
assert pub.snapshot_class.get(first_id + 2).object_type == 'workflow'
assert pub.snapshot_class.get(first_id + 2).object_id == '2'
# save snapshot
pub.snapshot_class.snap(instance=workflow, label="snapshot !")
assert pub.snapshot_class.count() == 4
assert pub.snapshot_class.get(first_id).object_type == 'formdef'
assert pub.snapshot_class.get(first_id + 1).object_type == 'workflow'
assert pub.snapshot_class.get(first_id + 1).object_id == '1'
assert pub.snapshot_class.get(first_id + 1).label is None
assert pub.snapshot_class.get(first_id + 1).timestamp == old_timestamp
assert pub.snapshot_class.get(first_id + 1).instance.name == 'test'
assert pub.snapshot_class.get(first_id + 2).object_type == 'workflow'
assert pub.snapshot_class.get(first_id + 2).object_id == '2'
assert pub.snapshot_class.get(first_id + 3).object_type == 'workflow'
assert pub.snapshot_class.get(first_id + 3).object_id == '1'
assert pub.snapshot_class.get(first_id + 3).label == "snapshot !"
assert pub.snapshot_class.get(first_id + 3).instance.name == 'test'
# no changes
workflow.store()
assert pub.snapshot_class.count() == 4
assert pub.snapshot_class.get(first_id).object_type == 'formdef'
assert pub.snapshot_class.get(first_id + 1).object_type == 'workflow'
assert pub.snapshot_class.get(first_id + 1).object_id == '1'
assert pub.snapshot_class.get(first_id + 1).label is None
assert pub.snapshot_class.get(first_id + 1).timestamp == old_timestamp
assert pub.snapshot_class.get(first_id + 1).instance.name == 'test'
assert pub.snapshot_class.get(first_id + 2).object_type == 'workflow'
assert pub.snapshot_class.get(first_id + 2).object_id == '2'
assert pub.snapshot_class.get(first_id + 3).object_type == 'workflow'
assert pub.snapshot_class.get(first_id + 3).object_id == '1'
assert pub.snapshot_class.get(first_id + 3).label == "snapshot !"
assert pub.snapshot_class.get(first_id + 3).instance.name == 'test'
# with changes
workflow.name = 'foo bar'
workflow.store()
assert pub.snapshot_class.count() == 4
assert pub.snapshot_class.get(first_id).object_type == 'formdef'
assert pub.snapshot_class.get(first_id + 2).object_type == 'workflow'
assert pub.snapshot_class.get(first_id + 2).object_id == '2'
assert pub.snapshot_class.get(first_id + 3).object_type == 'workflow'
assert pub.snapshot_class.get(first_id + 3).object_id == '1'
assert pub.snapshot_class.get(first_id + 3).label == "snapshot !"
assert pub.snapshot_class.get(first_id + 3).instance.name == 'test'
assert pub.snapshot_class.get(first_id + 4).object_type == 'workflow'
assert pub.snapshot_class.get(first_id + 4).object_id == '1'
assert pub.snapshot_class.get(first_id + 4).label is None
assert pub.snapshot_class.get(first_id + 4).timestamp > old_timestamp
assert pub.snapshot_class.get(first_id + 4).instance.name == 'foo bar'
def test_pickle_erroneous_snapshot_object(pub):
# check snapshot object attribute is not restored
formdef = FormDef()

View File

@ -117,7 +117,7 @@ class SnapshotDirectory(Directory):
self.snapshot.timestamp.strftime('%Y%m%d-%H%M'),
),
)
return '<?xml version="1.0"?>\n' + self.snapshot.serialization
return '<?xml version="1.0"?>\n' + self.snapshot.get_serialization()
def restore(self):
form = Form(enctype='multipart/form-data')

View File

@ -14,13 +14,14 @@
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.
import difflib
import re
import xml.etree.ElementTree as ET
from django.utils.timezone import now
from quixote import get_publisher, get_session
from wcs.qommon import _, misc
from wcs.qommon.storage import Null
class UnknownUser:
@ -28,6 +29,97 @@ class UnknownUser:
return str(_('unknown user'))
def indent(tree, space=" ", level=0):
# backport from Lib/xml/etree/ElementTree.py python 3.9
if isinstance(tree, ET.ElementTree):
tree = tree.getroot()
if level < 0:
raise ValueError(f"Initial indentation level must be >= 0, got {level}")
if len(tree) == 0:
return
# Reduce the memory consumption by reusing indentation strings.
indentations = ["\n" + level * space]
def _indent_children(elem, level):
# Start a new indentation level for the first child.
child_level = level + 1
try:
child_indentation = indentations[child_level]
except IndexError:
child_indentation = indentations[level] + space
indentations.append(child_indentation)
if not elem.text or not elem.text.strip():
elem.text = child_indentation
for child in elem:
if len(child):
_indent_children(child, child_level)
if not child.tail or not child.tail.strip():
child.tail = child_indentation
# Dedent after the last child by overwriting the previous indentation.
if not child.tail.strip():
child.tail = indentations[level]
_indent_children(tree, 0)
_no_eol = "\\ No newline at end of file"
_hdr_pat = re.compile(r"^@@ -(\d+),?(\d+)? \+(\d+),?(\d+)? @@$")
def make_patch(a, b):
"""
Get unified string diff between two strings. Trims top two lines.
Returns empty string if strings are identical.
"""
diffs = difflib.unified_diff(a.splitlines(True), b.splitlines(True), n=0)
try:
_, _ = next(diffs), next(diffs)
except StopIteration:
pass
return ''.join([d if d[-1] == '\n' else d + '\n' + _no_eol + '\n' for d in diffs])
def apply_patch(s, patch, revert=False):
"""
Apply patch to string s to recover newer string.
If revert is True, treat s as the newer string, recover older string.
"""
s = s.splitlines(True)
p = patch.splitlines(True)
t = ''
i = sl = 0
(midx, sign) = (1, '+') if not revert else (3, '-')
while i < len(p) and p[i].startswith(("---", "+++")):
i += 1 # skip header lines
while i < len(p):
m = _hdr_pat.match(p[i])
if not m:
raise Exception("Bad patch -- regex mismatch [line " + str(i) + "]")
_l = int(m.group(midx)) - 1 + (m.group(midx + 1) == '0')
if sl > _l or _l > len(s):
raise Exception("Bad patch -- bad line num [line " + str(i) + "]")
t += ''.join(s[sl:_l])
sl = _l
i += 1
while i < len(p) and p[i][0] != '@':
if i + 1 < len(p) and p[i + 1][0] == '\\':
line = p[i][:-1]
i += 2
else:
line = p[i]
i += 1
if len(line) > 0:
if line[0] == sign or line[0] == ' ':
t += line[1:]
sl += line[0] != sign
t += ''.join(s[sl:])
return t
class Snapshot:
id = None
object_type = None # (formdef, carddef, blockdef, workflow, data_source, etc.)
@ -36,14 +128,13 @@ class Snapshot:
user_id = None
comment = None
serialization = None
patch = None
label = None # (named snapshot)
# cache
_instance = None
_user = None
WCS_MAX_LEN = 1000000
@classmethod
def snap(cls, instance, comment=None, label=None):
obj = cls()
@ -52,18 +143,43 @@ class Snapshot:
obj.timestamp = now()
if get_session():
obj.user_id = get_session().user
obj.serialization = ET.tostring(instance.export_to_xml(include_id=True)).decode('utf-8')
tree = instance.export_to_xml(include_id=True)
obj.serialization = ET.tostring(tree).decode('utf-8')
obj.comment = str(comment) if comment else None
obj.label = label
latest = cls.get_latest(obj.object_type, obj.object_id)
if label is not None or latest is None or obj.serialization != latest.serialization:
# save snapshot if there are changes or an explicit label was
# given.
if label is None and len(obj.serialization) > cls.WCS_MAX_LEN:
# keep only latest snapshot for big objects
# (typically workflows with embedded documents)
for old_snapshot in cls.select_object_history(instance, clause=[Null('label')]):
cls.remove_object(old_snapshot.id)
latest_complete = cls.get_latest(obj.object_type, obj.object_id, complete=True)
if latest_complete is None:
# no complete snapshot, store it, with serialization and no patch
obj.store()
return
# get patch beetween latest serialization and current instance
# indent xml to minimize patch
latest_tree = ET.fromstring(latest_complete.serialization)
indent(tree)
indent(latest_tree)
patch = make_patch(ET.tostring(latest_tree).decode('utf-8'), ET.tostring(tree).decode('utf-8'))
# should we store a snapshot ?
store_snapshot = True
if label is None:
# compare with patch of latest snapshot
latest = cls.get_latest(obj.object_type, obj.object_id)
if latest.patch and patch == latest.patch:
# previous snapshot contains a patch (but no serialization)
# and the current patch is the same as in the previous snapshot
store_snapshot = False
elif latest.serialization and not patch:
# previous snapshot contains a serialization (but no patch)
# and there is no difference (no patch)
store_snapshot = False
if store_snapshot:
if len(obj.serialization) > len(patch):
# serialization is bigger than patch, store patch
obj.serialization = None
obj.patch = patch
# else: keep serialization and ignore patch
obj.store()
def get_object_class(self):
@ -80,10 +196,22 @@ class Snapshot:
return klass
raise KeyError('no class for object type: %s' % self.object_type)
def get_serialization(self):
# there is a complete serialization
if self.serialization:
return self.serialization
# get latest version with serialization
latest_complete = self.__class__.get_latest(self.object_type, self.object_id, complete=True)
latest_tree = ET.fromstring(latest_complete.serialization)
indent(latest_tree)
serialization = apply_patch(ET.tostring(latest_tree).decode('utf-8'), self.patch or '')
return serialization
@property
def instance(self):
if self._instance is None:
tree = ET.fromstring(self.serialization)
tree = ET.fromstring(self.get_serialization())
self._instance = self.get_object_class().import_from_xml_tree(
tree,
include_id=True,

View File

@ -1057,6 +1057,7 @@ def do_snapshots_table():
user_id VARCHAR,
comment TEXT,
serialization TEXT,
patch TEXT,
label VARCHAR
)'''
% table_name
@ -1069,6 +1070,10 @@ def do_snapshots_table():
)
existing_fields = {x[0] for x in cur.fetchall()}
# migrations
if 'patch' not in existing_fields:
cur.execute('''ALTER TABLE %s ADD COLUMN patch TEXT''' % table_name)
needed_fields = {x[0] for x in Snapshot._table_static_fields}
# delete obsolete fields
@ -2959,9 +2964,10 @@ class Snapshot(SqlMixin, wcs.snapshots.Snapshot):
('user_id', 'varchar'),
('comment', 'text'),
('serialization', 'text'),
('patch', 'text'),
('label', 'varchar'),
]
_table_select_skipped_fields = ['serialization']
_table_select_skipped_fields = ['serialization', 'patch']
@guard_postgres
@invalidate_substitution_cache
@ -3018,13 +3024,16 @@ class Snapshot(SqlMixin, wcs.snapshots.Snapshot):
return []
@classmethod
def get_latest(cls, object_type, object_id):
def get_latest(cls, object_type, object_id, complete=False):
conn, cur = get_connection_and_cursor()
sql_statement = '''SELECT id FROM snapshots
WHERE object_type = %(object_type)s
AND object_id = %(object_id)s
WHERE object_type = %%(object_type)s
AND object_id = %%(object_id)s
%s
ORDER BY timestamp DESC
LIMIT 1'''
LIMIT 1''' % (
'AND serialization IS NOT NULL' if complete else ''
)
cur.execute(sql_statement, {'object_type': object_type, 'object_id': object_id})
row = cur.fetchone()
conn.commit()
@ -3441,7 +3450,7 @@ def get_period_total(
# latest migration, number + description (description is not used
# programmaticaly but will make sure git conflicts if two migrations are
# separately added with the same number)
SQL_LEVEL = (53, 'add kind column on logged_errors table')
SQL_LEVEL = (54, 'add patch column on snapshot table')
def migrate_global_views(conn, cur):
@ -3617,8 +3626,9 @@ def migrate():
continue
for formdata in formdef.data_class().select_iterator():
formdata._set_auto_fields(cur) # build digests
if sql_level < 42:
if sql_level < 54:
# 42: create snapshots table
# 54: add patch column
do_snapshots_table()
if sql_level < 53:
# 47: store LoggedErrors in SQL