snapshot: store a patch instead of serialization (#57299)

2021-10-05 11:38:28 +02:00 · 2021-10-05 11:38:28 +02:00 · 0565ffe5d3
parent 897dc44bc3
commit 0565ffe5d3
4 changed files with 205 additions and 121 deletions
--- a/tests/test_snapshots.py
+++ b/tests/test_snapshots.py
@ -66,28 +66,55 @@ def test_snapshot_basics(pub):
    formdef.fields = []
    formdef.store()

-    carddef = CardDef()
-    carddef.name = 'testcard'
-    carddef.fields = []
-    carddef.store()
+    # first occurence, complete snapshot stored
+    assert pub.snapshot_class.count() == 1
+    snapshot1 = pub.snapshot_class.get_latest('formdef', formdef.id)
+    assert snapshot1.serialization is not None
+    assert '>testform<' in snapshot1.serialization
+    assert snapshot1.patch is None
+    assert snapshot1.instance  # possible to restore

+    # no changes
+    formdef.store()
+    assert pub.snapshot_class.count() == 1
+
+    # patch only
    formdef.name = 'testform2'
    formdef.store()
+    assert pub.snapshot_class.count() == 2

-    carddef.name = 'testcard2'
-    carddef.store()
+    snapshot2 = pub.snapshot_class.get_latest('formdef', formdef.id)
+    assert snapshot2.serialization is None
+    assert '>testform2<' in snapshot2.patch
+    assert snapshot2.instance  # possible to restore

-    data_source = NamedDataSource(name='foobar')
-    data_source.data_source = {'type': 'formula', 'value': repr([('1', 'un'), ('2', 'deux')])}
-    data_source.store()
+    # no diff with latest snap but label is given
+    pub.snapshot_class.snap(instance=formdef, label="foo bar")
+    assert pub.snapshot_class.count() == 3
+    snapshot3 = pub.snapshot_class.get_latest('formdef', formdef.id)
+    assert snapshot3.serialization is None
+    assert '>testform2<' in snapshot3.patch
+    assert snapshot2.patch == snapshot3.patch
+    assert snapshot3.instance  # possible to restore
+
+    # patch is longer as serialization, store serialization
+    formdef.name = 'testform3'
+    formdef.fields = [StringField(id=str(i), label='Test %s' % i, type='string') for i in range(0, 10)]
+    formdef.store()
+    assert pub.snapshot_class.count() == 4
+    snapshot4 = pub.snapshot_class.get_latest('formdef', formdef.id)
+    assert snapshot4.serialization is not None
+    assert '>testform3<' in snapshot4.serialization
+    assert snapshot4.patch is None
+    assert snapshot4.instance  # possible to restore
+
+    # no diff with latest snap but label is given
+    pub.snapshot_class.snap(instance=formdef, label="foo bar")
    assert pub.snapshot_class.count() == 5
-
-    # check we got correct data in the serializations
-    snapshot = pub.snapshot_class.get_latest('formdef', formdef.id)
-    assert '>testform2<' in snapshot.serialization
-
-    snapshot = pub.snapshot_class.get_latest('carddef', carddef.id)
-    assert '>testcard2<' in snapshot.serialization
+    snapshot5 = pub.snapshot_class.get_latest('formdef', formdef.id)
+    assert snapshot5.serialization is None
+    assert snapshot5.patch == ''  # no difference with latest snap, which has a serialization
+    assert snapshot5.instance  # possible to restore


 def test_snapshot_instance(pub):
@ -117,7 +144,8 @@ def test_snapshot_instance(pub):
    snapshots = pub.snapshot_class.select_object_history(formdef)
    assert len(snapshots) == 10
    for i in range(10):
-        assert snapshots[i].serialization is None
+        assert snapshots[i].serialization is None  # not loaded
+        assert snapshots[i].patch is None  # not loaded
        assert pub.snapshot_class.get(snapshots[i].id).instance.name == 'testform %s' % (9 - i)

    snapshots = pub.snapshot_class.select_object_history(carddef)
@ -703,88 +731,6 @@ def test_snapshot_workflow_variable(pub):
    assert 'sortable readonly' in resp.text


-@pytest.fixture
-def size_limit(pub):
-    pub.snapshot_class.WCS_MAX_LEN = 100
-    yield
-    pub.snapshot_class.WCS_MAX_LEN = 1000000
-
-
-def test_workflow_snapshot_max_len(pub, size_limit):
-    formdef = FormDef()
-    formdef.name = 'testform'
-    formdef.fields = []
-    formdef.store()
-
-    Workflow.wipe()
-    workflow = Workflow(name='test')
-    workflow.store()
-
-    another_workflow = Workflow(name='other test')
-    another_workflow.store()  # same object_type - check that other instances snapshots are not deleted
-
-    assert formdef.id == workflow.id  # same id - check other object_type snapshots are not deleted
-
-    # first one: saved
-    assert pub.snapshot_class.count() == 3
-    first_id = pub.snapshot_class.select(order_by='id')[0].id
-    assert pub.snapshot_class.get(first_id).object_type == 'formdef'
-    assert pub.snapshot_class.get(first_id + 1).object_type == 'workflow'
-    assert pub.snapshot_class.get(first_id + 1).object_id == '1'
-    old_timestamp = pub.snapshot_class.get(first_id + 1).timestamp
-    assert pub.snapshot_class.get(first_id + 2).object_type == 'workflow'
-    assert pub.snapshot_class.get(first_id + 2).object_id == '2'
-
-    # save snapshot
-    pub.snapshot_class.snap(instance=workflow, label="snapshot !")
-    assert pub.snapshot_class.count() == 4
-    assert pub.snapshot_class.get(first_id).object_type == 'formdef'
-    assert pub.snapshot_class.get(first_id + 1).object_type == 'workflow'
-    assert pub.snapshot_class.get(first_id + 1).object_id == '1'
-    assert pub.snapshot_class.get(first_id + 1).label is None
-    assert pub.snapshot_class.get(first_id + 1).timestamp == old_timestamp
-    assert pub.snapshot_class.get(first_id + 1).instance.name == 'test'
-    assert pub.snapshot_class.get(first_id + 2).object_type == 'workflow'
-    assert pub.snapshot_class.get(first_id + 2).object_id == '2'
-    assert pub.snapshot_class.get(first_id + 3).object_type == 'workflow'
-    assert pub.snapshot_class.get(first_id + 3).object_id == '1'
-    assert pub.snapshot_class.get(first_id + 3).label == "snapshot !"
-    assert pub.snapshot_class.get(first_id + 3).instance.name == 'test'
-
-    # no changes
-    workflow.store()
-    assert pub.snapshot_class.count() == 4
-    assert pub.snapshot_class.get(first_id).object_type == 'formdef'
-    assert pub.snapshot_class.get(first_id + 1).object_type == 'workflow'
-    assert pub.snapshot_class.get(first_id + 1).object_id == '1'
-    assert pub.snapshot_class.get(first_id + 1).label is None
-    assert pub.snapshot_class.get(first_id + 1).timestamp == old_timestamp
-    assert pub.snapshot_class.get(first_id + 1).instance.name == 'test'
-    assert pub.snapshot_class.get(first_id + 2).object_type == 'workflow'
-    assert pub.snapshot_class.get(first_id + 2).object_id == '2'
-    assert pub.snapshot_class.get(first_id + 3).object_type == 'workflow'
-    assert pub.snapshot_class.get(first_id + 3).object_id == '1'
-    assert pub.snapshot_class.get(first_id + 3).label == "snapshot !"
-    assert pub.snapshot_class.get(first_id + 3).instance.name == 'test'
-
-    # with changes
-    workflow.name = 'foo bar'
-    workflow.store()
-    assert pub.snapshot_class.count() == 4
-    assert pub.snapshot_class.get(first_id).object_type == 'formdef'
-    assert pub.snapshot_class.get(first_id + 2).object_type == 'workflow'
-    assert pub.snapshot_class.get(first_id + 2).object_id == '2'
-    assert pub.snapshot_class.get(first_id + 3).object_type == 'workflow'
-    assert pub.snapshot_class.get(first_id + 3).object_id == '1'
-    assert pub.snapshot_class.get(first_id + 3).label == "snapshot !"
-    assert pub.snapshot_class.get(first_id + 3).instance.name == 'test'
-    assert pub.snapshot_class.get(first_id + 4).object_type == 'workflow'
-    assert pub.snapshot_class.get(first_id + 4).object_id == '1'
-    assert pub.snapshot_class.get(first_id + 4).label is None
-    assert pub.snapshot_class.get(first_id + 4).timestamp > old_timestamp
-    assert pub.snapshot_class.get(first_id + 4).instance.name == 'foo bar'
-
-
 def test_pickle_erroneous_snapshot_object(pub):
    # check snapshot object attribute is not restored
    formdef = FormDef()
--- a/wcs/backoffice/snapshots.py
+++ b/wcs/backoffice/snapshots.py
@ -117,7 +117,7 @@ class SnapshotDirectory(Directory):
                self.snapshot.timestamp.strftime('%Y%m%d-%H%M'),
            ),
        )
-        return '<?xml version="1.0"?>\n' + self.snapshot.serialization
+        return '<?xml version="1.0"?>\n' + self.snapshot.get_serialization()

    def restore(self):
        form = Form(enctype='multipart/form-data')
--- a/wcs/snapshots.py
+++ b/wcs/snapshots.py
@ -14,13 +14,14 @@
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, see <http://www.gnu.org/licenses/>.

+import difflib
+import re
 import xml.etree.ElementTree as ET

 from django.utils.timezone import now
 from quixote import get_publisher, get_session

 from wcs.qommon import _, misc
-from wcs.qommon.storage import Null


 class UnknownUser:
@ -28,6 +29,97 @@ class UnknownUser:
        return str(_('unknown user'))


+def indent(tree, space="  ", level=0):
+    # backport from Lib/xml/etree/ElementTree.py python 3.9
+    if isinstance(tree, ET.ElementTree):
+        tree = tree.getroot()
+    if level < 0:
+        raise ValueError(f"Initial indentation level must be >= 0, got {level}")
+    if len(tree) == 0:
+        return
+
+    # Reduce the memory consumption by reusing indentation strings.
+    indentations = ["\n" + level * space]
+
+    def _indent_children(elem, level):
+        # Start a new indentation level for the first child.
+        child_level = level + 1
+        try:
+            child_indentation = indentations[child_level]
+        except IndexError:
+            child_indentation = indentations[level] + space
+            indentations.append(child_indentation)
+
+        if not elem.text or not elem.text.strip():
+            elem.text = child_indentation
+
+        for child in elem:
+            if len(child):
+                _indent_children(child, child_level)
+            if not child.tail or not child.tail.strip():
+                child.tail = child_indentation
+
+        # Dedent after the last child by overwriting the previous indentation.
+        if not child.tail.strip():
+            child.tail = indentations[level]
+
+    _indent_children(tree, 0)
+
+
+_no_eol = "\\ No newline at end of file"
+_hdr_pat = re.compile(r"^@@ -(\d+),?(\d+)? \+(\d+),?(\d+)? @@$")
+
+
+def make_patch(a, b):
+    """
+    Get unified string diff between two strings. Trims top two lines.
+    Returns empty string if strings are identical.
+    """
+    diffs = difflib.unified_diff(a.splitlines(True), b.splitlines(True), n=0)
+    try:
+        _, _ = next(diffs), next(diffs)
+    except StopIteration:
+        pass
+    return ''.join([d if d[-1] == '\n' else d + '\n' + _no_eol + '\n' for d in diffs])
+
+
+def apply_patch(s, patch, revert=False):
+    """
+    Apply patch to string s to recover newer string.
+    If revert is True, treat s as the newer string, recover older string.
+    """
+    s = s.splitlines(True)
+    p = patch.splitlines(True)
+    t = ''
+    i = sl = 0
+    (midx, sign) = (1, '+') if not revert else (3, '-')
+    while i < len(p) and p[i].startswith(("---", "+++")):
+        i += 1  # skip header lines
+    while i < len(p):
+        m = _hdr_pat.match(p[i])
+        if not m:
+            raise Exception("Bad patch -- regex mismatch [line " + str(i) + "]")
+        _l = int(m.group(midx)) - 1 + (m.group(midx + 1) == '0')
+        if sl > _l or _l > len(s):
+            raise Exception("Bad patch -- bad line num [line " + str(i) + "]")
+        t += ''.join(s[sl:_l])
+        sl = _l
+        i += 1
+        while i < len(p) and p[i][0] != '@':
+            if i + 1 < len(p) and p[i + 1][0] == '\\':
+                line = p[i][:-1]
+                i += 2
+            else:
+                line = p[i]
+                i += 1
+            if len(line) > 0:
+                if line[0] == sign or line[0] == ' ':
+                    t += line[1:]
+                sl += line[0] != sign
+    t += ''.join(s[sl:])
+    return t
+
+
 class Snapshot:
    id = None
    object_type = None  # (formdef, carddef, blockdef, workflow, data_source, etc.)
@ -36,14 +128,13 @@ class Snapshot:
    user_id = None
    comment = None
    serialization = None
+    patch = None
    label = None  # (named snapshot)

    # cache
    _instance = None
    _user = None

-    WCS_MAX_LEN = 1000000
-
    @classmethod
    def snap(cls, instance, comment=None, label=None):
        obj = cls()
@ -52,18 +143,43 @@ class Snapshot:
        obj.timestamp = now()
        if get_session():
            obj.user_id = get_session().user
-        obj.serialization = ET.tostring(instance.export_to_xml(include_id=True)).decode('utf-8')
+        tree = instance.export_to_xml(include_id=True)
+        obj.serialization = ET.tostring(tree).decode('utf-8')
        obj.comment = str(comment) if comment else None
        obj.label = label
-        latest = cls.get_latest(obj.object_type, obj.object_id)
-        if label is not None or latest is None or obj.serialization != latest.serialization:
-            # save snapshot if there are changes or an explicit label was
-            # given.
-            if label is None and len(obj.serialization) > cls.WCS_MAX_LEN:
-                # keep only latest snapshot for big objects
-                # (typically workflows with embedded documents)
-                for old_snapshot in cls.select_object_history(instance, clause=[Null('label')]):
-                    cls.remove_object(old_snapshot.id)
+
+        latest_complete = cls.get_latest(obj.object_type, obj.object_id, complete=True)
+        if latest_complete is None:
+            # no complete snapshot, store it, with serialization and no patch
+            obj.store()
+            return
+
+        # get patch beetween latest serialization and current instance
+        # indent xml to minimize patch
+        latest_tree = ET.fromstring(latest_complete.serialization)
+        indent(tree)
+        indent(latest_tree)
+        patch = make_patch(ET.tostring(latest_tree).decode('utf-8'), ET.tostring(tree).decode('utf-8'))
+        # should we store a snapshot ?
+        store_snapshot = True
+        if label is None:
+            # compare with patch of latest snapshot
+            latest = cls.get_latest(obj.object_type, obj.object_id)
+            if latest.patch and patch == latest.patch:
+                # previous snapshot contains a patch (but no serialization)
+                # and the current patch is the same as in the previous snapshot
+                store_snapshot = False
+            elif latest.serialization and not patch:
+                # previous snapshot contains a serialization (but no patch)
+                # and there is no difference (no patch)
+                store_snapshot = False
+
+        if store_snapshot:
+            if len(obj.serialization) > len(patch):
+                # serialization is bigger than patch, store patch
+                obj.serialization = None
+                obj.patch = patch
+            # else: keep serialization and ignore patch
            obj.store()

    def get_object_class(self):
@ -80,10 +196,22 @@ class Snapshot:
                return klass
        raise KeyError('no class for object type: %s' % self.object_type)

+    def get_serialization(self):
+        # there is a complete serialization
+        if self.serialization:
+            return self.serialization
+
+        # get latest version with serialization
+        latest_complete = self.__class__.get_latest(self.object_type, self.object_id, complete=True)
+        latest_tree = ET.fromstring(latest_complete.serialization)
+        indent(latest_tree)
+        serialization = apply_patch(ET.tostring(latest_tree).decode('utf-8'), self.patch or '')
+        return serialization
+
    @property
    def instance(self):
        if self._instance is None:
-            tree = ET.fromstring(self.serialization)
+            tree = ET.fromstring(self.get_serialization())
            self._instance = self.get_object_class().import_from_xml_tree(
                tree,
                include_id=True,
--- a/wcs/sql.py
+++ b/wcs/sql.py
@ -1057,6 +1057,7 @@ def do_snapshots_table():
                                        user_id VARCHAR,
                                        comment TEXT,
                                        serialization TEXT,
+                                        patch TEXT,
                                        label VARCHAR
                                        )'''
            % table_name
@ -1069,6 +1070,10 @@ def do_snapshots_table():
    )
    existing_fields = {x[0] for x in cur.fetchall()}

+    # migrations
+    if 'patch' not in existing_fields:
+        cur.execute('''ALTER TABLE %s ADD COLUMN patch TEXT''' % table_name)
+
    needed_fields = {x[0] for x in Snapshot._table_static_fields}

    # delete obsolete fields
@ -2959,9 +2964,10 @@ class Snapshot(SqlMixin, wcs.snapshots.Snapshot):
        ('user_id', 'varchar'),
        ('comment', 'text'),
        ('serialization', 'text'),
+        ('patch', 'text'),
        ('label', 'varchar'),
    ]
-    _table_select_skipped_fields = ['serialization']
+    _table_select_skipped_fields = ['serialization', 'patch']

    @guard_postgres
    @invalidate_substitution_cache
@ -3018,13 +3024,16 @@ class Snapshot(SqlMixin, wcs.snapshots.Snapshot):
        return []

    @classmethod
-    def get_latest(cls, object_type, object_id):
+    def get_latest(cls, object_type, object_id, complete=False):
        conn, cur = get_connection_and_cursor()
        sql_statement = '''SELECT id FROM snapshots
-                            WHERE object_type = %(object_type)s
-                              AND object_id = %(object_id)s
+                            WHERE object_type = %%(object_type)s
+                              AND object_id = %%(object_id)s
+                              %s
                         ORDER BY timestamp DESC
-                            LIMIT 1'''
+                            LIMIT 1''' % (
+            'AND serialization IS NOT NULL' if complete else ''
+        )
        cur.execute(sql_statement, {'object_type': object_type, 'object_id': object_id})
        row = cur.fetchone()
        conn.commit()
@ -3441,7 +3450,7 @@ def get_period_total(
 # latest migration, number + description (description is not used
 # programmaticaly but will make sure git conflicts if two migrations are
 # separately added with the same number)
-SQL_LEVEL = (53, 'add kind column on logged_errors table')
+SQL_LEVEL = (54, 'add patch column on snapshot table')


 def migrate_global_views(conn, cur):
@ -3617,8 +3626,9 @@ def migrate():
                continue
            for formdata in formdef.data_class().select_iterator():
                formdata._set_auto_fields(cur)  # build digests
-    if sql_level < 42:
+    if sql_level < 54:
        # 42: create snapshots table
+        # 54: add patch column
        do_snapshots_table()
    if sql_level < 53:
        # 47: store LoggedErrors in SQL