From 6af35222dbd00125e3627a23b2ebac9cd5983165 Mon Sep 17 00:00:00 2001 From: Benjamin Dauvergne Date: Fri, 28 Jun 2019 00:35:35 +0200 Subject: [PATCH] class for an indexable pickle object --- indexable-pickle/ram_pickle.py | 65 ++++++++++++++++++++++++++++++ indexable-pickle/test_rampickle.py | 35 ++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 indexable-pickle/ram_pickle.py create mode 100644 indexable-pickle/test_rampickle.py diff --git a/indexable-pickle/ram_pickle.py b/indexable-pickle/ram_pickle.py new file mode 100644 index 0000000..dec1439 --- /dev/null +++ b/indexable-pickle/ram_pickle.py @@ -0,0 +1,65 @@ +import struct +import pickle + + +class RamPickleWrite(object): + def __init__(self, sequence): + self.sequence = sequence + + def pickle(self, fd): + base_offset = fd.tell() + index = [] + fd.write(struct.pack('L', 0)) + i = 0 + while True: + batch = self.sequence[i:i + 100] + if not batch: + break + index.append(fd.tell()) + pickle.dump(batch, fd) + i += 100 + index_offset = fd.tell() + pickle.dump(index, fd) + fd.seek(base_offset) + fd.write(struct.pack('L', index_offset)) + + +class RamPickleRead(object): + def __init__(self, fd): + self.fd = fd + buf = fd.read(struct.calcsize('L')) + index_offset, = struct.unpack('L', buf) + fd.seek(index_offset) + self.index = pickle.load(fd) + self.batches = {} + + def load_batch(self, index): + page = index // 100 + if len(self.index) <= page: + return + if page not in self.batches: + self.fd.seek(self.index[page]) + self.batches[page] = pickle.load(self.fd) + return self.batches[page] + + def __getitem__(self, index): + if isinstance(index, (long, int)): + batch = self.load_batch(index) + if not batch: + raise IndexError(index) + offset = index % 100 + if len(batch) <= offset: + raise IndexError(index) + return batch[offset] + elif isinstance(index, slice): + l = [] + i = index.start or 0 + while index.stop is None or i < index.stop: + try: + l.append(self[i]) + except IndexError: + break + i += index.step or 1 + return l + else: + raise TypeError(index) diff --git a/indexable-pickle/test_rampickle.py b/indexable-pickle/test_rampickle.py new file mode 100644 index 0000000..7199a6b --- /dev/null +++ b/indexable-pickle/test_rampickle.py @@ -0,0 +1,35 @@ +import pytest + +from ram_pickle import RamPickleWrite, RamPickleRead + + +def test_ram(): + import io + + sequence = list(range(1000)) + fd = io.BytesIO() + write = RamPickleWrite(sequence) + write.pickle(fd) + + fd.seek(0) + read = RamPickleRead(fd) + for i in range(1000): + assert read[i] == i + + fd.seek(0) + read = RamPickleRead(fd) + for i in range(1000, 2000): + with pytest.raises(IndexError): + read[i] + + fd.seek(0) + read = RamPickleRead(fd) + assert read[100:] == range(100, 1000) + + fd.seek(0) + read = RamPickleRead(fd) + assert read[:100] == range(100) + + fd.seek(0) + read = RamPickleRead(fd) + assert read[100:200:2] == range(100, 200, 2)