bpo-31993: Do not allocate large temporary buffers in pickle dump. (#… · python/cpython@3cd7c6e
@@ -2042,21 +2042,40 @@ def test_setitems_on_non_dicts(self):
20422042def check_frame_opcodes(self, pickled):
20432043"""
20442044 Check the arguments of FRAME opcodes in a protocol 4+ pickle.
2045+2046+ Note that binary objects that are larger than FRAME_SIZE_TARGET are not
2047+ framed by default and are therefore considered a frame by themselves in
2048+ the following consistency check.
20452049 """
2046-frame_opcode_size = 9
2047-last_arg = last_pos = None
2050+last_arg = last_pos = last_frame_opcode_size = None
2051+frameless_opcode_sizes = {
2052+'BINBYTES': 5,
2053+'BINUNICODE': 5,
2054+'BINBYTES8': 9,
2055+'BINUNICODE8': 9,
2056+ }
20482057for op, arg, pos in pickletools.genops(pickled):
2049-if op.name != 'FRAME':
2058+if op.name in frameless_opcode_sizes:
2059+if len(arg) > self.FRAME_SIZE_TARGET:
2060+frame_opcode_size = frameless_opcode_sizes[op.name]
2061+arg = len(arg)
2062+else:
2063+continue
2064+elif op.name == 'FRAME':
2065+frame_opcode_size = 9
2066+else:
20502067continue
2068+20512069if last_pos is not None:
20522070# The previous frame's size should be equal to the number
20532071# of bytes up to the current frame.
2054-frame_size = pos - last_pos - frame_opcode_size
2072+frame_size = pos - last_pos - last_frame_opcode_size
20552073self.assertEqual(frame_size, last_arg)
20562074last_arg, last_pos = arg, pos
2075+last_frame_opcode_size = frame_opcode_size
20572076# The last frame's size should be equal to the number of bytes up
20582077# to the pickle's end.
2059-frame_size = len(pickled) - last_pos - frame_opcode_size
2078+frame_size = len(pickled) - last_pos - last_frame_opcode_size
20602079self.assertEqual(frame_size, last_arg)
2061208020622081def test_framing_many_objects(self):
@@ -2076,15 +2095,36 @@ def test_framing_many_objects(self):
2076209520772096def test_framing_large_objects(self):
20782097N = 1024 * 1024
2079-obj = [b'x' * N, b'y' * N, b'z' * N]
2098+obj = [b'x' * N, b'y' * N, 'z' * N]
20802099for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
2081-with self.subTest(proto=proto):
2082-pickled = self.dumps(obj, proto)
2083-unpickled = self.loads(pickled)
2084-self.assertEqual(obj, unpickled)
2085-n_frames = count_opcode(pickle.FRAME, pickled)
2086-self.assertGreaterEqual(n_frames, len(obj))
2087-self.check_frame_opcodes(pickled)
2100+for fast in [True, False]:
2101+with self.subTest(proto=proto, fast=fast):
2102+if hasattr(self, 'pickler'):
2103+buf = io.BytesIO()
2104+pickler = self.pickler(buf, protocol=proto)
2105+pickler.fast = fast
2106+pickler.dump(obj)
2107+pickled = buf.getvalue()
2108+elif fast:
2109+continue
2110+else:
2111+# Fallback to self.dumps when fast=False and
2112+# self.pickler is not available.
2113+pickled = self.dumps(obj, proto)
2114+unpickled = self.loads(pickled)
2115+# More informative error message in case of failure.
2116+self.assertEqual([len(x) for x in obj],
2117+ [len(x) for x in unpickled])
2118+# Perform full equality check if the lengths match.
2119+self.assertEqual(obj, unpickled)
2120+n_frames = count_opcode(pickle.FRAME, pickled)
2121+if not fast:
2122+# One frame per memoize for each large object.
2123+self.assertGreaterEqual(n_frames, len(obj))
2124+else:
2125+# One frame at the beginning and one at the end.
2126+self.assertGreaterEqual(n_frames, 2)
2127+self.check_frame_opcodes(pickled)
2088212820892129def test_optional_frames(self):
20902130if pickle.HIGHEST_PROTOCOL < 4:
@@ -2125,6 +2165,71 @@ def remove_frames(pickled, keep_frame=None):
21252165count_opcode(pickle.FRAME, pickled))
21262166self.assertEqual(obj, self.loads(some_frames_pickle))
212721672168+def test_framed_write_sizes_with_delayed_writer(self):
2169+class ChunkAccumulator:
2170+"""Accumulate pickler output in a list of raw chunks."""
2171+2172+def __init__(self):
2173+self.chunks = []
2174+2175+def write(self, chunk):
2176+self.chunks.append(chunk)
2177+2178+def concatenate_chunks(self):
2179+# Some chunks can be memoryview instances, we need to convert
2180+# them to bytes to be able to call join
2181+return b"".join([c.tobytes() if hasattr(c, 'tobytes') else c
2182+for c in self.chunks])
2183+2184+small_objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)})
2185+for i in range(int(1e4))]
2186+2187+for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
2188+# Protocol 4 packs groups of small objects into frames and issues
2189+# calls to write only once or twice per frame:
2190+# The C pickler issues one call to write per-frame (header and
2191+# contents) while Python pickler issues two calls to write: one for
2192+# the frame header and one for the frame binary contents.
2193+writer = ChunkAccumulator()
2194+self.pickler(writer, proto).dump(small_objects)
2195+2196+# Actually read the binary content of the chunks after the end
2197+# of the call to dump: ant memoryview passed to write should not
2198+# be released otherwise this delayed access would not be possible.
2199+pickled = writer.concatenate_chunks()
2200+reconstructed = self.loads(pickled)
2201+self.assertEqual(reconstructed, small_objects)
2202+self.assertGreater(len(writer.chunks), 1)
2203+2204+n_frames, remainder = divmod(len(pickled), self.FRAME_SIZE_TARGET)
2205+if remainder > 0:
2206+n_frames += 1
2207+2208+# There should be at least one call to write per frame
2209+self.assertGreaterEqual(len(writer.chunks), n_frames)
2210+2211+# but not too many either: there can be one for the proto,
2212+# one per-frame header and one per frame for the actual contents.
2213+self.assertGreaterEqual(2 * n_frames + 1, len(writer.chunks))
2214+2215+chunk_sizes = [len(c) for c in writer.chunks[:-1]]
2216+large_sizes = [s for s in chunk_sizes
2217+if s >= self.FRAME_SIZE_TARGET]
2218+small_sizes = [s for s in chunk_sizes
2219+if s < self.FRAME_SIZE_TARGET]
2220+2221+# Large chunks should not be too large:
2222+for chunk_size in large_sizes:
2223+self.assertGreater(2 * self.FRAME_SIZE_TARGET, chunk_size)
2224+2225+last_chunk_size = len(writer.chunks[-1])
2226+self.assertGreater(2 * self.FRAME_SIZE_TARGET, last_chunk_size)
2227+2228+# Small chunks (if any) should be very small
2229+# (only proto and frame headers)
2230+for chunk_size in small_sizes:
2231+self.assertGreaterEqual(9, chunk_size)
2232+21282233def test_nested_names(self):
21292234global Nested
21302235class Nested: