bpo-31993: Do not allocate large temporary buffers in pickle dump. (#… · python/cpython@3cd7c6e

@@ -2042,21 +2042,40 @@ def test_setitems_on_non_dicts(self):

20422042

def check_frame_opcodes(self, pickled):

20432043

"""

20442044

Check the arguments of FRAME opcodes in a protocol 4+ pickle.

2045+2046+

Note that binary objects that are larger than FRAME_SIZE_TARGET are not

2047+

framed by default and are therefore considered a frame by themselves in

2048+

the following consistency check.

20452049

"""

2046-

frame_opcode_size = 9

2047-

last_arg = last_pos = None

2050+

last_arg = last_pos = last_frame_opcode_size = None

2051+

frameless_opcode_sizes = {

2052+

'BINBYTES': 5,

2053+

'BINUNICODE': 5,

2054+

'BINBYTES8': 9,

2055+

'BINUNICODE8': 9,

2056+

}

20482057

for op, arg, pos in pickletools.genops(pickled):

2049-

if op.name != 'FRAME':

2058+

if op.name in frameless_opcode_sizes:

2059+

if len(arg) > self.FRAME_SIZE_TARGET:

2060+

frame_opcode_size = frameless_opcode_sizes[op.name]

2061+

arg = len(arg)

2062+

else:

2063+

continue

2064+

elif op.name == 'FRAME':

2065+

frame_opcode_size = 9

2066+

else:

20502067

continue

2068+20512069

if last_pos is not None:

20522070

# The previous frame's size should be equal to the number

20532071

# of bytes up to the current frame.

2054-

frame_size = pos - last_pos - frame_opcode_size

2072+

frame_size = pos - last_pos - last_frame_opcode_size

20552073

self.assertEqual(frame_size, last_arg)

20562074

last_arg, last_pos = arg, pos

2075+

last_frame_opcode_size = frame_opcode_size

20572076

# The last frame's size should be equal to the number of bytes up

20582077

# to the pickle's end.

2059-

frame_size = len(pickled) - last_pos - frame_opcode_size

2078+

frame_size = len(pickled) - last_pos - last_frame_opcode_size

20602079

self.assertEqual(frame_size, last_arg)

2061208020622081

def test_framing_many_objects(self):

@@ -2076,15 +2095,36 @@ def test_framing_many_objects(self):

2076209520772096

def test_framing_large_objects(self):

20782097

N = 1024 * 1024

2079-

obj = [b'x' * N, b'y' * N, b'z' * N]

2098+

obj = [b'x' * N, b'y' * N, 'z' * N]

20802099

for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):

2081-

with self.subTest(proto=proto):

2082-

pickled = self.dumps(obj, proto)

2083-

unpickled = self.loads(pickled)

2084-

self.assertEqual(obj, unpickled)

2085-

n_frames = count_opcode(pickle.FRAME, pickled)

2086-

self.assertGreaterEqual(n_frames, len(obj))

2087-

self.check_frame_opcodes(pickled)

2100+

for fast in [True, False]:

2101+

with self.subTest(proto=proto, fast=fast):

2102+

if hasattr(self, 'pickler'):

2103+

buf = io.BytesIO()

2104+

pickler = self.pickler(buf, protocol=proto)

2105+

pickler.fast = fast

2106+

pickler.dump(obj)

2107+

pickled = buf.getvalue()

2108+

elif fast:

2109+

continue

2110+

else:

2111+

# Fallback to self.dumps when fast=False and

2112+

# self.pickler is not available.

2113+

pickled = self.dumps(obj, proto)

2114+

unpickled = self.loads(pickled)

2115+

# More informative error message in case of failure.

2116+

self.assertEqual([len(x) for x in obj],

2117+

[len(x) for x in unpickled])

2118+

# Perform full equality check if the lengths match.

2119+

self.assertEqual(obj, unpickled)

2120+

n_frames = count_opcode(pickle.FRAME, pickled)

2121+

if not fast:

2122+

# One frame per memoize for each large object.

2123+

self.assertGreaterEqual(n_frames, len(obj))

2124+

else:

2125+

# One frame at the beginning and one at the end.

2126+

self.assertGreaterEqual(n_frames, 2)

2127+

self.check_frame_opcodes(pickled)

2088212820892129

def test_optional_frames(self):

20902130

if pickle.HIGHEST_PROTOCOL < 4:

@@ -2125,6 +2165,71 @@ def remove_frames(pickled, keep_frame=None):

21252165

count_opcode(pickle.FRAME, pickled))

21262166

self.assertEqual(obj, self.loads(some_frames_pickle))

212721672168+

def test_framed_write_sizes_with_delayed_writer(self):

2169+

class ChunkAccumulator:

2170+

"""Accumulate pickler output in a list of raw chunks."""

2171+2172+

def __init__(self):

2173+

self.chunks = []

2174+2175+

def write(self, chunk):

2176+

self.chunks.append(chunk)

2177+2178+

def concatenate_chunks(self):

2179+

# Some chunks can be memoryview instances, we need to convert

2180+

# them to bytes to be able to call join

2181+

return b"".join([c.tobytes() if hasattr(c, 'tobytes') else c

2182+

for c in self.chunks])

2183+2184+

small_objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)})

2185+

for i in range(int(1e4))]

2186+2187+

for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):

2188+

# Protocol 4 packs groups of small objects into frames and issues

2189+

# calls to write only once or twice per frame:

2190+

# The C pickler issues one call to write per-frame (header and

2191+

# contents) while Python pickler issues two calls to write: one for

2192+

# the frame header and one for the frame binary contents.

2193+

writer = ChunkAccumulator()

2194+

self.pickler(writer, proto).dump(small_objects)

2195+2196+

# Actually read the binary content of the chunks after the end

2197+

# of the call to dump: ant memoryview passed to write should not

2198+

# be released otherwise this delayed access would not be possible.

2199+

pickled = writer.concatenate_chunks()

2200+

reconstructed = self.loads(pickled)

2201+

self.assertEqual(reconstructed, small_objects)

2202+

self.assertGreater(len(writer.chunks), 1)

2203+2204+

n_frames, remainder = divmod(len(pickled), self.FRAME_SIZE_TARGET)

2205+

if remainder > 0:

2206+

n_frames += 1

2207+2208+

# There should be at least one call to write per frame

2209+

self.assertGreaterEqual(len(writer.chunks), n_frames)

2210+2211+

# but not too many either: there can be one for the proto,

2212+

# one per-frame header and one per frame for the actual contents.

2213+

self.assertGreaterEqual(2 * n_frames + 1, len(writer.chunks))

2214+2215+

chunk_sizes = [len(c) for c in writer.chunks[:-1]]

2216+

large_sizes = [s for s in chunk_sizes

2217+

if s >= self.FRAME_SIZE_TARGET]

2218+

small_sizes = [s for s in chunk_sizes

2219+

if s < self.FRAME_SIZE_TARGET]

2220+2221+

# Large chunks should not be too large:

2222+

for chunk_size in large_sizes:

2223+

self.assertGreater(2 * self.FRAME_SIZE_TARGET, chunk_size)

2224+2225+

last_chunk_size = len(writer.chunks[-1])

2226+

self.assertGreater(2 * self.FRAME_SIZE_TARGET, last_chunk_size)

2227+2228+

# Small chunks (if any) should be very small

2229+

# (only proto and frame headers)

2230+

for chunk_size in small_sizes:

2231+

self.assertGreaterEqual(9, chunk_size)

2232+21282233

def test_nested_names(self):

21292234

global Nested

21302235

class Nested: