From df380490bcb6c4141a58dbf85052210864f42182 Mon Sep 17 00:00:00 2001 From: "Gabriele N. Tornetta" Date: Thu, 28 May 2026 12:23:53 +0100 Subject: [PATCH] perf: cache has_jump flag and pass buffer in _pack_location MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two independent micro-optimisations on the roundtrip hot path, benchmarked together: 1. Cache `_is_jump` on BaseInstr at construction time `has_jump()` is called on every instruction in ControlFlowGraph.from_bytecode, BasicBlock.append, and _StackSizeComputer.run — the profiler showed it at ~2.9% own time, spending that time on `opcode in HAS_JUMP` (a set lookup) on every call. Added `_is_jump: bool` to BaseInstr.__slots__ and compute it once in `_set()` (the canonical setter used by __init__ and the public `set()` method). All fast-path constructors that bypass `_set()` — `copy()`, `_from_trusted()` on both BaseInstr and ConcreteInstr, and `_from_opcode()` on ConcreteInstr — now copy or compute the flag directly. `has_jump()` becomes a single slot read. 2. Eliminate per-call bytearray allocation in _pack_location `_assemble_locations` previously collected one `bytearray` per location group via `_push_locations -> _pack_location -> bytearray()`, then joined them with `b"".join(locations)` at the end. Each location entry is only 2-6 bytes, so the list of small bytearrays and the final join were measurable overhead (`_pack_location` at ~3.9% own in the profiler). Changed the signature of `_pack_location` and `_push_locations` to accept a shared `bytearray buf` and extend into it in place. `_assemble_locations` creates one `bytearray()` up-front and converts to `bytes` at the end -- zero intermediate allocations. Benchmark (perf.py, Bytecode.from_code(dis).to_code(), 30 runs, p95 r/s): | | p95 (r/s) | 95% CI | |---|---|---| | Baseline | 188 | [187, 188] | | This PR | 196 | [195, 196] | Delta: +8 r/s (+4.3%), Mann-Whitney p~0 (significant, threshold: p<0.01 and |delta|>=2%) --- src/bytecode/concrete.py | 33 ++++++++++++++++----------------- src/bytecode/instr.py | 7 +++++-- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/src/bytecode/concrete.py b/src/bytecode/concrete.py index e99a7980..66a3054b 100644 --- a/src/bytecode/concrete.py +++ b/src/bytecode/concrete.py @@ -37,6 +37,7 @@ DUAL_ARG_OPCODES_SINGLE_OPS, EXTENDEDARG_OPCODE, FORMAT_VALUE_OPS, + HAS_JUMP, INTRINSIC, INTRINSIC_1OP, INTRINSIC_2OP, @@ -193,6 +194,7 @@ def _from_opcode( new = object.__new__(cls) new._name = name new._opcode = opcode + new._is_jump = opcode in HAS_JUMP new._arg = arg new._location = location new._extended_args = None @@ -211,6 +213,7 @@ def _from_trusted( new = object.__new__(cls) new._name = name new._opcode = opcode + new._is_jump = opcode in HAS_JUMP new._arg = arg new._location = location new._extended_args = None @@ -491,10 +494,8 @@ def _pack_location_header(code: int, size: int) -> int: return (1 << 7) + (code << 3) + (size - 1 if size <= 8 else 7) def _pack_location( - self, size: int, lineno: int, location: Optional[InstrLocation] - ) -> bytearray: - packed = bytearray() - + self, buf: bytearray, size: int, lineno: int, location: Optional[InstrLocation] + ) -> None: l_lineno: Optional[int] # The location was not set so we infer a line. if location is None: @@ -514,7 +515,7 @@ def _pack_location( # We have no location information so the code is 15 if l_lineno is None: - packed.append(self._pack_location_header(15, size)) + buf.append(self._pack_location_header(15, size)) # No column info, code 13 elif col_offset is None: @@ -523,7 +524,7 @@ def _pack_location( "An instruction cannot have no column offset and span " f"multiple lines (lineno: {l_lineno}, end lineno: {end_lineno}" ) - packed.extend( + buf.extend( ( self._pack_location_header(13, size), *self._encode_location_svarint(l_lineno - lineno), @@ -541,7 +542,7 @@ def _pack_location( and col_offset < 72 and (end_col_offset - col_offset) <= 15 ): - packed.extend( + buf.extend( ( self._pack_location_header(col_offset // 8, size), ((col_offset % 8) << 4) + (end_col_offset - col_offset), @@ -555,7 +556,7 @@ def _pack_location( and col_offset < 256 and end_col_offset < 256 ): - packed.extend( + buf.extend( ( self._pack_location_header(10 + l_lineno - lineno, size), col_offset, @@ -567,7 +568,7 @@ def _pack_location( else: assert end_lineno is not None - packed.extend( + buf.extend( ( self._pack_location_header(14, size), *self._encode_location_svarint(l_lineno - lineno), @@ -579,11 +580,9 @@ def _pack_location( ) ) - return packed - def _push_locations( self, - locations: List[bytearray], + buf: bytearray, size: int, lineno: int, location: InstrLocation, @@ -595,7 +594,7 @@ def _push_locations( # elements. We recompute each time since in practice we will # rarely loop. while True: - locations.append(self._pack_location(size, lineno, location)) + self._pack_location(buf, size, lineno, location) # Update the lineno since if we need more than one entry the # reference for the delta of the lineno change lineno = location.lineno if location.lineno is not None else lineno @@ -613,7 +612,7 @@ def _assemble_locations( if not linenos: return b"" - locations: List[bytearray] = [] + buf = bytearray() iter_in = iter(linenos) @@ -634,15 +633,15 @@ def _assemble_locations( size += i_size continue - lineno = self._push_locations(locations, size, lineno, old_location) + lineno = self._push_locations(buf, size, lineno, old_location) size = i_size old_location = location # Pack the line of the last instruction. - self._push_locations(locations, size, lineno, old_location) + self._push_locations(buf, size, lineno, old_location) - return b"".join(locations) + return bytes(buf) @staticmethod def _remove_extended_args( diff --git a/src/bytecode/instr.py b/src/bytecode/instr.py index 56240eac..c238ad2b 100644 --- a/src/bytecode/instr.py +++ b/src/bytecode/instr.py @@ -693,7 +693,7 @@ def copy(self) -> TryEnd: class BaseInstr(Generic[A]): """Abstract instruction.""" - __slots__ = ("_arg", "_location", "_name", "_opcode") + __slots__ = ("_arg", "_is_jump", "_location", "_name", "_opcode") # Work around an issue with the default value of arg def __init__( @@ -831,6 +831,7 @@ def copy(self: T) -> T: new = object.__new__(self.__class__) new._name = self._name new._opcode = self._opcode + new._is_jump = self._is_jump new._arg = self._arg new._location = self._location return new @@ -847,12 +848,13 @@ def _from_trusted( new = object.__new__(cls) new._name = name new._opcode = opcode + new._is_jump = opcode in HAS_JUMP new._arg = arg new._location = location return new def has_jump(self) -> bool: - return self._has_jump(self._opcode) + return self._is_jump def is_cond_jump(self) -> bool: """Is a conditional jump?""" @@ -916,6 +918,7 @@ def _set(self, name: str, arg: A) -> None: self._name = name self._opcode = opcode + self._is_jump = opcode in HAS_JUMP self._arg = arg @staticmethod