Skip to content

Commit 4eab90f

Browse files
authored
GH-140683: JIT: Improve machine code for loading smaller constants on AArch64. (GH-142511)
* Use movz and movk instructions for loading 16 and 32 bit operands and oparg. * Loading of 64 bit operands is unchanged.
1 parent 469f191 commit 4eab90f

File tree

10 files changed

+320
-135
lines changed

10 files changed

+320
-135
lines changed

Python/bytecodes.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -810,7 +810,7 @@ dummy_func(
810810
assert(next_instr->op.code == STORE_FAST);
811811
next_oparg = next_instr->op.arg;
812812
#else
813-
next_oparg = (int)CURRENT_OPERAND0();
813+
next_oparg = (int)CURRENT_OPERAND0_16();
814814
#endif
815815
_PyStackRef *target_local = &GETLOCAL(next_oparg);
816816
assert(PyUnicode_CheckExact(left_o));

Python/ceval_macros.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -450,8 +450,12 @@ do { \
450450
} while (0)
451451

452452
#define CURRENT_OPARG() (next_uop[-1].oparg)
453-
#define CURRENT_OPERAND0() (next_uop[-1].operand0)
454-
#define CURRENT_OPERAND1() (next_uop[-1].operand1)
453+
#define CURRENT_OPERAND0_64() (next_uop[-1].operand0)
454+
#define CURRENT_OPERAND1_64() (next_uop[-1].operand1)
455+
#define CURRENT_OPERAND0_32() (next_uop[-1].operand0)
456+
#define CURRENT_OPERAND1_32() (next_uop[-1].operand1)
457+
#define CURRENT_OPERAND0_16() (next_uop[-1].operand0)
458+
#define CURRENT_OPERAND1_16() (next_uop[-1].operand1)
455459
#define CURRENT_TARGET() (next_uop[-1].target)
456460

457461
#define JUMP_TO_JUMP_TARGET() goto jump_to_jump_target

Python/executor_cases.c.h

Lines changed: 117 additions & 117 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Python/generated_cases.c.h

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Tools/cases_generator/tier2_generator.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -222,12 +222,13 @@ def write_uop(uop: Uop, emitter: Emitter, stack: Stack, offset_strs: dict[str, t
222222
idx = 0
223223
for cache in uop.caches:
224224
if cache.name != "unused":
225+
bits = cache.size*16
225226
if cache.size == 4:
226227
type = cast = "PyObject *"
227228
else:
228-
type = f"uint{cache.size*16}_t "
229-
cast = f"uint{cache.size*16}_t"
230-
emitter.emit(f"{type}{cache.name} = ({cast})CURRENT_OPERAND{idx}();\n")
229+
type = f"uint{bits}_t "
230+
cast = f"uint{bits}_t"
231+
emitter.emit(f"{type}{cache.name} = ({cast})CURRENT_OPERAND{idx}_{bits}();\n")
231232
idx += 1
232233
reachable, storage = emitter.emit_tokens(uop, storage, None, False)
233234
if reachable:

Tools/jit/_optimizers.py

Lines changed: 128 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ class InstructionKind(enum.Enum):
9696
LONG_BRANCH = enum.auto()
9797
SHORT_BRANCH = enum.auto()
9898
RETURN = enum.auto()
99+
SMALL_CONST_1 = enum.auto()
100+
SMALL_CONST_2 = enum.auto()
99101
OTHER = enum.auto()
100102

101103

@@ -172,6 +174,7 @@ class Optimizer:
172174
)
173175
# Override everything that follows in subclasses:
174176
_supports_external_relocations = True
177+
supports_small_constants = False
175178
_branches: typing.ClassVar[dict[str, tuple[str | None, str | None]]] = {}
176179
# Short branches are instructions that can branch within a micro-op,
177180
# but might not have the reach to branch anywhere within a trace.
@@ -184,6 +187,9 @@ class Optimizer:
184187
_re_return: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH
185188
text: str = ""
186189
globals: set[str] = dataclasses.field(default_factory=set)
190+
_re_small_const_1 = _RE_NEVER_MATCH
191+
_re_small_const_2 = _RE_NEVER_MATCH
192+
const_reloc = "<Not supported>"
187193

188194
def __post_init__(self) -> None:
189195
# Split the code into a linked list of basic blocks. A basic block is an
@@ -253,6 +259,14 @@ def _parse_instruction(self, line: str) -> Instruction:
253259
elif match := self._re_return.match(line):
254260
name = line
255261
kind = InstructionKind.RETURN
262+
elif match := self._re_small_const_1.match(line):
263+
target = match["value"]
264+
name = match["instruction"]
265+
kind = InstructionKind.SMALL_CONST_1
266+
elif match := self._re_small_const_2.match(line):
267+
target = match["value"]
268+
name = match["instruction"]
269+
kind = InstructionKind.SMALL_CONST_2
256270
else:
257271
name, *_ = line.split(" ")
258272
kind = InstructionKind.OTHER
@@ -385,7 +399,7 @@ def _remove_redundant_jumps(self) -> None:
385399
block.fallthrough = True
386400
block.instructions.pop()
387401
# Before:
388-
# br ? FOO:
402+
# branch FOO:
389403
# ...
390404
# FOO:
391405
# jump BAR
@@ -461,6 +475,70 @@ def _fixup_external_labels(self) -> None:
461475
)
462476
block.instructions.append(branch.update_target("0"))
463477

478+
def _make_temp_label(self, index: int) -> Instruction:
479+
marker = f"jit_temp_{index}:"
480+
return Instruction(InstructionKind.OTHER, "", marker, None)
481+
482+
def _fixup_constants(self) -> None:
483+
if not self.supports_small_constants:
484+
return
485+
index = 0
486+
for block in self._blocks():
487+
fixed: list[Instruction] = []
488+
small_const_index = -1
489+
for inst in block.instructions:
490+
if inst.kind == InstructionKind.SMALL_CONST_1:
491+
marker = f"jit_pending_{inst.target}{index}:"
492+
fixed.append(self._make_temp_label(index))
493+
index += 1
494+
small_const_index = len(fixed)
495+
fixed.append(inst)
496+
elif inst.kind == InstructionKind.SMALL_CONST_2:
497+
if small_const_index < 0:
498+
fixed.append(inst)
499+
continue
500+
small_const_1 = fixed[small_const_index]
501+
if not self._small_consts_match(small_const_1, inst):
502+
small_const_index = -1
503+
fixed.append(inst)
504+
continue
505+
assert small_const_1.target is not None
506+
if small_const_1.target.endswith("16"):
507+
fixed[small_const_index] = self._make_temp_label(index)
508+
index += 1
509+
else:
510+
assert small_const_1.target.endswith("32")
511+
patch_kind, replacement = self._small_const_1(small_const_1)
512+
if replacement is not None:
513+
label = f"{self.const_reloc}{patch_kind}_JIT_RELOCATION_CONST{small_const_1.target[:-3]}_JIT_RELOCATION_{index}:"
514+
index += 1
515+
fixed[small_const_index - 1] = Instruction(
516+
InstructionKind.OTHER, "", label, None
517+
)
518+
fixed[small_const_index] = replacement
519+
patch_kind, replacement = self._small_const_2(inst)
520+
if replacement is not None:
521+
assert inst.target is not None
522+
label = f"{self.const_reloc}{patch_kind}_JIT_RELOCATION_CONST{inst.target[:-3]}_JIT_RELOCATION_{index}:"
523+
index += 1
524+
fixed.append(
525+
Instruction(InstructionKind.OTHER, "", label, None)
526+
)
527+
fixed.append(replacement)
528+
small_const_index = -1
529+
else:
530+
fixed.append(inst)
531+
block.instructions = fixed
532+
533+
def _small_const_1(self, inst: Instruction) -> tuple[str, Instruction | None]:
534+
raise NotImplementedError()
535+
536+
def _small_const_2(self, inst: Instruction) -> tuple[str, Instruction | None]:
537+
raise NotImplementedError()
538+
539+
def _small_consts_match(self, inst1: Instruction, inst2: Instruction) -> bool:
540+
raise NotImplementedError()
541+
464542
def run(self) -> None:
465543
"""Run this optimizer."""
466544
self._insert_continue_label()
@@ -472,6 +550,7 @@ def run(self) -> None:
472550
self._remove_redundant_jumps()
473551
self._remove_unreachable()
474552
self._fixup_external_labels()
553+
self._fixup_constants()
475554
self.path.write_text(self._body())
476555

477556

@@ -492,6 +571,54 @@ class OptimizerAArch64(Optimizer): # pylint: disable = too-few-public-methods
492571
# https://developer.arm.com/documentation/ddi0602/2025-09/Base-Instructions/RET--Return-from-subroutine-
493572
_re_return = re.compile(r"\s*ret\b")
494573

574+
supports_small_constants = True
575+
_re_small_const_1 = re.compile(
576+
r"\s*(?P<instruction>adrp)\s+.*(?P<value>_JIT_OP(ARG|ERAND(0|1))_(16|32)).*"
577+
)
578+
_re_small_const_2 = re.compile(
579+
r"\s*(?P<instruction>ldr)\s+.*(?P<value>_JIT_OP(ARG|ERAND(0|1))_(16|32)).*"
580+
)
581+
const_reloc = "CUSTOM_AARCH64_CONST"
582+
583+
def _get_reg(self, inst: Instruction) -> str:
584+
_, rest = inst.text.split(inst.name)
585+
reg, *_ = rest.split(",")
586+
return reg.strip()
587+
588+
def _small_const_1(self, inst: Instruction) -> tuple[str, Instruction | None]:
589+
assert inst.kind is InstructionKind.SMALL_CONST_1
590+
assert inst.target is not None
591+
if "16" in inst.target:
592+
return "", None
593+
pre, _ = inst.text.split(inst.name)
594+
return "16a", Instruction(
595+
InstructionKind.OTHER, "movz", f"{pre}movz {self._get_reg(inst)}, 0", None
596+
)
597+
598+
def _small_const_2(self, inst: Instruction) -> tuple[str, Instruction | None]:
599+
assert inst.kind is InstructionKind.SMALL_CONST_2
600+
assert inst.target is not None
601+
pre, _ = inst.text.split(inst.name)
602+
if "16" in inst.target:
603+
return "16a", Instruction(
604+
InstructionKind.OTHER,
605+
"movz",
606+
f"{pre}movz {self._get_reg(inst)}, 0",
607+
None,
608+
)
609+
else:
610+
return "16b", Instruction(
611+
InstructionKind.OTHER,
612+
"movk",
613+
f"{pre}movk {self._get_reg(inst)}, 0, lsl #16",
614+
None,
615+
)
616+
617+
def _small_consts_match(self, inst1: Instruction, inst2: Instruction) -> bool:
618+
reg1 = self._get_reg(inst1)
619+
reg2 = self._get_reg(inst2)
620+
return reg1 == reg2
621+
495622

496623
class OptimizerX86(Optimizer): # pylint: disable = too-few-public-methods
497624
"""i686-pc-windows-msvc/x86_64-apple-darwin/x86_64-unknown-linux-gnu"""

Tools/jit/_schema.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
"ARM64_RELOC_PAGE21",
1010
"ARM64_RELOC_PAGEOFF12",
1111
"ARM64_RELOC_UNSIGNED",
12+
"CUSTOM_AARCH64_BRANCH19",
13+
"CUSTOM_AARCH64_CONST_16",
14+
"CUSTOM_AARCH64_CONST_32",
1215
"IMAGE_REL_AMD64_REL32",
1316
"IMAGE_REL_ARM64_BRANCH19",
1417
"IMAGE_REL_ARM64_BRANCH26",

Tools/jit/_stencils.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ class HoleValue(enum.Enum):
3232
# The current uop's operand0 on 32-bit platforms (exposed as _JIT_OPERAND0_HI/LO):
3333
OPERAND0_HI = enum.auto()
3434
OPERAND0_LO = enum.auto()
35+
# 16 and 32 bit versions of OPARG, OPERAND0 and OPERAND1
36+
OPARG_16 = enum.auto()
37+
OPERAND0_16 = enum.auto()
38+
OPERAND1_16 = enum.auto()
39+
OPERAND0_32 = enum.auto()
40+
OPERAND1_32 = enum.auto()
3541
# The current uop's operand1 on 64-bit platforms (exposed as _JIT_OPERAND1):
3642
OPERAND1 = enum.auto()
3743
# The current uop's operand1 on 32-bit platforms (exposed as _JIT_OPERAND1_HI/LO):
@@ -59,6 +65,8 @@ class HoleValue(enum.Enum):
5965
"ARM64_RELOC_PAGEOFF12": "patch_aarch64_12",
6066
"ARM64_RELOC_UNSIGNED": "patch_64",
6167
"CUSTOM_AARCH64_BRANCH19": "patch_aarch64_19r",
68+
"CUSTOM_AARCH64_CONST16a": "patch_aarch64_16a",
69+
"CUSTOM_AARCH64_CONST16b": "patch_aarch64_16b",
6270
# x86_64-pc-windows-msvc:
6371
"IMAGE_REL_AMD64_REL32": "patch_x86_64_32rx",
6472
# aarch64-pc-windows-msvc:
@@ -95,6 +103,7 @@ class HoleValue(enum.Enum):
95103
"X86_64_RELOC_SIGNED": "patch_32r",
96104
"X86_64_RELOC_UNSIGNED": "patch_64",
97105
}
106+
98107
# Translate HoleValues to C expressions:
99108
_HOLE_EXPRS = {
100109
HoleValue.CODE: "(uintptr_t)code",
@@ -103,10 +112,15 @@ class HoleValue(enum.Enum):
103112
HoleValue.GOT: "",
104113
# These should all have been turned into DATA values by process_relocations:
105114
HoleValue.OPARG: "instruction->oparg",
115+
HoleValue.OPARG_16: "instruction->oparg",
106116
HoleValue.OPERAND0: "instruction->operand0",
117+
HoleValue.OPERAND0_16: "instruction->operand0",
118+
HoleValue.OPERAND0_32: "instruction->operand0",
107119
HoleValue.OPERAND0_HI: "(instruction->operand0 >> 32)",
108120
HoleValue.OPERAND0_LO: "(instruction->operand0 & UINT32_MAX)",
109121
HoleValue.OPERAND1: "instruction->operand1",
122+
HoleValue.OPERAND1_16: "instruction->operand1",
123+
HoleValue.OPERAND1_32: "instruction->operand1",
110124
HoleValue.OPERAND1_HI: "(instruction->operand1 >> 32)",
111125
HoleValue.OPERAND1_LO: "(instruction->operand1 & UINT32_MAX)",
112126
HoleValue.TARGET: "instruction->target",
@@ -201,7 +215,10 @@ def as_c(self, where: str) -> str:
201215
if self.symbol:
202216
if value:
203217
value += " + "
204-
value += f"(uintptr_t)&{self.symbol}"
218+
if self.symbol.startswith("CONST"):
219+
value += f"instruction->{self.symbol[10:].lower()}"
220+
else:
221+
value += f"(uintptr_t)&{self.symbol}"
205222
if _signed(self.addend) or not value:
206223
if value:
207224
value += " + "

Tools/jit/_targets.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ async def _compile(
138138
f"--target={self.triple}",
139139
"-DPy_BUILD_CORE_MODULE",
140140
"-D_DEBUG" if self.debug else "-DNDEBUG",
141+
f"-DSUPPORTS_SMALL_CONSTS={1 if self.optimizer.supports_small_constants else 0}",
141142
f"-D_JIT_OPCODE={opname}",
142143
"-D_PyJIT_ACTIVE",
143144
"-D_Py_JIT",

Tools/jit/template.c

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,38 @@
3434

3535
#include "jit.h"
3636

37+
38+
#undef CURRENT_OPERAND0_64
39+
#define CURRENT_OPERAND0_64() (_operand0_64)
40+
41+
#undef CURRENT_OPERAND1_64
42+
#define CURRENT_OPERAND1_64() (_operand1_64)
43+
44+
3745
#undef CURRENT_OPARG
46+
#undef CURRENT_OPERAND0_16
47+
#undef CURRENT_OPERAND0_32
48+
#undef CURRENT_OPERAND1_16
49+
#undef CURRENT_OPERAND1_32
50+
51+
#if SUPPORTS_SMALL_CONSTS
52+
53+
#define CURRENT_OPARG() (_oparg_16)
54+
#define CURRENT_OPERAND0_32() (_operand0_32)
55+
#define CURRENT_OPERAND0_16() (_operand0_16)
56+
#define CURRENT_OPERAND1_32() (_operand1_32)
57+
#define CURRENT_OPERAND1_16() (_operand1_16)
58+
59+
#else
60+
3861
#define CURRENT_OPARG() (_oparg)
62+
#define CURRENT_OPERAND0_32() (_operand0_64)
63+
#define CURRENT_OPERAND0_16() (_operand0_64)
64+
#define CURRENT_OPERAND1_32() (_operand1_64)
65+
#define CURRENT_OPERAND1_16() (_operand1_64)
3966

40-
#undef CURRENT_OPERAND0
41-
#define CURRENT_OPERAND0() (_operand0)
67+
#endif
4268

43-
#undef CURRENT_OPERAND1
44-
#define CURRENT_OPERAND1() (_operand1)
4569

4670
#undef CURRENT_TARGET
4771
#define CURRENT_TARGET() (_target)
@@ -105,18 +129,26 @@ _JIT_ENTRY(
105129
int uopcode = _JIT_OPCODE;
106130
_Py_CODEUNIT *next_instr;
107131
// Other stuff we need handy:
108-
PATCH_VALUE(uint16_t, _oparg, _JIT_OPARG)
109132
#if SIZEOF_VOID_P == 8
110-
PATCH_VALUE(uint64_t, _operand0, _JIT_OPERAND0)
111-
PATCH_VALUE(uint64_t, _operand1, _JIT_OPERAND1)
133+
PATCH_VALUE(uint64_t, _operand0_64, _JIT_OPERAND0)
134+
PATCH_VALUE(uint64_t, _operand1_64, _JIT_OPERAND1)
112135
#else
113136
assert(SIZEOF_VOID_P == 4);
114137
PATCH_VALUE(uint32_t, _operand0_hi, _JIT_OPERAND0_HI)
115138
PATCH_VALUE(uint32_t, _operand0_lo, _JIT_OPERAND0_LO)
116-
uint64_t _operand0 = ((uint64_t)_operand0_hi << 32) | _operand0_lo;
139+
uint64_t _operand0_64 = ((uint64_t)_operand0_hi << 32) | _operand0_lo;
117140
PATCH_VALUE(uint32_t, _operand1_hi, _JIT_OPERAND1_HI)
118141
PATCH_VALUE(uint32_t, _operand1_lo, _JIT_OPERAND1_LO)
119-
uint64_t _operand1 = ((uint64_t)_operand1_hi << 32) | _operand1_lo;
142+
uint64_t _operand1_64 = ((uint64_t)_operand1_hi << 32) | _operand1_lo;
143+
#endif
144+
#if SUPPORTS_SMALL_CONSTS
145+
PATCH_VALUE(uint32_t, _operand0_32, _JIT_OPERAND0_32)
146+
PATCH_VALUE(uint32_t, _operand1_32, _JIT_OPERAND1_32)
147+
PATCH_VALUE(uint16_t, _operand0_16, _JIT_OPERAND0_16)
148+
PATCH_VALUE(uint16_t, _operand1_16, _JIT_OPERAND1_16)
149+
PATCH_VALUE(uint16_t, _oparg_16, _JIT_OPARG_16)
150+
#else
151+
PATCH_VALUE(uint16_t, _oparg, _JIT_OPARG)
120152
#endif
121153
PATCH_VALUE(uint32_t, _target, _JIT_TARGET)
122154
OPT_STAT_INC(uops_executed);

0 commit comments

Comments
 (0)