Skip to content

Commit 5535482

Browse files
[3.14] gh-136541: Fix several problems of perf trampolines in x86_64 and aarch64 (GH-136500) (#136544)
gh-136541: Fix several problems of perf trampolines in x86_64 and aarch64 (GH-136500) This commit fixes the following problems: * The x86_64 trampolines are not preserving frame pointers * The hardcoded offsets to the code segment from the FDE only worked properly for x64_64 * The CIE data was not following conventions of aarch64 * The eh_frame for aarch64 was not fully correct (cherry picked from commit 236f733) Co-authored-by: Pablo Galindo Salgado <[email protected]>
1 parent a464c4e commit 5535482

File tree

4 files changed

+147
-41
lines changed

4 files changed

+147
-41
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix some issues with the perf trampolines on x86-64 and aarch64. The
2+
trampolines were not being generated correctly for some cases, which could
3+
lead to the perf integration not working correctly. Patch by Pablo Galindo.

Python/asm_trampoline.S

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@ _Py_trampoline_func_start:
1212
#if defined(__CET__) && (__CET__ & 1)
1313
endbr64
1414
#endif
15-
sub $8, %rsp
16-
call *%rcx
17-
add $8, %rsp
15+
push %rbp
16+
mov %rsp, %rbp
17+
call *%rcx
18+
pop %rbp
1819
ret
1920
#endif // __x86_64__
2021
#if defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)

Python/perf_jit_trampoline.c

Lines changed: 129 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,9 @@
9797
* /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
9898
* /tmp/jitted-PID-1.so: [headers][.text][unwind_info][padding]
9999
*
100-
* The padding size (0x100) is chosen to accommodate typical unwind info sizes
101-
* while maintaining 16-byte alignment requirements.
100+
* The padding size is now calculated automatically during initialization
101+
* based on the actual unwind information requirements.
102102
*/
103-
#define PERF_JIT_CODE_PADDING 0x100
104103

105104
/* Convenient access to the global trampoline API state */
106105
#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
@@ -401,10 +400,12 @@ enum {
401400
DWRF_CFA_nop = 0x0, // No operation
402401
DWRF_CFA_offset_extended = 0x5, // Extended offset instruction
403402
DWRF_CFA_def_cfa = 0xc, // Define CFA rule
403+
DWRF_CFA_def_cfa_register = 0xd, // Define CFA register
404404
DWRF_CFA_def_cfa_offset = 0xe, // Define CFA offset
405405
DWRF_CFA_offset_extended_sf = 0x11, // Extended signed offset
406406
DWRF_CFA_advance_loc = 0x40, // Advance location counter
407-
DWRF_CFA_offset = 0x80 // Simple offset instruction
407+
DWRF_CFA_offset = 0x80, // Simple offset instruction
408+
DWRF_CFA_restore = 0xc0 // Restore register
408409
};
409410

410411
/* DWARF Exception Handling pointer encodings */
@@ -519,6 +520,7 @@ typedef struct ELFObjectContext {
519520
uint8_t* p; // Current write position in buffer
520521
uint8_t* startp; // Start of buffer (for offset calculations)
521522
uint8_t* eh_frame_p; // Start of EH frame data (for relative offsets)
523+
uint8_t* fde_p; // Start of FDE data (for PC-relative calculations)
522524
uint32_t code_size; // Size of the code being described
523525
} ELFObjectContext;
524526

@@ -643,6 +645,8 @@ static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
643645
// DWARF EH FRAME GENERATION
644646
// =============================================================================
645647

648+
static void elf_init_ehframe(ELFObjectContext* ctx);
649+
646650
/*
647651
* Initialize DWARF .eh_frame section for a code region
648652
*
@@ -657,6 +661,23 @@ static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
657661
* Args:
658662
* ctx: ELF object context containing code size and buffer pointers
659663
*/
664+
static size_t calculate_eh_frame_size(void) {
665+
/* Calculate the EH frame size for the trampoline function */
666+
extern void *_Py_trampoline_func_start;
667+
extern void *_Py_trampoline_func_end;
668+
669+
size_t code_size = (char*)&_Py_trampoline_func_end - (char*)&_Py_trampoline_func_start;
670+
671+
ELFObjectContext ctx;
672+
char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient)
673+
ctx.code_size = code_size;
674+
ctx.startp = ctx.p = (uint8_t*)buffer;
675+
ctx.fde_p = NULL;
676+
677+
elf_init_ehframe(&ctx);
678+
return ctx.p - ctx.startp;
679+
}
680+
660681
static void elf_init_ehframe(ELFObjectContext* ctx) {
661682
uint8_t* p = ctx->p;
662683
uint8_t* framep = p; // Remember start of frame data
@@ -784,7 +805,7 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
784805
*
785806
* DWRF_SECTION(FDE,
786807
* DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here)
787-
* DWRF_U32(-0x30); // Initial PC-relative location of the code
808+
* DWRF_U32(pc_relative_offset); // PC-relative location of the code (calculated dynamically)
788809
* DWRF_U32(ctx->code_size); // Code range covered by this FDE
789810
* DWRF_U8(0); // Augmentation data length (none)
790811
*
@@ -830,19 +851,31 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
830851
DWRF_U32(0); // CIE ID (0 indicates this is a CIE)
831852
DWRF_U8(DWRF_CIE_VERSION); // CIE version (1)
832853
DWRF_STR("zR"); // Augmentation string ("zR" = has LSDA)
833-
DWRF_UV(1); // Code alignment factor
854+
#ifdef __x86_64__
855+
DWRF_UV(1); // Code alignment factor (x86_64: 1 byte)
856+
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
857+
DWRF_UV(4); // Code alignment factor (AArch64: 4 bytes per instruction)
858+
#endif
834859
DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative)
835860
DWRF_U8(DWRF_REG_RA); // Return address register number
836861
DWRF_UV(1); // Augmentation data length
837862
DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding
838863

839864
/* Initial CFI instructions - describe default calling convention */
865+
#ifdef __x86_64__
866+
/* x86_64 initial CFI state */
840867
DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address)
841868
DWRF_UV(DWRF_REG_SP); // CFA = SP register
842869
DWRF_UV(sizeof(uintptr_t)); // CFA = SP + pointer_size
843870
DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved
844871
DWRF_UV(1); // At offset 1 from CFA
845-
872+
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
873+
/* AArch64 initial CFI state */
874+
DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address)
875+
DWRF_UV(DWRF_REG_SP); // CFA = SP register
876+
DWRF_UV(0); // CFA = SP + 0 (AArch64 starts with offset 0)
877+
// No initial register saves in AArch64 CIE
878+
#endif
846879
DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary
847880
)
848881

@@ -853,11 +886,15 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
853886
*
854887
* The FDE describes unwinding information specific to this function.
855888
* It references the CIE and provides function-specific CFI instructions.
889+
*
890+
* The PC-relative offset is calculated after the entire EH frame is built
891+
* to ensure accurate positioning relative to the synthesized DSO layout.
856892
*/
857893
DWRF_SECTION(FDE,
858894
DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference)
859-
DWRF_U32(-0x30); // Machine code offset relative to .text
860-
DWRF_U32(ctx->code_size); // Address range covered by this FDE (code lenght)
895+
ctx->fde_p = p; // Remember where PC offset field is located for later calculation
896+
DWRF_U32(0); // Placeholder for PC-relative offset (calculated at end of elf_init_ehframe)
897+
DWRF_U32(ctx->code_size); // Address range covered by this FDE (code length)
861898
DWRF_U8(0); // Augmentation data length (none)
862899

863900
/*
@@ -868,32 +905,36 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
868905
* conventions and register usage patterns.
869906
*/
870907
#ifdef __x86_64__
871-
/* x86_64 calling convention unwinding rules */
908+
/* x86_64 calling convention unwinding rules with frame pointer */
872909
# if defined(__CET__) && (__CET__ & 1)
873-
DWRF_U8(DWRF_CFA_advance_loc | 8); // Advance location by 8 bytes when CET protection is enabled
874-
# else
875-
DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance location by 4 bytes
910+
DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance past endbr64 (4 bytes)
876911
# endif
877-
DWRF_U8(DWRF_CFA_def_cfa_offset); // Redefine CFA offset
912+
DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance past push %rbp (1 byte)
913+
DWRF_U8(DWRF_CFA_def_cfa_offset); // def_cfa_offset 16
878914
DWRF_UV(16); // New offset: SP + 16
879-
DWRF_U8(DWRF_CFA_advance_loc | 6); // Advance location by 6 bytes
880-
DWRF_U8(DWRF_CFA_def_cfa_offset); // Redefine CFA offset
915+
DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16
916+
DWRF_UV(2); // Offset factor: 2 * 8 = 16 bytes
917+
DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past mov %rsp,%rbp (3 bytes)
918+
DWRF_U8(DWRF_CFA_def_cfa_register); // def_cfa_register r6
919+
DWRF_UV(DWRF_REG_BP); // Use base pointer register
920+
DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3
921+
DWRF_U8(DWRF_CFA_def_cfa); // def_cfa r7 ofs 8
922+
DWRF_UV(DWRF_REG_SP); // Use stack pointer register
881923
DWRF_UV(8); // New offset: SP + 8
882924
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
883925
/* AArch64 calling convention unwinding rules */
884-
DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 instruction (stp x29, x30)
885-
DWRF_U8(DWRF_CFA_def_cfa_offset); // Redefine CFA offset
886-
DWRF_UV(16); // CFA = SP + 16 (stack pointer after push)
887-
DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Frame pointer (x29) saved
888-
DWRF_UV(2); // At offset 2 from CFA (2 * 8 = 16 bytes)
889-
DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Link register (x30) saved
890-
DWRF_UV(1); // At offset 1 from CFA (1 * 8 = 8 bytes)
891-
DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (mov x16, x3; mov x29, sp; ldp...)
892-
DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore frame pointer (x29)
893-
DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore link register (x30)
894-
DWRF_U8(DWRF_CFA_def_cfa_offset); // Final CFA adjustment
895-
DWRF_UV(0); // CFA = SP + 0 (stack restored)
896-
926+
DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance by 1 instruction (4 bytes)
927+
DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16
928+
DWRF_UV(16); // Stack pointer moved by 16 bytes
929+
DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // x29 (frame pointer) saved
930+
DWRF_UV(2); // At CFA-16 (2 * 8 = 16 bytes from CFA)
931+
DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // x30 (link register) saved
932+
DWRF_UV(1); // At CFA-8 (1 * 8 = 8 bytes from CFA)
933+
DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (12 bytes)
934+
DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA); // Restore x30 - NO DWRF_UV() after this!
935+
DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP); // Restore x29 - NO DWRF_UV() after this!
936+
DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 0 (stack restored)
937+
DWRF_UV(0); // Back to original stack position
897938
#else
898939
# error "Unsupported target architecture"
899940
#endif
@@ -902,6 +943,58 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
902943
)
903944

904945
ctx->p = p; // Update context pointer to end of generated data
946+
947+
/* Calculate and update the PC-relative offset in the FDE
948+
*
949+
* When perf processes the jitdump, it creates a synthesized DSO with this layout:
950+
*
951+
* Synthesized DSO Memory Layout:
952+
* ┌─────────────────────────────────────────────────────────────┐ < code_start
953+
* │ Code Section │
954+
* │ (round_up(code_size, 8) bytes) │
955+
* ├─────────────────────────────────────────────────────────────┤ < start of EH frame data
956+
* │ EH Frame Data │
957+
* │ ┌─────────────────────────────────────────────────────┐ │
958+
* │ │ CIE data │ │
959+
* │ └─────────────────────────────────────────────────────┘ │
960+
* │ ┌─────────────────────────────────────────────────────┐ │
961+
* │ │ FDE Header: │ │
962+
* │ │ - CIE offset (4 bytes) │ │
963+
* │ │ - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start
964+
* │ │ - address range (4 bytes) │ │ (this specific field)
965+
* │ │ CFI Instructions... │ │
966+
* │ └─────────────────────────────────────────────────────┘ │
967+
* ├─────────────────────────────────────────────────────────────┤ < reference_point
968+
* │ EhFrameHeader │
969+
* │ (navigation metadata) │
970+
* └─────────────────────────────────────────────────────────────┘
971+
*
972+
* The PC offset field in the FDE must contain the distance from itself to code_start:
973+
*
974+
* distance = code_start - fde_pc_field
975+
*
976+
* Where:
977+
* fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame
978+
* code_start_location = reference_point - eh_frame_size - round_up(code_size, 8)
979+
*
980+
* Therefore:
981+
* distance = code_start_location - fde_pc_field_location
982+
* = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame)
983+
* = -rounded_code_size - fde_offset_in_frame
984+
* = -(round_up(code_size, 8) + fde_offset_in_frame)
985+
*
986+
* Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field,
987+
*
988+
*/
989+
if (ctx->fde_p != NULL) {
990+
int32_t fde_offset_in_frame = (ctx->fde_p - ctx->startp);
991+
int32_t rounded_code_size = round_up(ctx->code_size, 8);
992+
int32_t pc_relative_offset = -(rounded_code_size + fde_offset_in_frame);
993+
994+
995+
// Update the PC-relative offset in the FDE
996+
*(int32_t*)ctx->fde_p = pc_relative_offset;
997+
}
905998
}
906999

9071000
// =============================================================================
@@ -1002,8 +1095,10 @@ static void* perf_map_jit_init(void) {
10021095
/* Initialize code ID counter */
10031096
perf_jit_map_state.code_id = 0;
10041097

1005-
/* Configure trampoline API with padding information */
1006-
trampoline_api.code_padding = PERF_JIT_CODE_PADDING;
1098+
/* Calculate padding size based on actual unwind info requirements */
1099+
size_t eh_frame_size = calculate_eh_frame_size();
1100+
size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
1101+
trampoline_api.code_padding = round_up(unwind_data_size, 16);
10071102

10081103
return &perf_jit_map_state;
10091104
}
@@ -1092,6 +1187,7 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
10921187
char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient)
10931188
ctx.code_size = code_size;
10941189
ctx.startp = ctx.p = (uint8_t*)buffer;
1190+
ctx.fde_p = NULL; // Initialize to NULL, will be set when FDE is written
10951191

10961192
/* Generate EH frame (Exception Handling frame) data */
10971193
elf_init_ehframe(&ctx);
@@ -1110,7 +1206,7 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
11101206
ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
11111207

11121208
/* Verify we don't exceed our padding budget */
1113-
assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING);
1209+
assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding);
11141210

11151211
ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
11161212
ev2.mapped_size = round_up(ev2.unwind_data_size, 16); // 16-byte alignment
@@ -1262,4 +1358,4 @@ _PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
12621358
&perf_map_jit_fini, // Cleanup function
12631359
};
12641360

1265-
#endif /* PY_HAVE_PERF_TRAMPOLINE */
1361+
#endif /* PY_HAVE_PERF_TRAMPOLINE */

Python/perf_trampoline.c

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,8 @@ static void invalidate_icache(char* begin, char*end) {
162162
}
163163
#endif
164164

165+
#define CODE_ALIGNMENT 32
166+
165167
/* The function pointer is passed as last argument. The other three arguments
166168
* are passed in the same order as the function requires. This results in
167169
* shorter, more efficient ASM code for trampoline.
@@ -291,7 +293,9 @@ new_code_arena(void)
291293
void *start = &_Py_trampoline_func_start;
292294
void *end = &_Py_trampoline_func_end;
293295
size_t code_size = end - start;
294-
size_t chunk_size = round_up(code_size + trampoline_api.code_padding, 16);
296+
size_t unaligned_size = code_size + trampoline_api.code_padding;
297+
size_t chunk_size = round_up(unaligned_size, CODE_ALIGNMENT);
298+
assert(chunk_size % CODE_ALIGNMENT == 0);
295299
// TODO: Check the effect of alignment of the code chunks. Initial investigation
296300
// showed that this has no effect on performance in x86-64 or aarch64 and the current
297301
// version has the advantage that the unwinder in GDB can unwind across JIT-ed code.
@@ -356,7 +360,9 @@ static inline py_trampoline
356360
code_arena_new_code(code_arena_t *code_arena)
357361
{
358362
py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
359-
size_t total_code_size = round_up(code_arena->code_size + trampoline_api.code_padding, 16);
363+
size_t total_code_size = round_up(code_arena->code_size + trampoline_api.code_padding,
364+
CODE_ALIGNMENT);
365+
assert(total_code_size % CODE_ALIGNMENT == 0);
360366
code_arena->size_left -= total_code_size;
361367
code_arena->current_addr += total_code_size;
362368
return trampoline;
@@ -489,16 +495,16 @@ _PyPerfTrampoline_Init(int activate)
489495
}
490496
else {
491497
_PyInterpreterState_SetEvalFrameFunc(tstate->interp, py_trampoline_evaluator);
492-
if (new_code_arena() < 0) {
493-
return -1;
494-
}
495498
extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
496499
if (extra_code_index == -1) {
497500
return -1;
498501
}
499502
if (trampoline_api.state == NULL && trampoline_api.init_state != NULL) {
500503
trampoline_api.state = trampoline_api.init_state();
501504
}
505+
if (new_code_arena() < 0) {
506+
return -1;
507+
}
502508
perf_status = PERF_STATUS_OK;
503509
}
504510
#endif

0 commit comments

Comments
 (0)