97
97
* /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
98
98
* /tmp/jitted-PID-1.so: [headers][.text][unwind_info][padding]
99
99
*
100
- * The padding size (0x100) is chosen to accommodate typical unwind info sizes
101
- * while maintaining 16-byte alignment requirements.
100
+ * The padding size is now calculated automatically during initialization
101
+ * based on the actual unwind information requirements.
102
102
*/
103
- #define PERF_JIT_CODE_PADDING 0x100
104
103
105
104
/* Convenient access to the global trampoline API state */
106
105
#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
@@ -401,10 +400,12 @@ enum {
401
400
DWRF_CFA_nop = 0x0 , // No operation
402
401
DWRF_CFA_offset_extended = 0x5 , // Extended offset instruction
403
402
DWRF_CFA_def_cfa = 0xc , // Define CFA rule
403
+ DWRF_CFA_def_cfa_register = 0xd , // Define CFA register
404
404
DWRF_CFA_def_cfa_offset = 0xe , // Define CFA offset
405
405
DWRF_CFA_offset_extended_sf = 0x11 , // Extended signed offset
406
406
DWRF_CFA_advance_loc = 0x40 , // Advance location counter
407
- DWRF_CFA_offset = 0x80 // Simple offset instruction
407
+ DWRF_CFA_offset = 0x80 , // Simple offset instruction
408
+ DWRF_CFA_restore = 0xc0 // Restore register
408
409
};
409
410
410
411
/* DWARF Exception Handling pointer encodings */
@@ -519,6 +520,7 @@ typedef struct ELFObjectContext {
519
520
uint8_t * p ; // Current write position in buffer
520
521
uint8_t * startp ; // Start of buffer (for offset calculations)
521
522
uint8_t * eh_frame_p ; // Start of EH frame data (for relative offsets)
523
+ uint8_t * fde_p ; // Start of FDE data (for PC-relative calculations)
522
524
uint32_t code_size ; // Size of the code being described
523
525
} ELFObjectContext ;
524
526
@@ -643,6 +645,8 @@ static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
643
645
// DWARF EH FRAME GENERATION
644
646
// =============================================================================
645
647
648
+ static void elf_init_ehframe (ELFObjectContext * ctx );
649
+
646
650
/*
647
651
* Initialize DWARF .eh_frame section for a code region
648
652
*
@@ -657,6 +661,23 @@ static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
657
661
* Args:
658
662
* ctx: ELF object context containing code size and buffer pointers
659
663
*/
664
+ static size_t calculate_eh_frame_size (void ) {
665
+ /* Calculate the EH frame size for the trampoline function */
666
+ extern void * _Py_trampoline_func_start ;
667
+ extern void * _Py_trampoline_func_end ;
668
+
669
+ size_t code_size = (char * )& _Py_trampoline_func_end - (char * )& _Py_trampoline_func_start ;
670
+
671
+ ELFObjectContext ctx ;
672
+ char buffer [1024 ]; // Buffer for DWARF data (1KB should be sufficient)
673
+ ctx .code_size = code_size ;
674
+ ctx .startp = ctx .p = (uint8_t * )buffer ;
675
+ ctx .fde_p = NULL ;
676
+
677
+ elf_init_ehframe (& ctx );
678
+ return ctx .p - ctx .startp ;
679
+ }
680
+
660
681
static void elf_init_ehframe (ELFObjectContext * ctx ) {
661
682
uint8_t * p = ctx -> p ;
662
683
uint8_t * framep = p ; // Remember start of frame data
@@ -784,7 +805,7 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
784
805
*
785
806
* DWRF_SECTION(FDE,
786
807
* DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here)
787
- * DWRF_U32(-0x30 ); // Initial PC-relative location of the code
808
+ * DWRF_U32(pc_relative_offset ); // PC-relative location of the code (calculated dynamically)
788
809
* DWRF_U32(ctx->code_size); // Code range covered by this FDE
789
810
* DWRF_U8(0); // Augmentation data length (none)
790
811
*
@@ -830,19 +851,31 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
830
851
DWRF_U32 (0 ); // CIE ID (0 indicates this is a CIE)
831
852
DWRF_U8 (DWRF_CIE_VERSION ); // CIE version (1)
832
853
DWRF_STR ("zR" ); // Augmentation string ("zR" = has LSDA)
833
- DWRF_UV (1 ); // Code alignment factor
854
+ #ifdef __x86_64__
855
+ DWRF_UV (1 ); // Code alignment factor (x86_64: 1 byte)
856
+ #elif defined(__aarch64__ ) && defined(__AARCH64EL__ ) && !defined(__ILP32__ )
857
+ DWRF_UV (4 ); // Code alignment factor (AArch64: 4 bytes per instruction)
858
+ #endif
834
859
DWRF_SV (- (int64_t )sizeof (uintptr_t )); // Data alignment factor (negative)
835
860
DWRF_U8 (DWRF_REG_RA ); // Return address register number
836
861
DWRF_UV (1 ); // Augmentation data length
837
862
DWRF_U8 (DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4 ); // FDE pointer encoding
838
863
839
864
/* Initial CFI instructions - describe default calling convention */
865
+ #ifdef __x86_64__
866
+ /* x86_64 initial CFI state */
840
867
DWRF_U8 (DWRF_CFA_def_cfa ); // Define CFA (Call Frame Address)
841
868
DWRF_UV (DWRF_REG_SP ); // CFA = SP register
842
869
DWRF_UV (sizeof (uintptr_t )); // CFA = SP + pointer_size
843
870
DWRF_U8 (DWRF_CFA_offset |DWRF_REG_RA ); // Return address is saved
844
871
DWRF_UV (1 ); // At offset 1 from CFA
845
-
872
+ #elif defined(__aarch64__ ) && defined(__AARCH64EL__ ) && !defined(__ILP32__ )
873
+ /* AArch64 initial CFI state */
874
+ DWRF_U8 (DWRF_CFA_def_cfa ); // Define CFA (Call Frame Address)
875
+ DWRF_UV (DWRF_REG_SP ); // CFA = SP register
876
+ DWRF_UV (0 ); // CFA = SP + 0 (AArch64 starts with offset 0)
877
+ // No initial register saves in AArch64 CIE
878
+ #endif
846
879
DWRF_ALIGNNOP (sizeof (uintptr_t )); // Align to pointer boundary
847
880
)
848
881
@@ -853,11 +886,15 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
853
886
*
854
887
* The FDE describes unwinding information specific to this function.
855
888
* It references the CIE and provides function-specific CFI instructions.
889
+ *
890
+ * The PC-relative offset is calculated after the entire EH frame is built
891
+ * to ensure accurate positioning relative to the synthesized DSO layout.
856
892
*/
857
893
DWRF_SECTION (FDE ,
858
894
DWRF_U32 ((uint32_t )(p - framep )); // Offset to CIE (backwards reference)
859
- DWRF_U32 (-0x30 ); // Machine code offset relative to .text
860
- DWRF_U32 (ctx -> code_size ); // Address range covered by this FDE (code lenght)
895
+ ctx -> fde_p = p ; // Remember where PC offset field is located for later calculation
896
+ DWRF_U32 (0 ); // Placeholder for PC-relative offset (calculated at end of elf_init_ehframe)
897
+ DWRF_U32 (ctx -> code_size ); // Address range covered by this FDE (code length)
861
898
DWRF_U8 (0 ); // Augmentation data length (none)
862
899
863
900
/*
@@ -868,32 +905,36 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
868
905
* conventions and register usage patterns.
869
906
*/
870
907
#ifdef __x86_64__
871
- /* x86_64 calling convention unwinding rules */
908
+ /* x86_64 calling convention unwinding rules with frame pointer */
872
909
# if defined(__CET__ ) && (__CET__ & 1 )
873
- DWRF_U8 (DWRF_CFA_advance_loc | 8 ); // Advance location by 8 bytes when CET protection is enabled
874
- # else
875
- DWRF_U8 (DWRF_CFA_advance_loc | 4 ); // Advance location by 4 bytes
910
+ DWRF_U8 (DWRF_CFA_advance_loc | 4 ); // Advance past endbr64 (4 bytes)
876
911
# endif
877
- DWRF_U8 (DWRF_CFA_def_cfa_offset ); // Redefine CFA offset
912
+ DWRF_U8 (DWRF_CFA_advance_loc | 1 ); // Advance past push %rbp (1 byte)
913
+ DWRF_U8 (DWRF_CFA_def_cfa_offset ); // def_cfa_offset 16
878
914
DWRF_UV (16 ); // New offset: SP + 16
879
- DWRF_U8 (DWRF_CFA_advance_loc | 6 ); // Advance location by 6 bytes
880
- DWRF_U8 (DWRF_CFA_def_cfa_offset ); // Redefine CFA offset
915
+ DWRF_U8 (DWRF_CFA_offset | DWRF_REG_BP ); // offset r6 at cfa-16
916
+ DWRF_UV (2 ); // Offset factor: 2 * 8 = 16 bytes
917
+ DWRF_U8 (DWRF_CFA_advance_loc | 3 ); // Advance past mov %rsp,%rbp (3 bytes)
918
+ DWRF_U8 (DWRF_CFA_def_cfa_register ); // def_cfa_register r6
919
+ DWRF_UV (DWRF_REG_BP ); // Use base pointer register
920
+ DWRF_U8 (DWRF_CFA_advance_loc | 3 ); // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3
921
+ DWRF_U8 (DWRF_CFA_def_cfa ); // def_cfa r7 ofs 8
922
+ DWRF_UV (DWRF_REG_SP ); // Use stack pointer register
881
923
DWRF_UV (8 ); // New offset: SP + 8
882
924
#elif defined(__aarch64__ ) && defined(__AARCH64EL__ ) && !defined(__ILP32__ )
883
925
/* AArch64 calling convention unwinding rules */
884
- DWRF_U8 (DWRF_CFA_advance_loc | 1 ); // Advance location by 1 instruction (stp x29, x30)
885
- DWRF_U8 (DWRF_CFA_def_cfa_offset ); // Redefine CFA offset
886
- DWRF_UV (16 ); // CFA = SP + 16 (stack pointer after push)
887
- DWRF_U8 (DWRF_CFA_offset | DWRF_REG_FP ); // Frame pointer (x29) saved
888
- DWRF_UV (2 ); // At offset 2 from CFA (2 * 8 = 16 bytes)
889
- DWRF_U8 (DWRF_CFA_offset | DWRF_REG_RA ); // Link register (x30) saved
890
- DWRF_UV (1 ); // At offset 1 from CFA (1 * 8 = 8 bytes)
891
- DWRF_U8 (DWRF_CFA_advance_loc | 3 ); // Advance by 3 instructions (mov x16, x3; mov x29, sp; ldp...)
892
- DWRF_U8 (DWRF_CFA_offset | DWRF_REG_FP ); // Restore frame pointer (x29)
893
- DWRF_U8 (DWRF_CFA_offset | DWRF_REG_RA ); // Restore link register (x30)
894
- DWRF_U8 (DWRF_CFA_def_cfa_offset ); // Final CFA adjustment
895
- DWRF_UV (0 ); // CFA = SP + 0 (stack restored)
896
-
926
+ DWRF_U8 (DWRF_CFA_advance_loc | 1 ); // Advance by 1 instruction (4 bytes)
927
+ DWRF_U8 (DWRF_CFA_def_cfa_offset ); // CFA = SP + 16
928
+ DWRF_UV (16 ); // Stack pointer moved by 16 bytes
929
+ DWRF_U8 (DWRF_CFA_offset | DWRF_REG_FP ); // x29 (frame pointer) saved
930
+ DWRF_UV (2 ); // At CFA-16 (2 * 8 = 16 bytes from CFA)
931
+ DWRF_U8 (DWRF_CFA_offset | DWRF_REG_RA ); // x30 (link register) saved
932
+ DWRF_UV (1 ); // At CFA-8 (1 * 8 = 8 bytes from CFA)
933
+ DWRF_U8 (DWRF_CFA_advance_loc | 3 ); // Advance by 3 instructions (12 bytes)
934
+ DWRF_U8 (DWRF_CFA_restore | DWRF_REG_RA ); // Restore x30 - NO DWRF_UV() after this!
935
+ DWRF_U8 (DWRF_CFA_restore | DWRF_REG_FP ); // Restore x29 - NO DWRF_UV() after this!
936
+ DWRF_U8 (DWRF_CFA_def_cfa_offset ); // CFA = SP + 0 (stack restored)
937
+ DWRF_UV (0 ); // Back to original stack position
897
938
#else
898
939
# error "Unsupported target architecture"
899
940
#endif
@@ -902,6 +943,58 @@ static void elf_init_ehframe(ELFObjectContext* ctx) {
902
943
)
903
944
904
945
ctx -> p = p ; // Update context pointer to end of generated data
946
+
947
+ /* Calculate and update the PC-relative offset in the FDE
948
+ *
949
+ * When perf processes the jitdump, it creates a synthesized DSO with this layout:
950
+ *
951
+ * Synthesized DSO Memory Layout:
952
+ * ┌─────────────────────────────────────────────────────────────┐ < code_start
953
+ * │ Code Section │
954
+ * │ (round_up(code_size, 8) bytes) │
955
+ * ├─────────────────────────────────────────────────────────────┤ < start of EH frame data
956
+ * │ EH Frame Data │
957
+ * │ ┌─────────────────────────────────────────────────────┐ │
958
+ * │ │ CIE data │ │
959
+ * │ └─────────────────────────────────────────────────────┘ │
960
+ * │ ┌─────────────────────────────────────────────────────┐ │
961
+ * │ │ FDE Header: │ │
962
+ * │ │ - CIE offset (4 bytes) │ │
963
+ * │ │ - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start
964
+ * │ │ - address range (4 bytes) │ │ (this specific field)
965
+ * │ │ CFI Instructions... │ │
966
+ * │ └─────────────────────────────────────────────────────┘ │
967
+ * ├─────────────────────────────────────────────────────────────┤ < reference_point
968
+ * │ EhFrameHeader │
969
+ * │ (navigation metadata) │
970
+ * └─────────────────────────────────────────────────────────────┘
971
+ *
972
+ * The PC offset field in the FDE must contain the distance from itself to code_start:
973
+ *
974
+ * distance = code_start - fde_pc_field
975
+ *
976
+ * Where:
977
+ * fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame
978
+ * code_start_location = reference_point - eh_frame_size - round_up(code_size, 8)
979
+ *
980
+ * Therefore:
981
+ * distance = code_start_location - fde_pc_field_location
982
+ * = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame)
983
+ * = -rounded_code_size - fde_offset_in_frame
984
+ * = -(round_up(code_size, 8) + fde_offset_in_frame)
985
+ *
986
+ * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field,
987
+ *
988
+ */
989
+ if (ctx -> fde_p != NULL ) {
990
+ int32_t fde_offset_in_frame = (ctx -> fde_p - ctx -> startp );
991
+ int32_t rounded_code_size = round_up (ctx -> code_size , 8 );
992
+ int32_t pc_relative_offset = - (rounded_code_size + fde_offset_in_frame );
993
+
994
+
995
+ // Update the PC-relative offset in the FDE
996
+ * (int32_t * )ctx -> fde_p = pc_relative_offset ;
997
+ }
905
998
}
906
999
907
1000
// =============================================================================
@@ -1002,8 +1095,10 @@ static void* perf_map_jit_init(void) {
1002
1095
/* Initialize code ID counter */
1003
1096
perf_jit_map_state .code_id = 0 ;
1004
1097
1005
- /* Configure trampoline API with padding information */
1006
- trampoline_api .code_padding = PERF_JIT_CODE_PADDING ;
1098
+ /* Calculate padding size based on actual unwind info requirements */
1099
+ size_t eh_frame_size = calculate_eh_frame_size ();
1100
+ size_t unwind_data_size = sizeof (EhFrameHeader ) + eh_frame_size ;
1101
+ trampoline_api .code_padding = round_up (unwind_data_size , 16 );
1007
1102
1008
1103
return & perf_jit_map_state ;
1009
1104
}
@@ -1092,6 +1187,7 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
1092
1187
char buffer [1024 ]; // Buffer for DWARF data (1KB should be sufficient)
1093
1188
ctx .code_size = code_size ;
1094
1189
ctx .startp = ctx .p = (uint8_t * )buffer ;
1190
+ ctx .fde_p = NULL ; // Initialize to NULL, will be set when FDE is written
1095
1191
1096
1192
/* Generate EH frame (Exception Handling frame) data */
1097
1193
elf_init_ehframe (& ctx );
@@ -1110,7 +1206,7 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
1110
1206
ev2 .unwind_data_size = sizeof (EhFrameHeader ) + eh_frame_size ;
1111
1207
1112
1208
/* Verify we don't exceed our padding budget */
1113
- assert (ev2 .unwind_data_size <= PERF_JIT_CODE_PADDING );
1209
+ assert (ev2 .unwind_data_size <= ( uint64_t ) trampoline_api . code_padding );
1114
1210
1115
1211
ev2 .eh_frame_hdr_size = sizeof (EhFrameHeader );
1116
1212
ev2 .mapped_size = round_up (ev2 .unwind_data_size , 16 ); // 16-byte alignment
@@ -1262,4 +1358,4 @@ _PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
1262
1358
& perf_map_jit_fini , // Cleanup function
1263
1359
};
1264
1360
1265
- #endif /* PY_HAVE_PERF_TRAMPOLINE */
1361
+ #endif /* PY_HAVE_PERF_TRAMPOLINE */
0 commit comments