@@ -134,18 +134,20 @@ mark_executable(unsigned char *memory, size_t size)
134134
135135// JIT compiler stuff: /////////////////////////////////////////////////////////
136136
137- #define SYMBOL_MASK_WORDS 4
137+ #define GOT_SLOT_SIZE sizeof(uintptr_t)
138+ #define SYMBOL_MASK_WORDS 8
138139
139140typedef uint32_t symbol_mask [SYMBOL_MASK_WORDS ];
140141
141142typedef struct {
142143 unsigned char * mem ;
143144 symbol_mask mask ;
144145 size_t size ;
145- } trampoline_state ;
146+ } symbol_state ;
146147
147148typedef struct {
148- trampoline_state trampolines ;
149+ symbol_state trampolines ;
150+ symbol_state got_symbols ;
149151 uintptr_t instruction_starts [UOP_MAX_TRACE_LENGTH ];
150152} jit_state ;
151153
@@ -210,6 +212,33 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start,
210212// - x86_64-unknown-linux-gnu:
211213// - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/X86_64.cpp
212214
215+
216+ // Get the symbol slot memory location for a given symbol ordinal.
217+ static unsigned char *
218+ get_symbol_slot (int ordinal , symbol_state * state , int size )
219+ {
220+ const uint32_t symbol_mask = 1U << (ordinal % 32 );
221+ const uint32_t state_mask = state -> mask [ordinal / 32 ];
222+ assert (symbol_mask & state_mask );
223+
224+ // Count the number of set bits in the symbol mask lower than ordinal
225+ size_t index = _Py_popcount32 (state_mask & (symbol_mask - 1 ));
226+ for (int i = 0 ; i < ordinal / 32 ; i ++ ) {
227+ index += _Py_popcount32 (state -> mask [i ]);
228+ }
229+
230+ unsigned char * slot = state -> mem + index * size ;
231+ assert ((size_t )(index + 1 ) * size <= state -> size );
232+ return slot ;
233+ }
234+
235+ // Return the address of the GOT slot for the requested symbol ordinal.
236+ static uintptr_t
237+ got_symbol_address (int ordinal , jit_state * state )
238+ {
239+ return (uintptr_t )get_symbol_slot (ordinal , & state -> got_symbols , GOT_SLOT_SIZE );
240+ }
241+
213242// Many of these patches are "relaxing", meaning that they can rewrite the
214243// code they're patching to be more efficient (like turning a 64-bit memory
215244// load into a 32-bit immediate load). These patches have an "x" in their name.
@@ -452,6 +481,7 @@ patch_x86_64_32rx(unsigned char *location, uint64_t value)
452481 patch_32r (location , value );
453482}
454483
484+ void patch_got_symbol (jit_state * state , int ordinal );
455485void patch_aarch64_trampoline (unsigned char * location , int ordinal , jit_state * state );
456486void patch_x86_64_trampoline (unsigned char * location , int ordinal , jit_state * state );
457487
@@ -470,23 +500,13 @@ void patch_x86_64_trampoline(unsigned char *location, int ordinal, jit_state *st
470500 #define DATA_ALIGN 1
471501#endif
472502
473- // Get the trampoline memory location for a given symbol ordinal.
474- static unsigned char *
475- get_trampoline_slot ( int ordinal , jit_state * state )
503+ // Populate the GOT entry for the given symbol ordinal with its resolved address .
504+ void
505+ patch_got_symbol ( jit_state * state , int ordinal )
476506{
477- const uint32_t symbol_mask = 1 << (ordinal % 32 );
478- const uint32_t trampoline_mask = state -> trampolines .mask [ordinal / 32 ];
479- assert (symbol_mask & trampoline_mask );
480-
481- // Count the number of set bits in the trampoline mask lower than ordinal
482- int index = _Py_popcount32 (trampoline_mask & (symbol_mask - 1 ));
483- for (int i = 0 ; i < ordinal / 32 ; i ++ ) {
484- index += _Py_popcount32 (state -> trampolines .mask [i ]);
485- }
486-
487- unsigned char * trampoline = state -> trampolines .mem + index * TRAMPOLINE_SIZE ;
488- assert ((size_t )(index + 1 ) * TRAMPOLINE_SIZE <= state -> trampolines .size );
489- return trampoline ;
507+ uint64_t value = (uintptr_t )symbols_map [ordinal ];
508+ unsigned char * location = (unsigned char * )get_symbol_slot (ordinal , & state -> got_symbols , GOT_SLOT_SIZE );
509+ patch_64 (location , value );
490510}
491511
492512// Generate and patch AArch64 trampolines. The symbols to jump to are stored
@@ -506,8 +526,7 @@ patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state)
506526 }
507527
508528 // Out of range - need a trampoline
509- uint32_t * p = (uint32_t * )get_trampoline_slot (ordinal , state );
510-
529+ uint32_t * p = (uint32_t * )get_symbol_slot (ordinal , & state -> trampolines , TRAMPOLINE_SIZE );
511530
512531 /* Generate the trampoline
513532 0: 58000048 ldr x8, 8
@@ -537,7 +556,7 @@ patch_x86_64_trampoline(unsigned char *location, int ordinal, jit_state *state)
537556 }
538557
539558 // Out of range - need a trampoline
540- unsigned char * trampoline = get_trampoline_slot (ordinal , state );
559+ unsigned char * trampoline = get_symbol_slot (ordinal , & state -> trampolines , TRAMPOLINE_SIZE );
541560
542561 /* Generate the trampoline (14 bytes, padded to 16):
543562 0: ff 25 00 00 00 00 jmp *(%rip)
@@ -579,21 +598,26 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
579598 code_size += group -> code_size ;
580599 data_size += group -> data_size ;
581600 combine_symbol_mask (group -> trampoline_mask , state .trampolines .mask );
601+ combine_symbol_mask (group -> got_mask , state .got_symbols .mask );
582602 }
583603 group = & stencil_groups [_FATAL_ERROR ];
584604 code_size += group -> code_size ;
585605 data_size += group -> data_size ;
586606 combine_symbol_mask (group -> trampoline_mask , state .trampolines .mask );
607+ combine_symbol_mask (group -> got_mask , state .got_symbols .mask );
587608 // Calculate the size of the trampolines required by the whole trace
588609 for (size_t i = 0 ; i < Py_ARRAY_LENGTH (state .trampolines .mask ); i ++ ) {
589610 state .trampolines .size += _Py_popcount32 (state .trampolines .mask [i ]) * TRAMPOLINE_SIZE ;
590611 }
612+ for (size_t i = 0 ; i < Py_ARRAY_LENGTH (state .got_symbols .mask ); i ++ ) {
613+ state .got_symbols .size += _Py_popcount32 (state .got_symbols .mask [i ]) * GOT_SLOT_SIZE ;
614+ }
591615 // Round up to the nearest page:
592616 size_t page_size = get_page_size ();
593617 assert ((page_size & (page_size - 1 )) == 0 );
594618 size_t code_padding = DATA_ALIGN - ((code_size + state .trampolines .size ) & (DATA_ALIGN - 1 ));
595- size_t padding = page_size - ((code_size + state .trampolines .size + code_padding + data_size ) & (page_size - 1 ));
596- size_t total_size = code_size + state .trampolines .size + code_padding + data_size + padding ;
619+ size_t padding = page_size - ((code_size + state .trampolines .size + code_padding + data_size + state . got_symbols . size ) & (page_size - 1 ));
620+ size_t total_size = code_size + state .trampolines .size + code_padding + data_size + state . got_symbols . size + padding ;
597621 unsigned char * memory = jit_alloc (total_size );
598622 if (memory == NULL ) {
599623 return -1 ;
@@ -603,6 +627,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
603627 OPT_STAT_ADD (jit_code_size , code_size );
604628 OPT_STAT_ADD (jit_trampoline_size , state .trampolines .size );
605629 OPT_STAT_ADD (jit_data_size , data_size );
630+ OPT_STAT_ADD (jit_got_size , state .got_symbols .size );
606631 OPT_STAT_ADD (jit_padding_size , padding );
607632 OPT_HIST (total_size , trace_total_memory_hist );
608633 // Update the offsets of each instruction:
@@ -613,6 +638,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
613638 unsigned char * code = memory ;
614639 state .trampolines .mem = memory + code_size ;
615640 unsigned char * data = memory + code_size + state .trampolines .size + code_padding ;
641+ state .got_symbols .mem = data + data_size ;
616642 assert (trace [0 ].opcode == _START_EXECUTOR || trace [0 ].opcode == _COLD_EXIT || trace [0 ].opcode == _COLD_DYNAMIC_EXIT );
617643 for (size_t i = 0 ; i < length ; i ++ ) {
618644 const _PyUOpInstruction * instruction = & trace [i ];
@@ -654,19 +680,21 @@ compile_trampoline(void)
654680 code_size += group -> code_size ;
655681 data_size += group -> data_size ;
656682 combine_symbol_mask (group -> trampoline_mask , state .trampolines .mask );
683+ combine_symbol_mask (group -> got_mask , state .got_symbols .mask );
657684 // Round up to the nearest page:
658685 size_t page_size = get_page_size ();
659686 assert ((page_size & (page_size - 1 )) == 0 );
660687 size_t code_padding = DATA_ALIGN - ((code_size + state .trampolines .size ) & (DATA_ALIGN - 1 ));
661- size_t padding = page_size - ((code_size + state .trampolines .size + code_padding + data_size ) & (page_size - 1 ));
662- size_t total_size = code_size + state .trampolines .size + code_padding + data_size + padding ;
688+ size_t padding = page_size - ((code_size + state .trampolines .size + code_padding + data_size + state . got_symbols . size ) & (page_size - 1 ));
689+ size_t total_size = code_size + state .trampolines .size + code_padding + data_size + state . got_symbols . size + padding ;
663690 unsigned char * memory = jit_alloc (total_size );
664691 if (memory == NULL ) {
665692 return NULL ;
666693 }
667694 unsigned char * code = memory ;
668695 state .trampolines .mem = memory + code_size ;
669696 unsigned char * data = memory + code_size + state .trampolines .size + code_padding ;
697+ state .got_symbols .mem = data + data_size ;
670698 // Compile the shim, which handles converting between the native
671699 // calling convention and the calling convention used by jitted code
672700 // (which may be different for efficiency reasons).
0 commit comments