From cbe48868eb090743e18bb1d3b6edbb17f4f89fb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Naz=C4=B1m=20Can=20Alt=C4=B1nova?= Date: Wed, 3 Jun 2026 02:15:34 +0200 Subject: [PATCH] gh-150723: Fix perf jitdump files on macOS (GH-150728) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The perf jitdump format defines the thread id field of the JR_CODE_LOAD record as a 32-bit value, but on macOS it was declared as a uint64_t (since pthread_threadid_np() returns a uint64_t). Those extra 8 bytes plus alignment padding shifted every following field, so parsers reading the file by the spec misread code_size as the code address and failed to resolve any Python frames. Declare thread_id as uint32_t on all platforms and truncate the macOS thread id when writing the record. The value is only informational. Symbols are resolved by address, and not thread ids so truncation is safe here. * Use mach_absolute_time for macOS jitdump timestamps On macOS the jitdump file is consumed by profilers such as samply, which timestamp their samples using mach_absolute_time(). The jitdump events were stamped with clock_gettime(CLOCK_MONOTONIC), a different clock domain that keeps advancing while the system is asleep, so the JIT code mappings could be off by days relative to the samples and no Python frame would resolve. Stamp jitdump events with mach_absolute_time() on macOS so they share the sampler's clock domain. Linux continues to use CLOCK_MONOTONIC to stay aligned with perf. Exercise the -Xperf_jit (jitdump) backend through samply and assert that Python frames resolve, exercising the binary jitdump path end to end. Skipped when samply is not installed. (cherry picked from commit 494f2e3c92cc1b7774cca16fca5c7d1ff18c0de2) Co-authored-by: Nazım Can Altınova --- Lib/test/test_samply_profiler.py | 24 +++++++++++++++ ...-06-01-19-21-01.gh-issue-150723.Hb3JDG.rst | 4 +++ ...-06-01-19-24-12.gh-issue-150723.WlcL_-.rst | 4 +++ Python/perf_jit_trampoline.c | 29 +++++++++++++++---- 4 files changed, 56 insertions(+), 5 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2026-06-01-19-21-01.gh-issue-150723.Hb3JDG.rst create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2026-06-01-19-24-12.gh-issue-150723.WlcL_-.rst diff --git a/Lib/test/test_samply_profiler.py b/Lib/test/test_samply_profiler.py index ec0ed37ffd047b7..f9ab9207c3c23d8 100644 --- a/Lib/test/test_samply_profiler.py +++ b/Lib/test/test_samply_profiler.py @@ -240,5 +240,29 @@ def compile_trampolines_for_all_functions(): self.assertIn(line, child_perf_file_contents) +@unittest.skipUnless(samply_command_works(), "samply command doesn't work") +class TestSamplyProfilerWithJitDump(unittest.TestCase, TestSamplyProfilerMixin): + # Regression test for gh-150723: exercises the binary jitdump backend + # (-Xperf_jit) end to end through samply, unlike TestSamplyProfiler which + # uses the textual perf-map backend (-Xperf). + def run_samply(self, script_dir, script, activate_trampoline=True): + if activate_trampoline: + return run_samply(script_dir, sys.executable, "-Xperf_jit", script) + return run_samply(script_dir, sys.executable, script) + + def setUp(self): + super().setUp() + self.jit_files = set(pathlib.Path("/tmp/").glob("jit-*.dump")) + self.jit_files |= set(pathlib.Path("/tmp/").glob("jitted-*.so")) + + def tearDown(self) -> None: + super().tearDown() + files_to_delete = set(pathlib.Path("/tmp/").glob("jit-*.dump")) + files_to_delete |= set(pathlib.Path("/tmp/").glob("jitted-*.so")) + files_to_delete -= self.jit_files + for file in files_to_delete: + file.unlink() + + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-06-01-19-21-01.gh-issue-150723.Hb3JDG.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-06-01-19-21-01.gh-issue-150723.Hb3JDG.rst new file mode 100644 index 000000000000000..1920c8cdfce4f4c --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-06-01-19-21-01.gh-issue-150723.Hb3JDG.rst @@ -0,0 +1,4 @@ +Fix malformed perf jitdump thread ids on macOS. The ``thread_id`` field of the +``JR_CODE_LOAD`` record was written as a 64-bit value instead of the 32-bit +value required by the jitdump format, which shifted every following field and +prevented profilers from resolving Python frames. diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-06-01-19-24-12.gh-issue-150723.WlcL_-.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-06-01-19-24-12.gh-issue-150723.WlcL_-.rst new file mode 100644 index 000000000000000..78c896b669c2393 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-06-01-19-24-12.gh-issue-150723.WlcL_-.rst @@ -0,0 +1,4 @@ +Fix perf jitdump timestamps on macOS. Events were stamped using +``CLOCK_MONOTONIC``, but macOS profilers timestamp their samples with +``mach_absolute_time()``. The mismatch prevented the JIT code mappings from +lining up with the samples, so no Python frame could be resolved. diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c index 0c460282feceef9..32b147199544cfc 100644 --- a/Python/perf_jit_trampoline.c +++ b/Python/perf_jit_trampoline.c @@ -82,6 +82,9 @@ #if defined(__linux__) # include // System call interface #endif +#if defined(__APPLE__) +# include // mach_absolute_time, mach_timebase_info +#endif // ============================================================================= // CONSTANTS AND CONFIGURATION @@ -217,11 +220,7 @@ struct BaseEvent { typedef struct { struct BaseEvent base; // Common event header uint32_t process_id; // Process ID where code was generated -#if defined(__APPLE__) - uint64_t thread_id; // Thread ID where code was generated -#else uint32_t thread_id; // Thread ID where code was generated -#endif uint64_t vma; // Virtual memory address where code is loaded uint64_t code_address; // Address of the actual machine code uint64_t code_size; // Size of the machine code in bytes @@ -295,7 +294,9 @@ static PerfMapJitState perf_jit_map_state; // ============================================================================= /* Time conversion constant */ +#if !defined(__APPLE__) static const intptr_t nanoseconds_per_second = 1000000000; +#endif /* * Get current monotonic time in nanoseconds @@ -307,6 +308,18 @@ static const intptr_t nanoseconds_per_second = 1000000000; * Returns: Current monotonic time in nanoseconds since an arbitrary epoch */ static int64_t get_current_monotonic_ticks(void) { +#if defined(__APPLE__) + // On macOS the jitdump file is consumed by profilers (such as samply) that + // timestamp their samples using mach_absolute_time(). The jitdump event + // timestamps must use the same clock domain, otherwise the JIT code + // mappings cannot be lined up with the samples. + static mach_timebase_info_data_t timebase = {0, 0}; + if (timebase.denom == 0) { + (void)mach_timebase_info(&timebase); + } + uint64_t ticks = mach_absolute_time(); + return (int64_t)(ticks * timebase.numer / timebase.denom); +#else struct timespec ts; if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) { Py_UNREACHABLE(); // Should never fail on supported systems @@ -318,6 +331,7 @@ static int64_t get_current_monotonic_ticks(void) { result *= nanoseconds_per_second; result += ts.tv_nsec; return result; +#endif } /* @@ -652,7 +666,12 @@ static void perf_map_jit_write_entry_with_name( ev.base.time_stamp = get_current_monotonic_ticks(); ev.process_id = getpid(); #if defined(__APPLE__) - pthread_threadid_np(NULL, &ev.thread_id); + // The jitdump format defines the thread id field as a 32-bit value, but + // pthread_threadid_np() returns a 64-bit id. Truncate it to 32 bits to + // keep the record layout identical to other platforms. + uint64_t thread_id = 0; + pthread_threadid_np(NULL, &thread_id); + ev.thread_id = (uint32_t)thread_id; #else ev.thread_id = syscall(SYS_gettid); // Get thread ID via system call #endif