diff --git a/docs/design/coreclr/botr/xplat-minidump-generation.md b/docs/design/coreclr/botr/xplat-minidump-generation.md index 20380fe2220107..d2bfeb20921862 100644 --- a/docs/design/coreclr/botr/xplat-minidump-generation.md +++ b/docs/design/coreclr/botr/xplat-minidump-generation.md @@ -60,8 +60,9 @@ Environment variables supported: - `COMPlus_DbgEnableMiniDump`: if set to "1", enables this core dump generation. The default is NOT to generate a dump. - `COMPlus_DbgMiniDumpType`: See below. Default is "2" MiniDumpWithPrivateReadWriteMemory. -- `COMPlus_DbgMiniDumpName`: if set, use as the template to create the dump path and file name. The pid can be placed in the name with %d. The default is _/tmp/coredump.%d_. +- `COMPlus_DbgMiniDumpName`: if set, use as the template to create the dump path and file name. See "Dump name formatting" for how the dump name can be formatted. The default is _/tmp/coredump.%p_. - `COMPlus_CreateDumpDiagnostics`: if set to "1", enables the _createdump_ utilities diagnostic messages (TRACE macro). +- `COMPlus_EnableCrashReport`: In .NET 6.0 or greater, if set to "1", createdump also generates a json formatted crash report which includes information about the threads and stack frames of the crashing application. The crash report name is the dump path/name with _.crashreport.json_ appended. COMPlus_DbgMiniDumpType values: @@ -79,15 +80,23 @@ COMPlus_DbgMiniDumpType values: The createdump utility can also be run from the command line on arbitrary .NET Core processes. The type of dump can be controlled with the below command switches. The default is a "minidump" which contains the majority the memory and managed state needed. Unless you have ptrace (CAP_SYS_PTRACE) administrative privilege, you need to run with sudo or su. The same as if you were attaching with lldb or other native debugger. -`sudo createdump ` - - -f, --name - dump path and file name. The %p, %e, %h %t format characters are supported. The default is '/tmp/coredump.%p' - -n, --normal - create minidump. - -h, --withheap - create minidump with heap (default). - -t, --triage - create triage minidump. - -u, --full - create full core dump. - -d, --diag - enable diagnostic messages. - +``` +createdump [options] pid +-f, --name - dump path and file name. The default is '/tmp/coredump.%p'. These specifiers are substituted with following values: + %p PID of dumped process. + %e The process executable filename. + %h Hostname return by gethostname(). + %t Time of dump, expressed as seconds since the Epoch, 1970-01-01 00:00:00 +0000 (UTC). +-n, --normal - create minidump. +-h, --withheap - create minidump with heap (default). +-t, --triage - create triage minidump. +-u, --full - create full core dump. +-d, --diag - enable diagnostic messages. +-v, --verbose - enable verbose diagnostic messages. +--crashreport - write crash report file. +--crashthread - the thread id of the crashing thread. +--signal - the signal code of the crash. +``` **Dump name formatting** @@ -103,12 +112,3 @@ As of .NET 5.0, the following subset of the core pattern (see [core](https://man # Testing # The test plan is to modify the SOS tests in the (still) private debuggertests repo to trigger and use the core minidumps generated. Debugging managed core dumps on Linux is not supported by _mdbg_ at this time until we have a ELF core dump reader so only the SOS tests (which use _lldb_ on Linux) will be modified. - -# Open Issues # - -- May need more than just the pid for decorating dump names for docker containers because I think the _pid_ is always 1. -- Do we need all the memory mappings from `/proc/$pid/maps` in the PT\_LOAD sections even though the memory is not actually in the dump? They have a file offset/size of 0. Full dumps generated by the system or _gdb_ do have these un-backed regions. -- There is no way to get the signal number, etc. that causes the abort from the _createdump_ utility using _ptrace_ or a /proc file. It would have to be passed from CoreCLR on the command line. -- Do we need the "dynamic" sections of each shared module in the core dump? It is part of the "link_map" entry enumerated when gathering the _DSO_ information. -- There may be more versioning and/or build id information needed to be added to the dump. -- It is unclear exactly what cases stack overflow does not get control in the signal handler and when the OS just aborts the process. diff --git a/src/coreclr/debug/createdump/crashreportwriter.cpp b/src/coreclr/debug/createdump/crashreportwriter.cpp index e72651e4f824be..1fb0865cf5d5f7 100644 --- a/src/coreclr/debug/createdump/crashreportwriter.cpp +++ b/src/coreclr/debug/createdump/crashreportwriter.cpp @@ -148,12 +148,23 @@ CrashReportWriter::WriteCrashReport() WriteValue64("BP", thread->GetFramePointer()); CloseObject(); // ctx - OpenArray("unmanaged_frames"); - for (const StackFrame& frame : thread->StackFrames()) + OpenArray("stack_frames"); + for (auto iterator = thread->StackFrames().cbegin(); iterator != thread->StackFrames().cend(); ++iterator) { - WriteStackFrame(frame); + if (thread->IsBeginRepeat(iterator)) + { + OpenObject(); + WriteValue32("repeated", thread->NumRepeatedFrames()); + OpenArray("repeated_frames"); + } + if (thread->IsEndRepeat(iterator)) + { + CloseArray(); // repeated_frames + CloseObject(); + } + WriteStackFrame(*iterator); } - CloseArray(); // unmanaged_frames + CloseArray(); // stack_frames CloseObject(); } CloseArray(); // threads diff --git a/src/coreclr/debug/createdump/main.cpp b/src/coreclr/debug/createdump/main.cpp index 4ed6366d4675ad..f91a81ca66ccb8 100644 --- a/src/coreclr/debug/createdump/main.cpp +++ b/src/coreclr/debug/createdump/main.cpp @@ -95,10 +95,13 @@ int __cdecl main(const int argc, const char* argv[]) { dumpType = "triage minidump"; minidumpType = (MINIDUMP_TYPE)(MiniDumpFilterTriage | - MiniDumpWithDataSegs | - MiniDumpWithHandleData | + MiniDumpIgnoreInaccessibleMemory | + MiniDumpWithoutOptionalData | + MiniDumpWithProcessThreadData | MiniDumpFilterModulePaths | - MiniDumpWithThreadInfo); + MiniDumpWithUnloadedModules | + MiniDumpFilterMemory | + MiniDumpWithHandleData); } else if ((strcmp(*argv, "-u") == 0) || (strcmp(*argv, "--full") == 0)) { diff --git a/src/coreclr/debug/createdump/threadinfo.cpp b/src/coreclr/debug/createdump/threadinfo.cpp index 2107c6c1bafb73..82509f52750653 100644 --- a/src/coreclr/debug/createdump/threadinfo.cpp +++ b/src/coreclr/debug/createdump/threadinfo.cpp @@ -141,7 +141,12 @@ ThreadInfo::UnwindThread(IXCLRDataProcess* pClrDataProcess, ISOSDacInterface* pS m_managed = true; ReleaseHolder pException; - if (SUCCEEDED(pTask->GetCurrentExceptionState(&pException))) + HRESULT hr = pTask->GetCurrentExceptionState(&pException); + if (FAILED(hr)) + { + hr = pTask->GetLastExceptionState(&pException); + } + if (SUCCEEDED(hr)) { TRACE("Unwind: found managed exception\n"); @@ -270,18 +275,77 @@ ThreadInfo::GatherStackFrames(CONTEXT* pContext, IXCLRDataStackWalk* pStackwalk) AddStackFrame(frame); } +// This function deals with two types of frames: duplicate stack frames (SP is equal) and repeated frames (IP is +// equal) because of a stack overflow. +// +// The list of constraints: +// +// 1) The StackFrame is immutable i.e. can't add some kind of repeat count to the frame. Making it mutable is big hassle. +// 2) The native unwinding can repeat the same frame SP/IP. These frames are not counted as repeated stack overflow ones. +// 3) Only add the repeated stack overflow frames once to frames set. This saves time and memory. void ThreadInfo::AddStackFrame(const StackFrame& frame) { - std::pair::iterator,bool> result = m_frames.insert(frame); - if (result.second) + // This filters out the duplicate stack frames that are the result the native + // unwinding happening between each managed frame. If the SP matches a frame + // already in the set, skip it. + const std::set::iterator& found = m_frames.find(frame); + if (found == m_frames.end()) { + // Aggregated the repeated stack frames only for stack overflow exceptions + if (m_exceptionHResult == STACK_OVERFLOW_EXCEPTION) + { + // Check for repeats through all the stack frames so far until we find one + if (m_beginRepeat == m_frames.end()) + { + for (auto iterator = m_frames.cbegin(); iterator != m_frames.cend(); ++iterator) + { + if (frame.InstructionPointer() == iterator->InstructionPointer()) + { + m_repeatedFrames++; + m_beginRepeat = iterator; + TRACE("Unwind: begin repeat sp %p ip %p\n", (void*)frame.StackPointer(), (void*)frame.InstructionPointer()); + return; + } + } + } + + // Check for repeats until we stop find them + if (m_endRepeat == m_frames.end()) + { + for (auto iterator = m_beginRepeat; iterator != m_endRepeat; ++iterator) + { + if (frame.InstructionPointer() == iterator->InstructionPointer()) + { + m_repeatedFrames++; + return; + } + } + } + } + + // Add the non-duplicate and (if stack overflow) non-repeating frames to set + std::pair::iterator, bool> result = m_frames.insert(frame); + assert(result.second); + TRACE("Unwind: sp %p ip %p off %08x mod %p%c\n", - (void*)frame.StackPointer(), - (void*)frame.InstructionPointer(), - frame.NativeOffset(), - (void*)frame.ModuleAddress(), - frame.IsManaged() ? '*' : ' '); + (void*)frame.StackPointer(), (void*)frame.InstructionPointer(), frame.NativeOffset(), (void*)frame.ModuleAddress(), frame.IsManaged() ? '*' : ' '); + + // Don't start tracking the end of the repeated frames until there is a start + if (m_beginRepeat != m_frames.end() && m_endRepeat == m_frames.end()) + { + TRACE("Unwind: end repeat sp %p ip %p\n", (void*)frame.StackPointer(), (void*)frame.InstructionPointer()); + m_endRepeat = result.first; + + // Count the number of frames in the repeating sequence and calculate how many times the sequence was repeated + int framesRepeated = 0; + for (auto iterator = m_beginRepeat; iterator != m_endRepeat; ++iterator) + { + framesRepeated++; + } + // The total number of individually repeated frames has to be greater than the number of frames in the repeating sequence + m_repeatedFrames = framesRepeated > 0 && m_repeatedFrames >= framesRepeated ? (m_repeatedFrames / framesRepeated) + 1 : 0; + } } } diff --git a/src/coreclr/debug/createdump/threadinfo.h b/src/coreclr/debug/createdump/threadinfo.h index 7ce0df5f1ec18c..4142b55e333dd7 100644 --- a/src/coreclr/debug/createdump/threadinfo.h +++ b/src/coreclr/debug/createdump/threadinfo.h @@ -33,6 +33,8 @@ struct user_vfpregs_struct } __attribute__((__packed__)); #endif +#define STACK_OVERFLOW_EXCEPTION 0x800703e9 + class ThreadInfo { private: @@ -43,8 +45,11 @@ class ThreadInfo bool m_managed; // if true, thread has managed code running uint64_t m_exceptionObject; // exception object address std::string m_exceptionType; // exception type - int32_t m_exceptionHResult; // exception HRESULT + uint32_t m_exceptionHResult; // exception HRESULT std::set m_frames; // stack frames + int m_repeatedFrames; // number of repeated frames + std::set::const_iterator m_beginRepeat; // beginning of stack overflow repeated frame sequence + std::set::const_iterator m_endRepeat; // end of repeated frame sequence #ifdef __APPLE__ mach_port_t m_port; // MacOS thread port @@ -88,9 +93,12 @@ class ThreadInfo inline bool IsManaged() const { return m_managed; } inline uint64_t ManagedExceptionObject() const { return m_exceptionObject; } - inline int32_t ManagedExceptionHResult() const { return m_exceptionHResult; } + inline uint32_t ManagedExceptionHResult() const { return m_exceptionHResult; } inline std::string ManagedExceptionType() const { return m_exceptionType; } - inline const std::set StackFrames() const { return m_frames; } + inline const std::set& StackFrames() const { return m_frames; } + inline int NumRepeatedFrames() const { return m_repeatedFrames; } + inline bool IsBeginRepeat(std::set::const_iterator& iterator) const { return m_repeatedFrames > 0 && iterator == m_beginRepeat; } + inline bool IsEndRepeat(std::set::const_iterator& iterator) const { return m_repeatedFrames > 0 && iterator == m_endRepeat; } #ifdef __APPLE__ #if defined(__x86_64__) diff --git a/src/coreclr/debug/createdump/threadinfomac.cpp b/src/coreclr/debug/createdump/threadinfomac.cpp index 92b9339088fcab..918f0ae2310b5a 100644 --- a/src/coreclr/debug/createdump/threadinfomac.cpp +++ b/src/coreclr/debug/createdump/threadinfomac.cpp @@ -6,11 +6,16 @@ ThreadInfo::ThreadInfo(CrashInfo& crashInfo, pid_t tid, mach_port_t port) : m_crashInfo(crashInfo), m_tid(tid), + m_ppid(0), + m_tgid(0), m_managed(false), m_exceptionObject(0), m_exceptionHResult(0), + m_repeatedFrames(0), m_port(port) { + m_beginRepeat = m_frames.end(); + m_endRepeat = m_frames.end(); } ThreadInfo::~ThreadInfo() diff --git a/src/coreclr/debug/createdump/threadinfounix.cpp b/src/coreclr/debug/createdump/threadinfounix.cpp index 90a4c8ab9ffe50..856dfbb2993793 100644 --- a/src/coreclr/debug/createdump/threadinfounix.cpp +++ b/src/coreclr/debug/createdump/threadinfounix.cpp @@ -25,10 +25,15 @@ bool GetStatus(pid_t pid, pid_t* ppid, pid_t* tgid, std::string* name); ThreadInfo::ThreadInfo(CrashInfo& crashInfo, pid_t tid) : m_crashInfo(crashInfo), m_tid(tid), + m_ppid(0), + m_tgid(0), m_managed(false), m_exceptionObject(0), - m_exceptionHResult(0) + m_exceptionHResult(0), + m_repeatedFrames(0) { + m_beginRepeat = m_frames.end(); + m_endRepeat = m_frames.end(); } ThreadInfo::~ThreadInfo()