Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

core: add core_mutex_debug to aid debugging deadlocks #18620

Merged
merged 1 commit into from
Apr 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
core: add core_mutex_debug to aid debugging deadlocks
Adding `USEMODULE += core_mutex_debug` to your `Makefile` results in
on log messages such as

    [mutex] waiting for thread 1 (pc = 0x800024d)

being added whenever `mutex_lock()` blocks. This makes tracing down
deadlocks easier.
  • Loading branch information
Marian Buschsieweke committed Apr 25, 2023
commit 3e86e39646f3ff765210a1f14f166b6fc7434add
3 changes: 3 additions & 0 deletions core/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ config MODULE_CORE_MSG_BUS
help
Messaging Bus API for inter process message broadcast.

config MODULE_CORE_MUTEX_DEBUG
bool "Aid debugging deadlocks by printing on whom mutex_lock() is waiting"

config MODULE_CORE_MUTEX_PRIORITY_INHERITANCE
bool "Use priority inheritance to mitigate priority inversion for mutexes"

Expand Down
35 changes: 34 additions & 1 deletion core/include/mutex.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,25 @@
* `MUTEX_LOCK`.
* - The scheduler is run, so that if the unblocked waiting thread can
* run now, in case it has a higher priority than the running thread.
*
* Debugging deadlocks
* -------------------
*
* The module `core_mutex_debug` can be used to print on whom `mutex_lock()`
* is waiting. This information includes the thread ID of the owner and the
* program counter (PC) from where `mutex_lock()` was called. Note that the
* information is only valid if:
*
* - The mutex was locked by a thread, and not e.g. by `MUTEX_INIT_LOCKED`
* - The function `cpu_get_caller_pc()` is implemented for the target
* architecture. (The thread ID will remain valid, though.)
* - The caller PC is briefly 0 when the current owner passes over ownership
* to the next thread, but that thread didn't get CPU time yet to write its
* PC into the data structure. Even worse, on architectures where an aligned
* function-pointer-sized write is not atomic, the value may briefly be
* bogus. Chances are close to zero this ever hits and since this only
* effects debug output, the ostrich algorithm was chosen here.
*
* @{
*
* @file
Expand All @@ -112,6 +131,7 @@
#include <stdint.h>
#include <stdbool.h>

#include "architecture.h"
#include "kernel_defines.h"
#include "list.h"
#include "thread.h"
Expand All @@ -130,7 +150,8 @@ typedef struct {
* @internal
*/
list_node_t queue;
#if defined(DOXYGEN) || defined(MODULE_CORE_MUTEX_PRIORITY_INHERITANCE)
#if defined(DOXYGEN) || defined(MODULE_CORE_MUTEX_PRIORITY_INHERITANCE) \
|| defined(MODULE_CORE_MUTEX_DEBUG)
/**
* @brief The current owner of the mutex or `NULL`
* @note Only available if module core_mutex_priority_inheritance
Expand All @@ -141,6 +162,18 @@ typedef struct {
* this will have the value of `NULL`.
*/
kernel_pid_t owner;
#endif
#if defined(DOXYGEN) || defined(MODULE_CORE_MUTEX_DEBUG)
/**
* @brief Program counter of the call to @ref mutex_lock that most
* recently acquired this mutex
*
* This is used when the module `core_mutex_debug` is used to debug
* deadlocks and is non-existing otherwise
*/
uinttxtptr_t owner_calling_pc;
#endif
#if defined(DOXYGEN) || defined(MODULE_CORE_MUTEX_PRIORITY_INHERITANCE)
/**
* @brief Original priority of the owner
* @note Only available if module core_mutex_priority_inheritance
Expand Down
49 changes: 42 additions & 7 deletions core/mutex.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,18 @@
* function into both @ref mutex_lock and @ref mutex_lock_cancelable is,
* therefore, beneficial for the majority of applications.
*/
static inline __attribute__((always_inline)) void _block(mutex_t *mutex,
unsigned irq_state)
static inline __attribute__((always_inline))
void _block(mutex_t *mutex,
unsigned irq_state,
uinttxtptr_t pc)
{
/* pc is only used when MODULE_CORE_MUTEX_DEBUG */
(void)pc;
#if IS_USED(MODULE_CORE_MUTEX_DEBUG)
printf("[mutex] waiting for thread %" PRIkernel_pid " (pc = 0x%" PRIxTXTPTR
fabian18 marked this conversation as resolved.
Show resolved Hide resolved
")\n",
mutex->owner, mutex->owner_calling_pc);
#endif
thread_t *me = thread_get_active();

/* Fail visibly even if a blocking action is called from somewhere where
Expand Down Expand Up @@ -80,10 +89,17 @@ static inline __attribute__((always_inline)) void _block(mutex_t *mutex,
irq_restore(irq_state);
thread_yield_higher();
/* We were woken up by scheduler. Waker removed us from queue. */
#if IS_USED(MODULE_CORE_MUTEX_DEBUG)
mutex->owner_calling_pc = pc;
#endif
}

bool mutex_lock_internal(mutex_t *mutex, bool block)
maribu marked this conversation as resolved.
Show resolved Hide resolved
{
uinttxtptr_t pc = 0;
#if IS_USED(MODULE_CORE_MUTEX_DEBUG)
pc = cpu_get_caller_pc();
#endif
unsigned irq_state = irq_disable();

DEBUG("PID[%" PRIkernel_pid "] mutex_lock_internal(block=%u).\n",
Expand All @@ -92,9 +108,15 @@ bool mutex_lock_internal(mutex_t *mutex, bool block)
if (mutex->queue.next == NULL) {
/* mutex is unlocked. */
mutex->queue.next = MUTEX_LOCKED;
#ifdef MODULE_CORE_MUTEX_PRIORITY_INHERITANCE
#if IS_USED(MODULE_CORE_MUTEX_PRIORITY_INHERITANCE) \
|| IS_USED(MODULE_CORE_MUTEX_DEBUG)
thread_t *me = thread_get_active();
mutex->owner = me->pid;
#endif
#if IS_USED(MODULE_CORE_MUTEX_DEBUG)
mutex->owner_calling_pc = pc;
#endif
#if IS_USED(MODULE_CORE_MUTEX_PRIORITY_INHERITANCE)
mutex->owner_original_priority = me->priority;
#endif
DEBUG("PID[%" PRIkernel_pid "] mutex_lock(): early out.\n",
Expand All @@ -106,14 +128,18 @@ bool mutex_lock_internal(mutex_t *mutex, bool block)
irq_restore(irq_state);
return false;
}
_block(mutex, irq_state);
_block(mutex, irq_state, pc);
}

return true;
}

int mutex_lock_cancelable(mutex_cancel_t *mc)
{
uinttxtptr_t pc = 0;
#if IS_USED(MODULE_CORE_MUTEX_DEBUG)
pc = cpu_get_caller_pc();
#endif
unsigned irq_state = irq_disable();

DEBUG("PID[%" PRIkernel_pid "] mutex_lock_cancelable()\n",
Expand All @@ -131,9 +157,15 @@ int mutex_lock_cancelable(mutex_cancel_t *mc)
if (mutex->queue.next == NULL) {
/* mutex is unlocked. */
mutex->queue.next = MUTEX_LOCKED;
#ifdef MODULE_CORE_MUTEX_PRIORITY_INHERITANCE
#if IS_USED(MODULE_CORE_MUTEX_PRIORITY_INHERITANCE) \
|| IS_USED(MODULE_CORE_MUTEX_DEBUG)
thread_t *me = thread_get_active();
mutex->owner = me->pid;
#endif
#if IS_USED(MODULE_CORE_MUTEX_DEBUG)
mutex->owner_calling_pc = pc;
#endif
#if IS_USED(MODULE_CORE_MUTEX_PRIORITY_INHERITANCE)
mutex->owner_original_priority = me->priority;
#endif
DEBUG("PID[%" PRIkernel_pid "] mutex_lock_cancelable() early out.\n",
Expand All @@ -142,7 +174,7 @@ int mutex_lock_cancelable(mutex_cancel_t *mc)
return 0;
}
else {
_block(mutex, irq_state);
_block(mutex, irq_state, pc);
if (mc->cancelled) {
DEBUG("PID[%" PRIkernel_pid "] mutex_lock_cancelable() "
"cancelled.\n", thread_getpid());
Expand Down Expand Up @@ -185,7 +217,7 @@ void mutex_unlock(mutex_t *mutex)

uint16_t process_priority = process->priority;

#ifdef MODULE_CORE_MUTEX_PRIORITY_INHERITANCE
#if IS_USED(MODULE_CORE_MUTEX_PRIORITY_INHERITANCE)
thread_t *owner = thread_get(mutex->owner);
if ((owner) && (owner->priority != mutex->owner_original_priority)) {
DEBUG("PID[%" PRIkernel_pid "] prio %u --> %u\n",
Expand All @@ -194,6 +226,9 @@ void mutex_unlock(mutex_t *mutex)
sched_change_priority(owner, mutex->owner_original_priority);
}
#endif
#if IS_USED(MODULE_CORE_MUTEX_DEBUG)
mutex->owner_calling_pc = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah no - if another thread was waiting on the mutex, it will be locked again here, but we don't have the PC of that one.

Not sure if

Suggested change
mutex->owner_calling_pc = 0;
mutex->owner_calling_pc = cpu_get_caller_pc();

would be any better of if we should just document that. (Or can we get the PC of the blocked thread?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be fixed now. There is still a brief time window between clearing this and when the new owner updates this in which theoretically a higher priority thread could wake up and try to take the mutex and find a value of 0 here, but that should barely ever happen. I think for a debug print this is good enough and the issue is now documented.

#endif

irq_restore(irqstate);
sched_switch(process_priority);
Expand Down