* [PATCH v17 13/47] dept: apply sdt_might_sleep_{start,end}() to wait_for_completion()/complete()
From: Byungchul Park @ 2025-10-02 8:12 UTC (permalink / raw)
To: linux-kernel
Cc: kernel_team, torvalds, damien.lemoal, linux-ide, adilger.kernel,
linux-ext4, mingo, peterz, will, tglx, rostedt, joel, sashal,
daniel.vetter, duyuyang, johannes.berg, tj, tytso, willy, david,
amir73il, gregkh, kernel-team, linux-mm, akpm, mhocko, minchan,
hannes, vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes,
vbabka, ngupta, linux-block, josef, linux-fsdevel, jack, jlayton,
dan.j.williams, hch, djwong, dri-devel, rodrigosiqueiramelo,
melissa.srw, hamohammed.sa, harry.yoo, chris.p.wilson,
gwan-gyeong.mun, max.byungchul.park, boqun.feng, longman,
yunseong.kim, ysk, yeoreum.yun, netdev, matthew.brost, her0gyugyu,
corbet, catalin.marinas, bp, dave.hansen, x86, hpa, luto,
sumit.semwal, gustavo, christian.koenig, andi.shyti, arnd,
lorenzo.stoakes, Liam.Howlett, rppt, surenb, mcgrof, petr.pavlu,
da.gomez, samitolvanen, paulmck, frederic, neeraj.upadhyay,
joelagnelf, josh, urezki, mathieu.desnoyers, jiangshanlai,
qiang.zhang, juri.lelli, vincent.guittot, dietmar.eggemann,
bsegall, mgorman, vschneid, chuck.lever, neil, okorniev, Dai.Ngo,
tom, trondmy, anna, kees, bigeasy, clrkwllms, mark.rutland,
ada.coupriediaz, kristina.martsenko, wangkefeng.wang, broonie,
kevin.brodsky, dwmw, shakeel.butt, ast, ziy, yuzhao, baolin.wang,
usamaarif642, joel.granados, richard.weiyang, geert+renesas,
tim.c.chen, linux, alexander.shishkin, lillian, chenhuacai,
francesco, guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel
In-Reply-To: <20251002081247.51255-1-byungchul@sk.com>
Make dept able to track dependencies by wait_for_completion()/complete().
Signed-off-by: Byungchul Park <byungchul@sk.com>
---
include/linux/completion.h | 30 +++++++++++++++++++++++++-----
1 file changed, 25 insertions(+), 5 deletions(-)
diff --git a/include/linux/completion.h b/include/linux/completion.h
index fb2915676574..bd2c207481d6 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -10,6 +10,7 @@
*/
#include <linux/swait.h>
+#include <linux/dept_sdt.h>
/*
* struct completion - structure used to maintain state for a "completion"
@@ -26,14 +27,33 @@
struct completion {
unsigned int done;
struct swait_queue_head wait;
+ struct dept_map dmap;
};
+#define init_completion(x) \
+do { \
+ sdt_map_init(&(x)->dmap); \
+ __init_completion(x); \
+} while (0)
+
+/*
+ * XXX: No use cases for now. Fill the body when needed.
+ */
#define init_completion_map(x, m) init_completion(x)
-static inline void complete_acquire(struct completion *x) {}
-static inline void complete_release(struct completion *x) {}
+
+static inline void complete_acquire(struct completion *x)
+{
+ sdt_might_sleep_start(&x->dmap);
+}
+
+static inline void complete_release(struct completion *x)
+{
+ sdt_might_sleep_end();
+}
#define COMPLETION_INITIALIZER(work) \
- { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
+ { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
+ .dmap = DEPT_MAP_INITIALIZER(work, NULL), }
#define COMPLETION_INITIALIZER_ONSTACK_MAP(work, map) \
(*({ init_completion_map(&(work), &(map)); &(work); }))
@@ -75,13 +95,13 @@ static inline void complete_release(struct completion *x) {}
#endif
/**
- * init_completion - Initialize a dynamically allocated completion
+ * __init_completion - Initialize a dynamically allocated completion
* @x: pointer to completion structure that is to be initialized
*
* This inline function will initialize a dynamically created completion
* structure.
*/
-static inline void init_completion(struct completion *x)
+static inline void __init_completion(struct completion *x)
{
x->done = 0;
init_swait_queue_head(&x->wait);
--
2.17.1
^ permalink raw reply related
* [PATCH v17 11/47] dept: add a mechanism to refill the internal memory pools on running out
From: Byungchul Park @ 2025-10-02 8:12 UTC (permalink / raw)
To: linux-kernel
Cc: kernel_team, torvalds, damien.lemoal, linux-ide, adilger.kernel,
linux-ext4, mingo, peterz, will, tglx, rostedt, joel, sashal,
daniel.vetter, duyuyang, johannes.berg, tj, tytso, willy, david,
amir73il, gregkh, kernel-team, linux-mm, akpm, mhocko, minchan,
hannes, vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes,
vbabka, ngupta, linux-block, josef, linux-fsdevel, jack, jlayton,
dan.j.williams, hch, djwong, dri-devel, rodrigosiqueiramelo,
melissa.srw, hamohammed.sa, harry.yoo, chris.p.wilson,
gwan-gyeong.mun, max.byungchul.park, boqun.feng, longman,
yunseong.kim, ysk, yeoreum.yun, netdev, matthew.brost, her0gyugyu,
corbet, catalin.marinas, bp, dave.hansen, x86, hpa, luto,
sumit.semwal, gustavo, christian.koenig, andi.shyti, arnd,
lorenzo.stoakes, Liam.Howlett, rppt, surenb, mcgrof, petr.pavlu,
da.gomez, samitolvanen, paulmck, frederic, neeraj.upadhyay,
joelagnelf, josh, urezki, mathieu.desnoyers, jiangshanlai,
qiang.zhang, juri.lelli, vincent.guittot, dietmar.eggemann,
bsegall, mgorman, vschneid, chuck.lever, neil, okorniev, Dai.Ngo,
tom, trondmy, anna, kees, bigeasy, clrkwllms, mark.rutland,
ada.coupriediaz, kristina.martsenko, wangkefeng.wang, broonie,
kevin.brodsky, dwmw, shakeel.butt, ast, ziy, yuzhao, baolin.wang,
usamaarif642, joel.granados, richard.weiyang, geert+renesas,
tim.c.chen, linux, alexander.shishkin, lillian, chenhuacai,
francesco, guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel
In-Reply-To: <20251002081247.51255-1-byungchul@sk.com>
dept engine works in a constrained environment. For example, dept
cannot make use of dynamic allocation e.g. kmalloc(). So dept has been
using static pools to keep memory chunks dept uses.
However, dept would barely work once any of the pools gets run out. So
implemented a mechanism for the refill on the lack, using irq work and
workqueue that fits on the contrained environment.
Signed-off-by: Byungchul Park <byungchul@sk.com>
---
kernel/dependency/dept.c | 108 +++++++++++++++++++++++++-----
kernel/dependency/dept_internal.h | 19 ++++--
kernel/dependency/dept_object.h | 10 +--
kernel/dependency/dept_proc.c | 8 +--
4 files changed, 116 insertions(+), 29 deletions(-)
diff --git a/kernel/dependency/dept.c b/kernel/dependency/dept.c
index 953e1b81a81f..1b16a6095b3c 100644
--- a/kernel/dependency/dept.c
+++ b/kernel/dependency/dept.c
@@ -75,6 +75,9 @@
#include <linux/dept.h>
#include <linux/utsname.h>
#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/irq_work.h>
+#include <linux/vmalloc.h>
#include "dept_internal.h"
static int dept_stop;
@@ -143,9 +146,11 @@ static inline struct dept_task *dept_task(void)
} \
})
-#define DEPT_INFO_ONCE(s...) pr_warn_once("DEPT_INFO_ONCE: " s)
+#define DEPT_INFO_ONCE(s...) pr_warn_once("DEPT_INFO_ONCE: " s)
+#define DEPT_INFO(s...) pr_warn("DEPT_INFO: " s)
static arch_spinlock_t dept_spin = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static arch_spinlock_t dept_pool_spin = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
/*
* DEPT internal engine should be cautious in using outside functions
@@ -268,6 +273,7 @@ static bool valid_key(struct dept_key *k)
#define OBJECT(id, nr) \
static struct dept_##id spool_##id[nr]; \
+static struct dept_##id rpool_##id[nr]; \
static DEFINE_PER_CPU(struct llist_head, lpool_##id);
#include "dept_object.h"
#undef OBJECT
@@ -276,14 +282,74 @@ struct dept_pool dept_pool[OBJECT_NR] = {
#define OBJECT(id, nr) { \
.name = #id, \
.obj_sz = sizeof(struct dept_##id), \
- .obj_nr = ATOMIC_INIT(nr), \
+ .obj_nr = nr, \
+ .tot_nr = nr, \
+ .acc_sz = ATOMIC_INIT(sizeof(spool_##id) + sizeof(rpool_##id)), \
.node_off = offsetof(struct dept_##id, pool_node), \
.spool = spool_##id, \
+ .rpool = rpool_##id, \
.lpool = &lpool_##id, },
#include "dept_object.h"
#undef OBJECT
};
+static void dept_wq_work_fn(struct work_struct *work)
+{
+ int i;
+
+ for (i = 0; i < OBJECT_NR; i++) {
+ struct dept_pool *p = dept_pool + i;
+ int sz = p->tot_nr * p->obj_sz;
+ void *rpool;
+ bool need;
+
+ local_irq_disable();
+ arch_spin_lock(&dept_pool_spin);
+ need = !p->rpool;
+ arch_spin_unlock(&dept_pool_spin);
+ local_irq_enable();
+
+ if (!need)
+ continue;
+
+ rpool = vmalloc(sz);
+
+ if (!rpool) {
+ DEPT_STOP("Failed to extend internal resources.\n");
+ break;
+ }
+
+ local_irq_disable();
+ arch_spin_lock(&dept_pool_spin);
+ if (!p->rpool) {
+ p->rpool = rpool;
+ rpool = NULL;
+ atomic_add(sz, &p->acc_sz);
+ }
+ arch_spin_unlock(&dept_pool_spin);
+ local_irq_enable();
+
+ if (rpool)
+ vfree(rpool);
+ else
+ DEPT_INFO("Dept object(%s) just got refilled successfully.\n", p->name);
+ }
+}
+
+static DECLARE_WORK(dept_wq_work, dept_wq_work_fn);
+
+static void dept_irq_work_fn(struct irq_work *w)
+{
+ schedule_work(&dept_wq_work);
+}
+
+static DEFINE_IRQ_WORK(dept_irq_work, dept_irq_work_fn);
+
+static void request_rpool_refill(void)
+{
+ irq_work_queue(&dept_irq_work);
+}
+
/*
* Can use llist no matter whether CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG is
* enabled or not because NMI and other contexts in the same CPU never
@@ -319,19 +385,31 @@ static void *from_pool(enum object_t t)
/*
* Try static pool.
*/
- if (atomic_read(&p->obj_nr) > 0) {
- int idx = atomic_dec_return(&p->obj_nr);
+ arch_spin_lock(&dept_pool_spin);
+
+ if (!p->obj_nr) {
+ p->spool = p->rpool;
+ p->obj_nr = p->rpool ? p->tot_nr : 0;
+ p->rpool = NULL;
+ request_rpool_refill();
+ }
+
+ if (p->obj_nr) {
+ void *ret;
+
+ p->obj_nr--;
+ ret = p->spool + (p->obj_nr * p->obj_sz);
+ arch_spin_unlock(&dept_pool_spin);
- if (idx >= 0)
- return p->spool + (idx * p->obj_sz);
+ return ret;
}
+ arch_spin_unlock(&dept_pool_spin);
- DEPT_INFO_ONCE("---------------------------------------------\n"
- " Some of Dept internal resources are run out.\n"
- " Dept might still work if the resources get freed.\n"
- " However, the chances are Dept will suffer from\n"
- " the lack from now. Needs to extend the internal\n"
- " resource pools. Ask max.byungchul.park@gmail.com\n");
+ DEPT_INFO("------------------------------------------\n"
+ " Dept object(%s) is run out.\n"
+ " Dept is trying to refill the object.\n"
+ " Nevertheless, if it fails, Dept will stop.\n",
+ p->name);
return NULL;
}
@@ -2957,8 +3035,8 @@ void __init dept_init(void)
pr_info("... DEPT_MAX_ECXT_HELD : %d\n", DEPT_MAX_ECXT_HELD);
pr_info("... DEPT_MAX_SUBCLASSES : %d\n", DEPT_MAX_SUBCLASSES);
#define OBJECT(id, nr) \
- pr_info("... memory used by %s: %zu KB\n", \
- #id, B2KB(sizeof(struct dept_##id) * nr));
+ pr_info("... memory initially used by %s: %zu KB\n", \
+ #id, B2KB(sizeof(spool_##id) + sizeof(rpool_##id)));
#include "dept_object.h"
#undef OBJECT
#define HASH(id, bits) \
@@ -2966,6 +3044,6 @@ void __init dept_init(void)
#id, B2KB(sizeof(struct hlist_head) * (1 << (bits))));
#include "dept_hash.h"
#undef HASH
- pr_info("... total memory used by objects and hashs: %zu KB\n", B2KB(mem_total));
+ pr_info("... total memory initially used by objects and hashs: %zu KB\n", B2KB(mem_total));
pr_info("... per task memory footprint: %zu bytes\n", sizeof(struct dept_task));
}
diff --git a/kernel/dependency/dept_internal.h b/kernel/dependency/dept_internal.h
index 6b39e5a2a830..b2a44632ee4d 100644
--- a/kernel/dependency/dept_internal.h
+++ b/kernel/dependency/dept_internal.h
@@ -23,9 +23,19 @@ struct dept_pool {
size_t obj_sz;
/*
- * the number of the static array
+ * the remaining number of the object in spool
*/
- atomic_t obj_nr;
+ int obj_nr;
+
+ /*
+ * the number of the object in spool
+ */
+ int tot_nr;
+
+ /*
+ * accumulated amount of memory used by the object in byte
+ */
+ atomic_t acc_sz;
/*
* offset of ->pool_node
@@ -35,9 +45,10 @@ struct dept_pool {
/*
* pointer to the pool
*/
- void *spool;
+ void *spool; /* static pool */
+ void *rpool; /* reserved pool */
struct llist_head boot_pool;
- struct llist_head __percpu *lpool;
+ struct llist_head __percpu *lpool; /* local pool */
};
enum object_t {
diff --git a/kernel/dependency/dept_object.h b/kernel/dependency/dept_object.h
index 0b7eb16fe9fb..4f936adfa8ee 100644
--- a/kernel/dependency/dept_object.h
+++ b/kernel/dependency/dept_object.h
@@ -6,8 +6,8 @@
* nr: # of the object that should be kept in the pool.
*/
-OBJECT(dep, 1024 * 8)
-OBJECT(class, 1024 * 8)
-OBJECT(stack, 1024 * 32)
-OBJECT(ecxt, 1024 * 16)
-OBJECT(wait, 1024 * 32)
+OBJECT(dep, 1024 * 4 * 2)
+OBJECT(class, 1024 * 4)
+OBJECT(stack, 1024 * 4 * 8)
+OBJECT(ecxt, 1024 * 4 * 2)
+OBJECT(wait, 1024 * 4 * 4)
diff --git a/kernel/dependency/dept_proc.c b/kernel/dependency/dept_proc.c
index 97beaf397715..f28992834588 100644
--- a/kernel/dependency/dept_proc.c
+++ b/kernel/dependency/dept_proc.c
@@ -74,12 +74,10 @@ static int dept_stats_show(struct seq_file *m, void *v)
{
int r;
- seq_puts(m, "Availability in the static pools:\n\n");
+ seq_puts(m, "Accumulated amount of memory used by pools:\n\n");
#define OBJECT(id, nr) \
- r = atomic_read(&dept_pool[OBJECT_##id].obj_nr); \
- if (r < 0) \
- r = 0; \
- seq_printf(m, "%s\t%d/%d(%d%%)\n", #id, r, nr, (r * 100) / (nr));
+ r = atomic_read(&dept_pool[OBJECT_##id].acc_sz); \
+ seq_printf(m, "%s\t%d KB\n", #id, r / 1024);
#include "dept_object.h"
#undef OBJECT
--
2.17.1
^ permalink raw reply related
* [PATCH v17 12/47] dept: record the latest one out of consecutive waits of the same class
From: Byungchul Park @ 2025-10-02 8:12 UTC (permalink / raw)
To: linux-kernel
Cc: kernel_team, torvalds, damien.lemoal, linux-ide, adilger.kernel,
linux-ext4, mingo, peterz, will, tglx, rostedt, joel, sashal,
daniel.vetter, duyuyang, johannes.berg, tj, tytso, willy, david,
amir73il, gregkh, kernel-team, linux-mm, akpm, mhocko, minchan,
hannes, vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes,
vbabka, ngupta, linux-block, josef, linux-fsdevel, jack, jlayton,
dan.j.williams, hch, djwong, dri-devel, rodrigosiqueiramelo,
melissa.srw, hamohammed.sa, harry.yoo, chris.p.wilson,
gwan-gyeong.mun, max.byungchul.park, boqun.feng, longman,
yunseong.kim, ysk, yeoreum.yun, netdev, matthew.brost, her0gyugyu,
corbet, catalin.marinas, bp, dave.hansen, x86, hpa, luto,
sumit.semwal, gustavo, christian.koenig, andi.shyti, arnd,
lorenzo.stoakes, Liam.Howlett, rppt, surenb, mcgrof, petr.pavlu,
da.gomez, samitolvanen, paulmck, frederic, neeraj.upadhyay,
joelagnelf, josh, urezki, mathieu.desnoyers, jiangshanlai,
qiang.zhang, juri.lelli, vincent.guittot, dietmar.eggemann,
bsegall, mgorman, vschneid, chuck.lever, neil, okorniev, Dai.Ngo,
tom, trondmy, anna, kees, bigeasy, clrkwllms, mark.rutland,
ada.coupriediaz, kristina.martsenko, wangkefeng.wang, broonie,
kevin.brodsky, dwmw, shakeel.butt, ast, ziy, yuzhao, baolin.wang,
usamaarif642, joel.granados, richard.weiyang, geert+renesas,
tim.c.chen, linux, alexander.shishkin, lillian, chenhuacai,
francesco, guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel
In-Reply-To: <20251002081247.51255-1-byungchul@sk.com>
The current code records all the waits for later use to track relation
between waits and events within each context. However, since the same
class is handled the same way, it'd be okay to record only one on behalf
of the others if they all have the same class.
Even though it's the ideal to search the whole history buffer for that,
since it'd cost too high, alternatively, let's keep the latest one when
the same class'ed waits consecutively appear.
Signed-off-by: Byungchul Park <byungchul@sk.com>
---
kernel/dependency/dept.c | 21 ++++++++++++++++++++-
1 file changed, 20 insertions(+), 1 deletion(-)
diff --git a/kernel/dependency/dept.c b/kernel/dependency/dept.c
index 1b16a6095b3c..f4c08758f8db 100644
--- a/kernel/dependency/dept.c
+++ b/kernel/dependency/dept.c
@@ -1486,9 +1486,28 @@ static struct dept_wait_hist *new_hist(void)
return wh;
}
+static struct dept_wait_hist *last_hist(void)
+{
+ int pos_n = hist_pos_next();
+ struct dept_wait_hist *wh_n = hist(pos_n);
+
+ /*
+ * This is the first try.
+ */
+ if (!pos_n && !wh_n->wait)
+ return NULL;
+
+ return hist(pos_n + DEPT_MAX_WAIT_HIST - 1);
+}
+
static void add_hist(struct dept_wait *w, unsigned int wg, unsigned int ctxt_id)
{
- struct dept_wait_hist *wh = new_hist();
+ struct dept_wait_hist *wh;
+
+ wh = last_hist();
+
+ if (!wh || wh->wait->class != w->class || wh->ctxt_id != ctxt_id)
+ wh = new_hist();
if (likely(wh->wait))
put_wait(wh->wait);
--
2.17.1
^ permalink raw reply related
* [PATCH v17 10/47] dept: distinguish each work from another
From: Byungchul Park @ 2025-10-02 8:12 UTC (permalink / raw)
To: linux-kernel
Cc: kernel_team, torvalds, damien.lemoal, linux-ide, adilger.kernel,
linux-ext4, mingo, peterz, will, tglx, rostedt, joel, sashal,
daniel.vetter, duyuyang, johannes.berg, tj, tytso, willy, david,
amir73il, gregkh, kernel-team, linux-mm, akpm, mhocko, minchan,
hannes, vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes,
vbabka, ngupta, linux-block, josef, linux-fsdevel, jack, jlayton,
dan.j.williams, hch, djwong, dri-devel, rodrigosiqueiramelo,
melissa.srw, hamohammed.sa, harry.yoo, chris.p.wilson,
gwan-gyeong.mun, max.byungchul.park, boqun.feng, longman,
yunseong.kim, ysk, yeoreum.yun, netdev, matthew.brost, her0gyugyu,
corbet, catalin.marinas, bp, dave.hansen, x86, hpa, luto,
sumit.semwal, gustavo, christian.koenig, andi.shyti, arnd,
lorenzo.stoakes, Liam.Howlett, rppt, surenb, mcgrof, petr.pavlu,
da.gomez, samitolvanen, paulmck, frederic, neeraj.upadhyay,
joelagnelf, josh, urezki, mathieu.desnoyers, jiangshanlai,
qiang.zhang, juri.lelli, vincent.guittot, dietmar.eggemann,
bsegall, mgorman, vschneid, chuck.lever, neil, okorniev, Dai.Ngo,
tom, trondmy, anna, kees, bigeasy, clrkwllms, mark.rutland,
ada.coupriediaz, kristina.martsenko, wangkefeng.wang, broonie,
kevin.brodsky, dwmw, shakeel.butt, ast, ziy, yuzhao, baolin.wang,
usamaarif642, joel.granados, richard.weiyang, geert+renesas,
tim.c.chen, linux, alexander.shishkin, lillian, chenhuacai,
francesco, guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel
In-Reply-To: <20251002081247.51255-1-byungchul@sk.com>
Workqueue already provides concurrency control. By that, any wait in a
work doesn't prevents events in other works with the control enabled.
Thus, each work would better be considered a different context.
So let dept assign a different context id to each work.
Signed-off-by: Byungchul Park <byungchul@sk.com>
---
kernel/workqueue.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c6b79b3675c3..0e05648b4501 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -55,6 +55,7 @@
#include <linux/kvm_para.h>
#include <linux/delay.h>
#include <linux/irq_work.h>
+#include <linux/dept.h>
#include "workqueue_internal.h"
@@ -3153,6 +3154,8 @@ __acquires(&pool->lock)
lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
+ dept_update_cxt();
+
/* ensure we're on the correct CPU */
WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
raw_smp_processor_id() != pool->cpu);
--
2.17.1
^ permalink raw reply related
* [PATCH v17 09/47] arm64, dept: add support CONFIG_ARCH_HAS_DEPT_SUPPORT to arm64
From: Byungchul Park @ 2025-10-02 8:12 UTC (permalink / raw)
To: linux-kernel
Cc: kernel_team, torvalds, damien.lemoal, linux-ide, adilger.kernel,
linux-ext4, mingo, peterz, will, tglx, rostedt, joel, sashal,
daniel.vetter, duyuyang, johannes.berg, tj, tytso, willy, david,
amir73il, gregkh, kernel-team, linux-mm, akpm, mhocko, minchan,
hannes, vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes,
vbabka, ngupta, linux-block, josef, linux-fsdevel, jack, jlayton,
dan.j.williams, hch, djwong, dri-devel, rodrigosiqueiramelo,
melissa.srw, hamohammed.sa, harry.yoo, chris.p.wilson,
gwan-gyeong.mun, max.byungchul.park, boqun.feng, longman,
yunseong.kim, ysk, yeoreum.yun, netdev, matthew.brost, her0gyugyu,
corbet, catalin.marinas, bp, dave.hansen, x86, hpa, luto,
sumit.semwal, gustavo, christian.koenig, andi.shyti, arnd,
lorenzo.stoakes, Liam.Howlett, rppt, surenb, mcgrof, petr.pavlu,
da.gomez, samitolvanen, paulmck, frederic, neeraj.upadhyay,
joelagnelf, josh, urezki, mathieu.desnoyers, jiangshanlai,
qiang.zhang, juri.lelli, vincent.guittot, dietmar.eggemann,
bsegall, mgorman, vschneid, chuck.lever, neil, okorniev, Dai.Ngo,
tom, trondmy, anna, kees, bigeasy, clrkwllms, mark.rutland,
ada.coupriediaz, kristina.martsenko, wangkefeng.wang, broonie,
kevin.brodsky, dwmw, shakeel.butt, ast, ziy, yuzhao, baolin.wang,
usamaarif642, joel.granados, richard.weiyang, geert+renesas,
tim.c.chen, linux, alexander.shishkin, lillian, chenhuacai,
francesco, guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel
In-Reply-To: <20251002081247.51255-1-byungchul@sk.com>
dept needs to notice every entrance from user to kernel mode to treat
every kernel context independently when tracking wait-event dependencies.
Roughly, system call and user oriented fault are the cases.
Make dept aware of the entrances of arm64 and add support
CONFIG_ARCH_HAS_DEPT_SUPPORT to arm64.
Signed-off-by: Byungchul Park <byungchul@sk.com>
---
arch/arm64/Kconfig | 1 +
arch/arm64/kernel/syscall.c | 7 +++++++
arch/arm64/mm/fault.c | 7 +++++++
3 files changed, 15 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e9bbfacc35a6..a8fab2c052dc 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -281,6 +281,7 @@ config ARM64
select USER_STACKTRACE_SUPPORT
select VDSO_GETRANDOM
select VMAP_STACK
+ select ARCH_HAS_DEPT_SUPPORT
help
ARM 64-bit (AArch64) Linux support.
diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c
index c442fcec6b9e..bbd306335179 100644
--- a/arch/arm64/kernel/syscall.c
+++ b/arch/arm64/kernel/syscall.c
@@ -7,6 +7,7 @@
#include <linux/ptrace.h>
#include <linux/randomize_kstack.h>
#include <linux/syscalls.h>
+#include <linux/dept.h>
#include <asm/debug-monitors.h>
#include <asm/exception.h>
@@ -96,6 +97,12 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr,
* (Similarly for HVC and SMC elsewhere.)
*/
+ /*
+ * This is a system call from user mode. Make dept work with a
+ * new kernel mode context.
+ */
+ dept_update_cxt();
+
if (flags & _TIF_MTE_ASYNC_FAULT) {
/*
* Process the asynchronous tag check fault before the actual
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index d816ff44faff..96827b999d18 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -26,6 +26,7 @@
#include <linux/pkeys.h>
#include <linux/preempt.h>
#include <linux/hugetlb.h>
+#include <linux/dept.h>
#include <asm/acpi.h>
#include <asm/bug.h>
@@ -622,6 +623,12 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
if (!(mm_flags & FAULT_FLAG_USER))
goto lock_mmap;
+ /*
+ * This fault comes from user mode. Make dept work with a new
+ * kernel mode context.
+ */
+ dept_update_cxt();
+
vma = lock_vma_under_rcu(mm, addr);
if (!vma)
goto lock_mmap;
--
2.17.1
^ permalink raw reply related
* [PATCH v17 08/47] x86_64, dept: add support CONFIG_ARCH_HAS_DEPT_SUPPORT to x86_64
From: Byungchul Park @ 2025-10-02 8:12 UTC (permalink / raw)
To: linux-kernel
Cc: kernel_team, torvalds, damien.lemoal, linux-ide, adilger.kernel,
linux-ext4, mingo, peterz, will, tglx, rostedt, joel, sashal,
daniel.vetter, duyuyang, johannes.berg, tj, tytso, willy, david,
amir73il, gregkh, kernel-team, linux-mm, akpm, mhocko, minchan,
hannes, vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes,
vbabka, ngupta, linux-block, josef, linux-fsdevel, jack, jlayton,
dan.j.williams, hch, djwong, dri-devel, rodrigosiqueiramelo,
melissa.srw, hamohammed.sa, harry.yoo, chris.p.wilson,
gwan-gyeong.mun, max.byungchul.park, boqun.feng, longman,
yunseong.kim, ysk, yeoreum.yun, netdev, matthew.brost, her0gyugyu,
corbet, catalin.marinas, bp, dave.hansen, x86, hpa, luto,
sumit.semwal, gustavo, christian.koenig, andi.shyti, arnd,
lorenzo.stoakes, Liam.Howlett, rppt, surenb, mcgrof, petr.pavlu,
da.gomez, samitolvanen, paulmck, frederic, neeraj.upadhyay,
joelagnelf, josh, urezki, mathieu.desnoyers, jiangshanlai,
qiang.zhang, juri.lelli, vincent.guittot, dietmar.eggemann,
bsegall, mgorman, vschneid, chuck.lever, neil, okorniev, Dai.Ngo,
tom, trondmy, anna, kees, bigeasy, clrkwllms, mark.rutland,
ada.coupriediaz, kristina.martsenko, wangkefeng.wang, broonie,
kevin.brodsky, dwmw, shakeel.butt, ast, ziy, yuzhao, baolin.wang,
usamaarif642, joel.granados, richard.weiyang, geert+renesas,
tim.c.chen, linux, alexander.shishkin, lillian, chenhuacai,
francesco, guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel
In-Reply-To: <20251002081247.51255-1-byungchul@sk.com>
dept needs to notice every entrance from user to kernel mode to treat
every kernel context independently when tracking wait-event dependencies.
Roughly, system call and user oriented fault are the cases.
Make dept aware of the entrances of x86_64 and add support
CONFIG_ARCH_HAS_DEPT_SUPPORT to x86_64.
Signed-off-by: Byungchul Park <byungchul@sk.com>
---
arch/x86/Kconfig | 1 +
arch/x86/entry/syscall_64.c | 7 +++++++
arch/x86/mm/fault.c | 7 +++++++
3 files changed, 15 insertions(+)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 05880301212e..46021cf5934b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -38,6 +38,7 @@ config X86_64
select ZONE_DMA32
select EXECMEM if DYNAMIC_FTRACE
select ACPI_MRRM if ACPI
+ select ARCH_HAS_DEPT_SUPPORT
config FORCE_DYNAMIC_FTRACE
def_bool y
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index b6e68ea98b83..66bd5af5aff1 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -8,6 +8,7 @@
#include <linux/entry-common.h>
#include <linux/nospec.h>
#include <asm/syscall.h>
+#include <linux/dept.h>
#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
@@ -86,6 +87,12 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
/* Returns true to return using SYSRET, or false to use IRET */
__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
{
+ /*
+ * This is a system call from user mode. Make dept work with a
+ * new kernel mode context.
+ */
+ dept_update_cxt();
+
add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 998bd807fc7b..017edb75f0a0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -19,6 +19,7 @@
#include <linux/mm_types.h>
#include <linux/mm.h> /* find_and_lock_vma() */
#include <linux/vmalloc.h>
+#include <linux/dept.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
@@ -1219,6 +1220,12 @@ void do_user_addr_fault(struct pt_regs *regs,
tsk = current;
mm = tsk->mm;
+ /*
+ * This fault comes from user mode. Make dept work with a new
+ * kernel mode context.
+ */
+ dept_update_cxt();
+
if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
/*
* Whoops, this is kernel mode code trying to execute from
--
2.17.1
^ permalink raw reply related
* [PATCH v17 07/47] dept: distinguish each kernel context from another
From: Byungchul Park @ 2025-10-02 8:12 UTC (permalink / raw)
To: linux-kernel
Cc: kernel_team, torvalds, damien.lemoal, linux-ide, adilger.kernel,
linux-ext4, mingo, peterz, will, tglx, rostedt, joel, sashal,
daniel.vetter, duyuyang, johannes.berg, tj, tytso, willy, david,
amir73il, gregkh, kernel-team, linux-mm, akpm, mhocko, minchan,
hannes, vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes,
vbabka, ngupta, linux-block, josef, linux-fsdevel, jack, jlayton,
dan.j.williams, hch, djwong, dri-devel, rodrigosiqueiramelo,
melissa.srw, hamohammed.sa, harry.yoo, chris.p.wilson,
gwan-gyeong.mun, max.byungchul.park, boqun.feng, longman,
yunseong.kim, ysk, yeoreum.yun, netdev, matthew.brost, her0gyugyu,
corbet, catalin.marinas, bp, dave.hansen, x86, hpa, luto,
sumit.semwal, gustavo, christian.koenig, andi.shyti, arnd,
lorenzo.stoakes, Liam.Howlett, rppt, surenb, mcgrof, petr.pavlu,
da.gomez, samitolvanen, paulmck, frederic, neeraj.upadhyay,
joelagnelf, josh, urezki, mathieu.desnoyers, jiangshanlai,
qiang.zhang, juri.lelli, vincent.guittot, dietmar.eggemann,
bsegall, mgorman, vschneid, chuck.lever, neil, okorniev, Dai.Ngo,
tom, trondmy, anna, kees, bigeasy, clrkwllms, mark.rutland,
ada.coupriediaz, kristina.martsenko, wangkefeng.wang, broonie,
kevin.brodsky, dwmw, shakeel.butt, ast, ziy, yuzhao, baolin.wang,
usamaarif642, joel.granados, richard.weiyang, geert+renesas,
tim.c.chen, linux, alexander.shishkin, lillian, chenhuacai,
francesco, guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel
In-Reply-To: <20251002081247.51255-1-byungchul@sk.com>
Each unique kernel context, in dept's point of view, should be
identified on every entrance to kernel mode e.g. system call or user
oriented fault. Otherwise, dept may track meaningless dependencies
across different kernel context.
Plus, in order to update kernel context id at the very beginning of each
entrance, arch code support is required, that could be configured by
CONFIG_ARCH_HAS_DEPT_SUPPORT.
Signed-off-by: Byungchul Park <byungchul@sk.com>
---
include/linux/dept.h | 29 ++++++++++-------
include/linux/sched.h | 10 +++---
kernel/dependency/dept.c | 67 ++++++++++++++++++++--------------------
lib/Kconfig.debug | 5 ++-
4 files changed, 61 insertions(+), 50 deletions(-)
diff --git a/include/linux/dept.h b/include/linux/dept.h
index 5f0d2d8c8cbe..cb1b1beea077 100644
--- a/include/linux/dept.h
+++ b/include/linux/dept.h
@@ -26,11 +26,16 @@ struct task_struct;
#define DEPT_MAX_SUBCLASSES_USR (DEPT_MAX_SUBCLASSES / DEPT_MAX_SUBCLASSES_EVT)
#define DEPT_MAX_SUBCLASSES_CACHE 2
-#define DEPT_SIRQ 0
-#define DEPT_HIRQ 1
-#define DEPT_IRQS_NR 2
-#define DEPT_SIRQF (1UL << DEPT_SIRQ)
-#define DEPT_HIRQF (1UL << DEPT_HIRQ)
+enum {
+ DEPT_CXT_SIRQ = 0,
+ DEPT_CXT_HIRQ,
+ DEPT_CXT_IRQS_NR,
+ DEPT_CXT_PROCESS = DEPT_CXT_IRQS_NR,
+ DEPT_CXTS_NR
+};
+
+#define DEPT_SIRQF (1UL << DEPT_CXT_SIRQ)
+#define DEPT_HIRQF (1UL << DEPT_CXT_HIRQ)
struct dept_ecxt;
struct dept_iecxt {
@@ -95,8 +100,8 @@ struct dept_class {
/*
* for tracking IRQ dependencies
*/
- struct dept_iecxt iecxt[DEPT_IRQS_NR];
- struct dept_iwait iwait[DEPT_IRQS_NR];
+ struct dept_iecxt iecxt[DEPT_CXT_IRQS_NR];
+ struct dept_iwait iwait[DEPT_CXT_IRQS_NR];
/*
* classified by a map embedded in task_struct,
@@ -208,8 +213,8 @@ struct dept_ecxt {
/*
* where the IRQ-enabled happened
*/
- unsigned long enirq_ip[DEPT_IRQS_NR];
- struct dept_stack *enirq_stack[DEPT_IRQS_NR];
+ unsigned long enirq_ip[DEPT_CXT_IRQS_NR];
+ struct dept_stack *enirq_stack[DEPT_CXT_IRQS_NR];
/*
* where the event context started
@@ -253,8 +258,8 @@ struct dept_wait {
/*
* where the IRQ wait happened
*/
- unsigned long irq_ip[DEPT_IRQS_NR];
- struct dept_stack *irq_stack[DEPT_IRQS_NR];
+ unsigned long irq_ip[DEPT_CXT_IRQS_NR];
+ struct dept_stack *irq_stack[DEPT_CXT_IRQS_NR];
/*
* where the wait happened
@@ -384,6 +389,7 @@ extern void dept_event(struct dept_map *m, unsigned long e_f, unsigned long ip,
extern void dept_ecxt_exit(struct dept_map *m, unsigned long e_f, unsigned long ip);
extern void dept_sched_enter(void);
extern void dept_sched_exit(void);
+extern void dept_update_cxt(void);
static inline void dept_ecxt_enter_nokeep(struct dept_map *m)
{
@@ -431,6 +437,7 @@ struct dept_map { };
#define dept_ecxt_exit(m, e_f, ip) do { } while (0)
#define dept_sched_enter() do { } while (0)
#define dept_sched_exit() do { } while (0)
+#define dept_update_cxt() do { } while (0)
#define dept_ecxt_enter_nokeep(m) do { } while (0)
#define dept_key_init(k) do { (void)(k); } while (0)
#define dept_key_destroy(k) do { (void)(k); } while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ddb162201ba1..05c3f8a45405 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -830,19 +830,19 @@ struct dept_task {
int wait_hist_pos;
/*
- * sequential id to identify each IRQ context
+ * sequential id to identify each context
*/
- unsigned int irq_id[DEPT_IRQS_NR];
+ unsigned int cxt_id[DEPT_CXTS_NR];
/*
* for tracking IRQ-enabled points with cross-event
*/
- unsigned int wgen_enirq[DEPT_IRQS_NR];
+ unsigned int wgen_enirq[DEPT_CXT_IRQS_NR];
/*
* for keeping up-to-date IRQ-enabled points
*/
- unsigned long enirq_ip[DEPT_IRQS_NR];
+ unsigned long enirq_ip[DEPT_CXT_IRQS_NR];
/*
* for reserving a current stack instance at each operation
@@ -896,7 +896,7 @@ struct dept_task {
.wait_hist = { { .wait = NULL, } }, \
.ecxt_held_pos = 0, \
.wait_hist_pos = 0, \
- .irq_id = { 0U }, \
+ .cxt_id = { 0U }, \
.wgen_enirq = { 0U }, \
.enirq_ip = { 0UL }, \
.stack = NULL, \
diff --git a/kernel/dependency/dept.c b/kernel/dependency/dept.c
index dfe9dfdb6991..953e1b81a81f 100644
--- a/kernel/dependency/dept.c
+++ b/kernel/dependency/dept.c
@@ -230,9 +230,9 @@ static struct dept_class *dep_tc(struct dept_dep *d)
static const char *irq_str(int irq)
{
- if (irq == DEPT_SIRQ)
+ if (irq == DEPT_CXT_SIRQ)
return "softirq";
- if (irq == DEPT_HIRQ)
+ if (irq == DEPT_CXT_HIRQ)
return "hardirq";
return "(unknown)";
}
@@ -410,7 +410,7 @@ static void initialize_class(struct dept_class *c)
{
int i;
- for (i = 0; i < DEPT_IRQS_NR; i++) {
+ for (i = 0; i < DEPT_CXT_IRQS_NR; i++) {
struct dept_iecxt *ie = &c->iecxt[i];
struct dept_iwait *iw = &c->iwait[i];
@@ -436,7 +436,7 @@ static void initialize_ecxt(struct dept_ecxt *e)
{
int i;
- for (i = 0; i < DEPT_IRQS_NR; i++) {
+ for (i = 0; i < DEPT_CXT_IRQS_NR; i++) {
e->enirq_stack[i] = NULL;
e->enirq_ip[i] = 0UL;
}
@@ -452,7 +452,7 @@ static void initialize_wait(struct dept_wait *w)
{
int i;
- for (i = 0; i < DEPT_IRQS_NR; i++) {
+ for (i = 0; i < DEPT_CXT_IRQS_NR; i++) {
w->irq_stack[i] = NULL;
w->irq_ip[i] = 0UL;
}
@@ -491,7 +491,7 @@ static void destroy_ecxt(struct dept_ecxt *e)
{
int i;
- for (i = 0; i < DEPT_IRQS_NR; i++)
+ for (i = 0; i < DEPT_CXT_IRQS_NR; i++)
if (e->enirq_stack[i])
put_stack(e->enirq_stack[i]);
if (e->class)
@@ -507,7 +507,7 @@ static void destroy_wait(struct dept_wait *w)
{
int i;
- for (i = 0; i < DEPT_IRQS_NR; i++)
+ for (i = 0; i < DEPT_CXT_IRQS_NR; i++)
if (w->irq_stack[i])
put_stack(w->irq_stack[i]);
if (w->class)
@@ -665,7 +665,7 @@ static void print_diagram(struct dept_dep *d)
const char *tc_n = tc->sched_map ? "<sched>" : (tc->name ?: "(unknown)");
irqf = e->enirqf & w->irqf;
- for_each_set_bit(irq, &irqf, DEPT_IRQS_NR) {
+ for_each_set_bit(irq, &irqf, DEPT_CXT_IRQS_NR) {
if (!firstline)
pr_warn("\nor\n\n");
firstline = false;
@@ -698,7 +698,7 @@ static void print_dep(struct dept_dep *d)
const char *tc_n = tc->sched_map ? "<sched>" : (tc->name ?: "(unknown)");
irqf = e->enirqf & w->irqf;
- for_each_set_bit(irq, &irqf, DEPT_IRQS_NR) {
+ for_each_set_bit(irq, &irqf, DEPT_CXT_IRQS_NR) {
pr_warn("%s has been enabled:\n", irq_str(irq));
print_ip_stack(e->enirq_ip[irq], e->enirq_stack[irq]);
pr_warn("\n");
@@ -866,7 +866,7 @@ static void bfs(void *root, struct bfs_ops *ops, void *in, void **out)
*/
static unsigned long cur_enirqf(void);
-static int cur_irq(void);
+static int cur_cxt(void);
static unsigned int cur_ctxt_id(void);
static struct dept_iecxt *iecxt(struct dept_class *c, int irq)
@@ -1443,7 +1443,7 @@ static void add_dep(struct dept_ecxt *e, struct dept_wait *w)
if (d) {
check_dl_bfs(d);
- for (i = 0; i < DEPT_IRQS_NR; i++) {
+ for (i = 0; i < DEPT_CXT_IRQS_NR; i++) {
struct dept_iwait *fiw = iwait(fc, i);
struct dept_iecxt *found_ie;
struct dept_iwait *found_iw;
@@ -1487,7 +1487,7 @@ static void add_wait(struct dept_class *c, unsigned long ip,
struct dept_task *dt = dept_task();
struct dept_wait *w;
unsigned int wg;
- int irq;
+ int cxt;
int i;
if (DEPT_WARN_ON(!valid_class(c)))
@@ -1503,9 +1503,9 @@ static void add_wait(struct dept_class *c, unsigned long ip,
w->wait_stack = get_current_stack();
w->sched_sleep = sched_sleep;
- irq = cur_irq();
- if (irq < DEPT_IRQS_NR)
- add_iwait(c, irq, w);
+ cxt = cur_cxt();
+ if (cxt == DEPT_CXT_HIRQ || cxt == DEPT_CXT_SIRQ)
+ add_iwait(c, cxt, w);
/*
* Avoid adding dependency between user aware nested ecxt and
@@ -1579,7 +1579,7 @@ static struct dept_ecxt_held *add_ecxt(struct dept_map *m,
eh->sub_l = sub_l;
irqf = cur_enirqf();
- for_each_set_bit(irq, &irqf, DEPT_IRQS_NR)
+ for_each_set_bit(irq, &irqf, DEPT_CXT_IRQS_NR)
add_iecxt(c, irq, e, false);
del_ecxt(e);
@@ -1728,7 +1728,7 @@ static void do_event(struct dept_map *m, struct dept_map *real_m,
add_dep(eh->ecxt, wh->wait);
}
- for (i = 0; i < DEPT_IRQS_NR; i++) {
+ for (i = 0; i < DEPT_CXT_IRQS_NR; i++) {
struct dept_ecxt *e;
if (before(dt->wgen_enirq[i], wg))
@@ -1775,7 +1775,7 @@ static void disconnect_class(struct dept_class *c)
call_rcu(&d->rh, del_dep_rcu);
}
- for (i = 0; i < DEPT_IRQS_NR; i++) {
+ for (i = 0; i < DEPT_CXT_IRQS_NR; i++) {
stale_iecxt(iecxt(c, i));
stale_iwait(iwait(c, i));
}
@@ -1800,27 +1800,21 @@ static unsigned long cur_enirqf(void)
return 0UL;
}
-static int cur_irq(void)
+static int cur_cxt(void)
{
if (lockdep_softirq_context(current))
- return DEPT_SIRQ;
+ return DEPT_CXT_SIRQ;
if (lockdep_hardirq_context())
- return DEPT_HIRQ;
- return DEPT_IRQS_NR;
+ return DEPT_CXT_HIRQ;
+ return DEPT_CXT_PROCESS;
}
static unsigned int cur_ctxt_id(void)
{
struct dept_task *dt = dept_task();
- int irq = cur_irq();
+ int cxt = cur_cxt();
- /*
- * Normal process context
- */
- if (irq == DEPT_IRQS_NR)
- return 0U;
-
- return dt->irq_id[irq] | (1UL << irq);
+ return dt->cxt_id[cxt] | (1UL << cxt);
}
static void enirq_transition(int irq)
@@ -1877,7 +1871,7 @@ static void dept_enirq(unsigned long ip)
flags = dept_enter();
- for_each_set_bit(irq, &irqf, DEPT_IRQS_NR) {
+ for_each_set_bit(irq, &irqf, DEPT_CXT_IRQS_NR) {
dt->enirq_ip[irq] = ip;
enirq_transition(irq);
}
@@ -1923,6 +1917,13 @@ void noinstr dept_hardirqs_off(void)
dept_task()->hardirqs_enabled = false;
}
+void noinstr dept_update_cxt(void)
+{
+ struct dept_task *dt = dept_task();
+
+ dt->cxt_id[DEPT_CXT_PROCESS] += 1UL << DEPT_CXTS_NR;
+}
+
/*
* Ensure it's the outmost softirq context.
*/
@@ -1930,7 +1931,7 @@ void dept_softirq_enter(void)
{
struct dept_task *dt = dept_task();
- dt->irq_id[DEPT_SIRQ] += 1UL << DEPT_IRQS_NR;
+ dt->cxt_id[DEPT_CXT_SIRQ] += 1UL << DEPT_CXTS_NR;
}
/*
@@ -1940,7 +1941,7 @@ void noinstr dept_hardirq_enter(void)
{
struct dept_task *dt = dept_task();
- dt->irq_id[DEPT_HIRQ] += 1UL << DEPT_IRQS_NR;
+ dt->cxt_id[DEPT_CXT_HIRQ] += 1UL << DEPT_CXTS_NR;
}
void dept_sched_enter(void)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b9cff0bec6f2..3669b069337b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1365,9 +1365,12 @@ config DEBUG_PREEMPT
menu "Lock Debugging (spinlocks, mutexes, etc...)"
+config ARCH_HAS_DEPT_SUPPORT
+ bool
+
config DEPT
bool "Dependency tracking (EXPERIMENTAL)"
- depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
+ depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT && ARCH_HAS_DEPT_SUPPORT
select DEBUG_SPINLOCK
select DEBUG_MUTEXES if !PREEMPT_RT
select DEBUG_RT_MUTEXES if RT_MUTEXES
--
2.17.1
^ permalink raw reply related
* [PATCH v17 06/47] dept: add proc knobs to show stats and dependency graph
From: Byungchul Park @ 2025-10-02 8:12 UTC (permalink / raw)
To: linux-kernel
Cc: kernel_team, torvalds, damien.lemoal, linux-ide, adilger.kernel,
linux-ext4, mingo, peterz, will, tglx, rostedt, joel, sashal,
daniel.vetter, duyuyang, johannes.berg, tj, tytso, willy, david,
amir73il, gregkh, kernel-team, linux-mm, akpm, mhocko, minchan,
hannes, vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes,
vbabka, ngupta, linux-block, josef, linux-fsdevel, jack, jlayton,
dan.j.williams, hch, djwong, dri-devel, rodrigosiqueiramelo,
melissa.srw, hamohammed.sa, harry.yoo, chris.p.wilson,
gwan-gyeong.mun, max.byungchul.park, boqun.feng, longman,
yunseong.kim, ysk, yeoreum.yun, netdev, matthew.brost, her0gyugyu,
corbet, catalin.marinas, bp, dave.hansen, x86, hpa, luto,
sumit.semwal, gustavo, christian.koenig, andi.shyti, arnd,
lorenzo.stoakes, Liam.Howlett, rppt, surenb, mcgrof, petr.pavlu,
da.gomez, samitolvanen, paulmck, frederic, neeraj.upadhyay,
joelagnelf, josh, urezki, mathieu.desnoyers, jiangshanlai,
qiang.zhang, juri.lelli, vincent.guittot, dietmar.eggemann,
bsegall, mgorman, vschneid, chuck.lever, neil, okorniev, Dai.Ngo,
tom, trondmy, anna, kees, bigeasy, clrkwllms, mark.rutland,
ada.coupriediaz, kristina.martsenko, wangkefeng.wang, broonie,
kevin.brodsky, dwmw, shakeel.butt, ast, ziy, yuzhao, baolin.wang,
usamaarif642, joel.granados, richard.weiyang, geert+renesas,
tim.c.chen, linux, alexander.shishkin, lillian, chenhuacai,
francesco, guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel
In-Reply-To: <20251002081247.51255-1-byungchul@sk.com>
It'd be useful to show dept internal stats and dependency graph on
runtime via proc for better information. Introduce the knobs.
Signed-off-by: Byungchul Park <byungchul@sk.com>
---
kernel/dependency/Makefile | 1 +
kernel/dependency/dept.c | 50 +++-------------
kernel/dependency/dept_internal.h | 54 +++++++++++++++++
kernel/dependency/dept_proc.c | 96 +++++++++++++++++++++++++++++++
4 files changed, 160 insertions(+), 41 deletions(-)
create mode 100644 kernel/dependency/dept_internal.h
create mode 100644 kernel/dependency/dept_proc.c
diff --git a/kernel/dependency/Makefile b/kernel/dependency/Makefile
index b5cfb8a03c0c..92f165400187 100644
--- a/kernel/dependency/Makefile
+++ b/kernel/dependency/Makefile
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_DEPT) += dept.o
+obj-$(CONFIG_DEPT) += dept_proc.o
diff --git a/kernel/dependency/dept.c b/kernel/dependency/dept.c
index cbb036e8cc1d..dfe9dfdb6991 100644
--- a/kernel/dependency/dept.c
+++ b/kernel/dependency/dept.c
@@ -75,6 +75,7 @@
#include <linux/dept.h>
#include <linux/utsname.h>
#include <linux/kernel.h>
+#include "dept_internal.h"
static int dept_stop;
static int dept_per_cpu_ready;
@@ -265,46 +266,13 @@ static bool valid_key(struct dept_key *k)
* have been freed will be placed.
*/
-enum object_t {
-#define OBJECT(id, nr) OBJECT_##id,
- #include "dept_object.h"
-#undef OBJECT
- OBJECT_NR,
-};
-
#define OBJECT(id, nr) \
static struct dept_##id spool_##id[nr]; \
static DEFINE_PER_CPU(struct llist_head, lpool_##id);
#include "dept_object.h"
#undef OBJECT
-struct dept_pool {
- const char *name;
-
- /*
- * object size
- */
- size_t obj_sz;
-
- /*
- * the number of the static array
- */
- atomic_t obj_nr;
-
- /*
- * offset of ->pool_node
- */
- size_t node_off;
-
- /*
- * pointer to the pool
- */
- void *spool;
- struct llist_head boot_pool;
- struct llist_head __percpu *lpool;
-};
-
-static struct dept_pool pool[OBJECT_NR] = {
+struct dept_pool dept_pool[OBJECT_NR] = {
#define OBJECT(id, nr) { \
.name = #id, \
.obj_sz = sizeof(struct dept_##id), \
@@ -334,7 +302,7 @@ static void *from_pool(enum object_t t)
if (DEPT_WARN_ON(!irqs_disabled()))
return NULL;
- p = &pool[t];
+ p = &dept_pool[t];
/*
* Try local pool first.
@@ -369,7 +337,7 @@ static void *from_pool(enum object_t t)
static void to_pool(void *o, enum object_t t)
{
- struct dept_pool *p = &pool[t];
+ struct dept_pool *p = &dept_pool[t];
struct llist_head *h;
preempt_disable();
@@ -2108,7 +2076,7 @@ void dept_map_copy(struct dept_map *to, struct dept_map *from)
clean_classes_cache(&to->map_key);
}
-static LIST_HEAD(classes);
+LIST_HEAD(dept_classes);
static bool within(const void *addr, void *start, unsigned long size)
{
@@ -2140,7 +2108,7 @@ void dept_free_range(void *start, unsigned int sz)
while (unlikely(!dept_lock()))
cpu_relax();
- list_for_each_entry_safe(c, n, &classes, all_node) {
+ list_for_each_entry_safe(c, n, &dept_classes, all_node) {
if (!within((void *)c->key, start, sz) &&
!within(c->name, start, sz))
continue;
@@ -2216,7 +2184,7 @@ static struct dept_class *check_new_class(struct dept_key *local,
c->sub_id = sub_id;
c->key = (unsigned long)(k->base + sub_id);
hash_add_class(c);
- list_add(&c->all_node, &classes);
+ list_add(&c->all_node, &dept_classes);
unlock:
dept_unlock();
caching:
@@ -2951,8 +2919,8 @@ static void migrate_per_cpu_pool(void)
struct llist_head *from;
struct llist_head *to;
- from = &pool[i].boot_pool;
- to = per_cpu_ptr(pool[i].lpool, boot_cpu);
+ from = &dept_pool[i].boot_pool;
+ to = per_cpu_ptr(dept_pool[i].lpool, boot_cpu);
move_llist(to, from);
}
}
diff --git a/kernel/dependency/dept_internal.h b/kernel/dependency/dept_internal.h
new file mode 100644
index 000000000000..6b39e5a2a830
--- /dev/null
+++ b/kernel/dependency/dept_internal.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Dept(DEPendency Tracker) - runtime dependency tracker internal header
+ *
+ * Started by Byungchul Park <max.byungchul.park@gmail.com>:
+ *
+ * Copyright (c) 2020 LG Electronics, Inc., Byungchul Park
+ * Copyright (c) 2024 SK hynix, Inc., Byungchul Park
+ */
+
+#ifndef __DEPT_INTERNAL_H
+#define __DEPT_INTERNAL_H
+
+#ifdef CONFIG_DEPT
+#include <linux/percpu.h>
+
+struct dept_pool {
+ const char *name;
+
+ /*
+ * object size
+ */
+ size_t obj_sz;
+
+ /*
+ * the number of the static array
+ */
+ atomic_t obj_nr;
+
+ /*
+ * offset of ->pool_node
+ */
+ size_t node_off;
+
+ /*
+ * pointer to the pool
+ */
+ void *spool;
+ struct llist_head boot_pool;
+ struct llist_head __percpu *lpool;
+};
+
+enum object_t {
+#define OBJECT(id, nr) OBJECT_##id,
+ #include "dept_object.h"
+#undef OBJECT
+ OBJECT_NR,
+};
+
+extern struct list_head dept_classes;
+extern struct dept_pool dept_pool[];
+
+#endif
+#endif /* __DEPT_INTERNAL_H */
diff --git a/kernel/dependency/dept_proc.c b/kernel/dependency/dept_proc.c
new file mode 100644
index 000000000000..97beaf397715
--- /dev/null
+++ b/kernel/dependency/dept_proc.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Procfs knobs for Dept(DEPendency Tracker)
+ *
+ * Started by Byungchul Park <max.byungchul.park@gmail.com>:
+ *
+ * Copyright (C) 2021 LG Electronics, Inc. , Byungchul Park
+ * Copyright (C) 2024 SK hynix, Inc. , Byungchul Park
+ */
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/dept.h>
+#include "dept_internal.h"
+
+static void *l_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ /*
+ * XXX: Serialize list traversal if needed. The following might
+ * give a wrong information on contention.
+ */
+ return seq_list_next(v, &dept_classes, pos);
+}
+
+static void *l_start(struct seq_file *m, loff_t *pos)
+{
+ /*
+ * XXX: Serialize list traversal if needed. The following might
+ * give a wrong information on contention.
+ */
+ return seq_list_start_head(&dept_classes, *pos);
+}
+
+static void l_stop(struct seq_file *m, void *v)
+{
+}
+
+static int l_show(struct seq_file *m, void *v)
+{
+ struct dept_class *fc = list_entry(v, struct dept_class, all_node);
+ struct dept_dep *d;
+ const char *prefix;
+
+ if (v == &dept_classes) {
+ seq_puts(m, "All classes:\n\n");
+ return 0;
+ }
+
+ prefix = fc->sched_map ? "<sched> " : "";
+ seq_printf(m, "[%p] %s%s\n", (void *)fc->key, prefix, fc->name);
+
+ /*
+ * XXX: Serialize list traversal if needed. The following might
+ * give a wrong information on contention.
+ */
+ list_for_each_entry(d, &fc->dep_head, dep_node) {
+ struct dept_class *tc = d->wait->class;
+
+ prefix = tc->sched_map ? "<sched> " : "";
+ seq_printf(m, " -> [%p] %s%s\n", (void *)tc->key, prefix, tc->name);
+ }
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static const struct seq_operations dept_deps_ops = {
+ .start = l_start,
+ .next = l_next,
+ .stop = l_stop,
+ .show = l_show,
+};
+
+static int dept_stats_show(struct seq_file *m, void *v)
+{
+ int r;
+
+ seq_puts(m, "Availability in the static pools:\n\n");
+#define OBJECT(id, nr) \
+ r = atomic_read(&dept_pool[OBJECT_##id].obj_nr); \
+ if (r < 0) \
+ r = 0; \
+ seq_printf(m, "%s\t%d/%d(%d%%)\n", #id, r, nr, (r * 100) / (nr));
+ #include "dept_object.h"
+#undef OBJECT
+
+ return 0;
+}
+
+static int __init dept_proc_init(void)
+{
+ proc_create_seq("dept_deps", S_IRUSR, NULL, &dept_deps_ops);
+ proc_create_single("dept_stats", S_IRUSR, NULL, dept_stats_show);
+ return 0;
+}
+
+__initcall(dept_proc_init);
--
2.17.1
^ permalink raw reply related
* [PATCH v17 05/47] dept: tie to lockdep and IRQ tracing
From: Byungchul Park @ 2025-10-02 8:12 UTC (permalink / raw)
To: linux-kernel
Cc: kernel_team, torvalds, damien.lemoal, linux-ide, adilger.kernel,
linux-ext4, mingo, peterz, will, tglx, rostedt, joel, sashal,
daniel.vetter, duyuyang, johannes.berg, tj, tytso, willy, david,
amir73il, gregkh, kernel-team, linux-mm, akpm, mhocko, minchan,
hannes, vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes,
vbabka, ngupta, linux-block, josef, linux-fsdevel, jack, jlayton,
dan.j.williams, hch, djwong, dri-devel, rodrigosiqueiramelo,
melissa.srw, hamohammed.sa, harry.yoo, chris.p.wilson,
gwan-gyeong.mun, max.byungchul.park, boqun.feng, longman,
yunseong.kim, ysk, yeoreum.yun, netdev, matthew.brost, her0gyugyu,
corbet, catalin.marinas, bp, dave.hansen, x86, hpa, luto,
sumit.semwal, gustavo, christian.koenig, andi.shyti, arnd,
lorenzo.stoakes, Liam.Howlett, rppt, surenb, mcgrof, petr.pavlu,
da.gomez, samitolvanen, paulmck, frederic, neeraj.upadhyay,
joelagnelf, josh, urezki, mathieu.desnoyers, jiangshanlai,
qiang.zhang, juri.lelli, vincent.guittot, dietmar.eggemann,
bsegall, mgorman, vschneid, chuck.lever, neil, okorniev, Dai.Ngo,
tom, trondmy, anna, kees, bigeasy, clrkwllms, mark.rutland,
ada.coupriediaz, kristina.martsenko, wangkefeng.wang, broonie,
kevin.brodsky, dwmw, shakeel.butt, ast, ziy, yuzhao, baolin.wang,
usamaarif642, joel.granados, richard.weiyang, geert+renesas,
tim.c.chen, linux, alexander.shishkin, lillian, chenhuacai,
francesco, guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel
In-Reply-To: <20251002081247.51255-1-byungchul@sk.com>
How to place dept this way looks so ugly. However, it's inevitable for
now. The way should be enhanced gradually.
Signed-off-by: Byungchul Park <byungchul@sk.com>
---
include/linux/irqflags.h | 7 +-
include/linux/local_lock_internal.h | 1 +
include/linux/lockdep.h | 102 ++++++++++++++++++++++------
include/linux/lockdep_types.h | 3 +
include/linux/mutex.h | 1 +
include/linux/percpu-rwsem.h | 2 +-
include/linux/rtmutex.h | 1 +
include/linux/rwlock_types.h | 1 +
include/linux/rwsem.h | 1 +
include/linux/seqlock.h | 2 +-
include/linux/spinlock_types_raw.h | 3 +
include/linux/srcu.h | 2 +-
kernel/dependency/dept.c | 8 +--
kernel/locking/lockdep.c | 22 ++++++
14 files changed, 127 insertions(+), 29 deletions(-)
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 57b074e0cfbb..d8b9cf093f83 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -15,6 +15,7 @@
#include <linux/irqflags_types.h>
#include <linux/typecheck.h>
#include <linux/cleanup.h>
+#include <linux/dept.h>
#include <asm/irqflags.h>
#include <asm/percpu.h>
@@ -55,8 +56,10 @@ extern void trace_hardirqs_off(void);
# define lockdep_softirqs_enabled(p) ((p)->softirqs_enabled)
# define lockdep_hardirq_enter() \
do { \
- if (__this_cpu_inc_return(hardirq_context) == 1)\
+ if (__this_cpu_inc_return(hardirq_context) == 1) { \
current->hardirq_threaded = 0; \
+ dept_hardirq_enter(); \
+ } \
} while (0)
# define lockdep_hardirq_threaded() \
do { \
@@ -131,6 +134,8 @@ do { \
# define lockdep_softirq_enter() \
do { \
current->softirq_context++; \
+ if (current->softirq_context == 1) \
+ dept_softirq_enter(); \
} while (0)
# define lockdep_softirq_exit() \
do { \
diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
index d80b5306a2c0..3b74da2fec50 100644
--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -27,6 +27,7 @@ typedef struct {
.name = #lockname, \
.wait_type_inner = LD_WAIT_CONFIG, \
.lock_type = LD_LOCK_PERCPU, \
+ .dmap = DEPT_MAP_INITIALIZER(lockname, NULL),\
}, \
.owner = NULL,
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 67964dc4db95..ef03d8808c10 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -12,6 +12,7 @@
#include <linux/lockdep_types.h>
#include <linux/smp.h>
+#include <linux/dept_ldt.h>
#include <asm/percpu.h>
struct task_struct;
@@ -39,6 +40,8 @@ static inline void lockdep_copy_map(struct lockdep_map *to,
*/
for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
to->class_cache[i] = NULL;
+
+ dept_map_copy(&to->dmap, &from->dmap);
}
/*
@@ -428,7 +431,8 @@ enum xhlock_context_t {
* Note that _name must not be NULL.
*/
#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
- { .name = (_name), .key = (void *)(_key), }
+ { .name = (_name), .key = (void *)(_key), \
+ .dmap = DEPT_MAP_INITIALIZER(_name, _key) }
static inline void lockdep_invariant_state(bool force) {}
static inline void lockdep_free_task(struct task_struct *task) {}
@@ -510,33 +514,89 @@ extern bool read_lock_is_recursive(void);
#define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i)
#define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i)
-#define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i)
-#define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i)
-#define spin_release(l, i) lock_release(l, i)
-
-#define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i)
+#define spin_acquire(l, s, t, i) \
+do { \
+ ldt_lock(&(l)->dmap, s, t, NULL, i); \
+ lock_acquire_exclusive(l, s, t, NULL, i); \
+} while (0)
+#define spin_acquire_nest(l, s, t, n, i) \
+do { \
+ ldt_lock(&(l)->dmap, s, t, n, i); \
+ lock_acquire_exclusive(l, s, t, n, i); \
+} while (0)
+#define spin_release(l, i) \
+do { \
+ ldt_unlock(&(l)->dmap, i); \
+ lock_release(l, i); \
+} while (0)
+#define rwlock_acquire(l, s, t, i) \
+do { \
+ ldt_wlock(&(l)->dmap, s, t, NULL, i); \
+ lock_acquire_exclusive(l, s, t, NULL, i); \
+} while (0)
#define rwlock_acquire_read(l, s, t, i) \
do { \
+ ldt_rlock(&(l)->dmap, s, t, NULL, i, !read_lock_is_recursive());\
if (read_lock_is_recursive()) \
lock_acquire_shared_recursive(l, s, t, NULL, i); \
else \
lock_acquire_shared(l, s, t, NULL, i); \
} while (0)
-
-#define rwlock_release(l, i) lock_release(l, i)
-
-#define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i)
-#define seqcount_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i)
-#define seqcount_release(l, i) lock_release(l, i)
-
-#define mutex_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i)
-#define mutex_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i)
-#define mutex_release(l, i) lock_release(l, i)
-
-#define rwsem_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i)
-#define rwsem_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i)
-#define rwsem_acquire_read(l, s, t, i) lock_acquire_shared(l, s, t, NULL, i)
-#define rwsem_release(l, i) lock_release(l, i)
+#define rwlock_release(l, i) \
+do { \
+ ldt_unlock(&(l)->dmap, i); \
+ lock_release(l, i); \
+} while (0)
+#define seqcount_acquire(l, s, t, i) \
+do { \
+ ldt_wlock(&(l)->dmap, s, t, NULL, i); \
+ lock_acquire_exclusive(l, s, t, NULL, i); \
+} while (0)
+#define seqcount_acquire_read(l, s, t, i) \
+do { \
+ ldt_rlock(&(l)->dmap, s, t, NULL, i, false); \
+ lock_acquire_shared_recursive(l, s, t, NULL, i); \
+} while (0)
+#define seqcount_release(l, i) \
+do { \
+ ldt_unlock(&(l)->dmap, i); \
+ lock_release(l, i); \
+} while (0)
+#define mutex_acquire(l, s, t, i) \
+do { \
+ ldt_lock(&(l)->dmap, s, t, NULL, i); \
+ lock_acquire_exclusive(l, s, t, NULL, i); \
+} while (0)
+#define mutex_acquire_nest(l, s, t, n, i) \
+do { \
+ ldt_lock(&(l)->dmap, s, t, n, i); \
+ lock_acquire_exclusive(l, s, t, n, i); \
+} while (0)
+#define mutex_release(l, i) \
+do { \
+ ldt_unlock(&(l)->dmap, i); \
+ lock_release(l, i); \
+} while (0)
+#define rwsem_acquire(l, s, t, i) \
+do { \
+ ldt_lock(&(l)->dmap, s, t, NULL, i); \
+ lock_acquire_exclusive(l, s, t, NULL, i); \
+} while (0)
+#define rwsem_acquire_nest(l, s, t, n, i) \
+do { \
+ ldt_lock(&(l)->dmap, s, t, n, i); \
+ lock_acquire_exclusive(l, s, t, n, i); \
+} while (0)
+#define rwsem_acquire_read(l, s, t, i) \
+do { \
+ ldt_lock(&(l)->dmap, s, t, NULL, i); \
+ lock_acquire_shared(l, s, t, NULL, i); \
+} while (0)
+#define rwsem_release(l, i) \
+do { \
+ ldt_unlock(&(l)->dmap, i); \
+ lock_release(l, i); \
+} while (0)
#define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_)
#define lock_map_acquire_try(l) lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_)
diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index eae115a26488..0c3389ed26b6 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -11,6 +11,7 @@
#define __LINUX_LOCKDEP_TYPES_H
#include <linux/types.h>
+#include <linux/dept.h>
#define MAX_LOCKDEP_SUBCLASSES 8UL
@@ -77,6 +78,7 @@ struct lock_class_key {
struct hlist_node hash_entry;
struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES];
};
+ struct dept_key dkey;
};
extern struct lock_class_key __lockdep_no_validate__;
@@ -195,6 +197,7 @@ struct lockdep_map {
int cpu;
unsigned long ip;
#endif
+ struct dept_map dmap;
};
struct pin_cookie { unsigned int val; };
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 847b81ca6436..f8d7f02be04d 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -29,6 +29,7 @@ struct device;
, .dep_map = { \
.name = #lockname, \
.wait_type_inner = LD_WAIT_SLEEP, \
+ .dmap = DEPT_MAP_INITIALIZER(lockname, NULL),\
}
#else
# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 288f5235649a..11eece738f1f 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -22,7 +22,7 @@ struct percpu_rw_semaphore {
};
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname },
+#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname, .dmap = DEPT_MAP_INITIALIZER(lockname, NULL) },
#else
#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)
#endif
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index fa9f1021541e..4dc7f046b0a6 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -81,6 +81,7 @@ do { \
.dep_map = { \
.name = #mutexname, \
.wait_type_inner = LD_WAIT_SLEEP, \
+ .dmap = DEPT_MAP_INITIALIZER(mutexname, NULL),\
}
#else
#define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
index 1948442e7750..6e58dfc84997 100644
--- a/include/linux/rwlock_types.h
+++ b/include/linux/rwlock_types.h
@@ -10,6 +10,7 @@
.dep_map = { \
.name = #lockname, \
.wait_type_inner = LD_WAIT_CONFIG, \
+ .dmap = DEPT_MAP_INITIALIZER(lockname, NULL), \
}
#else
# define RW_DEP_MAP_INIT(lockname)
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index f1aaf676a874..0f349c83a7dc 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -22,6 +22,7 @@
.dep_map = { \
.name = #lockname, \
.wait_type_inner = LD_WAIT_SLEEP, \
+ .dmap = DEPT_MAP_INITIALIZER(lockname, NULL),\
},
#else
# define __RWSEM_DEP_MAP_INIT(lockname)
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 5ce48eab7a2a..5f3447449fe0 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -51,7 +51,7 @@ static inline void __seqcount_init(seqcount_t *s, const char *name,
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define SEQCOUNT_DEP_MAP_INIT(lockname) \
- .dep_map = { .name = #lockname }
+ .dep_map = { .name = #lockname, .dmap = DEPT_MAP_INITIALIZER(lockname, NULL) }
/**
* seqcount_init() - runtime initializer for seqcount_t
diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
index 91cb36b65a17..3dcc551ded25 100644
--- a/include/linux/spinlock_types_raw.h
+++ b/include/linux/spinlock_types_raw.h
@@ -31,11 +31,13 @@ typedef struct raw_spinlock {
.dep_map = { \
.name = #lockname, \
.wait_type_inner = LD_WAIT_SPIN, \
+ .dmap = DEPT_MAP_INITIALIZER(lockname, NULL),\
}
# define SPIN_DEP_MAP_INIT(lockname) \
.dep_map = { \
.name = #lockname, \
.wait_type_inner = LD_WAIT_CONFIG, \
+ .dmap = DEPT_MAP_INITIALIZER(lockname, NULL),\
}
# define LOCAL_SPIN_DEP_MAP_INIT(lockname) \
@@ -43,6 +45,7 @@ typedef struct raw_spinlock {
.name = #lockname, \
.wait_type_inner = LD_WAIT_CONFIG, \
.lock_type = LD_LOCK_PERCPU, \
+ .dmap = DEPT_MAP_INITIALIZER(lockname, NULL),\
}
#else
# define RAW_SPIN_DEP_MAP_INIT(lockname)
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index f179700fecaf..f1961554ed1a 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -35,7 +35,7 @@ int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
__init_srcu_struct((ssp), #ssp, &__srcu_key); \
})
-#define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name },
+#define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name, .dmap = DEPT_MAP_INITIALIZER(srcu_name, NULL) },
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
int init_srcu_struct(struct srcu_struct *ssp);
diff --git a/kernel/dependency/dept.c b/kernel/dependency/dept.c
index 712b7f79a095..cbb036e8cc1d 100644
--- a/kernel/dependency/dept.c
+++ b/kernel/dependency/dept.c
@@ -249,10 +249,10 @@ static bool dept_working(void)
* Even k == NULL is considered as a valid key because it would use
* &->map_key as the key in that case.
*/
-struct dept_key __dept_no_validate__;
+extern struct lock_class_key __lockdep_no_validate__;
static bool valid_key(struct dept_key *k)
{
- return &__dept_no_validate__ != k;
+ return &__lockdep_no_validate__.dkey != k;
}
/*
@@ -1946,7 +1946,7 @@ void dept_softirqs_off(void)
dept_task()->softirqs_enabled = false;
}
-void dept_hardirqs_off(void)
+void noinstr dept_hardirqs_off(void)
{
/*
* Assumes that it's called with IRQ disabled so that accessing
@@ -1968,7 +1968,7 @@ void dept_softirq_enter(void)
/*
* Ensure it's the outmost hardirq context.
*/
-void dept_hardirq_enter(void)
+void noinstr dept_hardirq_enter(void)
{
struct dept_task *dt = dept_task();
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 2d4c5bab5af8..dc97f2753ef8 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1224,6 +1224,8 @@ void lockdep_register_key(struct lock_class_key *key)
struct lock_class_key *k;
unsigned long flags;
+ dept_key_init(&key->dkey);
+
if (WARN_ON_ONCE(static_obj(key)))
return;
hash_head = keyhashentry(key);
@@ -4361,6 +4363,8 @@ static void __trace_hardirqs_on_caller(void)
*/
void lockdep_hardirqs_on_prepare(void)
{
+ dept_hardirqs_on();
+
if (unlikely(!debug_locks))
return;
@@ -4481,6 +4485,8 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on);
*/
void noinstr lockdep_hardirqs_off(unsigned long ip)
{
+ dept_hardirqs_off();
+
if (unlikely(!debug_locks))
return;
@@ -4525,6 +4531,8 @@ void lockdep_softirqs_on(unsigned long ip)
{
struct irqtrace_events *trace = ¤t->irqtrace;
+ dept_softirqs_on_ip(ip);
+
if (unlikely(!lockdep_enabled()))
return;
@@ -4563,6 +4571,8 @@ void lockdep_softirqs_on(unsigned long ip)
*/
void lockdep_softirqs_off(unsigned long ip)
{
+ dept_softirqs_off();
+
if (unlikely(!lockdep_enabled()))
return;
@@ -4940,6 +4950,8 @@ void lockdep_init_map_type(struct lockdep_map *lock, const char *name,
{
int i;
+ ldt_init(&lock->dmap, &key->dkey, subclass, name);
+
for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
lock->class_cache[i] = NULL;
@@ -5736,6 +5748,12 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
{
unsigned long flags;
+ /*
+ * dept_map_(re)init() might be called twice redundantly. But
+ * there's no choice as long as Dept relies on Lockdep.
+ */
+ ldt_set_class(&lock->dmap, name, &key->dkey, subclass, ip);
+
if (unlikely(!lockdep_enabled()))
return;
@@ -5753,6 +5771,8 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
+ ldt_downgrade(&lock->dmap, ip);
+
if (unlikely(!lockdep_enabled()))
return;
@@ -6588,6 +6608,8 @@ void lockdep_unregister_key(struct lock_class_key *key)
bool found = false;
bool need_callback = false;
+ dept_key_destroy(&key->dkey);
+
might_sleep();
if (WARN_ON_ONCE(static_obj(key)))
--
2.17.1
^ permalink raw reply related
* [PATCH v17 03/47] dept: add single event dependency tracker APIs
From: Byungchul Park @ 2025-10-02 8:12 UTC (permalink / raw)
To: linux-kernel
Cc: kernel_team, torvalds, damien.lemoal, linux-ide, adilger.kernel,
linux-ext4, mingo, peterz, will, tglx, rostedt, joel, sashal,
daniel.vetter, duyuyang, johannes.berg, tj, tytso, willy, david,
amir73il, gregkh, kernel-team, linux-mm, akpm, mhocko, minchan,
hannes, vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes,
vbabka, ngupta, linux-block, josef, linux-fsdevel, jack, jlayton,
dan.j.williams, hch, djwong, dri-devel, rodrigosiqueiramelo,
melissa.srw, hamohammed.sa, harry.yoo, chris.p.wilson,
gwan-gyeong.mun, max.byungchul.park, boqun.feng, longman,
yunseong.kim, ysk, yeoreum.yun, netdev, matthew.brost, her0gyugyu,
corbet, catalin.marinas, bp, dave.hansen, x86, hpa, luto,
sumit.semwal, gustavo, christian.koenig, andi.shyti, arnd,
lorenzo.stoakes, Liam.Howlett, rppt, surenb, mcgrof, petr.pavlu,
da.gomez, samitolvanen, paulmck, frederic, neeraj.upadhyay,
joelagnelf, josh, urezki, mathieu.desnoyers, jiangshanlai,
qiang.zhang, juri.lelli, vincent.guittot, dietmar.eggemann,
bsegall, mgorman, vschneid, chuck.lever, neil, okorniev, Dai.Ngo,
tom, trondmy, anna, kees, bigeasy, clrkwllms, mark.rutland,
ada.coupriediaz, kristina.martsenko, wangkefeng.wang, broonie,
kevin.brodsky, dwmw, shakeel.butt, ast, ziy, yuzhao, baolin.wang,
usamaarif642, joel.granados, richard.weiyang, geert+renesas,
tim.c.chen, linux, alexander.shishkin, lillian, chenhuacai,
francesco, guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel
In-Reply-To: <20251002081247.51255-1-byungchul@sk.com>
Wrapped the base APIs for easier annotation on wait and event. Start
with supporting waiters on each single event. More general support for
multiple events is a future work. Do more when the need arises.
How to annotate:
1. Initaialize a map for the interesting wait.
/*
* Place along with the wait instance.
*/
struct dept_map my_wait;
/*
* Place in the initialization code.
*/
sdt_map_init(&my_wait);
2. Place the following at the wait code.
sdt_wait(&my_wait);
3. Place the following at the event code.
sdt_event(&my_wait);
That's it!
Signed-off-by: Byungchul Park <byungchul@sk.com>
---
include/linux/dept_sdt.h | 65 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 65 insertions(+)
create mode 100644 include/linux/dept_sdt.h
diff --git a/include/linux/dept_sdt.h b/include/linux/dept_sdt.h
new file mode 100644
index 000000000000..0535f763b21b
--- /dev/null
+++ b/include/linux/dept_sdt.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Single-event Dependency Tracker
+ *
+ * Started by Byungchul Park <max.byungchul.park@gmail.com>:
+ *
+ * Copyright (c) 2020 LG Electronics, Inc., Byungchul Park
+ * Copyright (c) 2024 SK hynix, Inc., Byungchul Park
+ */
+
+#ifndef __LINUX_DEPT_SDT_H
+#define __LINUX_DEPT_SDT_H
+
+#include <linux/kernel.h>
+#include <linux/dept.h>
+
+#ifdef CONFIG_DEPT
+#define sdt_map_init(m) \
+ do { \
+ static struct dept_key __key; \
+ dept_map_init(m, &__key, 0, #m); \
+ } while (0)
+
+#define sdt_map_init_key(m, k) dept_map_init(m, k, 0, #m)
+
+#define sdt_wait(m) \
+ do { \
+ dept_request_event(m); \
+ dept_wait(m, 1UL, _THIS_IP_, __func__, 0); \
+ } while (0)
+
+/*
+ * sdt_might_sleep() and its family will be committed in __schedule()
+ * when it actually gets to __schedule(). Both dept_request_event() and
+ * dept_wait() will be performed on the commit.
+ */
+
+/*
+ * Use the code location as the class key if an explicit map is not used.
+ */
+#define sdt_might_sleep_start(m) \
+ do { \
+ struct dept_map *__m = m; \
+ static struct dept_key __key; \
+ dept_stage_wait(__m, __m ? NULL : &__key, _THIS_IP_, __func__);\
+ } while (0)
+
+#define sdt_might_sleep_end() dept_clean_stage()
+
+#define sdt_ecxt_enter(m) dept_ecxt_enter(m, 1UL, _THIS_IP_, "start", "event", 0)
+#define sdt_event(m) dept_event(m, 1UL, _THIS_IP_, __func__)
+#define sdt_ecxt_exit(m) dept_ecxt_exit(m, 1UL, _THIS_IP_)
+#define sdt_request_event(m) dept_request_event(m)
+#else /* !CONFIG_DEPT */
+#define sdt_map_init(m) do { } while (0)
+#define sdt_map_init_key(m, k) do { (void)(k); } while (0)
+#define sdt_wait(m) do { } while (0)
+#define sdt_might_sleep_start(m) do { } while (0)
+#define sdt_might_sleep_end() do { } while (0)
+#define sdt_ecxt_enter(m) do { } while (0)
+#define sdt_event(m) do { } while (0)
+#define sdt_ecxt_exit(m) do { } while (0)
+#define sdt_request_event(m) do { } while (0)
+#endif
+#endif /* __LINUX_DEPT_SDT_H */
--
2.17.1
^ permalink raw reply related
* [PATCH v17 02/47] dept: implement DEPT(DEPendency Tracker)
From: Byungchul Park @ 2025-10-02 8:12 UTC (permalink / raw)
To: linux-kernel
Cc: kernel_team, torvalds, damien.lemoal, linux-ide, adilger.kernel,
linux-ext4, mingo, peterz, will, tglx, rostedt, joel, sashal,
daniel.vetter, duyuyang, johannes.berg, tj, tytso, willy, david,
amir73il, gregkh, kernel-team, linux-mm, akpm, mhocko, minchan,
hannes, vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes,
vbabka, ngupta, linux-block, josef, linux-fsdevel, jack, jlayton,
dan.j.williams, hch, djwong, dri-devel, rodrigosiqueiramelo,
melissa.srw, hamohammed.sa, harry.yoo, chris.p.wilson,
gwan-gyeong.mun, max.byungchul.park, boqun.feng, longman,
yunseong.kim, ysk, yeoreum.yun, netdev, matthew.brost, her0gyugyu,
corbet, catalin.marinas, bp, dave.hansen, x86, hpa, luto,
sumit.semwal, gustavo, christian.koenig, andi.shyti, arnd,
lorenzo.stoakes, Liam.Howlett, rppt, surenb, mcgrof, petr.pavlu,
da.gomez, samitolvanen, paulmck, frederic, neeraj.upadhyay,
joelagnelf, josh, urezki, mathieu.desnoyers, jiangshanlai,
qiang.zhang, juri.lelli, vincent.guittot, dietmar.eggemann,
bsegall, mgorman, vschneid, chuck.lever, neil, okorniev, Dai.Ngo,
tom, trondmy, anna, kees, bigeasy, clrkwllms, mark.rutland,
ada.coupriediaz, kristina.martsenko, wangkefeng.wang, broonie,
kevin.brodsky, dwmw, shakeel.butt, ast, ziy, yuzhao, baolin.wang,
usamaarif642, joel.granados, richard.weiyang, geert+renesas,
tim.c.chen, linux, alexander.shishkin, lillian, chenhuacai,
francesco, guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel
In-Reply-To: <20251002081247.51255-1-byungchul@sk.com>
CURRENT STATUS
--------------
lockdep tracks acquisition order of locks in order to detect deadlock,
and IRQ and IRQ enable/disable state as well to take accident
acquisitions into account.
lockdep should be turned off once it detects and reports a deadlock
since the data structure and algorithm are not reusable after detection
because of the complex design.
PROBLEM
-------
*Waits* and their *events* that never reach eventually cause deadlock.
However, lockdep is only interested in lock acquisition order, forcing
to emulate lock acqusition even for just waits and events that have
nothing to do with real lock.
Even worse, no one likes lockdep's false positive detection because that
prevents further one that might be more valuable. That's why all the
kernel developers are sensitive to lockdep's false positive.
Besides those, by tracking acquisition order, it cannot correctly deal
with read lock and cross-event e.g. wait_for_completion()/complete() for
deadlock detection. lockdep is no longer a good tool for that purpose.
SOLUTION
--------
Again, *waits* and their *events* that never reach eventually cause
deadlock. The new solution, DEPT(DEPendency Tracker), focuses on waits
and events themselves. dept tracks waits and events and report it if
any event would be never reachable.
dept does:
. Works with read lock in the right way.
. Works with any wait and event e.i. cross-event.
. Continue to work even after reporting multiple times.
. Provides simple and intuitive APIs.
. Does exactly what dependency checker should do.
Q & A
-----
Q. Is this the first try ever to address the problem?
A. No, cross-release feature (b09be676e0ff2 locking/lockdep: Implement
the 'crossrelease' feature) addressed it that was a lockdep extension
and merged but reverted shortly because:
cross-release started to report valuable hidden problems but started
to give report false positive reports as well. For sure, no one
likes lockdep's false positive reports since it makes lockdep stop,
preventing reporting further real problems.
Q. Why not dept was developed as an extension of lockdep?
A. lockdep definitely includes all the efforts great developers have
made for a long time so as to be quite stable enough. But I had to
design and implement newly because of the following:
1) lockdep was designed to track lock acquisition order. The APIs
and implementation do not fit on wait-event model.
2) lockdep is turned off on detection including false positive.
Which is terrible and prevents developing any extension for
stronger detection.
Q. Do you intend to totally replace lockdep?
A. No, lockdep also checks if lock usage is correct. Of course, the
dependency check routine should be replaced but the other functions
should be still there.
Q. Do you mean the dependency check routine should be replaced right
away?
A. No, I admit lockdep is stable enough thanks to great efforts kernel
developers have made. lockdep and dept, both should be in the kernel
until dept gets considered stable.
Q. Stronger detection capability would give more false positive report.
Which was a big problem when cross-release was introduced. Is it ok
with dept?
A. It's ok. dept allows multiple reporting thanks to simple and quite
generalized design. Of course, false positive reports should be
fixed anyway but it's no longer as a critical problem as it was.
Signed-off-by: Byungchul Park <byungchul@sk.com>
Tested-by: Yeoreum Yun <yeoreum.yun@arm.com>
Tested-by: Yunseong Kim <yskelg@gmail.com>
---
include/linux/dept.h | 446 +++++
include/linux/hardirq.h | 3 +
include/linux/sched.h | 108 ++
init/init_task.c | 2 +
init/main.c | 2 +
kernel/Makefile | 1 +
kernel/dependency/Makefile | 3 +
kernel/dependency/dept.c | 3002 +++++++++++++++++++++++++++++++
kernel/dependency/dept_hash.h | 10 +
kernel/dependency/dept_object.h | 13 +
kernel/exit.c | 1 +
kernel/fork.c | 2 +
kernel/module/main.c | 4 +
kernel/sched/core.c | 9 +
lib/Kconfig.debug | 26 +
lib/locking-selftest.c | 2 +
16 files changed, 3634 insertions(+)
create mode 100644 include/linux/dept.h
create mode 100644 kernel/dependency/Makefile
create mode 100644 kernel/dependency/dept.c
create mode 100644 kernel/dependency/dept_hash.h
create mode 100644 kernel/dependency/dept_object.h
diff --git a/include/linux/dept.h b/include/linux/dept.h
new file mode 100644
index 000000000000..5f0d2d8c8cbe
--- /dev/null
+++ b/include/linux/dept.h
@@ -0,0 +1,446 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * DEPT(DEPendency Tracker) - runtime dependency tracker
+ *
+ * Started by Byungchul Park <max.byungchul.park@gmail.com>:
+ *
+ * Copyright (c) 2020 LG Electronics, Inc., Byungchul Park
+ * Copyright (c) 2024 SK hynix, Inc., Byungchul Park
+ */
+
+#ifndef __LINUX_DEPT_H
+#define __LINUX_DEPT_H
+
+#ifdef CONFIG_DEPT
+
+#include <linux/types.h>
+
+struct task_struct;
+
+#define DEPT_MAX_STACK_ENTRY 16
+#define DEPT_MAX_WAIT_HIST 64
+#define DEPT_MAX_ECXT_HELD 48
+
+#define DEPT_MAX_SUBCLASSES 16
+#define DEPT_MAX_SUBCLASSES_EVT 2
+#define DEPT_MAX_SUBCLASSES_USR (DEPT_MAX_SUBCLASSES / DEPT_MAX_SUBCLASSES_EVT)
+#define DEPT_MAX_SUBCLASSES_CACHE 2
+
+#define DEPT_SIRQ 0
+#define DEPT_HIRQ 1
+#define DEPT_IRQS_NR 2
+#define DEPT_SIRQF (1UL << DEPT_SIRQ)
+#define DEPT_HIRQF (1UL << DEPT_HIRQ)
+
+struct dept_ecxt;
+struct dept_iecxt {
+ struct dept_ecxt *ecxt;
+ int enirq;
+ /*
+ * for preventing to add a new ecxt
+ */
+ bool staled;
+};
+
+struct dept_wait;
+struct dept_iwait {
+ struct dept_wait *wait;
+ int irq;
+ /*
+ * for preventing to add a new wait
+ */
+ bool staled;
+ bool touched;
+};
+
+struct dept_class {
+ union {
+ struct llist_node pool_node;
+ struct {
+ /*
+ * reference counter for object management
+ */
+ atomic_t ref;
+
+ /*
+ * unique information about the class
+ */
+ const char *name;
+ unsigned long key;
+ int sub_id;
+
+ /*
+ * for BFS
+ */
+ unsigned int bfs_gen;
+ struct dept_class *bfs_parent;
+ struct list_head bfs_node;
+
+ /*
+ * for hashing this object
+ */
+ struct hlist_node hash_node;
+
+ /*
+ * for linking all classes
+ */
+ struct list_head all_node;
+
+ /*
+ * for associating its dependencies
+ */
+ struct list_head dep_head;
+ struct list_head dep_rev_head;
+
+ /*
+ * for tracking IRQ dependencies
+ */
+ struct dept_iecxt iecxt[DEPT_IRQS_NR];
+ struct dept_iwait iwait[DEPT_IRQS_NR];
+
+ /*
+ * classified by a map embedded in task_struct,
+ * not an explicit map
+ */
+ bool sched_map;
+ };
+ };
+};
+
+struct dept_key {
+ union {
+ /*
+ * Each byte-wise address will be used as its key.
+ */
+ char base[DEPT_MAX_SUBCLASSES];
+
+ /*
+ * for caching the main class pointer
+ */
+ struct dept_class *classes[DEPT_MAX_SUBCLASSES_CACHE];
+ };
+};
+
+struct dept_map {
+ const char *name;
+ struct dept_key *keys;
+
+ /*
+ * subclass that can be set from user
+ */
+ int sub_u;
+
+ /*
+ * It's local copy for fast access to the associated classes.
+ * Also used for dept_key for static maps.
+ */
+ struct dept_key map_key;
+
+ /*
+ * wait timestamp associated to this map
+ */
+ unsigned int wgen;
+
+ /*
+ * whether this map should be going to be checked or not
+ */
+ bool nocheck;
+};
+
+#define DEPT_MAP_INITIALIZER(n, k) \
+{ \
+ .name = #n, \
+ .keys = (struct dept_key *)(k), \
+ .sub_u = 0, \
+ .map_key = { .classes = { NULL, } }, \
+ .wgen = 0U, \
+ .nocheck = false, \
+}
+
+struct dept_stack {
+ union {
+ struct llist_node pool_node;
+ struct {
+ /*
+ * reference counter for object management
+ */
+ atomic_t ref;
+
+ /*
+ * backtrace entries
+ */
+ unsigned long raw[DEPT_MAX_STACK_ENTRY];
+ int nr;
+ };
+ };
+};
+
+struct dept_ecxt {
+ union {
+ struct llist_node pool_node;
+ struct {
+ /*
+ * reference counter for object management
+ */
+ atomic_t ref;
+
+ /*
+ * function that entered to this ecxt
+ */
+ const char *ecxt_fn;
+
+ /*
+ * event function
+ */
+ const char *event_fn;
+
+ /*
+ * associated class
+ */
+ struct dept_class *class;
+
+ /*
+ * flag indicating which IRQ has been
+ * enabled within the event context
+ */
+ unsigned long enirqf;
+
+ /*
+ * where the IRQ-enabled happened
+ */
+ unsigned long enirq_ip[DEPT_IRQS_NR];
+ struct dept_stack *enirq_stack[DEPT_IRQS_NR];
+
+ /*
+ * where the event context started
+ */
+ unsigned long ecxt_ip;
+ struct dept_stack *ecxt_stack;
+
+ /*
+ * where the event triggered
+ */
+ unsigned long event_ip;
+ struct dept_stack *event_stack;
+ };
+ };
+};
+
+struct dept_wait {
+ union {
+ struct llist_node pool_node;
+ struct {
+ /*
+ * reference counter for object management
+ */
+ atomic_t ref;
+
+ /*
+ * function causing this wait
+ */
+ const char *wait_fn;
+
+ /*
+ * the associated class
+ */
+ struct dept_class *class;
+
+ /*
+ * which IRQ the wait was placed in
+ */
+ unsigned long irqf;
+
+ /*
+ * where the IRQ wait happened
+ */
+ unsigned long irq_ip[DEPT_IRQS_NR];
+ struct dept_stack *irq_stack[DEPT_IRQS_NR];
+
+ /*
+ * where the wait happened
+ */
+ unsigned long wait_ip;
+ struct dept_stack *wait_stack;
+
+ /*
+ * whether this wait is for commit in scheduler
+ */
+ bool sched_sleep;
+ };
+ };
+};
+
+struct dept_dep {
+ union {
+ struct llist_node pool_node;
+ struct {
+ /*
+ * reference counter for object management
+ */
+ atomic_t ref;
+
+ /*
+ * key data of dependency
+ */
+ struct dept_ecxt *ecxt;
+ struct dept_wait *wait;
+
+ /*
+ * This object can be referred without dept_lock
+ * held but with IRQ disabled, e.g. for hash
+ * lookup. So deferred deletion is needed.
+ */
+ struct rcu_head rh;
+
+ /*
+ * for hashing this object
+ */
+ struct hlist_node hash_node;
+
+ /*
+ * for linking to a class object
+ */
+ struct list_head dep_node;
+ struct list_head dep_rev_node;
+ };
+ };
+};
+
+struct dept_hash {
+ /*
+ * hash table
+ */
+ struct hlist_head *table;
+
+ /*
+ * size of the table e.i. 2^bits
+ */
+ int bits;
+};
+
+struct dept_ecxt_held {
+ /*
+ * associated event context
+ */
+ struct dept_ecxt *ecxt;
+
+ /*
+ * unique key for this dept_ecxt_held
+ */
+ struct dept_map *map;
+
+ /*
+ * class of the ecxt of this dept_ecxt_held
+ */
+ struct dept_class *class;
+
+ /*
+ * the wgen when the event context started
+ */
+ unsigned int wgen;
+
+ /*
+ * subclass that only works in the local context
+ */
+ int sub_l;
+};
+
+struct dept_wait_hist {
+ /*
+ * associated wait
+ */
+ struct dept_wait *wait;
+
+ /*
+ * unique id of all waits system-wise until wrapped
+ */
+ unsigned int wgen;
+
+ /*
+ * local context id to identify IRQ context
+ */
+ unsigned int ctxt_id;
+};
+
+extern void dept_on(void);
+extern void dept_off(void);
+extern void dept_init(void);
+extern void dept_task_init(struct task_struct *t);
+extern void dept_task_exit(struct task_struct *t);
+extern void dept_free_range(void *start, unsigned int sz);
+
+extern void dept_map_init(struct dept_map *m, struct dept_key *k, int sub_u, const char *n);
+extern void dept_map_reinit(struct dept_map *m, struct dept_key *k, int sub_u, const char *n);
+extern void dept_map_copy(struct dept_map *to, struct dept_map *from);
+extern void dept_wait(struct dept_map *m, unsigned long w_f, unsigned long ip, const char *w_fn, int sub_l);
+extern void dept_stage_wait(struct dept_map *m, struct dept_key *k, unsigned long ip, const char *w_fn);
+extern void dept_request_event_wait_commit(void);
+extern void dept_clean_stage(void);
+extern void dept_ttwu_stage_wait(struct task_struct *t, unsigned long ip);
+extern void dept_ecxt_enter(struct dept_map *m, unsigned long e_f, unsigned long ip, const char *c_fn, const char *e_fn, int sub_l);
+extern bool dept_ecxt_holding(struct dept_map *m, unsigned long e_f);
+extern void dept_request_event(struct dept_map *m);
+extern void dept_event(struct dept_map *m, unsigned long e_f, unsigned long ip, const char *e_fn);
+extern void dept_ecxt_exit(struct dept_map *m, unsigned long e_f, unsigned long ip);
+extern void dept_sched_enter(void);
+extern void dept_sched_exit(void);
+
+static inline void dept_ecxt_enter_nokeep(struct dept_map *m)
+{
+ dept_ecxt_enter(m, 0UL, 0UL, NULL, NULL, 0);
+}
+
+/*
+ * for users who want to manage external keys
+ */
+extern void dept_key_init(struct dept_key *k);
+extern void dept_key_destroy(struct dept_key *k);
+extern void dept_map_ecxt_modify(struct dept_map *m, unsigned long e_f, struct dept_key *new_k, unsigned long new_e_f, unsigned long new_ip, const char *new_c_fn, const char *new_e_fn, int new_sub_l);
+
+extern void dept_softirq_enter(void);
+extern void dept_hardirq_enter(void);
+extern void dept_softirqs_on_ip(unsigned long ip);
+extern void dept_hardirqs_on(void);
+extern void dept_softirqs_off(void);
+extern void dept_hardirqs_off(void);
+#else /* !CONFIG_DEPT */
+struct dept_key { };
+struct dept_map { };
+
+#define DEPT_MAP_INITIALIZER(n, k) { }
+
+#define dept_on() do { } while (0)
+#define dept_off() do { } while (0)
+#define dept_init() do { } while (0)
+#define dept_task_init(t) do { } while (0)
+#define dept_task_exit(t) do { } while (0)
+#define dept_free_range(s, sz) do { } while (0)
+
+#define dept_map_init(m, k, su, n) do { (void)(n); (void)(k); } while (0)
+#define dept_map_reinit(m, k, su, n) do { (void)(n); (void)(k); } while (0)
+#define dept_map_copy(t, f) do { } while (0)
+#define dept_wait(m, w_f, ip, w_fn, sl) do { (void)(w_fn); } while (0)
+#define dept_stage_wait(m, k, ip, w_fn) do { (void)(k); (void)(w_fn); } while (0)
+#define dept_request_event_wait_commit() do { } while (0)
+#define dept_clean_stage() do { } while (0)
+#define dept_ttwu_stage_wait(t, ip) do { } while (0)
+#define dept_ecxt_enter(m, e_f, ip, c_fn, e_fn, sl) do { (void)(c_fn); (void)(e_fn); } while (0)
+#define dept_ecxt_holding(m, e_f) false
+#define dept_request_event(m) do { } while (0)
+#define dept_event(m, e_f, ip, e_fn) do { (void)(e_fn); } while (0)
+#define dept_ecxt_exit(m, e_f, ip) do { } while (0)
+#define dept_sched_enter() do { } while (0)
+#define dept_sched_exit() do { } while (0)
+#define dept_ecxt_enter_nokeep(m) do { } while (0)
+#define dept_key_init(k) do { (void)(k); } while (0)
+#define dept_key_destroy(k) do { (void)(k); } while (0)
+#define dept_map_ecxt_modify(m, e_f, n_k, n_e_f, n_ip, n_c_fn, n_e_fn, n_sl) do { (void)(n_k); (void)(n_c_fn); (void)(n_e_fn); } while (0)
+
+#define dept_softirq_enter() do { } while (0)
+#define dept_hardirq_enter() do { } while (0)
+#define dept_softirqs_on_ip(ip) do { } while (0)
+#define dept_hardirqs_on() do { } while (0)
+#define dept_softirqs_off() do { } while (0)
+#define dept_hardirqs_off() do { } while (0)
+#endif
+#endif /* __LINUX_DEPT_H */
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index d57cab4d4c06..bb279dbbe748 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -5,6 +5,7 @@
#include <linux/context_tracking_state.h>
#include <linux/preempt.h>
#include <linux/lockdep.h>
+#include <linux/dept.h>
#include <linux/ftrace_irq.h>
#include <linux/sched.h>
#include <linux/vtime.h>
@@ -106,6 +107,7 @@ void irq_exit_rcu(void);
*/
#define __nmi_enter() \
do { \
+ dept_off(); \
lockdep_off(); \
arch_nmi_enter(); \
BUG_ON(in_nmi() == NMI_MASK); \
@@ -128,6 +130,7 @@ void irq_exit_rcu(void);
__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \
arch_nmi_exit(); \
lockdep_on(); \
+ dept_on(); \
} while (0)
#define nmi_exit() \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e4ce0a76831e..ddb162201ba1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -49,6 +49,8 @@
#include <linux/tracepoint-defs.h>
#include <linux/unwind_deferred_types.h>
#include <asm/kmap_size.h>
+#include <linux/spinlock.h>
+#include <linux/dept.h>
/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
@@ -813,6 +815,110 @@ struct kmap_ctrl {
#endif
};
+#ifdef CONFIG_DEPT
+struct dept_task {
+ /*
+ * all event contexts that have entered and before exiting
+ */
+ struct dept_ecxt_held ecxt_held[DEPT_MAX_ECXT_HELD];
+ int ecxt_held_pos;
+
+ /*
+ * ring buffer holding all waits that have happened
+ */
+ struct dept_wait_hist wait_hist[DEPT_MAX_WAIT_HIST];
+ int wait_hist_pos;
+
+ /*
+ * sequential id to identify each IRQ context
+ */
+ unsigned int irq_id[DEPT_IRQS_NR];
+
+ /*
+ * for tracking IRQ-enabled points with cross-event
+ */
+ unsigned int wgen_enirq[DEPT_IRQS_NR];
+
+ /*
+ * for keeping up-to-date IRQ-enabled points
+ */
+ unsigned long enirq_ip[DEPT_IRQS_NR];
+
+ /*
+ * for reserving a current stack instance at each operation
+ */
+ struct dept_stack *stack;
+
+ /*
+ * for preventing recursive call into DEPT engine
+ */
+ int recursive;
+
+ /*
+ * for preventing reentrance to WARN*() while warning
+ */
+ int in_warning;
+
+ /*
+ * for staging data to commit a wait
+ */
+ struct dept_map stage_m;
+ struct dept_map *stage_real_m;
+ bool stage_sched_map;
+ const char *stage_w_fn;
+ unsigned long stage_ip;
+ arch_spinlock_t stage_lock;
+
+ /*
+ * the number of missing ecxts
+ */
+ int missing_ecxt;
+
+ /*
+ * for tracking IRQ-enable state
+ */
+ bool hardirqs_enabled;
+ bool softirqs_enabled;
+
+ /*
+ * whether the current is on do_exit()
+ */
+ bool task_exit;
+
+ /*
+ * whether the current is running __schedule()
+ */
+ bool in_sched;
+};
+
+#define DEPT_TASK_INITIALIZER(t) \
+{ \
+ .wait_hist = { { .wait = NULL, } }, \
+ .ecxt_held_pos = 0, \
+ .wait_hist_pos = 0, \
+ .irq_id = { 0U }, \
+ .wgen_enirq = { 0U }, \
+ .enirq_ip = { 0UL }, \
+ .stack = NULL, \
+ .recursive = 0, \
+ .in_warning = 0, \
+ .stage_m = DEPT_MAP_INITIALIZER((t)->stage_m, NULL), \
+ .stage_real_m = NULL, \
+ .stage_sched_map = false, \
+ .stage_w_fn = NULL, \
+ .stage_ip = 0UL, \
+ .stage_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED,\
+ .missing_ecxt = 0, \
+ .hardirqs_enabled = false, \
+ .softirqs_enabled = false, \
+ .task_exit = false, \
+ .in_sched = false, \
+}
+#else
+struct dept_task { };
+#define DEPT_TASK_INITIALIZER(t) { }
+#endif
+
struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
@@ -1266,6 +1372,8 @@ struct task_struct {
struct held_lock held_locks[MAX_LOCK_DEPTH];
#endif
+ struct dept_task dept_task;
+
#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP)
unsigned int in_ubsan;
#endif
diff --git a/init/init_task.c b/init/init_task.c
index e557f622bd90..84da2464c390 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -14,6 +14,7 @@
#include <linux/numa.h>
#include <linux/scs.h>
#include <linux/plist.h>
+#include <linux/dept.h>
#include <linux/uaccess.h>
@@ -204,6 +205,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
.curr_chain_key = INITIAL_CHAIN_KEY,
.lockdep_recursion = 0,
#endif
+ .dept_task = DEPT_TASK_INITIALIZER(init_task),
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
.ret_stack = NULL,
.tracing_graph_pause = ATOMIC_INIT(0),
diff --git a/init/main.c b/init/main.c
index 5753e9539ae6..8a9b289c58e4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -66,6 +66,7 @@
#include <linux/debug_locks.h>
#include <linux/debugobjects.h>
#include <linux/lockdep.h>
+#include <linux/dept.h>
#include <linux/kmemleak.h>
#include <linux/padata.h>
#include <linux/pid_namespace.h>
@@ -1038,6 +1039,7 @@ void start_kernel(void)
panic_param);
lockdep_init();
+ dept_init();
/*
* Need to run this when irqs are enabled, because it wants
diff --git a/kernel/Makefile b/kernel/Makefile
index c60623448235..72c0d9767c89 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -56,6 +56,7 @@ obj-y += dma/
obj-y += entry/
obj-y += unwind/
obj-$(CONFIG_MODULES) += module/
+obj-y += dependency/
obj-$(CONFIG_KCMP) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
diff --git a/kernel/dependency/Makefile b/kernel/dependency/Makefile
new file mode 100644
index 000000000000..b5cfb8a03c0c
--- /dev/null
+++ b/kernel/dependency/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_DEPT) += dept.o
diff --git a/kernel/dependency/dept.c b/kernel/dependency/dept.c
new file mode 100644
index 000000000000..712b7f79a095
--- /dev/null
+++ b/kernel/dependency/dept.c
@@ -0,0 +1,3002 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DEPT(DEPendency Tracker) - Runtime dependency tracker
+ *
+ * Started by Byungchul Park <max.byungchul.park@gmail.com>:
+ *
+ * Copyright (c) 2020 LG Electronics, Inc., Byungchul Park
+ * Copyright (c) 2024 SK hynix, Inc., Byungchul Park
+ *
+ * DEPT provides a general way to detect potential deadlocks at runtime
+ * and the interest is not limited to typical lock but to every
+ * synchronization primitives.
+ *
+ * The following ideas were borrowed from LOCKDEP:
+ *
+ * 1) Use a graph to track relationship between classes.
+ * 2) Prevent performance regression using hash.
+ *
+ * The following items were enhanced from LOCKDEP:
+ *
+ * 1) Cover more deadlock cases.
+ * 2) Allow multiple reports.
+ *
+ * TODO: Both LOCKDEP and DEPT should co-exist until DEPT is considered
+ * stable. Then the dependency check routine should be replaced with
+ * DEPT after. It should finally look like:
+ *
+ *
+ *
+ * As is:
+ *
+ * LOCKDEP
+ * +-----------------------------------------+
+ * | Lock usage correctness check | <-> locks
+ * | |
+ * | |
+ * | +-------------------------------------+ |
+ * | | Dependency check | |
+ * | | (by tracking lock acquisition order)| |
+ * | +-------------------------------------+ |
+ * | |
+ * +-----------------------------------------+
+ *
+ * DEPT
+ * +-----------------------------------------+
+ * | Dependency check | <-> waits/events
+ * | (by tracking wait and event context) |
+ * +-----------------------------------------+
+ *
+ *
+ *
+ * To be:
+ *
+ * LOCKDEP
+ * +-----------------------------------------+
+ * | Lock usage correctness check | <-> locks
+ * | |
+ * | |
+ * | (Request dependency check) |
+ * | T |
+ * +--------------------|--------------------+
+ * |
+ * DEPT V
+ * +-----------------------------------------+
+ * | Dependency check | <-> waits/events
+ * | (by tracking wait and event context) |
+ * +-----------------------------------------+
+ */
+
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/hash.h>
+#include <linux/dept.h>
+#include <linux/utsname.h>
+#include <linux/kernel.h>
+
+static int dept_stop;
+static int dept_per_cpu_ready;
+
+static inline struct dept_task *dept_task(void)
+{
+ return ¤t->dept_task;
+}
+
+#define DEPT_READY_WARN (!oops_in_progress && !dept_task()->in_warning)
+
+/*
+ * Make all operations using DEPT_WARN_ON() fail on oops_in_progress and
+ * prevent warning message.
+ */
+#define DEPT_WARN_ON_ONCE(c) \
+ ({ \
+ int __ret = !!(c); \
+ \
+ if (likely(DEPT_READY_WARN)) { \
+ ++dept_task()->in_warning; \
+ WARN_ONCE(c, "DEPT_WARN_ON_ONCE: " #c); \
+ --dept_task()->in_warning; \
+ } \
+ __ret; \
+ })
+
+#define DEPT_WARN_ONCE(s...) \
+ ({ \
+ if (likely(DEPT_READY_WARN)) { \
+ ++dept_task()->in_warning; \
+ WARN_ONCE(1, "DEPT_WARN_ONCE: " s); \
+ --dept_task()->in_warning; \
+ } \
+ })
+
+#define DEPT_WARN_ON(c) \
+ ({ \
+ int __ret = !!(c); \
+ \
+ if (likely(DEPT_READY_WARN)) { \
+ ++dept_task()->in_warning; \
+ WARN(c, "DEPT_WARN_ON: " #c); \
+ --dept_task()->in_warning; \
+ } \
+ __ret; \
+ })
+
+#define DEPT_WARN(s...) \
+ ({ \
+ if (likely(DEPT_READY_WARN)) { \
+ ++dept_task()->in_warning; \
+ WARN(1, "DEPT_WARN: " s); \
+ --dept_task()->in_warning; \
+ } \
+ })
+
+#define DEPT_STOP(s...) \
+ ({ \
+ WRITE_ONCE(dept_stop, 1); \
+ if (likely(DEPT_READY_WARN)) { \
+ ++dept_task()->in_warning; \
+ WARN(1, "DEPT_STOP: " s); \
+ --dept_task()->in_warning; \
+ } \
+ })
+
+#define DEPT_INFO_ONCE(s...) pr_warn_once("DEPT_INFO_ONCE: " s)
+
+static arch_spinlock_t dept_spin = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+
+/*
+ * DEPT internal engine should be cautious in using outside functions
+ * e.g. printk at reporting since that kind of usage might cause
+ * untrackable deadlock.
+ */
+static atomic_t dept_outworld = ATOMIC_INIT(0);
+
+static void dept_outworld_enter(void)
+{
+ atomic_inc(&dept_outworld);
+}
+
+static void dept_outworld_exit(void)
+{
+ atomic_dec(&dept_outworld);
+}
+
+static bool dept_outworld_entered(void)
+{
+ return atomic_read(&dept_outworld);
+}
+
+static bool dept_lock(void)
+{
+ while (!arch_spin_trylock(&dept_spin))
+ if (unlikely(dept_outworld_entered()))
+ return false;
+ return true;
+}
+
+static void dept_unlock(void)
+{
+ arch_spin_unlock(&dept_spin);
+}
+
+enum bfs_ret {
+ BFS_CONTINUE,
+ BFS_DONE,
+ BFS_SKIP,
+};
+
+static bool before(unsigned int a, unsigned int b)
+{
+ return (int)(a - b) < 0;
+}
+
+static bool valid_stack(struct dept_stack *s)
+{
+ return s && s->nr > 0;
+}
+
+static bool valid_class(struct dept_class *c)
+{
+ return c->key;
+}
+
+static void invalidate_class(struct dept_class *c)
+{
+ c->key = 0UL;
+}
+
+static struct dept_ecxt *dep_e(struct dept_dep *d)
+{
+ return d->ecxt;
+}
+
+static struct dept_wait *dep_w(struct dept_dep *d)
+{
+ return d->wait;
+}
+
+static struct dept_class *dep_fc(struct dept_dep *d)
+{
+ return dep_e(d)->class;
+}
+
+static struct dept_class *dep_tc(struct dept_dep *d)
+{
+ return dep_w(d)->class;
+}
+
+static const char *irq_str(int irq)
+{
+ if (irq == DEPT_SIRQ)
+ return "softirq";
+ if (irq == DEPT_HIRQ)
+ return "hardirq";
+ return "(unknown)";
+}
+
+/*
+ * Dept doesn't work either when it's stopped by DEPT_STOP() or in a nmi
+ * context.
+ */
+static bool dept_working(void)
+{
+ return !READ_ONCE(dept_stop) && !in_nmi();
+}
+
+/*
+ * Even k == NULL is considered as a valid key because it would use
+ * &->map_key as the key in that case.
+ */
+struct dept_key __dept_no_validate__;
+static bool valid_key(struct dept_key *k)
+{
+ return &__dept_no_validate__ != k;
+}
+
+/*
+ * Pool
+ * =====================================================================
+ * DEPT maintains pools to provide objects in a safe way.
+ *
+ * 1) Static pool is used at the beginning of booting time.
+ * 2) Local pool is tried first before the static pool. Objects that
+ * have been freed will be placed.
+ */
+
+enum object_t {
+#define OBJECT(id, nr) OBJECT_##id,
+ #include "dept_object.h"
+#undef OBJECT
+ OBJECT_NR,
+};
+
+#define OBJECT(id, nr) \
+static struct dept_##id spool_##id[nr]; \
+static DEFINE_PER_CPU(struct llist_head, lpool_##id);
+ #include "dept_object.h"
+#undef OBJECT
+
+struct dept_pool {
+ const char *name;
+
+ /*
+ * object size
+ */
+ size_t obj_sz;
+
+ /*
+ * the number of the static array
+ */
+ atomic_t obj_nr;
+
+ /*
+ * offset of ->pool_node
+ */
+ size_t node_off;
+
+ /*
+ * pointer to the pool
+ */
+ void *spool;
+ struct llist_head boot_pool;
+ struct llist_head __percpu *lpool;
+};
+
+static struct dept_pool pool[OBJECT_NR] = {
+#define OBJECT(id, nr) { \
+ .name = #id, \
+ .obj_sz = sizeof(struct dept_##id), \
+ .obj_nr = ATOMIC_INIT(nr), \
+ .node_off = offsetof(struct dept_##id, pool_node), \
+ .spool = spool_##id, \
+ .lpool = &lpool_##id, },
+ #include "dept_object.h"
+#undef OBJECT
+};
+
+/*
+ * Can use llist no matter whether CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG is
+ * enabled or not because NMI and other contexts in the same CPU never
+ * run inside of DEPT concurrently by preventing reentrance.
+ */
+static void *from_pool(enum object_t t)
+{
+ struct dept_pool *p;
+ struct llist_head *h;
+ struct llist_node *n;
+
+ /*
+ * llist_del_first() doesn't allow concurrent access e.g.
+ * between process and IRQ context.
+ */
+ if (DEPT_WARN_ON(!irqs_disabled()))
+ return NULL;
+
+ p = &pool[t];
+
+ /*
+ * Try local pool first.
+ */
+ if (likely(dept_per_cpu_ready))
+ h = this_cpu_ptr(p->lpool);
+ else
+ h = &p->boot_pool;
+
+ n = llist_del_first(h);
+ if (n)
+ return (void *)n - p->node_off;
+
+ /*
+ * Try static pool.
+ */
+ if (atomic_read(&p->obj_nr) > 0) {
+ int idx = atomic_dec_return(&p->obj_nr);
+
+ if (idx >= 0)
+ return p->spool + (idx * p->obj_sz);
+ }
+
+ DEPT_INFO_ONCE("---------------------------------------------\n"
+ " Some of Dept internal resources are run out.\n"
+ " Dept might still work if the resources get freed.\n"
+ " However, the chances are Dept will suffer from\n"
+ " the lack from now. Needs to extend the internal\n"
+ " resource pools. Ask max.byungchul.park@gmail.com\n");
+ return NULL;
+}
+
+static void to_pool(void *o, enum object_t t)
+{
+ struct dept_pool *p = &pool[t];
+ struct llist_head *h;
+
+ preempt_disable();
+ if (likely(dept_per_cpu_ready))
+ h = this_cpu_ptr(p->lpool);
+ else
+ h = &p->boot_pool;
+
+ llist_add(o + p->node_off, h);
+ preempt_enable();
+}
+
+#define OBJECT(id, nr) \
+static void (*ctor_##id)(struct dept_##id *a); \
+static void (*dtor_##id)(struct dept_##id *a); \
+static struct dept_##id *new_##id(void) \
+{ \
+ struct dept_##id *a; \
+ \
+ a = (struct dept_##id *)from_pool(OBJECT_##id); \
+ if (unlikely(!a)) \
+ return NULL; \
+ \
+ atomic_set(&a->ref, 1); \
+ \
+ if (ctor_##id) \
+ ctor_##id(a); \
+ \
+ return a; \
+} \
+ \
+static struct dept_##id *get_##id(struct dept_##id *a) \
+{ \
+ atomic_inc(&a->ref); \
+ return a; \
+} \
+ \
+static void put_##id(struct dept_##id *a) \
+{ \
+ if (!atomic_dec_return(&a->ref)) { \
+ if (dtor_##id) \
+ dtor_##id(a); \
+ to_pool(a, OBJECT_##id); \
+ } \
+} \
+ \
+static void del_##id(struct dept_##id *a) \
+{ \
+ put_##id(a); \
+} \
+ \
+static bool __maybe_unused id##_consumed(struct dept_##id *a) \
+{ \
+ return a && atomic_read(&a->ref) > 1; \
+}
+#include "dept_object.h"
+#undef OBJECT
+
+#define SET_CONSTRUCTOR(id, f) \
+static void (*ctor_##id)(struct dept_##id *a) = f
+
+static void initialize_dep(struct dept_dep *d)
+{
+ INIT_LIST_HEAD(&d->dep_node);
+ INIT_LIST_HEAD(&d->dep_rev_node);
+}
+SET_CONSTRUCTOR(dep, initialize_dep);
+
+static void initialize_class(struct dept_class *c)
+{
+ int i;
+
+ for (i = 0; i < DEPT_IRQS_NR; i++) {
+ struct dept_iecxt *ie = &c->iecxt[i];
+ struct dept_iwait *iw = &c->iwait[i];
+
+ ie->ecxt = NULL;
+ ie->enirq = i;
+ ie->staled = false;
+
+ iw->wait = NULL;
+ iw->irq = i;
+ iw->staled = false;
+ iw->touched = false;
+ }
+ c->bfs_gen = 0U;
+
+ INIT_LIST_HEAD(&c->all_node);
+ INIT_LIST_HEAD(&c->dep_head);
+ INIT_LIST_HEAD(&c->dep_rev_head);
+ INIT_LIST_HEAD(&c->bfs_node);
+}
+SET_CONSTRUCTOR(class, initialize_class);
+
+static void initialize_ecxt(struct dept_ecxt *e)
+{
+ int i;
+
+ for (i = 0; i < DEPT_IRQS_NR; i++) {
+ e->enirq_stack[i] = NULL;
+ e->enirq_ip[i] = 0UL;
+ }
+ e->ecxt_ip = 0UL;
+ e->ecxt_stack = NULL;
+ e->enirqf = 0UL;
+ e->event_ip = 0UL;
+ e->event_stack = NULL;
+}
+SET_CONSTRUCTOR(ecxt, initialize_ecxt);
+
+static void initialize_wait(struct dept_wait *w)
+{
+ int i;
+
+ for (i = 0; i < DEPT_IRQS_NR; i++) {
+ w->irq_stack[i] = NULL;
+ w->irq_ip[i] = 0UL;
+ }
+ w->wait_ip = 0UL;
+ w->wait_stack = NULL;
+ w->irqf = 0UL;
+}
+SET_CONSTRUCTOR(wait, initialize_wait);
+
+static void initialize_stack(struct dept_stack *s)
+{
+ s->nr = 0;
+}
+SET_CONSTRUCTOR(stack, initialize_stack);
+
+#define OBJECT(id, nr) \
+static void (*ctor_##id)(struct dept_##id *a);
+ #include "dept_object.h"
+#undef OBJECT
+
+#undef SET_CONSTRUCTOR
+
+#define SET_DESTRUCTOR(id, f) \
+static void (*dtor_##id)(struct dept_##id *a) = f
+
+static void destroy_dep(struct dept_dep *d)
+{
+ if (dep_e(d))
+ put_ecxt(dep_e(d));
+ if (dep_w(d))
+ put_wait(dep_w(d));
+}
+SET_DESTRUCTOR(dep, destroy_dep);
+
+static void destroy_ecxt(struct dept_ecxt *e)
+{
+ int i;
+
+ for (i = 0; i < DEPT_IRQS_NR; i++)
+ if (e->enirq_stack[i])
+ put_stack(e->enirq_stack[i]);
+ if (e->class)
+ put_class(e->class);
+ if (e->ecxt_stack)
+ put_stack(e->ecxt_stack);
+ if (e->event_stack)
+ put_stack(e->event_stack);
+}
+SET_DESTRUCTOR(ecxt, destroy_ecxt);
+
+static void destroy_wait(struct dept_wait *w)
+{
+ int i;
+
+ for (i = 0; i < DEPT_IRQS_NR; i++)
+ if (w->irq_stack[i])
+ put_stack(w->irq_stack[i]);
+ if (w->class)
+ put_class(w->class);
+ if (w->wait_stack)
+ put_stack(w->wait_stack);
+}
+SET_DESTRUCTOR(wait, destroy_wait);
+
+#define OBJECT(id, nr) \
+static void (*dtor_##id)(struct dept_##id *a);
+ #include "dept_object.h"
+#undef OBJECT
+
+#undef SET_DESTRUCTOR
+
+/*
+ * Caching and hashing
+ * =====================================================================
+ * DEPT makes use of caching and hashing to improve performance. Each
+ * object can be obtained in O(1) with its key.
+ *
+ * NOTE: Currently we assume all the objects in the hashs will never be
+ * removed. Implement it when needed.
+ */
+
+/*
+ * Some information might be lost but it's only for hashing key.
+ */
+static unsigned long mix(unsigned long a, unsigned long b)
+{
+ int halfbits = sizeof(unsigned long) * 8 / 2;
+ unsigned long halfmask = (1UL << halfbits) - 1UL;
+
+ return (a << halfbits) | (b & halfmask);
+}
+
+static bool cmp_dep(struct dept_dep *d1, struct dept_dep *d2)
+{
+ return dep_fc(d1)->key == dep_fc(d2)->key &&
+ dep_tc(d1)->key == dep_tc(d2)->key;
+}
+
+static unsigned long key_dep(struct dept_dep *d)
+{
+ return mix(dep_fc(d)->key, dep_tc(d)->key);
+}
+
+static bool cmp_class(struct dept_class *c1, struct dept_class *c2)
+{
+ return c1->key == c2->key;
+}
+
+static unsigned long key_class(struct dept_class *c)
+{
+ return c->key;
+}
+
+#define HASH(id, bits) \
+static struct hlist_head table_##id[1 << (bits)]; \
+ \
+static struct hlist_head *head_##id(struct dept_##id *a) \
+{ \
+ return table_##id + hash_long(key_##id(a), bits); \
+} \
+ \
+static struct dept_##id *hash_lookup_##id(struct dept_##id *a) \
+{ \
+ struct dept_##id *b; \
+ \
+ hlist_for_each_entry_rcu(b, head_##id(a), hash_node) \
+ if (cmp_##id(a, b)) \
+ return b; \
+ return NULL; \
+} \
+ \
+static void hash_add_##id(struct dept_##id *a) \
+{ \
+ get_##id(a); \
+ hlist_add_head_rcu(&a->hash_node, head_##id(a)); \
+} \
+ \
+static void hash_del_##id(struct dept_##id *a) \
+{ \
+ hlist_del_rcu(&a->hash_node); \
+ put_##id(a); \
+}
+#include "dept_hash.h"
+#undef HASH
+
+static struct dept_dep *lookup_dep(struct dept_class *fc,
+ struct dept_class *tc)
+{
+ struct dept_ecxt onetime_e = { .class = fc };
+ struct dept_wait onetime_w = { .class = tc };
+ struct dept_dep onetime_d = { .ecxt = &onetime_e,
+ .wait = &onetime_w };
+ return hash_lookup_dep(&onetime_d);
+}
+
+static struct dept_class *lookup_class(unsigned long key)
+{
+ struct dept_class onetime_c = { .key = key };
+
+ return hash_lookup_class(&onetime_c);
+}
+
+/*
+ * Report
+ * =====================================================================
+ * DEPT prints useful information to help debugging on detection of
+ * problematic dependency.
+ */
+
+static void print_ip_stack(unsigned long ip, struct dept_stack *s)
+{
+ if (ip)
+ print_ip_sym(KERN_WARNING, ip);
+
+#ifdef CONFIG_DEPT_DEBUG
+ if (!s)
+ pr_warn("stack is NULL.\n");
+ else if (!s->nr)
+ pr_warn("stack->nr is 0.\n");
+ if (s)
+ pr_warn("stack ref is %d.\n", atomic_read(&s->ref));
+#endif
+
+ if (valid_stack(s)) {
+ pr_warn("stacktrace:\n");
+ stack_trace_print(s->raw, s->nr, 5);
+ }
+
+ if (!ip && !valid_stack(s))
+ pr_warn("(N/A)\n");
+}
+
+#define print_spc(spc, fmt, ...) \
+ pr_warn("%*c" fmt, (spc) * 3, ' ', ##__VA_ARGS__)
+
+static void print_diagram(struct dept_dep *d)
+{
+ struct dept_ecxt *e = dep_e(d);
+ struct dept_wait *w = dep_w(d);
+ struct dept_class *fc = dep_fc(d);
+ struct dept_class *tc = dep_tc(d);
+ unsigned long irqf;
+ int irq;
+ bool firstline = true;
+ int spc = 1;
+ const char *w_fn = w->wait_fn ?: "(unknown)";
+ const char *e_fn = e->event_fn ?: "(unknown)";
+ const char *c_fn = e->ecxt_fn ?: "(unknown)";
+ const char *fc_n = fc->sched_map ? "<sched>" : (fc->name ?: "(unknown)");
+ const char *tc_n = tc->sched_map ? "<sched>" : (tc->name ?: "(unknown)");
+
+ irqf = e->enirqf & w->irqf;
+ for_each_set_bit(irq, &irqf, DEPT_IRQS_NR) {
+ if (!firstline)
+ pr_warn("\nor\n\n");
+ firstline = false;
+
+ print_spc(spc, "[S] %s(%s:%d)\n", c_fn, fc_n, fc->sub_id);
+ print_spc(spc, " <%s interrupt>\n", irq_str(irq));
+ print_spc(spc + 1, "[W] %s(%s:%d)\n", w_fn, tc_n, tc->sub_id);
+ print_spc(spc, "[E] %s(%s:%d)\n", e_fn, fc_n, fc->sub_id);
+ }
+
+ if (!irqf) {
+ print_spc(spc, "[S] %s(%s:%d)\n", c_fn, fc_n, fc->sub_id);
+ print_spc(spc, "[W] %s(%s:%d)\n", w_fn, tc_n, tc->sub_id);
+ print_spc(spc, "[E] %s(%s:%d)\n", e_fn, fc_n, fc->sub_id);
+ }
+}
+
+static void print_dep(struct dept_dep *d)
+{
+ struct dept_ecxt *e = dep_e(d);
+ struct dept_wait *w = dep_w(d);
+ struct dept_class *fc = dep_fc(d);
+ struct dept_class *tc = dep_tc(d);
+ unsigned long irqf;
+ int irq;
+ const char *w_fn = w->wait_fn ?: "(unknown)";
+ const char *e_fn = e->event_fn ?: "(unknown)";
+ const char *c_fn = e->ecxt_fn ?: "(unknown)";
+ const char *fc_n = fc->sched_map ? "<sched>" : (fc->name ?: "(unknown)");
+ const char *tc_n = tc->sched_map ? "<sched>" : (tc->name ?: "(unknown)");
+
+ irqf = e->enirqf & w->irqf;
+ for_each_set_bit(irq, &irqf, DEPT_IRQS_NR) {
+ pr_warn("%s has been enabled:\n", irq_str(irq));
+ print_ip_stack(e->enirq_ip[irq], e->enirq_stack[irq]);
+ pr_warn("\n");
+
+ pr_warn("[S] %s(%s:%d):\n", c_fn, fc_n, fc->sub_id);
+ print_ip_stack(e->ecxt_ip, e->ecxt_stack);
+ pr_warn("\n");
+
+ pr_warn("[W] %s(%s:%d) in %s context:\n",
+ w_fn, tc_n, tc->sub_id, irq_str(irq));
+ print_ip_stack(w->irq_ip[irq], w->irq_stack[irq]);
+ pr_warn("\n");
+
+ pr_warn("[E] %s(%s:%d):\n", e_fn, fc_n, fc->sub_id);
+ print_ip_stack(e->event_ip, e->event_stack);
+ }
+
+ if (!irqf) {
+ pr_warn("[S] %s(%s:%d):\n", c_fn, fc_n, fc->sub_id);
+ print_ip_stack(e->ecxt_ip, e->ecxt_stack);
+ pr_warn("\n");
+
+ pr_warn("[W] %s(%s:%d):\n", w_fn, tc_n, tc->sub_id);
+ print_ip_stack(w->wait_ip, w->wait_stack);
+ pr_warn("\n");
+
+ pr_warn("[E] %s(%s:%d):\n", e_fn, fc_n, fc->sub_id);
+ print_ip_stack(e->event_ip, e->event_stack);
+ }
+}
+
+static void save_current_stack(int skip);
+
+/*
+ * Print all classes in a circle.
+ */
+static void print_circle(struct dept_class *c)
+{
+ struct dept_class *fc = c->bfs_parent;
+ struct dept_class *tc = c;
+ int i;
+
+ dept_outworld_enter();
+ save_current_stack(6);
+
+ pr_warn("===================================================\n");
+ pr_warn("DEPT: Circular dependency has been detected.\n");
+ pr_warn("%s %.*s %s\n", init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version,
+ print_tainted());
+ pr_warn("---------------------------------------------------\n");
+ pr_warn("summary\n");
+ pr_warn("---------------------------------------------------\n");
+
+ if (fc == tc)
+ pr_warn("*** AA DEADLOCK ***\n\n");
+ else
+ pr_warn("*** DEADLOCK ***\n\n");
+
+ i = 0;
+ do {
+ struct dept_dep *d = lookup_dep(fc, tc);
+
+ pr_warn("context %c\n", 'A' + (i++));
+ print_diagram(d);
+ if (fc != c)
+ pr_warn("\n");
+
+ tc = fc;
+ fc = fc->bfs_parent;
+ } while (tc != c);
+
+ pr_warn("\n");
+ pr_warn("[S]: start of the event context\n");
+ pr_warn("[W]: the wait blocked\n");
+ pr_warn("[E]: the event not reachable\n");
+
+ i = 0;
+ do {
+ struct dept_dep *d = lookup_dep(fc, tc);
+
+ pr_warn("---------------------------------------------------\n");
+ pr_warn("context %c's detail\n", 'A' + i);
+ pr_warn("---------------------------------------------------\n");
+ pr_warn("context %c\n", 'A' + (i++));
+ print_diagram(d);
+ pr_warn("\n");
+ print_dep(d);
+
+ tc = fc;
+ fc = fc->bfs_parent;
+ } while (tc != c);
+
+ pr_warn("---------------------------------------------------\n");
+ pr_warn("information that might be helpful\n");
+ pr_warn("---------------------------------------------------\n");
+ dump_stack();
+
+ dept_outworld_exit();
+}
+
+/*
+ * BFS(Breadth First Search)
+ * =====================================================================
+ * Whenever a new dependency is added into the graph, search the graph
+ * for a new circular dependency.
+ */
+
+struct bfs_ops {
+ void (*bfs_init)(void *, void *, void **);
+ void (*extend)(struct list_head *, void *);
+ void *(*dequeue)(struct list_head *);
+ enum bfs_ret (*callback)(void *, void *, void **);
+};
+
+static unsigned int bfs_gen;
+
+/*
+ * NOTE: Must be called with dept_lock held.
+ */
+static void bfs(void *root, struct bfs_ops *ops, void *in, void **out)
+{
+ LIST_HEAD(q);
+ enum bfs_ret ret;
+
+ if (DEPT_WARN_ON(!ops || !ops->bfs_init || !ops->extend ||
+ !ops->dequeue || !ops->callback))
+ return;
+
+ /*
+ * Avoid zero bfs_gen.
+ */
+ bfs_gen = bfs_gen + 1 ?: 1;
+ ops->bfs_init(root, in, out);
+
+ ret = ops->callback(root, in, out);
+ if (ret != BFS_CONTINUE)
+ return;
+
+ ops->extend(&q, root);
+ while (!list_empty(&q)) {
+ void *node = ops->dequeue(&q);
+
+ if (ret == BFS_DONE)
+ continue;
+
+ ret = ops->callback(node, in, out);
+ if (ret == BFS_CONTINUE)
+ ops->extend(&q, node);
+ }
+}
+
+/*
+ * Main operations
+ * =====================================================================
+ * Add dependencies - Each new dependency is added into the graph and
+ * checked if it forms a circular dependency.
+ *
+ * Track waits - Waits are queued into the ring buffer for later use to
+ * generate appropriate dependencies with cross-event.
+ *
+ * Track event contexts(ecxt) - Event contexts are pushed into local
+ * stack for later use to generate appropriate dependencies with waits.
+ */
+
+static unsigned long cur_enirqf(void);
+static int cur_irq(void);
+static unsigned int cur_ctxt_id(void);
+
+static struct dept_iecxt *iecxt(struct dept_class *c, int irq)
+{
+ return &c->iecxt[irq];
+}
+
+static struct dept_iwait *iwait(struct dept_class *c, int irq)
+{
+ return &c->iwait[irq];
+}
+
+static void stale_iecxt(struct dept_iecxt *ie)
+{
+ if (ie->ecxt)
+ put_ecxt(ie->ecxt);
+
+ WRITE_ONCE(ie->ecxt, NULL);
+ WRITE_ONCE(ie->staled, true);
+}
+
+static void set_iecxt(struct dept_iecxt *ie, struct dept_ecxt *e)
+{
+ /*
+ * ->ecxt will never be updated once getting set until the class
+ * gets removed.
+ */
+ if (ie->ecxt)
+ DEPT_WARN_ON(1);
+ else
+ WRITE_ONCE(ie->ecxt, get_ecxt(e));
+}
+
+static void stale_iwait(struct dept_iwait *iw)
+{
+ if (iw->wait)
+ put_wait(iw->wait);
+
+ WRITE_ONCE(iw->wait, NULL);
+ WRITE_ONCE(iw->staled, true);
+}
+
+static void set_iwait(struct dept_iwait *iw, struct dept_wait *w)
+{
+ /*
+ * ->wait will never be updated once getting set until the class
+ * gets removed.
+ */
+ if (iw->wait)
+ DEPT_WARN_ON(1);
+ else
+ WRITE_ONCE(iw->wait, get_wait(w));
+
+ iw->touched = true;
+}
+
+static void touch_iwait(struct dept_iwait *iw)
+{
+ iw->touched = true;
+}
+
+static void untouch_iwait(struct dept_iwait *iw)
+{
+ iw->touched = false;
+}
+
+static struct dept_stack *get_current_stack(void)
+{
+ struct dept_stack *s = dept_task()->stack;
+
+ return s ? get_stack(s) : NULL;
+}
+
+static void prepare_current_stack(void)
+{
+ DEPT_WARN_ON(dept_task()->stack);
+
+ dept_task()->stack = new_stack();
+}
+
+static void save_current_stack(int skip)
+{
+ struct dept_stack *s = dept_task()->stack;
+
+ if (!s)
+ return;
+
+ if (valid_stack(s))
+ return;
+
+ s->nr = stack_trace_save(s->raw, DEPT_MAX_STACK_ENTRY, skip);
+}
+
+static void finish_current_stack(void)
+{
+ struct dept_stack *s = dept_task()->stack;
+
+ /*
+ * Fill the struct dept_stack with a valid stracktrace if it has
+ * been referred at least once.
+ */
+ if (stack_consumed(s))
+ save_current_stack(2);
+
+ dept_task()->stack = NULL;
+
+ /*
+ * Actual deletion will happen at put_stack() if the stack has
+ * been referred.
+ */
+ if (s)
+ del_stack(s);
+}
+
+/*
+ * FIXME: For now, disable LOCKDEP while DEPT is working.
+ *
+ * Both LOCKDEP and DEPT report it on a deadlock detection using
+ * printk taking the risk of another deadlock that might be caused by
+ * locks of console or printk between inside and outside of them.
+ *
+ * For DEPT, it's no problem since multiple reports are allowed. But it
+ * would be a bad idea for LOCKDEP since it will stop even on a singe
+ * report. So we need to prevent LOCKDEP from its reporting the risk
+ * DEPT would take when reporting something.
+ */
+#include <linux/lockdep.h>
+
+void noinstr dept_off(void)
+{
+ dept_task()->recursive++;
+ lockdep_off();
+}
+
+void noinstr dept_on(void)
+{
+ lockdep_on();
+ dept_task()->recursive--;
+}
+
+static unsigned long dept_enter(void)
+{
+ unsigned long flags;
+
+ flags = arch_local_irq_save();
+ dept_off();
+ prepare_current_stack();
+ return flags;
+}
+
+static void dept_exit(unsigned long flags)
+{
+ finish_current_stack();
+ dept_on();
+ arch_local_irq_restore(flags);
+}
+
+static unsigned long dept_enter_recursive(void)
+{
+ unsigned long flags;
+
+ flags = arch_local_irq_save();
+ return flags;
+}
+
+static void dept_exit_recursive(unsigned long flags)
+{
+ arch_local_irq_restore(flags);
+}
+
+/*
+ * NOTE: Must be called with dept_lock held.
+ */
+static struct dept_dep *__add_dep(struct dept_ecxt *e,
+ struct dept_wait *w)
+{
+ struct dept_dep *d;
+
+ if (DEPT_WARN_ON(!valid_class(e->class)))
+ return NULL;
+
+ if (DEPT_WARN_ON(!valid_class(w->class)))
+ return NULL;
+
+ if (lookup_dep(e->class, w->class))
+ return NULL;
+
+ d = new_dep();
+ if (unlikely(!d))
+ return NULL;
+
+ d->ecxt = get_ecxt(e);
+ d->wait = get_wait(w);
+
+ /*
+ * Add the dependency into hash and graph.
+ */
+ hash_add_dep(d);
+ list_add(&d->dep_node, &dep_fc(d)->dep_head);
+ list_add(&d->dep_rev_node, &dep_tc(d)->dep_rev_head);
+ return d;
+}
+
+static void bfs_init_check_dl(void *node, void *in, void **out)
+{
+ struct dept_class *root = (struct dept_class *)node;
+ struct dept_dep *new = (struct dept_dep *)in;
+
+ root->bfs_gen = bfs_gen;
+ dep_tc(new)->bfs_parent = dep_fc(new);
+}
+
+static void bfs_extend_dep(struct list_head *h, void *node)
+{
+ struct dept_class *cur = (struct dept_class *)node;
+ struct dept_dep *d;
+
+ list_for_each_entry(d, &cur->dep_head, dep_node) {
+ struct dept_class *next = dep_tc(d);
+
+ if (bfs_gen == next->bfs_gen)
+ continue;
+ next->bfs_parent = cur;
+ next->bfs_gen = bfs_gen;
+ list_add_tail(&next->bfs_node, h);
+ }
+}
+
+static void *bfs_dequeue_dep(struct list_head *h)
+{
+ struct dept_class *c;
+
+ DEPT_WARN_ON(list_empty(h));
+
+ c = list_first_entry(h, struct dept_class, bfs_node);
+ list_del(&c->bfs_node);
+ return c;
+}
+
+static enum bfs_ret cb_check_dl(void *node, void *in, void **out)
+{
+ struct dept_class *cur = (struct dept_class *)node;
+ struct dept_dep *new = (struct dept_dep *)in;
+
+ if (cur == dep_fc(new)) {
+ print_circle(dep_tc(new));
+ return BFS_DONE;
+ }
+
+ return BFS_CONTINUE;
+}
+
+/*
+ * This function is actually in charge of reporting.
+ */
+static void check_dl_bfs(struct dept_dep *d)
+{
+ struct bfs_ops ops = {
+ .bfs_init = bfs_init_check_dl,
+ .extend = bfs_extend_dep,
+ .dequeue = bfs_dequeue_dep,
+ .callback = cb_check_dl,
+ };
+
+ bfs((void *)dep_tc(d), &ops, (void *)d, NULL);
+}
+
+static void bfs_init_dep(void *node, void *in, void **out)
+{
+ struct dept_class *root = (struct dept_class *)node;
+
+ root->bfs_gen = bfs_gen;
+}
+
+static void bfs_extend_dep_rev(struct list_head *h, void *node)
+{
+ struct dept_class *cur = (struct dept_class *)node;
+ struct dept_dep *d;
+
+ list_for_each_entry(d, &cur->dep_rev_head, dep_rev_node) {
+ struct dept_class *next = dep_fc(d);
+
+ if (bfs_gen == next->bfs_gen)
+ continue;
+ next->bfs_parent = cur;
+ next->bfs_gen = bfs_gen;
+ list_add_tail(&next->bfs_node, h);
+ }
+}
+
+static enum bfs_ret cb_find_iw(void *node, void *in, void **out)
+{
+ struct dept_class *cur = (struct dept_class *)node;
+ int irq = *(int *)in;
+ struct dept_iwait *iw;
+
+ if (DEPT_WARN_ON(!out))
+ return BFS_DONE;
+
+ iw = iwait(cur, irq);
+
+ /*
+ * If any parent's ->wait was set, then the children would've
+ * been touched.
+ */
+ if (!iw->touched)
+ return BFS_SKIP;
+
+ if (!iw->wait)
+ return BFS_CONTINUE;
+
+ *out = iw;
+ return BFS_DONE;
+}
+
+static struct dept_iwait *find_iw_bfs(struct dept_class *c, int irq)
+{
+ struct dept_iwait *iw = iwait(c, irq);
+ struct dept_iwait *found = NULL;
+ struct bfs_ops ops = {
+ .bfs_init = bfs_init_dep,
+ .extend = bfs_extend_dep_rev,
+ .dequeue = bfs_dequeue_dep,
+ .callback = cb_find_iw,
+ };
+
+ bfs((void *)c, &ops, (void *)&irq, (void **)&found);
+
+ if (found)
+ return found;
+
+ untouch_iwait(iw);
+ return NULL;
+}
+
+static enum bfs_ret cb_touch_iw_find_ie(void *node, void *in, void **out)
+{
+ struct dept_class *cur = (struct dept_class *)node;
+ int irq = *(int *)in;
+ struct dept_iecxt *ie = iecxt(cur, irq);
+ struct dept_iwait *iw = iwait(cur, irq);
+
+ if (DEPT_WARN_ON(!out))
+ return BFS_DONE;
+
+ touch_iwait(iw);
+
+ if (!ie->ecxt)
+ return BFS_CONTINUE;
+ if (!*out)
+ *out = ie;
+
+ /*
+ * Do touch_iwait() all the way.
+ */
+ return BFS_CONTINUE;
+}
+
+static struct dept_iecxt *touch_iw_find_ie_bfs(struct dept_class *c,
+ int irq)
+{
+ struct dept_iecxt *found = NULL;
+ struct bfs_ops ops = {
+ .bfs_init = bfs_init_dep,
+ .extend = bfs_extend_dep,
+ .dequeue = bfs_dequeue_dep,
+ .callback = cb_touch_iw_find_ie,
+ };
+
+ bfs((void *)c, &ops, (void *)&irq, (void **)&found);
+ return found;
+}
+
+/*
+ * Should be called with dept_lock held.
+ */
+static void __add_idep(struct dept_iecxt *ie, struct dept_iwait *iw)
+{
+ struct dept_dep *new;
+
+ /*
+ * There's nothing to do.
+ */
+ if (!ie || !iw || !ie->ecxt || !iw->wait)
+ return;
+
+ new = __add_dep(ie->ecxt, iw->wait);
+
+ /*
+ * Deadlock detected. Let check_dl_bfs() report it.
+ */
+ if (new) {
+ check_dl_bfs(new);
+ stale_iecxt(ie);
+ stale_iwait(iw);
+ }
+
+ /*
+ * If !new, it would be the case of lack of object resource.
+ * Just let it go and get checked by other chances. Retrying is
+ * meaningless in that case.
+ */
+}
+
+static void set_check_iecxt(struct dept_class *c, int irq,
+ struct dept_ecxt *e)
+{
+ struct dept_iecxt *ie = iecxt(c, irq);
+
+ set_iecxt(ie, e);
+ __add_idep(ie, find_iw_bfs(c, irq));
+}
+
+static void set_check_iwait(struct dept_class *c, int irq,
+ struct dept_wait *w)
+{
+ struct dept_iwait *iw = iwait(c, irq);
+
+ set_iwait(iw, w);
+ __add_idep(touch_iw_find_ie_bfs(c, irq), iw);
+}
+
+static void add_iecxt(struct dept_class *c, int irq, struct dept_ecxt *e,
+ bool stack)
+{
+ /*
+ * This access is safe since we ensure e->class has set locally.
+ */
+ struct dept_task *dt = dept_task();
+ struct dept_iecxt *ie = iecxt(c, irq);
+
+ if (DEPT_WARN_ON(!valid_class(c)))
+ return;
+
+ if (unlikely(READ_ONCE(ie->staled)))
+ return;
+
+ /*
+ * Skip add_iecxt() if ie->ecxt has ever been set at least once.
+ * Which means it has a valid ->ecxt or been staled.
+ */
+ if (READ_ONCE(ie->ecxt))
+ return;
+
+ if (unlikely(!dept_lock()))
+ return;
+
+ if (unlikely(ie->staled))
+ goto unlock;
+ if (ie->ecxt)
+ goto unlock;
+
+ e->enirqf |= (1UL << irq);
+
+ /*
+ * Should be NULL since it's the first time that these
+ * enirq_{ip,stack}[irq] have ever set.
+ */
+ DEPT_WARN_ON(e->enirq_ip[irq]);
+ DEPT_WARN_ON(e->enirq_stack[irq]);
+
+ e->enirq_ip[irq] = dt->enirq_ip[irq];
+ e->enirq_stack[irq] = stack ? get_current_stack() : NULL;
+
+ set_check_iecxt(c, irq, e);
+unlock:
+ dept_unlock();
+}
+
+static void add_iwait(struct dept_class *c, int irq, struct dept_wait *w)
+{
+ struct dept_iwait *iw = iwait(c, irq);
+
+ if (DEPT_WARN_ON(!valid_class(c)))
+ return;
+
+ if (unlikely(READ_ONCE(iw->staled)))
+ return;
+
+ /*
+ * Skip add_iwait() if iw->wait has ever been set at least once.
+ * Which means it has a valid ->wait or been staled.
+ */
+ if (READ_ONCE(iw->wait))
+ return;
+
+ if (unlikely(!dept_lock()))
+ return;
+
+ if (unlikely(iw->staled))
+ goto unlock;
+ if (iw->wait)
+ goto unlock;
+
+ w->irqf |= (1UL << irq);
+
+ /*
+ * Should be NULL since it's the first time that these
+ * irq_{ip,stack}[irq] have ever set.
+ */
+ DEPT_WARN_ON(w->irq_ip[irq]);
+ DEPT_WARN_ON(w->irq_stack[irq]);
+
+ w->irq_ip[irq] = w->wait_ip;
+ w->irq_stack[irq] = get_current_stack();
+
+ set_check_iwait(c, irq, w);
+unlock:
+ dept_unlock();
+}
+
+static struct dept_wait_hist *hist(int pos)
+{
+ struct dept_task *dt = dept_task();
+
+ return dt->wait_hist + (pos % DEPT_MAX_WAIT_HIST);
+}
+
+static int hist_pos_next(void)
+{
+ struct dept_task *dt = dept_task();
+
+ return dt->wait_hist_pos % DEPT_MAX_WAIT_HIST;
+}
+
+static void hist_advance(void)
+{
+ struct dept_task *dt = dept_task();
+
+ dt->wait_hist_pos++;
+ dt->wait_hist_pos %= DEPT_MAX_WAIT_HIST;
+}
+
+static struct dept_wait_hist *new_hist(void)
+{
+ struct dept_wait_hist *wh = hist(hist_pos_next());
+
+ hist_advance();
+ return wh;
+}
+
+static void add_hist(struct dept_wait *w, unsigned int wg, unsigned int ctxt_id)
+{
+ struct dept_wait_hist *wh = new_hist();
+
+ if (likely(wh->wait))
+ put_wait(wh->wait);
+
+ wh->wait = get_wait(w);
+ wh->wgen = wg;
+ wh->ctxt_id = ctxt_id;
+}
+
+/*
+ * Should be called after setting up e's iecxt and w's iwait.
+ */
+static void add_dep(struct dept_ecxt *e, struct dept_wait *w)
+{
+ struct dept_class *fc = e->class;
+ struct dept_class *tc = w->class;
+ struct dept_dep *d;
+ int i;
+
+ if (lookup_dep(fc, tc))
+ return;
+
+ if (unlikely(!dept_lock()))
+ return;
+
+ /*
+ * __add_dep() will lookup_dep() again with lock held.
+ */
+ d = __add_dep(e, w);
+ if (d) {
+ check_dl_bfs(d);
+
+ for (i = 0; i < DEPT_IRQS_NR; i++) {
+ struct dept_iwait *fiw = iwait(fc, i);
+ struct dept_iecxt *found_ie;
+ struct dept_iwait *found_iw;
+
+ /*
+ * '->touched == false' guarantees there's no
+ * parent that has been set ->wait.
+ */
+ if (!fiw->touched)
+ continue;
+
+ /*
+ * find_iw_bfs() will untouch the iwait if
+ * not found.
+ */
+ found_iw = find_iw_bfs(fc, i);
+
+ if (!found_iw)
+ continue;
+
+ found_ie = touch_iw_find_ie_bfs(tc, i);
+ __add_idep(found_ie, found_iw);
+ }
+ }
+ dept_unlock();
+}
+
+static atomic_t wgen = ATOMIC_INIT(1);
+
+static int next_wgen(void)
+{
+ /*
+ * Avoid zero wgen.
+ */
+ return atomic_inc_return(&wgen) ?: atomic_inc_return(&wgen);
+}
+
+static void add_wait(struct dept_class *c, unsigned long ip,
+ const char *w_fn, int sub_l, bool sched_sleep)
+{
+ struct dept_task *dt = dept_task();
+ struct dept_wait *w;
+ unsigned int wg;
+ int irq;
+ int i;
+
+ if (DEPT_WARN_ON(!valid_class(c)))
+ return;
+
+ w = new_wait();
+ if (unlikely(!w))
+ return;
+
+ WRITE_ONCE(w->class, get_class(c));
+ w->wait_ip = ip;
+ w->wait_fn = w_fn;
+ w->wait_stack = get_current_stack();
+ w->sched_sleep = sched_sleep;
+
+ irq = cur_irq();
+ if (irq < DEPT_IRQS_NR)
+ add_iwait(c, irq, w);
+
+ /*
+ * Avoid adding dependency between user aware nested ecxt and
+ * wait.
+ */
+ for (i = dt->ecxt_held_pos - 1; i >= 0; i--) {
+ struct dept_ecxt_held *eh;
+
+ eh = dt->ecxt_held + i;
+
+ /*
+ * the case of invalid key'ed one
+ */
+ if (!eh->ecxt)
+ continue;
+
+ if (eh->ecxt->class != c || eh->sub_l == sub_l)
+ add_dep(eh->ecxt, w);
+ }
+
+ wg = next_wgen();
+ add_hist(w, wg, cur_ctxt_id());
+
+ del_wait(w);
+}
+
+static struct dept_ecxt_held *add_ecxt(struct dept_map *m,
+ struct dept_class *c, unsigned long ip, const char *c_fn,
+ const char *e_fn, int sub_l)
+{
+ struct dept_task *dt = dept_task();
+ struct dept_ecxt_held *eh;
+ struct dept_ecxt *e;
+ unsigned long irqf;
+ unsigned int wg;
+ int irq;
+
+ if (DEPT_WARN_ON(!valid_class(c)))
+ return NULL;
+
+ if (DEPT_WARN_ON_ONCE(dt->ecxt_held_pos >= DEPT_MAX_ECXT_HELD))
+ return NULL;
+
+ wg = next_wgen();
+ if (m->nocheck) {
+ eh = dt->ecxt_held + (dt->ecxt_held_pos++);
+ eh->ecxt = NULL;
+ eh->map = m;
+ eh->class = get_class(c);
+ eh->wgen = wg;
+ eh->sub_l = sub_l;
+
+ return eh;
+ }
+
+ e = new_ecxt();
+ if (unlikely(!e))
+ return NULL;
+
+ e->class = get_class(c);
+ e->ecxt_ip = ip;
+ e->ecxt_stack = ip ? get_current_stack() : NULL;
+ e->event_fn = e_fn;
+ e->ecxt_fn = c_fn;
+
+ eh = dt->ecxt_held + (dt->ecxt_held_pos++);
+ eh->ecxt = get_ecxt(e);
+ eh->map = m;
+ eh->class = get_class(c);
+ eh->wgen = wg;
+ eh->sub_l = sub_l;
+
+ irqf = cur_enirqf();
+ for_each_set_bit(irq, &irqf, DEPT_IRQS_NR)
+ add_iecxt(c, irq, e, false);
+
+ del_ecxt(e);
+ return eh;
+}
+
+static int find_ecxt_pos(struct dept_map *m, struct dept_class *c,
+ bool newfirst)
+{
+ struct dept_task *dt = dept_task();
+ int i;
+
+ if (newfirst) {
+ for (i = dt->ecxt_held_pos - 1; i >= 0; i--) {
+ struct dept_ecxt_held *eh;
+
+ eh = dt->ecxt_held + i;
+ if (eh->map == m && eh->class == c)
+ return i;
+ }
+ } else {
+ for (i = 0; i < dt->ecxt_held_pos; i++) {
+ struct dept_ecxt_held *eh;
+
+ eh = dt->ecxt_held + i;
+ if (eh->map == m && eh->class == c)
+ return i;
+ }
+ }
+ return -1;
+}
+
+static bool pop_ecxt(struct dept_map *m, struct dept_class *c)
+{
+ struct dept_task *dt = dept_task();
+ int pos;
+ int i;
+
+ pos = find_ecxt_pos(m, c, true);
+ if (pos == -1)
+ return false;
+
+ if (dt->ecxt_held[pos].class)
+ put_class(dt->ecxt_held[pos].class);
+
+ if (dt->ecxt_held[pos].ecxt)
+ put_ecxt(dt->ecxt_held[pos].ecxt);
+
+ dt->ecxt_held_pos--;
+
+ for (i = pos; i < dt->ecxt_held_pos; i++)
+ dt->ecxt_held[i] = dt->ecxt_held[i + 1];
+ return true;
+}
+
+static bool good_hist(struct dept_wait_hist *wh, unsigned int wg)
+{
+ return wh->wait != NULL && before(wg, wh->wgen);
+}
+
+/*
+ * Binary-search the ring buffer for the earliest valid wait.
+ */
+static int find_hist_pos(unsigned int wg)
+{
+ int oldest;
+ int l;
+ int r;
+ int pos;
+
+ oldest = hist_pos_next();
+ if (unlikely(good_hist(hist(oldest), wg))) {
+ DEPT_INFO_ONCE("Need to expand the ring buffer.\n");
+ return oldest;
+ }
+
+ l = oldest + 1;
+ r = oldest + DEPT_MAX_WAIT_HIST - 1;
+ for (pos = (l + r) / 2; l <= r; pos = (l + r) / 2) {
+ struct dept_wait_hist *p = hist(pos - 1);
+ struct dept_wait_hist *wh = hist(pos);
+
+ if (!good_hist(p, wg) && good_hist(wh, wg))
+ return pos % DEPT_MAX_WAIT_HIST;
+ if (good_hist(wh, wg))
+ r = pos - 1;
+ else
+ l = pos + 1;
+ }
+ return -1;
+}
+
+static void do_event(struct dept_map *m, struct dept_map *real_m,
+ struct dept_class *c, unsigned int wg, unsigned long ip,
+ const char *e_fn)
+{
+ struct dept_task *dt = dept_task();
+ struct dept_wait_hist *wh;
+ struct dept_ecxt_held *eh;
+ unsigned int ctxt_id;
+ int end;
+ int pos;
+ int i;
+
+ if (DEPT_WARN_ON(!valid_class(c)))
+ return;
+
+ if (m->nocheck)
+ return;
+
+ /*
+ * The event was triggered before wait.
+ */
+ if (!wg)
+ return;
+
+ /*
+ * If an ecxt for this map exists, let the ecxt work for this
+ * event and do not proceed it in do_event().
+ */
+ if (find_ecxt_pos(real_m, c, false) != -1)
+ return;
+ eh = add_ecxt(m, c, 0UL, NULL, e_fn, 0);
+
+ if (!eh)
+ return;
+
+ if (DEPT_WARN_ON(!eh->ecxt))
+ goto out;
+
+ eh->ecxt->event_ip = ip;
+ eh->ecxt->event_stack = get_current_stack();
+
+ pos = find_hist_pos(wg);
+ if (pos == -1)
+ goto out;
+
+ ctxt_id = cur_ctxt_id();
+ end = hist_pos_next();
+ end = end > pos ? end : end + DEPT_MAX_WAIT_HIST;
+ for (wh = hist(pos); pos < end; wh = hist(++pos)) {
+ if (dt->in_sched && wh->wait->sched_sleep)
+ continue;
+
+ if (wh->ctxt_id == ctxt_id)
+ add_dep(eh->ecxt, wh->wait);
+ }
+
+ for (i = 0; i < DEPT_IRQS_NR; i++) {
+ struct dept_ecxt *e;
+
+ if (before(dt->wgen_enirq[i], wg))
+ continue;
+
+ e = eh->ecxt;
+ add_iecxt(e->class, i, e, false);
+ }
+out:
+ /*
+ * Pop ecxt that temporarily has been added to handle this event.
+ */
+ pop_ecxt(m, c);
+}
+
+static void del_dep_rcu(struct rcu_head *rh)
+{
+ struct dept_dep *d = container_of(rh, struct dept_dep, rh);
+
+ preempt_disable();
+ del_dep(d);
+ preempt_enable();
+}
+
+/*
+ * NOTE: Must be called with dept_lock held.
+ */
+static void disconnect_class(struct dept_class *c)
+{
+ struct dept_dep *d, *n;
+ int i;
+
+ list_for_each_entry_safe(d, n, &c->dep_head, dep_node) {
+ list_del_rcu(&d->dep_node);
+ list_del_rcu(&d->dep_rev_node);
+ hash_del_dep(d);
+ call_rcu(&d->rh, del_dep_rcu);
+ }
+
+ list_for_each_entry_safe(d, n, &c->dep_rev_head, dep_rev_node) {
+ list_del_rcu(&d->dep_node);
+ list_del_rcu(&d->dep_rev_node);
+ hash_del_dep(d);
+ call_rcu(&d->rh, del_dep_rcu);
+ }
+
+ for (i = 0; i < DEPT_IRQS_NR; i++) {
+ stale_iecxt(iecxt(c, i));
+ stale_iwait(iwait(c, i));
+ }
+}
+
+/*
+ * Context control
+ * =====================================================================
+ * Whether a wait is in {hard,soft}-IRQ context or whether
+ * {hard,soft}-IRQ has been enabled on the way to an event is very
+ * important to check dependency. All those things should be tracked.
+ */
+
+static unsigned long cur_enirqf(void)
+{
+ struct dept_task *dt = dept_task();
+ int he = dt->hardirqs_enabled;
+ int se = dt->softirqs_enabled;
+
+ if (he)
+ return DEPT_HIRQF | (se ? DEPT_SIRQF : 0UL);
+ return 0UL;
+}
+
+static int cur_irq(void)
+{
+ if (lockdep_softirq_context(current))
+ return DEPT_SIRQ;
+ if (lockdep_hardirq_context())
+ return DEPT_HIRQ;
+ return DEPT_IRQS_NR;
+}
+
+static unsigned int cur_ctxt_id(void)
+{
+ struct dept_task *dt = dept_task();
+ int irq = cur_irq();
+
+ /*
+ * Normal process context
+ */
+ if (irq == DEPT_IRQS_NR)
+ return 0U;
+
+ return dt->irq_id[irq] | (1UL << irq);
+}
+
+static void enirq_transition(int irq)
+{
+ struct dept_task *dt = dept_task();
+ int i;
+
+ /*
+ * IRQ can cut in on the way to the event. Used for cross-event
+ * detection.
+ *
+ * wait context event context(ecxt)
+ * ------------ -------------------
+ * wait event
+ * UPDATE wgen
+ * observe IRQ enabled
+ * UPDATE wgen
+ * keep the wgen locally
+ *
+ * on the event
+ * check the wgen kept
+ */
+
+ dt->wgen_enirq[irq] = next_wgen();
+
+ for (i = dt->ecxt_held_pos - 1; i >= 0; i--) {
+ struct dept_ecxt_held *eh;
+ struct dept_ecxt *e;
+
+ eh = dt->ecxt_held + i;
+ e = eh->ecxt;
+ if (e)
+ add_iecxt(e->class, irq, e, true);
+ }
+}
+
+static void dept_enirq(unsigned long ip)
+{
+ struct dept_task *dt = dept_task();
+ unsigned long irqf = cur_enirqf();
+ int irq;
+ unsigned long flags;
+
+ if (unlikely(!dept_working()))
+ return;
+
+ /*
+ * IRQ ON/OFF transition might happen while Dept is working.
+ * We cannot handle recursive entrance. Just ignore it.
+ * Only transitions outside of Dept will be considered.
+ */
+ if (dt->recursive)
+ return;
+
+ flags = dept_enter();
+
+ for_each_set_bit(irq, &irqf, DEPT_IRQS_NR) {
+ dt->enirq_ip[irq] = ip;
+ enirq_transition(irq);
+ }
+
+ dept_exit(flags);
+}
+
+void dept_softirqs_on_ip(unsigned long ip)
+{
+ /*
+ * Assumes that it's called with IRQ disabled so that accessing
+ * current's fields is not racy.
+ */
+ dept_task()->softirqs_enabled = true;
+ dept_enirq(ip);
+}
+
+void dept_hardirqs_on(void)
+{
+ /*
+ * Assumes that it's called with IRQ disabled so that accessing
+ * current's fields is not racy.
+ */
+ dept_task()->hardirqs_enabled = true;
+ dept_enirq(_RET_IP_);
+}
+
+void dept_softirqs_off(void)
+{
+ /*
+ * Assumes that it's called with IRQ disabled so that accessing
+ * current's fields is not racy.
+ */
+ dept_task()->softirqs_enabled = false;
+}
+
+void dept_hardirqs_off(void)
+{
+ /*
+ * Assumes that it's called with IRQ disabled so that accessing
+ * current's fields is not racy.
+ */
+ dept_task()->hardirqs_enabled = false;
+}
+
+/*
+ * Ensure it's the outmost softirq context.
+ */
+void dept_softirq_enter(void)
+{
+ struct dept_task *dt = dept_task();
+
+ dt->irq_id[DEPT_SIRQ] += 1UL << DEPT_IRQS_NR;
+}
+
+/*
+ * Ensure it's the outmost hardirq context.
+ */
+void dept_hardirq_enter(void)
+{
+ struct dept_task *dt = dept_task();
+
+ dt->irq_id[DEPT_HIRQ] += 1UL << DEPT_IRQS_NR;
+}
+
+void dept_sched_enter(void)
+{
+ dept_task()->in_sched = true;
+}
+
+void dept_sched_exit(void)
+{
+ dept_task()->in_sched = false;
+}
+
+/*
+ * Exposed APIs
+ * =====================================================================
+ */
+
+static void clean_classes_cache(struct dept_key *k)
+{
+ int i;
+
+ for (i = 0; i < DEPT_MAX_SUBCLASSES_CACHE; i++) {
+ if (!READ_ONCE(k->classes[i]))
+ continue;
+
+ WRITE_ONCE(k->classes[i], NULL);
+ }
+}
+
+/*
+ * Assume we don't have to consider race with the map when
+ * dept_map_init() is called.
+ */
+void dept_map_init(struct dept_map *m, struct dept_key *k, int sub_u,
+ const char *n)
+{
+ unsigned long flags;
+
+ if (unlikely(!dept_working())) {
+ m->nocheck = true;
+ return;
+ }
+
+ if (DEPT_WARN_ON(sub_u < 0)) {
+ m->nocheck = true;
+ return;
+ }
+
+ if (DEPT_WARN_ON(sub_u >= DEPT_MAX_SUBCLASSES_USR)) {
+ m->nocheck = true;
+ return;
+ }
+
+ /*
+ * Allow recursive entrance.
+ */
+ flags = dept_enter_recursive();
+
+ clean_classes_cache(&m->map_key);
+
+ m->keys = k;
+ m->sub_u = sub_u;
+ m->name = n;
+ m->wgen = 0U;
+ m->nocheck = !valid_key(k);
+
+ dept_exit_recursive(flags);
+}
+EXPORT_SYMBOL_GPL(dept_map_init);
+
+/*
+ * Assume we don't have to consider race with the map when
+ * dept_map_reinit() is called.
+ */
+void dept_map_reinit(struct dept_map *m, struct dept_key *k, int sub_u,
+ const char *n)
+{
+ unsigned long flags;
+
+ if (unlikely(!dept_working())) {
+ m->nocheck = true;
+ return;
+ }
+
+ /*
+ * Allow recursive entrance.
+ */
+ flags = dept_enter_recursive();
+
+ if (k) {
+ clean_classes_cache(&m->map_key);
+ m->keys = k;
+ m->nocheck = !valid_key(k);
+ }
+
+ if (sub_u >= 0 && sub_u < DEPT_MAX_SUBCLASSES_USR)
+ m->sub_u = sub_u;
+
+ if (n)
+ m->name = n;
+
+ m->wgen = 0U;
+
+ dept_exit_recursive(flags);
+}
+EXPORT_SYMBOL_GPL(dept_map_reinit);
+
+void dept_map_copy(struct dept_map *to, struct dept_map *from)
+{
+ if (unlikely(!dept_working())) {
+ to->nocheck = true;
+ return;
+ }
+
+ *to = *from;
+
+ /*
+ * XXX: 'to' might be in a stack or something. Using the address
+ * in a stack segment as a key is meaningless. Just ignore the
+ * case for now.
+ */
+ if (!to->keys) {
+ to->nocheck = true;
+ return;
+ }
+
+ /*
+ * Since the class cache can be modified concurrently we could
+ * observe half pointers (64bit arch using 32bit copy
+ * instructions). Therefore clear the caches and take the
+ * performance hit.
+ */
+ clean_classes_cache(&to->map_key);
+}
+
+static LIST_HEAD(classes);
+
+static bool within(const void *addr, void *start, unsigned long size)
+{
+ return addr >= start && addr < start + size;
+}
+
+void dept_free_range(void *start, unsigned int sz)
+{
+ struct dept_task *dt = dept_task();
+ struct dept_class *c, *n;
+ unsigned long flags;
+
+ if (unlikely(!dept_working()))
+ return;
+
+ if (dt->recursive) {
+ DEPT_STOP("Failed to successfully free Dept objects.\n");
+ return;
+ }
+
+ flags = dept_enter();
+
+ /*
+ * dept_free_range() should not fail.
+ *
+ * FIXME: Should be fixed if dept_free_range() causes deadlock
+ * with dept_lock().
+ */
+ while (unlikely(!dept_lock()))
+ cpu_relax();
+
+ list_for_each_entry_safe(c, n, &classes, all_node) {
+ if (!within((void *)c->key, start, sz) &&
+ !within(c->name, start, sz))
+ continue;
+
+ hash_del_class(c);
+ disconnect_class(c);
+ list_del(&c->all_node);
+ invalidate_class(c);
+
+ /*
+ * Actual deletion will happen on the rcu callback
+ * that has been added in disconnect_class().
+ */
+ del_class(c);
+ }
+ dept_unlock();
+ dept_exit(flags);
+
+ /*
+ * Wait until even lockless hash_lookup_class() for the class
+ * returns NULL.
+ */
+ might_sleep();
+ synchronize_rcu();
+}
+
+static int sub_id(struct dept_map *m, int e)
+{
+ return (m ? m->sub_u : 0) + e * DEPT_MAX_SUBCLASSES_USR;
+}
+
+static struct dept_class *check_new_class(struct dept_key *local,
+ struct dept_key *k, int sub_id,
+ const char *n, bool sched_map)
+{
+ struct dept_class *c = NULL;
+
+ if (DEPT_WARN_ON(sub_id >= DEPT_MAX_SUBCLASSES))
+ return NULL;
+
+ if (DEPT_WARN_ON(!k))
+ return NULL;
+
+ /*
+ * XXX: Assume that users prevent the map from using if any of
+ * the cached keys has been invalidated. If not, the cache,
+ * local->classes should not be used because it would be racy
+ * with class deletion.
+ */
+ if (local && sub_id < DEPT_MAX_SUBCLASSES_CACHE)
+ c = READ_ONCE(local->classes[sub_id]);
+
+ if (c)
+ return c;
+
+ c = lookup_class((unsigned long)k->base + sub_id);
+ if (c)
+ goto caching;
+
+ if (unlikely(!dept_lock()))
+ return NULL;
+
+ c = lookup_class((unsigned long)k->base + sub_id);
+ if (unlikely(c))
+ goto unlock;
+
+ c = new_class();
+ if (unlikely(!c))
+ goto unlock;
+
+ c->name = n;
+ c->sched_map = sched_map;
+ c->sub_id = sub_id;
+ c->key = (unsigned long)(k->base + sub_id);
+ hash_add_class(c);
+ list_add(&c->all_node, &classes);
+unlock:
+ dept_unlock();
+caching:
+ if (local && sub_id < DEPT_MAX_SUBCLASSES_CACHE)
+ WRITE_ONCE(local->classes[sub_id], c);
+
+ return c;
+}
+
+/*
+ * Called between dept_enter() and dept_exit().
+ */
+static void __dept_wait(struct dept_map *m, unsigned long w_f,
+ unsigned long ip, const char *w_fn, int sub_l,
+ bool sched_sleep, bool sched_map)
+{
+ int e;
+
+ /*
+ * Be as conservative as possible. In case of multiple waits for
+ * a single dept_map, we are going to keep only the last wait's
+ * wgen for simplicity - keeping all wgens seems overengineering.
+ *
+ * Of course, it might cause missing some dependencies that
+ * would rarely, probably never, happen but it helps avoid
+ * false positive reports.
+ */
+ for_each_set_bit(e, &w_f, DEPT_MAX_SUBCLASSES_EVT) {
+ struct dept_class *c;
+ struct dept_key *k;
+
+ k = m->keys ?: &m->map_key;
+ c = check_new_class(&m->map_key, k,
+ sub_id(m, e), m->name, sched_map);
+ if (!c)
+ continue;
+
+ add_wait(c, ip, w_fn, sub_l, sched_sleep);
+ }
+}
+
+/*
+ * Called between dept_enter() and dept_exit().
+ */
+static void __dept_event(struct dept_map *m, struct dept_map *real_m,
+ unsigned long e_f, unsigned long ip, const char *e_fn,
+ bool sched_map)
+{
+ struct dept_class *c;
+ struct dept_key *k;
+ int e;
+
+ e = find_first_bit(&e_f, DEPT_MAX_SUBCLASSES_EVT);
+
+ if (DEPT_WARN_ON(e >= DEPT_MAX_SUBCLASSES_EVT))
+ return;
+
+ /*
+ * An event is an event. If the caller passed more than single
+ * event, then warn it and handle the event corresponding to
+ * the first bit anyway.
+ */
+ DEPT_WARN_ON(1UL << e != e_f);
+
+ k = m->keys ?: &m->map_key;
+ c = check_new_class(&m->map_key, k, sub_id(m, e), m->name, sched_map);
+
+ if (c)
+ do_event(m, real_m, c, READ_ONCE(m->wgen), ip, e_fn);
+}
+
+void dept_wait(struct dept_map *m, unsigned long w_f,
+ unsigned long ip, const char *w_fn, int sub_l)
+{
+ struct dept_task *dt = dept_task();
+ unsigned long flags;
+
+ if (unlikely(!dept_working()))
+ return;
+
+ if (dt->recursive)
+ return;
+
+ if (m->nocheck)
+ return;
+
+ flags = dept_enter();
+
+ __dept_wait(m, w_f, ip, w_fn, sub_l, false, false);
+
+ dept_exit(flags);
+}
+EXPORT_SYMBOL_GPL(dept_wait);
+
+void dept_stage_wait(struct dept_map *m, struct dept_key *k,
+ unsigned long ip, const char *w_fn)
+{
+ struct dept_task *dt = dept_task();
+ unsigned long flags;
+
+ if (unlikely(!dept_working()))
+ return;
+
+ if (m && m->nocheck)
+ return;
+
+ /*
+ * Either m or k should be passed. Which means Dept relies on
+ * either its own map or the caller's position in the code when
+ * determining its class.
+ */
+ if (DEPT_WARN_ON(!m && !k))
+ return;
+
+ /*
+ * Allow recursive entrance.
+ */
+ flags = dept_enter_recursive();
+
+ /*
+ * Ensure the outmost dept_stage_wait() works.
+ */
+ if (dt->stage_m.keys)
+ goto exit;
+
+ arch_spin_lock(&dt->stage_lock);
+ if (m) {
+ dt->stage_m = *m;
+ dt->stage_real_m = m;
+
+ /*
+ * Ensure dt->stage_m.keys != NULL and it works with the
+ * map's map_key, not stage_m's one when ->keys == NULL.
+ */
+ if (!m->keys)
+ dt->stage_m.keys = &m->map_key;
+ } else {
+ dt->stage_m.name = w_fn;
+ dt->stage_sched_map = true;
+ dt->stage_real_m = &dt->stage_m;
+ }
+
+ /*
+ * dept_map_reinit() includes WRITE_ONCE(->wgen, 0U) that
+ * effectively disables the map just in case real sleep won't
+ * happen. dept_request_event_wait_commit() will enable it.
+ */
+ dept_map_reinit(&dt->stage_m, k, -1, NULL);
+
+ dt->stage_w_fn = w_fn;
+ dt->stage_ip = ip;
+ arch_spin_unlock(&dt->stage_lock);
+exit:
+ dept_exit_recursive(flags);
+}
+EXPORT_SYMBOL_GPL(dept_stage_wait);
+
+static void __dept_clean_stage(struct dept_task *dt)
+{
+ memset(&dt->stage_m, 0x0, sizeof(struct dept_map));
+ dt->stage_real_m = NULL;
+ dt->stage_sched_map = false;
+ dt->stage_w_fn = NULL;
+ dt->stage_ip = 0UL;
+}
+
+void dept_clean_stage(void)
+{
+ struct dept_task *dt = dept_task();
+ unsigned long flags;
+
+ if (unlikely(!dept_working()))
+ return;
+
+ /*
+ * Allow recursive entrance.
+ */
+ flags = dept_enter_recursive();
+ arch_spin_lock(&dt->stage_lock);
+ __dept_clean_stage(dt);
+ arch_spin_unlock(&dt->stage_lock);
+ dept_exit_recursive(flags);
+}
+EXPORT_SYMBOL_GPL(dept_clean_stage);
+
+/*
+ * Always called from __schedule().
+ */
+void dept_request_event_wait_commit(void)
+{
+ struct dept_task *dt = dept_task();
+ unsigned long flags;
+ unsigned int wg;
+ unsigned long ip;
+ const char *w_fn;
+ bool sched_map;
+
+ if (unlikely(!dept_working()))
+ return;
+
+ /*
+ * It's impossible that __schedule() is called while Dept is
+ * working that already disabled IRQ at the entrance.
+ */
+ if (DEPT_WARN_ON(dt->recursive))
+ return;
+
+ flags = dept_enter();
+
+ arch_spin_lock(&dt->stage_lock);
+
+ /*
+ * Checks if current has staged a wait.
+ */
+ if (!dt->stage_m.keys) {
+ arch_spin_unlock(&dt->stage_lock);
+ goto exit;
+ }
+
+ w_fn = dt->stage_w_fn;
+ ip = dt->stage_ip;
+ sched_map = dt->stage_sched_map;
+
+ wg = next_wgen();
+ WRITE_ONCE(dt->stage_m.wgen, wg);
+ arch_spin_unlock(&dt->stage_lock);
+
+ __dept_wait(&dt->stage_m, 1UL, ip, w_fn, 0, true, sched_map);
+exit:
+ dept_exit(flags);
+}
+
+/*
+ * Always called from try_to_wake_up().
+ */
+void dept_ttwu_stage_wait(struct task_struct *requestor, unsigned long ip)
+{
+ struct dept_task *dt = dept_task();
+ struct dept_task *dt_req = &requestor->dept_task;
+ unsigned long flags;
+ struct dept_map m;
+ struct dept_map *real_m;
+ bool sched_map;
+
+ if (unlikely(!dept_working()))
+ return;
+
+ if (dt->recursive)
+ return;
+
+ flags = dept_enter();
+
+ arch_spin_lock(&dt_req->stage_lock);
+
+ /*
+ * Serializing is unnecessary as long as it always comes from
+ * try_to_wake_up().
+ */
+ m = dt_req->stage_m;
+ sched_map = dt_req->stage_sched_map;
+ real_m = dt_req->stage_real_m;
+ __dept_clean_stage(dt_req);
+ arch_spin_unlock(&dt_req->stage_lock);
+
+ /*
+ * ->stage_m.keys should not be NULL if it's in use. Should
+ * make sure that it's not NULL when staging a valid map.
+ */
+ if (!m.keys)
+ goto exit;
+
+ __dept_event(&m, real_m, 1UL, ip, "try_to_wake_up", sched_map);
+exit:
+ dept_exit(flags);
+}
+
+/*
+ * Modifies the latest ecxt corresponding to m and e_f.
+ */
+void dept_map_ecxt_modify(struct dept_map *m, unsigned long e_f,
+ struct dept_key *new_k, unsigned long new_e_f,
+ unsigned long new_ip, const char *new_c_fn,
+ const char *new_e_fn, int new_sub_l)
+{
+ struct dept_task *dt = dept_task();
+ struct dept_ecxt_held *eh;
+ struct dept_class *c;
+ struct dept_key *k;
+ unsigned long flags;
+ int pos = -1;
+ int new_e;
+ int e;
+
+ if (unlikely(!dept_working()))
+ return;
+
+ /*
+ * XXX: Couldn't handle re-enterance cases. Ignore it for now.
+ */
+ if (dt->recursive)
+ return;
+
+ /*
+ * Should go ahead no matter whether ->nocheck == true or not
+ * because ->nocheck value can be changed within the ecxt area
+ * delimitated by dept_ecxt_enter() and dept_ecxt_exit().
+ */
+
+ flags = dept_enter();
+
+ for_each_set_bit(e, &e_f, DEPT_MAX_SUBCLASSES_EVT) {
+ k = m->keys ?: &m->map_key;
+ c = check_new_class(&m->map_key, k,
+ sub_id(m, e), m->name, false);
+ if (!c)
+ continue;
+
+ /*
+ * When it found an ecxt for any event in e_f, done.
+ */
+ pos = find_ecxt_pos(m, c, true);
+ if (pos != -1)
+ break;
+ }
+
+ if (unlikely(pos == -1))
+ goto exit;
+
+ eh = dt->ecxt_held + pos;
+ new_sub_l = new_sub_l >= 0 ? new_sub_l : eh->sub_l;
+
+ new_e = find_first_bit(&new_e_f, DEPT_MAX_SUBCLASSES_EVT);
+
+ if (new_e < DEPT_MAX_SUBCLASSES_EVT)
+ /*
+ * Let it work with the first bit anyway.
+ */
+ DEPT_WARN_ON(1UL << new_e != new_e_f);
+ else
+ new_e = e;
+
+ pop_ecxt(m, c);
+
+ /*
+ * Apply the key to the map.
+ */
+ if (new_k)
+ dept_map_reinit(m, new_k, -1, NULL);
+
+ k = m->keys ?: &m->map_key;
+ c = check_new_class(&m->map_key, k, sub_id(m, new_e), m->name, false);
+
+ if (c && add_ecxt(m, c, new_ip, new_c_fn, new_e_fn, new_sub_l))
+ goto exit;
+
+ /*
+ * Successfully pop_ecxt()ed but failed to add_ecxt().
+ */
+ dt->missing_ecxt++;
+exit:
+ dept_exit(flags);
+}
+EXPORT_SYMBOL_GPL(dept_map_ecxt_modify);
+
+void dept_ecxt_enter(struct dept_map *m, unsigned long e_f, unsigned long ip,
+ const char *c_fn, const char *e_fn, int sub_l)
+{
+ struct dept_task *dt = dept_task();
+ unsigned long flags;
+ struct dept_class *c;
+ struct dept_key *k;
+ int e;
+
+ if (unlikely(!dept_working()))
+ return;
+
+ if (dt->recursive) {
+ dt->missing_ecxt++;
+ return;
+ }
+
+ /*
+ * Should go ahead no matter whether ->nocheck == true or not
+ * because ->nocheck value can be changed within the ecxt area
+ * delimitated by dept_ecxt_enter() and dept_ecxt_exit().
+ */
+
+ flags = dept_enter();
+
+ e = find_first_bit(&e_f, DEPT_MAX_SUBCLASSES_EVT);
+
+ if (e >= DEPT_MAX_SUBCLASSES_EVT)
+ goto missing_ecxt;
+
+ /*
+ * An event is an event. If the caller passed more than single
+ * event, then warn it and handle the event corresponding to
+ * the first bit anyway.
+ */
+ DEPT_WARN_ON(1UL << e != e_f);
+
+ k = m->keys ?: &m->map_key;
+ c = check_new_class(&m->map_key, k, sub_id(m, e), m->name, false);
+
+ if (c && add_ecxt(m, c, ip, c_fn, e_fn, sub_l))
+ goto exit;
+missing_ecxt:
+ dt->missing_ecxt++;
+exit:
+ dept_exit(flags);
+}
+EXPORT_SYMBOL_GPL(dept_ecxt_enter);
+
+bool dept_ecxt_holding(struct dept_map *m, unsigned long e_f)
+{
+ struct dept_task *dt = dept_task();
+ unsigned long flags;
+ bool ret = false;
+ int e;
+
+ if (unlikely(!dept_working()))
+ return false;
+
+ if (dt->recursive)
+ return false;
+
+ flags = dept_enter();
+
+ for_each_set_bit(e, &e_f, DEPT_MAX_SUBCLASSES_EVT) {
+ struct dept_class *c;
+ struct dept_key *k;
+
+ k = m->keys ?: &m->map_key;
+ c = check_new_class(&m->map_key, k,
+ sub_id(m, e), m->name, false);
+ if (!c)
+ continue;
+
+ if (find_ecxt_pos(m, c, true) != -1) {
+ ret = true;
+ break;
+ }
+ }
+
+ dept_exit(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dept_ecxt_holding);
+
+void dept_request_event(struct dept_map *m)
+{
+ unsigned long flags;
+ unsigned int wg;
+
+ if (unlikely(!dept_working()))
+ return;
+
+ if (m->nocheck)
+ return;
+
+ /*
+ * Allow recursive entrance.
+ */
+ flags = dept_enter_recursive();
+
+ wg = next_wgen();
+ WRITE_ONCE(m->wgen, wg);
+
+ dept_exit_recursive(flags);
+}
+EXPORT_SYMBOL_GPL(dept_request_event);
+
+void dept_event(struct dept_map *m, unsigned long e_f,
+ unsigned long ip, const char *e_fn)
+{
+ struct dept_task *dt = dept_task();
+ unsigned long flags;
+
+ if (unlikely(!dept_working()))
+ return;
+
+ if (m->nocheck)
+ return;
+
+ if (dt->recursive) {
+ /*
+ * Dept won't work with this even though an event
+ * context has been asked. Don't make it confused at
+ * handling the event. Disable it until the next.
+ */
+ WRITE_ONCE(m->wgen, 0U);
+ return;
+ }
+
+ flags = dept_enter();
+
+ __dept_event(m, m, e_f, ip, e_fn, false);
+
+ /*
+ * Keep the map diabled until the next sleep.
+ */
+ WRITE_ONCE(m->wgen, 0U);
+
+ dept_exit(flags);
+}
+EXPORT_SYMBOL_GPL(dept_event);
+
+void dept_ecxt_exit(struct dept_map *m, unsigned long e_f,
+ unsigned long ip)
+{
+ struct dept_task *dt = dept_task();
+ unsigned long flags;
+ int e;
+
+ if (unlikely(!dept_working()))
+ return;
+
+ if (dt->recursive) {
+ dt->missing_ecxt--;
+ return;
+ }
+
+ /*
+ * Should go ahead no matter whether ->nocheck == true or not
+ * because ->nocheck value can be changed within the ecxt area
+ * delimitated by dept_ecxt_enter() and dept_ecxt_exit().
+ */
+
+ flags = dept_enter();
+
+ for_each_set_bit(e, &e_f, DEPT_MAX_SUBCLASSES_EVT) {
+ struct dept_class *c;
+ struct dept_key *k;
+
+ k = m->keys ?: &m->map_key;
+ c = check_new_class(&m->map_key, k,
+ sub_id(m, e), m->name, false);
+ if (!c)
+ continue;
+
+ /*
+ * When it found an ecxt for any event in e_f, done.
+ */
+ if (pop_ecxt(m, c))
+ goto exit;
+ }
+
+ dt->missing_ecxt--;
+exit:
+ dept_exit(flags);
+}
+EXPORT_SYMBOL_GPL(dept_ecxt_exit);
+
+void dept_task_exit(struct task_struct *t)
+{
+ struct dept_task *dt = &t->dept_task;
+ int i;
+
+ if (unlikely(!dept_working()))
+ return;
+
+ raw_local_irq_disable();
+
+ if (dt->stack) {
+ put_stack(dt->stack);
+ dt->stack = NULL;
+ }
+
+ for (i = 0; i < dt->ecxt_held_pos; i++) {
+ if (dt->ecxt_held[i].class) {
+ put_class(dt->ecxt_held[i].class);
+ dt->ecxt_held[i].class = NULL;
+ }
+ if (dt->ecxt_held[i].ecxt) {
+ put_ecxt(dt->ecxt_held[i].ecxt);
+ dt->ecxt_held[i].ecxt = NULL;
+ }
+ }
+
+ for (i = 0; i < DEPT_MAX_WAIT_HIST; i++) {
+ if (dt->wait_hist[i].wait) {
+ put_wait(dt->wait_hist[i].wait);
+ dt->wait_hist[i].wait = NULL;
+ }
+ }
+
+ dt->task_exit = true;
+ dept_off();
+
+ raw_local_irq_enable();
+}
+
+void dept_task_init(struct task_struct *t)
+{
+ memset(&t->dept_task, 0x0, sizeof(struct dept_task));
+ t->dept_task.stage_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+}
+
+void dept_key_init(struct dept_key *k)
+{
+ struct dept_task *dt = dept_task();
+ unsigned long flags;
+ int sub_id;
+
+ if (unlikely(!dept_working()))
+ return;
+
+ if (dt->recursive) {
+ DEPT_STOP("Key initialization fails.\n");
+ return;
+ }
+
+ flags = dept_enter();
+
+ clean_classes_cache(k);
+
+ /*
+ * dept_key_init() should not fail.
+ *
+ * FIXME: Should be fixed if dept_key_init() causes deadlock
+ * with dept_lock().
+ */
+ while (unlikely(!dept_lock()))
+ cpu_relax();
+
+ for (sub_id = 0; sub_id < DEPT_MAX_SUBCLASSES; sub_id++) {
+ struct dept_class *c;
+
+ c = lookup_class((unsigned long)k->base + sub_id);
+ if (!c)
+ continue;
+
+ DEPT_STOP("The class(%s/%d) has not been removed.\n",
+ c->name, sub_id);
+ break;
+ }
+
+ dept_unlock();
+ dept_exit(flags);
+}
+EXPORT_SYMBOL_GPL(dept_key_init);
+
+void dept_key_destroy(struct dept_key *k)
+{
+ struct dept_task *dt = dept_task();
+ unsigned long flags;
+ int sub_id;
+
+ if (unlikely(!dept_working()))
+ return;
+
+ if (dt->recursive == 1 && dt->task_exit) {
+ /*
+ * Need to allow to go ahead in this case where
+ * ->recursive has been set to 1 by dept_off() in
+ * dept_task_exit() and ->task_exit has been set to
+ * true in dept_task_exit().
+ */
+ } else if (dt->recursive) {
+ DEPT_STOP("Key destroying fails.\n");
+ return;
+ }
+
+ flags = dept_enter();
+
+ /*
+ * dept_key_destroy() should not fail.
+ *
+ * FIXME: Should be fixed if dept_key_destroy() causes deadlock
+ * with dept_lock().
+ */
+ while (unlikely(!dept_lock()))
+ cpu_relax();
+
+ for (sub_id = 0; sub_id < DEPT_MAX_SUBCLASSES; sub_id++) {
+ struct dept_class *c;
+
+ c = lookup_class((unsigned long)k->base + sub_id);
+ if (!c)
+ continue;
+
+ hash_del_class(c);
+ disconnect_class(c);
+ list_del(&c->all_node);
+ invalidate_class(c);
+
+ /*
+ * Actual deletion will happen on the rcu callback
+ * that has been added in disconnect_class().
+ */
+ del_class(c);
+ }
+
+ dept_unlock();
+ dept_exit(flags);
+
+ /*
+ * Wait until even lockless hash_lookup_class() for the class
+ * returns NULL.
+ */
+ might_sleep();
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(dept_key_destroy);
+
+static void move_llist(struct llist_head *to, struct llist_head *from)
+{
+ struct llist_node *first = llist_del_all(from);
+ struct llist_node *last = first;
+
+ if (!first)
+ return;
+
+ while (llist_next(last))
+ last = llist_next(last);
+ llist_add_batch(first, last, to);
+}
+
+static void migrate_per_cpu_pool(void)
+{
+ const int boot_cpu = 0;
+ int i;
+
+ /*
+ * The boot CPU has been using the temporal local pool so far.
+ * From now on that per_cpu areas have been ready, use the
+ * per_cpu local pool instead.
+ */
+ DEPT_WARN_ON(smp_processor_id() != boot_cpu);
+ for (i = 0; i < OBJECT_NR; i++) {
+ struct llist_head *from;
+ struct llist_head *to;
+
+ from = &pool[i].boot_pool;
+ to = per_cpu_ptr(pool[i].lpool, boot_cpu);
+ move_llist(to, from);
+ }
+}
+
+#define B2KB(B) ((B) / 1024)
+
+/*
+ * Should be called after setup_per_cpu_areas() and before no non-boot
+ * CPUs have been on.
+ */
+void __init dept_init(void)
+{
+ size_t mem_total = 0;
+
+ local_irq_disable();
+ dept_per_cpu_ready = 1;
+ migrate_per_cpu_pool();
+ local_irq_enable();
+
+#define HASH(id, bits) BUILD_BUG_ON(1 << (bits) <= 0);
+ #include "dept_hash.h"
+#undef HASH
+#define OBJECT(id, nr) mem_total += sizeof(struct dept_##id) * nr;
+ #include "dept_object.h"
+#undef OBJECT
+#define HASH(id, bits) mem_total += sizeof(struct hlist_head) * (1 << (bits));
+ #include "dept_hash.h"
+#undef HASH
+
+ pr_info("DEPendency Tracker: Copyright (c) 2020 LG Electronics, Inc., Byungchul Park\n");
+ pr_info("... DEPT_MAX_STACK_ENTRY: %d\n", DEPT_MAX_STACK_ENTRY);
+ pr_info("... DEPT_MAX_WAIT_HIST : %d\n", DEPT_MAX_WAIT_HIST);
+ pr_info("... DEPT_MAX_ECXT_HELD : %d\n", DEPT_MAX_ECXT_HELD);
+ pr_info("... DEPT_MAX_SUBCLASSES : %d\n", DEPT_MAX_SUBCLASSES);
+#define OBJECT(id, nr) \
+ pr_info("... memory used by %s: %zu KB\n", \
+ #id, B2KB(sizeof(struct dept_##id) * nr));
+ #include "dept_object.h"
+#undef OBJECT
+#define HASH(id, bits) \
+ pr_info("... hash list head used by %s: %zu KB\n", \
+ #id, B2KB(sizeof(struct hlist_head) * (1 << (bits))));
+ #include "dept_hash.h"
+#undef HASH
+ pr_info("... total memory used by objects and hashs: %zu KB\n", B2KB(mem_total));
+ pr_info("... per task memory footprint: %zu bytes\n", sizeof(struct dept_task));
+}
diff --git a/kernel/dependency/dept_hash.h b/kernel/dependency/dept_hash.h
new file mode 100644
index 000000000000..fd85aab1fdfb
--- /dev/null
+++ b/kernel/dependency/dept_hash.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * HASH(id, bits)
+ *
+ * id : Id for the object of struct dept_##id.
+ * bits: 1UL << bits is the hash table size.
+ */
+
+HASH(dep, 12)
+HASH(class, 12)
diff --git a/kernel/dependency/dept_object.h b/kernel/dependency/dept_object.h
new file mode 100644
index 000000000000..0b7eb16fe9fb
--- /dev/null
+++ b/kernel/dependency/dept_object.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * OBJECT(id, nr)
+ *
+ * id: Id for the object of struct dept_##id.
+ * nr: # of the object that should be kept in the pool.
+ */
+
+OBJECT(dep, 1024 * 8)
+OBJECT(class, 1024 * 8)
+OBJECT(stack, 1024 * 32)
+OBJECT(ecxt, 1024 * 16)
+OBJECT(wait, 1024 * 32)
diff --git a/kernel/exit.c b/kernel/exit.c
index 343eb97543d5..88c0fbec9967 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1001,6 +1001,7 @@ void __noreturn do_exit(long code)
exit_tasks_rcu_finish();
lockdep_free_task(tsk);
+ dept_task_exit(tsk);
do_task_dead();
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 6ca8689a83b5..c6fe9a23ac0a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -106,6 +106,7 @@
#include <linux/pidfs.h>
#include <linux/tick.h>
#include <linux/unwind_deferred.h>
+#include <linux/dept.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -2127,6 +2128,7 @@ __latent_entropy struct task_struct *copy_process(
#ifdef CONFIG_LOCKDEP
lockdep_init_task(p);
#endif
+ dept_task_init(p);
p->blocked_on = NULL; /* not blocked yet */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index c66b26184936..6ad78f0a58b6 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1375,12 +1375,14 @@ static void free_mod_mem(struct module *mod)
/* Free lock-classes; relies on the preceding sync_rcu(). */
lockdep_free_key_range(mod_mem->base, mod_mem->size);
+ dept_free_range(mod_mem->base, mod_mem->size);
if (mod_mem->size)
module_memory_free(mod, type);
}
/* MOD_DATA hosts mod, so free it at last */
lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
+ dept_free_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
module_memory_free(mod, MOD_DATA);
}
@@ -3548,6 +3550,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
for_class_mod_mem_type(type, core_data) {
lockdep_free_key_range(mod->mem[type].base,
mod->mem[type].size);
+ dept_free_range(mod->mem[type].base,
+ mod->mem[type].size);
}
module_memory_restore_rox(mod);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ccba6fc3c3fe..db942591fb1a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -67,6 +67,7 @@
#include <linux/wait_api.h>
#include <linux/workqueue_api.h>
#include <linux/livepatch_sched.h>
+#include <linux/dept.h>
#ifdef CONFIG_PREEMPT_DYNAMIC
# ifdef CONFIG_GENERIC_IRQ_ENTRY
@@ -4246,6 +4247,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
break;
+ dept_ttwu_stage_wait(p, _RET_IP_);
+
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0.
@@ -6835,6 +6838,11 @@ static void __sched notrace __schedule(int sched_mode)
rq = cpu_rq(cpu);
prev = rq->curr;
+ prev_state = READ_ONCE(prev->__state);
+ if (sched_mode != SM_PREEMPT && prev_state & TASK_NORMAL)
+ dept_request_event_wait_commit();
+
+ dept_sched_enter();
schedule_debug(prev, preempt);
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
@@ -6969,6 +6977,7 @@ static void __sched notrace __schedule(int sched_mode)
raw_spin_rq_unlock_irq(rq);
}
trace_sched_exit_tp(is_switch);
+ dept_sched_exit();
}
void __noreturn do_task_dead(void)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index dc0e0c6ed075..b9cff0bec6f2 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1365,6 +1365,32 @@ config DEBUG_PREEMPT
menu "Lock Debugging (spinlocks, mutexes, etc...)"
+config DEPT
+ bool "Dependency tracking (EXPERIMENTAL)"
+ depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
+ select DEBUG_SPINLOCK
+ select DEBUG_MUTEXES if !PREEMPT_RT
+ select DEBUG_RT_MUTEXES if RT_MUTEXES
+ select DEBUG_RWSEMS if !PREEMPT_RT
+ select DEBUG_WW_MUTEX_SLOWPATH
+ select DEBUG_LOCK_ALLOC
+ select TRACE_IRQFLAGS
+ select STACKTRACE
+ select KALLSYMS
+ select KALLSYMS_ALL
+ select PROVE_LOCKING
+ default n
+ help
+ Check dependencies between wait and event and report it if
+ deadlock possibility has been detected. Multiple reports are
+ allowed if there are more than a single problem.
+
+ This feature is considered EXPERIMENTAL that might produce
+ false positive reports because new dependencies start to be
+ tracked, that have never been tracked before. It's worth
+ noting, to mitigate the impact by the false positives, multi
+ reporting has been supported.
+
config LOCK_DEBUGGING_SUPPORT
bool
depends on TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index ed99344317f5..18228afccea5 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -1398,6 +1398,8 @@ static void reset_locks(void)
local_irq_disable();
lockdep_free_key_range(&ww_lockdep.acquire_key, 1);
lockdep_free_key_range(&ww_lockdep.mutex_key, 1);
+ dept_free_range(&ww_lockdep.acquire_key, 1);
+ dept_free_range(&ww_lockdep.mutex_key, 1);
I1(A); I1(B); I1(C); I1(D);
I1(X1); I1(X2); I1(Y1); I1(Y2); I1(Z1); I1(Z2);
--
2.17.1
^ permalink raw reply related
* [PATCH v17 04/47] dept: add lock dependency tracker APIs
From: Byungchul Park @ 2025-10-02 8:12 UTC (permalink / raw)
To: linux-kernel
Cc: kernel_team, torvalds, damien.lemoal, linux-ide, adilger.kernel,
linux-ext4, mingo, peterz, will, tglx, rostedt, joel, sashal,
daniel.vetter, duyuyang, johannes.berg, tj, tytso, willy, david,
amir73il, gregkh, kernel-team, linux-mm, akpm, mhocko, minchan,
hannes, vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes,
vbabka, ngupta, linux-block, josef, linux-fsdevel, jack, jlayton,
dan.j.williams, hch, djwong, dri-devel, rodrigosiqueiramelo,
melissa.srw, hamohammed.sa, harry.yoo, chris.p.wilson,
gwan-gyeong.mun, max.byungchul.park, boqun.feng, longman,
yunseong.kim, ysk, yeoreum.yun, netdev, matthew.brost, her0gyugyu,
corbet, catalin.marinas, bp, dave.hansen, x86, hpa, luto,
sumit.semwal, gustavo, christian.koenig, andi.shyti, arnd,
lorenzo.stoakes, Liam.Howlett, rppt, surenb, mcgrof, petr.pavlu,
da.gomez, samitolvanen, paulmck, frederic, neeraj.upadhyay,
joelagnelf, josh, urezki, mathieu.desnoyers, jiangshanlai,
qiang.zhang, juri.lelli, vincent.guittot, dietmar.eggemann,
bsegall, mgorman, vschneid, chuck.lever, neil, okorniev, Dai.Ngo,
tom, trondmy, anna, kees, bigeasy, clrkwllms, mark.rutland,
ada.coupriediaz, kristina.martsenko, wangkefeng.wang, broonie,
kevin.brodsky, dwmw, shakeel.butt, ast, ziy, yuzhao, baolin.wang,
usamaarif642, joel.granados, richard.weiyang, geert+renesas,
tim.c.chen, linux, alexander.shishkin, lillian, chenhuacai,
francesco, guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel
In-Reply-To: <20251002081247.51255-1-byungchul@sk.com>
Wrap the base APIs for easier annotation on typical lock.
Signed-off-by: Byungchul Park <byungchul@sk.com>
---
include/linux/dept_ldt.h | 78 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 78 insertions(+)
create mode 100644 include/linux/dept_ldt.h
diff --git a/include/linux/dept_ldt.h b/include/linux/dept_ldt.h
new file mode 100644
index 000000000000..8047d0a531f1
--- /dev/null
+++ b/include/linux/dept_ldt.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Lock Dependency Tracker
+ *
+ * Started by Byungchul Park <max.byungchul.park@gmail.com>:
+ *
+ * Copyright (c) 2020 LG Electronics, Inc., Byungchul Park
+ * Copyright (c) 2024 SK hynix, Inc., Byungchul Park
+ */
+
+#ifndef __LINUX_DEPT_LDT_H
+#define __LINUX_DEPT_LDT_H
+
+#include <linux/dept.h>
+
+#ifdef CONFIG_DEPT
+#define LDT_EVT_L 1UL
+#define LDT_EVT_R 2UL
+#define LDT_EVT_W 1UL
+#define LDT_EVT_RW (LDT_EVT_R | LDT_EVT_W)
+#define LDT_EVT_ALL (LDT_EVT_L | LDT_EVT_RW)
+
+#define ldt_init(m, k, su, n) dept_map_init(m, k, su, n)
+#define ldt_lock(m, sl, t, n, i) \
+ do { \
+ if (n) \
+ dept_ecxt_enter_nokeep(m); \
+ else if (t) \
+ dept_ecxt_enter(m, LDT_EVT_L, i, "trylock", "unlock", sl);\
+ else { \
+ dept_wait(m, LDT_EVT_L, i, "lock", sl); \
+ dept_ecxt_enter(m, LDT_EVT_L, i, "lock", "unlock", sl);\
+ } \
+ } while (0)
+
+#define ldt_rlock(m, sl, t, n, i, q) \
+ do { \
+ if (n) \
+ dept_ecxt_enter_nokeep(m); \
+ else if (t) \
+ dept_ecxt_enter(m, LDT_EVT_R, i, "read_trylock", "read_unlock", sl);\
+ else { \
+ dept_wait(m, q ? LDT_EVT_RW : LDT_EVT_W, i, "read_lock", sl);\
+ dept_ecxt_enter(m, LDT_EVT_R, i, "read_lock", "read_unlock", sl);\
+ } \
+ } while (0)
+
+#define ldt_wlock(m, sl, t, n, i) \
+ do { \
+ if (n) \
+ dept_ecxt_enter_nokeep(m); \
+ else if (t) \
+ dept_ecxt_enter(m, LDT_EVT_W, i, "write_trylock", "write_unlock", sl);\
+ else { \
+ dept_wait(m, LDT_EVT_RW, i, "write_lock", sl); \
+ dept_ecxt_enter(m, LDT_EVT_W, i, "write_lock", "write_unlock", sl);\
+ } \
+ } while (0)
+
+#define ldt_unlock(m, i) dept_ecxt_exit(m, LDT_EVT_ALL, i)
+
+#define ldt_downgrade(m, i) \
+ do { \
+ if (dept_ecxt_holding(m, LDT_EVT_W)) \
+ dept_map_ecxt_modify(m, LDT_EVT_W, NULL, LDT_EVT_R, i, "downgrade", "read_unlock", -1);\
+ } while (0)
+
+#define ldt_set_class(m, n, k, sl, i) dept_map_ecxt_modify(m, LDT_EVT_ALL, k, 0UL, i, "lock_set_class", "(any)unlock", sl)
+#else /* !CONFIG_DEPT */
+#define ldt_init(m, k, su, n) do { (void)(k); } while (0)
+#define ldt_lock(m, sl, t, n, i) do { } while (0)
+#define ldt_rlock(m, sl, t, n, i, q) do { } while (0)
+#define ldt_wlock(m, sl, t, n, i) do { } while (0)
+#define ldt_unlock(m, i) do { } while (0)
+#define ldt_downgrade(m, i) do { } while (0)
+#define ldt_set_class(m, n, k, sl, i) do { } while (0)
+#endif
+#endif /* __LINUX_DEPT_LDT_H */
--
2.17.1
^ permalink raw reply related
* [PATCH v17 00/47] DEPT(DEPendency Tracker)
From: Byungchul Park @ 2025-10-02 8:12 UTC (permalink / raw)
To: linux-kernel
Cc: kernel_team, torvalds, damien.lemoal, linux-ide, adilger.kernel,
linux-ext4, mingo, peterz, will, tglx, rostedt, joel, sashal,
daniel.vetter, duyuyang, johannes.berg, tj, tytso, willy, david,
amir73il, gregkh, kernel-team, linux-mm, akpm, mhocko, minchan,
hannes, vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes,
vbabka, ngupta, linux-block, josef, linux-fsdevel, jack, jlayton,
dan.j.williams, hch, djwong, dri-devel, rodrigosiqueiramelo,
melissa.srw, hamohammed.sa, harry.yoo, chris.p.wilson,
gwan-gyeong.mun, max.byungchul.park, boqun.feng, longman,
yunseong.kim, ysk, yeoreum.yun, netdev, matthew.brost, her0gyugyu,
corbet, catalin.marinas, bp, dave.hansen, x86, hpa, luto,
sumit.semwal, gustavo, christian.koenig, andi.shyti, arnd,
lorenzo.stoakes, Liam.Howlett, rppt, surenb, mcgrof, petr.pavlu,
da.gomez, samitolvanen, paulmck, frederic, neeraj.upadhyay,
joelagnelf, josh, urezki, mathieu.desnoyers, jiangshanlai,
qiang.zhang, juri.lelli, vincent.guittot, dietmar.eggemann,
bsegall, mgorman, vschneid, chuck.lever, neil, okorniev, Dai.Ngo,
tom, trondmy, anna, kees, bigeasy, clrkwllms, mark.rutland,
ada.coupriediaz, kristina.martsenko, wangkefeng.wang, broonie,
kevin.brodsky, dwmw, shakeel.butt, ast, ziy, yuzhao, baolin.wang,
usamaarif642, joel.granados, richard.weiyang, geert+renesas,
tim.c.chen, linux, alexander.shishkin, lillian, chenhuacai,
francesco, guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel
Found out a recent deadlock issue can be reported by dept. The issue is:
https://lore.kernel.org/all/20250513093448.592150-1-gavinguo@igalia.com/
I'm happy to see that dept reported real problems in practice. See:
https://lore.kernel.org/lkml/6383cde5-cf4b-facf-6e07-1378a485657d@I-love.SAKURA.ne.jp/
https://lore.kernel.org/lkml/1674268856-31807-1-git-send-email-byungchul.park@lge.com/
https://lore.kernel.org/all/b6e00e77-4a8c-4e05-ab79-266bf05fcc2d@igalia.com/
I added documents describing dept, that would help you understand what
dept is and how dept works. You can use dept just with CONFIG_DEPT on
and checking dmesg at runtime.
There are still false positives and some of them are already in progress
to suppress and the efforts need to be kept for a while as lockdep
experienced. Especially, since dept tracks PG_locked but folios have
never been split in class - which needs help from maybe fs guys tho.. -
we should put up with the AA report of PG_locked for a while, for
instance, any nested folio_lock()s will give the dept splat for now :(
It's worth noting that *EXPERIMENTAL* in Kconfig is tagged, which means
dept is not proper for an automation tool yet.
Thanks for the support and contribution, to:
Harry Yoo <harry.yoo@oracle.com>
Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
Yunseong Kim <ysk@kzalloc.com>
Yeoreum Yun <yeoreum.yun@arm.com>
---
Hi Linus and folks,
I've been developing a tool for detecting deadlock possibilities by
tracking wait/event rather than lock acquisition order to try to cover
all synchonization machanisms.
Benefits:
0. Works with all lock primitives.
1. Works with wait_for_completion()/complete().
2. Works with PG_locked.
3. Works with swait/wakeup.
4. Works with waitqueue.
5. Works with wait_bit.
6. Multiple reports are allowed.
7. Deduplication control on multiple reports.
8. Withstand false positives thanks to 7.
9. Easy to annotate on waits/events.
Future works after getting merged:
0. To separates dept from lockdep.
1. To use dept as a dependency engine for lockdep.
2. To add missing annotations on waits/events.
How to interpret reports:
(See the document in this patchset for more detail.)
[S] the start of the event context
[W] the wait disturbing the event from being triggered
[E] the event that cannot be reachable
Thanks.
Byungchul
---
Changes from v16:
1. Rebase on v6.17.
2. Fix a false positive from rcu (by Yunseong Kim)
3. Introduce APIs to set page's usage, dept_set_page_usage() and
dept_reset_page_usage() to avoid false positives.
4. Consider lock_page() as a potential wait unconditionally.
5. Consider folio_lock_killable() as a potential wait
unconditionally.
6. Add support for tracking PG_writeback waits and events.
7. Fix two build errors due to the additional debug information
added by dept. (by Yunseong Kim)
Changes from v15:
1. Fix typo and improve comments and commit messages (feedbacked
by ALOK TIWARI, Waiman Long, and kernel test robot).
2. Do not stop dept on detection of cicular dependency of
recover event, allowing to keep reporting.
3. Add SK hynix to copyright.
4. Consider folio_lock() as a potential wait unconditionally.
5. Fix Kconfig dependency bug (feedbacked by kernel test rebot).
6. Do not suppress reports that involve classes even that have
already involved in other reports, allowing to keep
reporting.
Changes from v14:
1. Rebase on the current latest, v6.15-rc6.
2. Refactor dept code.
3. With multi event sites for a single wait, even if an event
forms a circular dependency, the event can be recovered by
other event(or wake up) paths. Even though informing the
circular dependency is worthy but it should be suppressed
once informing it, if it doesn't lead an actual deadlock. So
introduce APIs to annotate the relationship between event
site and recover site, that are, event_site() and
dept_recover_event().
4. wait_for_completion() worked with dept map embedded in struct
completion. However, it generates a few false positves since
all the waits using the instance of struct completion, share
the map and key. To avoid the false positves, make it not to
share the map and key but each wait_for_completion() caller
have its own key by default. Of course, external maps also
can be used if needed.
5. Fix a bug about hardirq on/off tracing.
6. Implement basic unit test for dept.
7. Add more supports for dma fence synchronization.
8. Add emergency stop of dept e.g. on panic().
9. Fix false positives by mmu_notifier_invalidate_*().
10. Fix recursive call bug by DEPT_WARN_*() and DEPT_STOP().
11. Fix trivial bugs in DEPT_WARN_*() and DEPT_STOP().
12. Fix a bug that a spin lock, dept_pool_spin, is used in
both contexts of irq disabled and enabled without irq
disabled.
13. Suppress reports with classes, any of that already have
been reported, even though they have different chains but
being barely meaningful.
14. Print stacktrace of the wait that an event is now waking up,
not only stacktrace of the event.
15. Make dept aware of lockdep_cmp_fn() that is used to avoid
false positives in lockdep so that dept can also avoid them.
16. Do do_event() only if there are no ecxts have been
delimited.
17. Fix a bug that was not synchronized for stage_m in struct
dept_task, using a spin lock, dept_task()->stage_lock.
18. Fix a bug that dept didn't handle the case that multiple
ttwus for a single waiter can be called at the same time
e.i. a race issue.
19. Distinguish each kernel context from others, not only by
system call but also by user oriented fault so that dept can
work with more accuracy information about kernel context.
That helps to avoid a few false positives.
20. Limit dept's working to x86_64 and arm64.
Changes from v13:
1. Rebase on the current latest version, v6.9-rc7.
2. Add 'dept' documentation describing dept APIs.
Changes from v12:
1. Refine the whole document for dept.
2. Add 'Interpret dept report' section in the document, using a
deadlock report obtained in practice. Hope this version of
document helps guys understand dept better.
https://lore.kernel.org/lkml/6383cde5-cf4b-facf-6e07-1378a485657d@I-love.SAKURA.ne.jp/#t
https://lore.kernel.org/lkml/1674268856-31807-1-git-send-email-byungchul.park@lge.com/
Changes from v11:
1. Add 'dept' documentation describing the concept of dept.
2. Rewrite the commit messages of the following commits for
using weaker lockdep annotation, for better description.
fs/jbd2: Use a weaker annotation in journal handling
cpu/hotplug: Use a weaker annotation in AP thread
(feedbacked by Thomas Gleixner)
Changes from v10:
1. Fix noinstr warning when building kernel source.
2. dept has been reporting some false positives due to the folio
lock's unfairness. Reflect it and make dept work based on
dept annotaions instead of just wait and wake up primitives.
3. Remove the support for PG_writeback while working on 2. I
will add the support later if needed.
4. dept didn't print stacktrace for [S] if the participant of a
deadlock is not lock mechanism but general wait and event.
However, it made hard to interpret the report in that case.
So add support to print stacktrace of the requestor who asked
the event context to run - usually a waiter of the event does
it just before going to wait state.
5. Give up tracking raw_local_irq_{disable,enable}() since it
totally messed up dept's irq tracking. So make it work in the
same way as lockdep does. I will consider it once any false
positives by those are observed again.
6. Change the manual rwsem_acquire_read(->j_trans_commit_map)
annotation in fs/jbd2/transaction.c to the try version so
that it works as much as it exactly needs.
7. Remove unnecessary 'inline' keyword in dept.c and add
'__maybe_unused' to a needed place.
Changes from v9:
1. Fix a bug. SDT tracking didn't work well because of my big
mistake that I should've used waiter's map to indentify its
class but it had been working with waker's one. FYI,
PG_locked and PG_writeback weren't affected. They still
worked well. (reported by YoungJun)
Changes from v8:
1. Fix build error by adding EXPORT_SYMBOL(PG_locked_map) and
EXPORT_SYMBOL(PG_writeback_map) for kernel module build -
appologize for that. (reported by kernel test robot)
2. Fix build error by removing header file's circular dependency
that was caused by "atomic.h", "kernel.h" and "irqflags.h",
which I introduced - appolgize for that. (reported by kernel
test robot)
Changes from v7:
1. Fix a bug that cannot track rwlock dependency properly,
introduced in v7. (reported by Boqun and lockdep selftest)
2. Track wait/event of PG_{locked,writeback} more aggressively
assuming that when a bit of PG_{locked,writeback} is cleared
there might be waits on the bit. (reported by Linus, Hillf
and syzbot)
3. Fix and clean bad style code e.i. unnecessarily introduced
a randome pattern and so on. (pointed out by Linux)
4. Clean code for applying dept to wait_for_completion().
Changes from v6:
1. Tie to task scheduler code to track sleep and try_to_wake_up()
assuming sleeps cause waits, try_to_wake_up()s would be the
events that those are waiting for, of course with proper dept
annotations, sdt_might_sleep_weak(), sdt_might_sleep_strong()
and so on. For these cases, class is classified at sleep
entrance rather than the synchronization initialization code.
Which would extremely reduce false alarms.
2. Remove the dept associated instance in each page struct for
tracking dependencies by PG_locked and PG_writeback thanks to
the 1. work above.
3. Introduce CONFIG_dept_AGGRESIVE_TIMEOUT_WAIT to suppress
reports that waits with timeout set are involved, for those
who don't like verbose reporting.
4. Add a mechanism to refill the internal memory pools on
running out so that dept could keep working as long as free
memory is available in the system.
5. Re-enable tracking hashed-waitqueue wait. That's going to no
longer generate false positives because class is classified
at sleep entrance rather than the waitqueue initailization.
6. Refactor to make it easier to port onto each new version of
the kernel.
7. Apply dept to dma fence.
8. Do trivial optimizaitions.
Changes from v5:
1. Use just pr_warn_once() rather than WARN_ONCE() on the lack
of internal resources because WARN_*() printing stacktrace is
too much for informing the lack. (feedback from Ted, Hyeonggon)
2. Fix trivial bugs like missing initializing a struct before
using it.
3. Assign a different class per task when handling onstack
variables for waitqueue or the like. Which makes dept
distinguish between onstack variables of different tasks so
as to prevent false positives. (reported by Hyeonggon)
4. Make dept aware of even raw_local_irq_*() to prevent false
positives. (reported by Hyeonggon)
5. Don't consider dependencies between the events that might be
triggered within __schedule() and the waits that requires
__schedule(), real ones. (reported by Hyeonggon)
6. Unstage the staged wait that has prepare_to_wait_event()'ed
*and* yet to get to __schedule(), if we encounter __schedule()
in-between for another sleep, which is possible if e.g. a
mutex_lock() exists in 'condition' of ___wait_event().
7. Turn on CONFIG_PROVE_LOCKING when CONFIG_DEPT is on, to rely
on the hardirq and softirq entrance tracing to make dept more
portable for now.
Changes from v4:
1. Fix some bugs that produce false alarms.
2. Distinguish each syscall context from another *for arm64*.
3. Make it not warn it but just print it in case dept ring
buffer gets exhausted. (feedback from Hyeonggon)
4. Explicitely describe "EXPERIMENTAL" and "dept might produce
false positive reports" in Kconfig. (feedback from Ted)
Changes from v3:
1. dept shouldn't create dependencies between different depths
of a class that were indicated by *_lock_nested(). dept
normally doesn't but it does once another lock class comes
in. So fixed it. (feedback from Hyeonggon)
2. dept considered a wait as a real wait once getting to
__schedule() even if it has been set to TASK_RUNNING by wake
up sources in advance. Fixed it so that dept doesn't consider
the case as a real wait. (feedback from Jan Kara)
3. Stop tracking dependencies with a map once the event
associated with the map has been handled. dept will start to
work with the map again, on the next sleep.
Changes from v2:
1. Disable dept on bit_wait_table[] in sched/wait_bit.c
reporting a lot of false positives, which is my fault.
Wait/event for bit_wait_table[] should've been tagged in a
higher layer for better work, which is a future work.
(feedback from Jan Kara)
2. Disable dept on crypto_larval's completion to prevent a false
positive.
Changes from v1:
1. Fix coding style and typo. (feedback from Steven)
2. Distinguish each work context from another in workqueue.
3. Skip checking lock acquisition with nest_lock, which is about
correct lock usage that should be checked by lockdep.
Changes from RFC(v0):
1. Prevent adding a wait tag at prepare_to_wait() but __schedule().
(feedback from Linus and Matthew)
2. Use try version at lockdep_acquire_cpus_lock() annotation.
3. Distinguish each syscall context from another.
Byungchul Park (47):
llist: move llist_{head,node} definition to types.h
dept: implement DEPT(DEPendency Tracker)
dept: add single event dependency tracker APIs
dept: add lock dependency tracker APIs
dept: tie to lockdep and IRQ tracing
dept: add proc knobs to show stats and dependency graph
dept: distinguish each kernel context from another
x86_64, dept: add support CONFIG_ARCH_HAS_DEPT_SUPPORT to x86_64
arm64, dept: add support CONFIG_ARCH_HAS_DEPT_SUPPORT to arm64
dept: distinguish each work from another
dept: add a mechanism to refill the internal memory pools on running
out
dept: record the latest one out of consecutive waits of the same class
dept: apply sdt_might_sleep_{start,end}() to
wait_for_completion()/complete()
dept: apply sdt_might_sleep_{start,end}() to swait
dept: apply sdt_might_sleep_{start,end}() to waitqueue wait
dept: apply sdt_might_sleep_{start,end}() to hashed-waitqueue wait
dept: apply sdt_might_sleep_{start,end}() to dma fence
dept: track timeout waits separately with a new Kconfig
dept: apply timeout consideration to wait_for_completion()/complete()
dept: apply timeout consideration to swait
dept: apply timeout consideration to waitqueue wait
dept: apply timeout consideration to hashed-waitqueue wait
dept: apply timeout consideration to dma fence wait
dept: make dept able to work with an external wgen
dept: track PG_locked with dept
dept: print staged wait's stacktrace on report
locking/lockdep: prevent various lockdep assertions when
lockdep_off()'ed
dept: add documentation for dept
cpu/hotplug: use a weaker annotation in AP thread
fs/jbd2: use a weaker annotation in journal handling
dept: assign dept map to mmu notifier invalidation synchronization
dept: assign unique dept_key to each distinct dma fence caller
dept: make dept aware of lockdep_set_lock_cmp_fn() annotation
dept: make dept stop from working on debug_locks_off()
i2c: rename wait_for_completion callback to wait_for_completion_cb
dept: assign unique dept_key to each distinct wait_for_completion()
caller
completion, dept: introduce init_completion_dmap() API
dept: introduce a new type of dependency tracking between multi event
sites
dept: add module support for struct dept_event_site and
dept_event_site_dep
dept: introduce event_site() to disable event tracking if it's
recoverable
dept: implement a basic unit test for dept
dept: call dept_hardirqs_off() in local_irq_*() regardless of irq
state
rcu/update: fix same dept key collision between various types of RCU
dept: introduce APIs to set page usage and use subclasses_evt for the
usage
dept: track PG_writeback with dept
SUNRPC: relocate struct rcu_head to the first field of struct rpc_xprt
mm: percpu: increase PERCPU_DYNAMIC_SIZE_SHIFT on DEPT and large
PAGE_SIZE
Documentation/dependency/dept.txt | 735 ++++++
Documentation/dependency/dept_api.txt | 117 +
arch/arm64/Kconfig | 1 +
arch/arm64/kernel/syscall.c | 7 +
arch/arm64/mm/fault.c | 7 +
arch/x86/Kconfig | 1 +
arch/x86/entry/syscall_64.c | 7 +
arch/x86/mm/fault.c | 7 +
drivers/dma-buf/dma-fence.c | 23 +-
drivers/i2c/algos/i2c-algo-pca.c | 2 +-
drivers/i2c/busses/i2c-pca-isa.c | 2 +-
drivers/i2c/busses/i2c-pca-platform.c | 2 +-
fs/jbd2/transaction.c | 2 +-
include/asm-generic/vmlinux.lds.h | 13 +-
include/linux/completion.h | 124 +-
include/linux/dept.h | 647 +++++
include/linux/dept_ldt.h | 78 +
include/linux/dept_sdt.h | 68 +
include/linux/dept_unit_test.h | 67 +
include/linux/dma-fence.h | 74 +-
include/linux/hardirq.h | 3 +
include/linux/i2c-algo-pca.h | 2 +-
include/linux/irqflags.h | 21 +-
include/linux/llist.h | 8 -
include/linux/local_lock_internal.h | 1 +
include/linux/lockdep.h | 105 +-
include/linux/lockdep_types.h | 3 +
include/linux/mm_types.h | 4 +
include/linux/mmu_notifier.h | 26 +
include/linux/module.h | 5 +
include/linux/mutex.h | 1 +
include/linux/page-flags.h | 204 +-
include/linux/pagemap.h | 37 +-
include/linux/percpu-rwsem.h | 2 +-
include/linux/percpu.h | 4 +
include/linux/rcupdate_wait.h | 13 +-
include/linux/rtmutex.h | 1 +
include/linux/rwlock_types.h | 1 +
include/linux/rwsem.h | 1 +
include/linux/sched.h | 118 +
include/linux/seqlock.h | 2 +-
include/linux/spinlock_types_raw.h | 3 +
include/linux/srcu.h | 2 +-
include/linux/sunrpc/xprt.h | 9 +-
include/linux/swait.h | 3 +
include/linux/types.h | 8 +
include/linux/wait.h | 3 +
include/linux/wait_bit.h | 3 +
init/init_task.c | 2 +
init/main.c | 2 +
kernel/Makefile | 1 +
kernel/cpu.c | 2 +-
kernel/dependency/Makefile | 5 +
kernel/dependency/dept.c | 3499 +++++++++++++++++++++++++
kernel/dependency/dept_hash.h | 10 +
kernel/dependency/dept_internal.h | 65 +
kernel/dependency/dept_object.h | 13 +
kernel/dependency/dept_proc.c | 94 +
kernel/dependency/dept_unit_test.c | 173 ++
kernel/exit.c | 1 +
kernel/fork.c | 2 +
kernel/locking/lockdep.c | 33 +
kernel/module/main.c | 19 +
kernel/rcu/rcu.h | 1 +
kernel/rcu/update.c | 5 +-
kernel/sched/completion.c | 62 +-
kernel/sched/core.c | 9 +
kernel/workqueue.c | 3 +
lib/Kconfig.debug | 51 +
lib/debug_locks.c | 2 +
lib/locking-selftest.c | 2 +
mm/filemap.c | 37 +
mm/mm_init.c | 3 +
mm/mmu_notifier.c | 31 +-
74 files changed, 6570 insertions(+), 134 deletions(-)
create mode 100644 Documentation/dependency/dept.txt
create mode 100644 Documentation/dependency/dept_api.txt
create mode 100644 include/linux/dept.h
create mode 100644 include/linux/dept_ldt.h
create mode 100644 include/linux/dept_sdt.h
create mode 100644 include/linux/dept_unit_test.h
create mode 100644 kernel/dependency/Makefile
create mode 100644 kernel/dependency/dept.c
create mode 100644 kernel/dependency/dept_hash.h
create mode 100644 kernel/dependency/dept_internal.h
create mode 100644 kernel/dependency/dept_object.h
create mode 100644 kernel/dependency/dept_proc.c
create mode 100644 kernel/dependency/dept_unit_test.c
base-commit: e5f0a698b34ed76002dc5cff3804a61c80233a7a
--
2.17.1
^ permalink raw reply
* [PATCH v17 01/47] llist: move llist_{head,node} definition to types.h
From: Byungchul Park @ 2025-10-02 8:12 UTC (permalink / raw)
To: linux-kernel
Cc: kernel_team, torvalds, damien.lemoal, linux-ide, adilger.kernel,
linux-ext4, mingo, peterz, will, tglx, rostedt, joel, sashal,
daniel.vetter, duyuyang, johannes.berg, tj, tytso, willy, david,
amir73il, gregkh, kernel-team, linux-mm, akpm, mhocko, minchan,
hannes, vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes,
vbabka, ngupta, linux-block, josef, linux-fsdevel, jack, jlayton,
dan.j.williams, hch, djwong, dri-devel, rodrigosiqueiramelo,
melissa.srw, hamohammed.sa, harry.yoo, chris.p.wilson,
gwan-gyeong.mun, max.byungchul.park, boqun.feng, longman,
yunseong.kim, ysk, yeoreum.yun, netdev, matthew.brost, her0gyugyu,
corbet, catalin.marinas, bp, dave.hansen, x86, hpa, luto,
sumit.semwal, gustavo, christian.koenig, andi.shyti, arnd,
lorenzo.stoakes, Liam.Howlett, rppt, surenb, mcgrof, petr.pavlu,
da.gomez, samitolvanen, paulmck, frederic, neeraj.upadhyay,
joelagnelf, josh, urezki, mathieu.desnoyers, jiangshanlai,
qiang.zhang, juri.lelli, vincent.guittot, dietmar.eggemann,
bsegall, mgorman, vschneid, chuck.lever, neil, okorniev, Dai.Ngo,
tom, trondmy, anna, kees, bigeasy, clrkwllms, mark.rutland,
ada.coupriediaz, kristina.martsenko, wangkefeng.wang, broonie,
kevin.brodsky, dwmw, shakeel.butt, ast, ziy, yuzhao, baolin.wang,
usamaarif642, joel.granados, richard.weiyang, geert+renesas,
tim.c.chen, linux, alexander.shishkin, lillian, chenhuacai,
francesco, guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel
In-Reply-To: <20251002081247.51255-1-byungchul@sk.com>
llist_head and llist_node can be used by some other header files. For
example, dept for tracking dependencies uses llist in its header. To
avoid header dependency, move them to types.h.
Signed-off-by: Byungchul Park <byungchul@sk.com>
---
include/linux/llist.h | 8 --------
include/linux/types.h | 8 ++++++++
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/include/linux/llist.h b/include/linux/llist.h
index 607b2360c938..6bcdf378ebd7 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -53,14 +53,6 @@
#include <linux/stddef.h>
#include <linux/types.h>
-struct llist_head {
- struct llist_node *first;
-};
-
-struct llist_node {
- struct llist_node *next;
-};
-
#define LLIST_HEAD_INIT(name) { NULL }
#define LLIST_HEAD(name) struct llist_head name = LLIST_HEAD_INIT(name)
diff --git a/include/linux/types.h b/include/linux/types.h
index 6dfdb8e8e4c3..58882a3730eb 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -208,6 +208,14 @@ struct hlist_node {
struct hlist_node *next, **pprev;
};
+struct llist_head {
+ struct llist_node *first;
+};
+
+struct llist_node {
+ struct llist_node *next;
+};
+
struct ustat {
__kernel_daddr_t f_tfree;
#ifdef CONFIG_ARCH_32BIT_USTAT_F_TINODE
--
2.17.1
^ permalink raw reply related
* Re: [PATCH 0/6] module: enable force unloading of modules that have crashed during init
From: Julian LaGattuta @ 2025-09-30 15:51 UTC (permalink / raw)
To: Petr Pavlu
Cc: Luis Chamberlain, Sami Tolvanen, Daniel Gomez, linux-modules,
linux-kernel
In-Reply-To: <3fc3f7e8-3297-4586-91ca-41f07c8a9418@suse.com>
Thank you so much for your well written reply. I understand now where
I went wrong.
Have a good rest of your day.
Sincerely,
Julian
^ permalink raw reply
* Re: [PATCH 0/6] module: enable force unloading of modules that have crashed during init
From: Petr Pavlu @ 2025-09-30 13:16 UTC (permalink / raw)
To: Julian LaGattuta
Cc: Luis Chamberlain, Sami Tolvanen, Daniel Gomez, linux-modules,
linux-kernel
In-Reply-To: <CADuX1qJZ1V32d0U4hSOUOzte2KE-k-Hzop0zZd4=7Ap-kS3JzQ@mail.gmail.com>
On 9/25/25 12:16 AM, Julian LaGattuta wrote:
>> Could you please explain the motivation for doing this in more detail?
>>
>> I think we shouldn't attempt to do anything clever with modules that
>> crashed during initialization. Such a module can already leave the
>> system in an unstable state and trying to recover can cause even more
>> problems. For instance, I don't see how it is safe to call the module's
>> exit function.
>
> Thank you for your response Petr. The motivation comes from when I
> wanted to replace a crashed module with one which does not crash
> without having to reboot. I looked around and saw some other people
> complain about it on stackoverflow.
Hm, I'm still not sure I understand the use case. If it is about being
able to remove a crashed module when developing it, then I wouldn't
expect rebooting the machine to be much of an issue. If it is on the
other hand about removing it on a production machine, then I think
attempting this can leave the machine in a worse state and not something
we should encourage or support.
>
> I thought that if a module crashed during init, it would be in a no
> better position compared to if it were forcefully removed.
> Therefore, there is no reason why this shouldn't be an option as it
> couldn't make the problem worse.
A module can be halfway through its initialization when it crashes. It
may have already registered with various parts of the kernel and
I believe that removing the module from under the kernel's control could
result in even more problems.
The current support for forcefully removing a module overrides the
kernel's tracking of module references. This option was originally
introduced by "[PATCH] Forced module unload" [1]. As far as I can see,
it was related to the module loader rework at that time in "[PATCH]
In-kernel Module Loader" [2]. This rework provided raceless
loading/unloading and marked several MOD_INC_USE_COUNT/MOD_DEC_USE_COUNT
interfaces as obsolete and unsafe. Since some modules still used the old
racy interfaces, it seems the forced removal option was added to make it
possible to remove such modules.
However, this issue should have been fixed a long time ago, so I wonder
if even the current CONFIG_MODULE_FORCE_UNLOAD support is useful.
[1] https://git.kernel.org/pub/scm/linux/kernel/git/mpe/linux-fullhistory.git/commit/?id=d0f8c9a4c2c9d93463d157248c73028670e80a97
[2] https://git.kernel.org/pub/scm/linux/kernel/git/mpe/linux-fullhistory.git/commit/?id=4c877b08daf4b463c144cbd2748ed1659931a0dd
--
Thanks,
Petr
^ permalink raw reply
* linux-6.17/kernel/module/sysfs.c:275: Always true test in for loop ?
From: David Binderman @ 2025-09-30 10:25 UTC (permalink / raw)
To: mcgrof@kernel.org, petr.pavlu@suse.com, da.gomez@kernel.org,
samitolvanen@google.com, linux-modules@vger.kernel.org, LKML
Hello there,
Static analyser cppcheck says:
linux-6.17/kernel/module/sysfs.c:275:20: style: Pointer expression 'attr=&mod->modinfo_attrs[i]' converted to bool is always true. [knownPointerToBool]
Source code is
for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
Suggest code rework.
Regards
David Binderman
^ permalink raw reply
* Re: [PATCH RFC 019/104] module: add load_module_mem() helper
From: Petr Pavlu @ 2025-09-29 9:47 UTC (permalink / raw)
To: Vegard Nossum
Cc: Herbert Xu, David S. Miller, linux-crypto, Luis Chamberlain,
Daniel Gomez, Ard Biesheuvel, Eric Biggers, Jason A . Donenfeld,
Greg Kroah-Hartman, Wang, Jay, Nicolai Stange, Vladis Dronov,
Stephan Mueller, Sami Tolvanen, linux-modules,
Saeed Mirzamohammadi
In-Reply-To: <20250904155216.460962-20-vegard.nossum@oracle.com>
On 9/4/25 5:50 PM, Vegard Nossum wrote:
> Add a new helper function, load_module_mem(), which can load a kernel
> module from a byte array in memory.
>
> Also add a new module loader flag, MODULE_INIT_MEM, signalling that a
> module was loaded in this way.
>
> When a module is loaded with load_module_mem(), we do a few things
> differently:
>
> - don't do signature verification
> - ignore vermagic
Why is checking the vermagic skipped?
> - don't taint the kernel
Why is tainting the kernel skipped?
> - keep the initial reference to the module until the caller wants to
> drop it
>
> These changes are necessary for having a bundled (but separately
> compiled) FIPS module.
>
> We may want to let distros carry patches to disable tainting separately
> so this information is not lost in case somebody builds a non-distro
> kernel using a FIPS module compiled for an incompatible version.
>
> Co-developed-by: Saeed Mirzamohammadi <saeed.mirzamohammadi@oracle.com>
> Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
I realize this is posted as an RFC so I'm not sure if you're looking for
more detailed comments on the implementation at this point. Nonetheless,
some notes are provided below.
> ---
> include/linux/module.h | 2 +
> include/uapi/linux/module.h | 5 ++
> kernel/module/main.c | 99 ++++++++++++++++++++++++++-----------
> 3 files changed, 77 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/module.h b/include/linux/module.h
> index 3319a5269d28..00d85602fb6a 100644
> --- a/include/linux/module.h
> +++ b/include/linux/module.h
> @@ -586,6 +586,8 @@ struct module {
>
> #ifdef CONFIG_MODULES
>
> +extern int load_module_mem(const char *mem, size_t size);
> +
Nit: The extern keyword is unnecessary here. See
Documentation/process/coding-style.rst, 6.1) Function prototypes.
> /* Get/put a kernel symbol (calls must be symmetric) */
> void *__symbol_get(const char *symbol);
> void *__symbol_get_gpl(const char *symbol);
> diff --git a/include/uapi/linux/module.h b/include/uapi/linux/module.h
> index 03a33ffffcba..5dcd24018be7 100644
> --- a/include/uapi/linux/module.h
> +++ b/include/uapi/linux/module.h
> @@ -7,4 +7,9 @@
> #define MODULE_INIT_IGNORE_VERMAGIC 2
> #define MODULE_INIT_COMPRESSED_FILE 4
>
> +#ifdef __KERNEL__
> +/* Internal flags */
> +#define MODULE_INIT_MEM 30
> +#endif
> +
This looks to be incorrect, 30 is 0b11110. The value should be a flag
with only one bit set.
Additionally, I think referring to this special-type module as MEM is
misleading as all modules are eventually loaded from the kernel memory.
Perhaps call it MODULE_INIT_EMBEDDED_FILE, which also aligns with
MODULE_INIT_COMPRESSED_FILE?
> #endif /* _UAPI_LINUX_MODULE_H */
> diff --git a/kernel/module/main.c b/kernel/module/main.c
> index c66b26184936..12ce4bad29ca 100644
> --- a/kernel/module/main.c
> +++ b/kernel/module/main.c
> @@ -2572,11 +2572,14 @@ static void module_augment_kernel_taints(struct module *mod, struct load_info *i
>
> static int check_modinfo(struct module *mod, struct load_info *info, int flags)
> {
> - const char *modmagic = get_modinfo(info, "vermagic");
> + const char *modmagic = NULL;
> int err;
>
> - if (flags & MODULE_INIT_IGNORE_VERMAGIC)
> - modmagic = NULL;
> + if (flags & MODULE_INIT_MEM)
> + return 0;
> +
> + if (!(flags & MODULE_INIT_IGNORE_VERMAGIC))
> + modmagic = get_modinfo(info, "vermagic");
>
> /* This is allowed: modprobe --force will invalidate it. */
> if (!modmagic) {
> @@ -3007,7 +3010,7 @@ module_param(async_probe, bool, 0644);
> * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
> * helper command 'lx-symbols'.
> */
> -static noinline int do_init_module(struct module *mod)
> +static noinline int do_init_module(struct module *mod, int flags)
> {
> int ret = 0;
> struct mod_initfree *freeinit;
> @@ -3071,7 +3074,8 @@ static noinline int do_init_module(struct module *mod)
> mod->mem[MOD_INIT_TEXT].base + mod->mem[MOD_INIT_TEXT].size);
> mutex_lock(&module_mutex);
> /* Drop initial reference. */
> - module_put(mod);
> + if (!(flags & MODULE_INIT_MEM))
> + module_put(mod);
> trim_init_extable(mod);
> #ifdef CONFIG_KALLSYMS
> /* Switch to core kallsyms now init is done: kallsyms may be walking! */
> @@ -3347,31 +3351,17 @@ static int early_mod_check(struct load_info *info, int flags)
> /*
> * Allocate and load the module: note that size of section 0 is always
> * zero, and we rely on this for optional sections.
> + *
> + * NOTE: module signature verification must have been done already.
> */
> -static int load_module(struct load_info *info, const char __user *uargs,
> - int flags)
> +static int _load_module(struct load_info *info, const char __user *uargs,
> + int flags)
> {
> struct module *mod;
> bool module_allocated = false;
> long err = 0;
> char *after_dashes;
>
> - /*
> - * Do the signature check (if any) first. All that
> - * the signature check needs is info->len, it does
> - * not need any of the section info. That can be
> - * set up later. This will minimize the chances
> - * of a corrupt module causing problems before
> - * we even get to the signature check.
> - *
> - * The check will also adjust info->len by stripping
> - * off the sig length at the end of the module, making
> - * checks against info->len more correct.
> - */
> - err = module_sig_check(info, flags);
> - if (err)
> - goto free_copy;
> -
> /*
> * Do basic sanity checks against the ELF header and
> * sections. Cache useful sections and set the
> @@ -3405,7 +3395,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
> * We are tainting your kernel if your module gets into
> * the modules linked list somehow.
> */
> - module_augment_kernel_taints(mod, info);
> + if (!(flags & MODULE_INIT_MEM))
> + module_augment_kernel_taints(mod, info);
>
> /* To avoid stressing percpu allocator, do this once we're unique. */
> err = percpu_modalloc(mod, info);
> @@ -3452,7 +3443,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
> flush_module_icache(mod);
>
> /* Now copy in args */
> - mod->args = strndup_user(uargs, ~0UL >> 1);
> + if ((flags & MODULE_INIT_MEM))
> + mod->args = kstrdup("", GFP_KERNEL);
> + else
> + mod->args = strndup_user(uargs, ~0UL >> 1);
> +
> if (IS_ERR(mod->args)) {
> err = PTR_ERR(mod->args);
> goto free_arch_cleanup;
> @@ -3500,13 +3495,10 @@ static int load_module(struct load_info *info, const char __user *uargs,
> if (codetag_load_module(mod))
> goto sysfs_cleanup;
>
> - /* Get rid of temporary copy. */
> - free_copy(info, flags);
> -
> /* Done! */
> trace_module_load(mod);
>
> - return do_init_module(mod);
> + return do_init_module(mod, flags);
>
> sysfs_cleanup:
> mod_sysfs_teardown(mod);
> @@ -3562,7 +3554,52 @@ static int load_module(struct load_info *info, const char __user *uargs,
> audit_log_kern_module(info->name ? info->name : "?");
> mod_stat_bump_becoming(info, flags);
> }
> + return err;
> +}
> +
> +/*
> + * Load module from kernel memory without signature check.
> + */
> +int load_module_mem(const char *mem, size_t size)
The description and name of this function are not ideal. All module
loads via load_module() are from the kernel memory and skipping the
signature check is not the only different property.
I suggest calling the function load_embedded_module() and improving its
description. Please preferably also use a kernel-doc to describe it as
the function is external.
> +{
> + int err;
> + struct load_info info = { };
> +
> + info.sig_ok = true;
> + info.hdr = (Elf64_Ehdr *) mem;
> + info.len = size;
> +
> + err = _load_module(&info, NULL, MODULE_INIT_MEM);
> + if (0)
> + free_copy(&info, 0);
Remove the dead code.
> +
> + return err;
> +}
> +
> +static int load_module(struct load_info *info, const char __user *uargs,
> + int flags)
> +{
> + int err;
> +
> + /*
> + * Do the signature check (if any) first. All that
> + * the signature check needs is info->len, it does
> + * not need any of the section info. That can be
> + * set up later. This will minimize the chances
> + * of a corrupt module causing problems before
> + * we even get to the signature check.
> + *
> + * The check will also adjust info->len by stripping
> + * off the sig length at the end of the module, making
> + * checks against info->len more correct.
> + */
> + err = module_sig_check(info, flags);
> + if (!err)
> + err = _load_module(info, uargs, flags);
> +
> + /* Get rid of temporary copy. */
> free_copy(info, flags);
> +
> return err;
> }
In the current code, the load_module() function frees the temporary copy
prior to calling the module's init function, which should generally
result in less memory pressure. This behavior looks useful to me to
preserve.
You could keep the current load_module() as is but wrap its
module_sig_check() call with 'if (!info->sig_ok)'. Similarly, the
free_copy() call could be protected by
'if (!(flags & MODULE_INIT_MEM))'.
>
> @@ -3728,6 +3765,10 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
>
> pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
>
> + /*
> + * Deliberately omitting MODULE_INIT_MEM as it is for internal use
> + * only.
> + */
> if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
> |MODULE_INIT_IGNORE_VERMAGIC
> |MODULE_INIT_COMPRESSED_FILE))
Nit: I suggest the following to improve the comment flow:
/*
* Check flags validity. Deliberately omit MODULE_INIT_MEM as it is for
* internal use only.
*/
--
Thanks,
Petr
^ permalink raw reply
* Re: [PATCH 0/4] PCI: Add support and tests for FIXUP quirks in modules
From: Christoph Hellwig @ 2025-09-29 8:56 UTC (permalink / raw)
To: Brian Norris
Cc: Christoph Hellwig, Bjorn Helgaas, Luis Chamberlain, Petr Pavlu,
Daniel Gomez, linux-pci, David Gow, Rae Moar, linux-kselftest,
linux-kernel, linux-modules, Johannes Berg, Sami Tolvanen,
Richard Weinberger, Wei Liu, Brendan Higgins, kunit-dev,
Anton Ivanov, linux-um
In-Reply-To: <aNGaBiUOb6_n8w8P@google.com>
On Mon, Sep 22, 2025 at 11:48:38AM -0700, Brian Norris wrote:
> On Mon, Sep 22, 2025 at 11:13:39AM -0700, Christoph Hellwig wrote:
> > Controller drivers are a special case I guess, but I'd rather still
> > not open it up to any random driver.
>
> I don't really see why this particular thing should develop restrictions
> beyond "can it work in modules?", but if you have an idea for how to do
> that reasonably, my ears are open.
PCI Controller seem pretty special in that they provide infrastructure.
^ permalink raw reply
* Re: [RFC PATCH 00/10] scalable symbol flags with __kflagstab
From: Sid Nayyar @ 2025-09-26 0:11 UTC (permalink / raw)
To: Petr Pavlu
Cc: Nathan Chancellor, Luis Chamberlain, Sami Tolvanen,
Nicolas Schier, Arnd Bergmann, linux-kbuild, linux-arch,
linux-modules, linux-kernel, Giuliano Procida,
Matthias Männich
In-Reply-To: <2bf54830-ea9c-4962-a7ef-653fbed8f8c0@suse.com>
On Mon, Sep 22, 2025 at 12:41 PM Petr Pavlu <petr.pavlu@suse.com> wrote:
> This is useful information. However, I was specifically interested in
> the impact of having the new flags field present as part of __ksymtab
> (kernel_symbol), compared to keeping it in a separate section. Sorry for
> not being clear.
>
> I ran a small test to get a better understanding of the different sizes.
> I used v6.17-rc6 together with the openSUSE x86_64 config [1], which is
> fairly large. The resulting vmlinux.bin (no debuginfo) had an on-disk
> size of 58 MiB, and included 5937 + 6589 (GPL-only) exported symbols.
>
> The following table summarizes my measurements and calculations
> regarding the sizes of all sections related to exported symbols:
>
> | HAVE_ARCH_PREL32_RELOCATIONS | !HAVE_ARCH_PREL32_RELOCATIONS
> Section | Base [B] | Ext. [B] | Sep. [B] | Base [B] | Ext. [B] | Sep. [B]
> ----------------------------------------------------------------------------------------
> __ksymtab | 71244 | 200416 | 150312 | 142488 | 400832 | 300624
> __ksymtab_gpl | 79068 | NA | NA | 158136 | NA | NA
> __kcrctab | 23748 | 50104 | 50104 | 23748 | 50104 | 50104
> __kcrctab_gpl | 26356 | NA | NA | 26356 | NA | NA
> __ksymtab_strings | 253628 | 253628 | 253628 | 253628 | 253628 | 253628
> __kflagstab | NA | NA | 12526 | NA | NA | 12526
> ----------------------------------------------------------------------------------------
> Total | 454044 | 504148 | 466570 | 604356 | 704564 | 616882
> Increase to base [%] | NA | 11.0 | 2.8 | NA | 16.6 | 2.1
>
> The column "HAVE_ARCH_PREL32_RELOCATIONS -> Base" contains the numbers
> that I measured. The rest of the values are calculated. The "Ext."
> column represents the variant of extending __ksymtab, and the "Sep."
> column represents the variant of having a separate __kflagstab. With
> HAVE_ARCH_PREL32_RELOCATIONS, each kernel_symbol is 12 B in size and is
> extended to 16 B. With !HAVE_ARCH_PREL32_RELOCATIONS, it is 24 B,
> extended to 32 B. Note that this does not include the metadata needed to
> relocate __ksymtab*, which is freed after the initial processing.
>
> The base export data in this case totals 0.43 MiB. About 50% is used for
> storing the names of exported symbols.
>
> Adding __kflagstab as a separate section has a negligible impact, as
> expected. When extending __ksymtab (kernel_symbol) instead, the worst
> case with !HAVE_ARCH_PREL32_RELOCATIONS increases the export data size
> by 16.6%.
>
> Based on the above, I think introducing __kflagstab makes senses, as the
> added complexity is minimal, although I feel we could probably also get
> away with extending kernel_symbol.
This investigation is very informative, thank you for sharing your
findings. I am in agreement with your conclusions.
> This seems to answer why the in-tree flag is not sufficient for you.
> However, I also suggested an alternative that the symbol protection
> could be determined by whether the module is signed by a key from the
> .builtin_trusted_keys keyring, as opposed to being signed by another key
> reachable from the .secondary_trusted_keys keyring or being completely
> unsigned.
>
> Distributions can require that external modules be signed and allow
> additional keys to be added as Machine Owner Keys, which can be made
> reachable from .secondary_trusted_keys. Nonetheless, such distributions
> might be still interested in limiting the number of symbols that such
> external modules can use.
>
> I think this option is worth considering, as it could potentially make
> this symbol protection useful for other distributions as well.
This sounds like a great solution to enhance trust and security,
apologies for missing this in the previous email. I will explore this
approach, but I would like to do it in a separate series.
> I'm personally ok with adding the kflagstab support. I think it
> introduces minimal complexity and, as you point out, simplifies certain
> aspects. Additionally, if we add it, I believe that adding the proposed
> symbol protection is simple enough to be included as well, at least from
> my perspective.
Since we are in agreement, I would like to seek code review for this
series. The code is ready for review from my side, but if you prefer I
can send out a non-RFC patch series for code review.
--
Thanks,
Siddharth Nayyar
^ permalink raw reply
* Re: [PATCH 0/6] module: enable force unloading of modules that have crashed during init
From: Julian LaGattuta @ 2025-09-24 22:16 UTC (permalink / raw)
To: Petr Pavlu
Cc: Luis Chamberlain, Sami Tolvanen, Daniel Gomez, linux-modules,
linux-kernel
In-Reply-To: <000808f3-10cf-46ad-94f9-95a142c08b59@suse.com>
> Could you please explain the motivation for doing this in more detail?
>
> I think we shouldn't attempt to do anything clever with modules that
> crashed during initialization. Such a module can already leave the
> system in an unstable state and trying to recover can cause even more
> problems. For instance, I don't see how it is safe to call the module's
> exit function.
>
> --
> Thanks,
> Petr
Thank you for your response Petr. The motivation comes from when I
wanted to replace a crashed module with one which does not crash
without having to reboot. I looked around and saw some other people
complain about it on stackoverflow.
I thought that if a module crashed during init, it would be in a no
better position compared to if it were forcefully removed.
Therefore, there is no reason why this shouldn't be an option as it
couldn't make the problem worse.
I agree that calling the exit function doesn't make sense and so I
could change the behavior.
That being said, I understand why someone would be wary of this type
of change; this is just my thought process.
Sincerely,
Julian
^ permalink raw reply
* Re: [PATCH v8 0/8] Add generated modalias to modules.builtin.modinfo
From: Nathan Chancellor @ 2025-09-24 16:17 UTC (permalink / raw)
To: Nathan Chancellor, Nicolas Schier, Petr Pavlu, Luis Chamberlain,
Sami Tolvanen, Daniel Gomez, Alexey Gladkov
Cc: linux-kernel, linux-modules, linux-kbuild
In-Reply-To: <cover.1758182101.git.legion@kernel.org>
On Thu, 18 Sep 2025 10:05:44 +0200, Alexey Gladkov wrote:
> The modules.builtin.modinfo file is used by userspace (kmod to be specific) to
> get information about builtin modules. Among other information about the module,
> information about module aliases is stored. This is very important to determine
> that a particular modalias will be handled by a module that is inside the
> kernel.
>
> There are several mechanisms for creating modalias for modules:
>
> [...]
Applied, thanks!
[1/8] s390: vmlinux.lds.S: Reorder sections
https://git.kernel.org/kbuild/c/8d18ef04f940a
[2/8] kbuild: always create intermediate vmlinux.unstripped
https://git.kernel.org/kbuild/c/0ce5139fd96e9
[3/8] kbuild: keep .modinfo section in vmlinux.unstripped
https://git.kernel.org/kbuild/c/3e86e4d74c049
[4/8] kbuild: extract modules.builtin.modinfo from vmlinux.unstripped
https://git.kernel.org/kbuild/c/39cfd5b12160b
[5/8] scsi: Always define blogic_pci_tbl structure
https://git.kernel.org/kbuild/c/b88f88c26705a
[6/8] modpost: Add modname to mod_device_table alias
https://git.kernel.org/kbuild/c/83fb49389bbe0
[7/8] modpost: Create modalias for builtin modules
https://git.kernel.org/kbuild/c/5ab23c7923a1d
[8/8] kbuild: vmlinux.unstripped should always depend on .vmlinux.export.o
https://git.kernel.org/kbuild/c/3328d39a8dca2
Best regards,
--
Nathan Chancellor <nathan@kernel.org>
^ permalink raw reply
* [PATCH v18 2/7] rust: str: add radix prefixed integer parsing functions
From: Andreas Hindborg @ 2025-09-24 12:39 UTC (permalink / raw)
To: Miguel Ojeda, Alex Gaynor, Boqun Feng, Gary Guo,
Björn Roy Baron, Alice Ryhl, Masahiro Yamada,
Nathan Chancellor, Luis Chamberlain, Danilo Krummrich,
Benno Lossin, Daniel Gomez, Benno Lossin, Nicolas Schier
Cc: Trevor Gross, Adam Bratschi-Kaye, rust-for-linux, linux-kernel,
linux-kbuild, Petr Pavlu, Sami Tolvanen, Daniel Gomez,
Simona Vetter, Greg KH, Fiona Behrens, Daniel Almeida,
linux-modules, Andreas Hindborg
In-Reply-To: <20250924-module-params-v3-v18-0-bf512c35d910@kernel.org>
Add the trait `ParseInt` for parsing string representations of integers
where the string representations are optionally prefixed by a radix
specifier. Implement the trait for the primitive integer types.
Suggested-by: Benno Lossin <benno.lossin@proton.me>
Tested-by: Daniel Gomez <da.gomez@samsung.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Benno Lossin <lossin@kernel.org>
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
---
rust/kernel/str.rs | 2 +
rust/kernel/str/parse_int.rs | 148 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 150 insertions(+)
diff --git a/rust/kernel/str.rs b/rust/kernel/str.rs
index 6c892550c0ba9..23fe924070e7c 100644
--- a/rust/kernel/str.rs
+++ b/rust/kernel/str.rs
@@ -8,6 +8,8 @@
use crate::prelude::*;
+pub mod parse_int;
+
/// Byte string without UTF-8 validity guarantee.
#[repr(transparent)]
pub struct BStr([u8]);
diff --git a/rust/kernel/str/parse_int.rs b/rust/kernel/str/parse_int.rs
new file mode 100644
index 0000000000000..48eb4c202984c
--- /dev/null
+++ b/rust/kernel/str/parse_int.rs
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Integer parsing functions.
+//!
+//! Integer parsing functions for parsing signed and unsigned integers
+//! potentially prefixed with `0x`, `0o`, or `0b`.
+
+use crate::prelude::*;
+use crate::str::BStr;
+use core::ops::Deref;
+
+// Make `FromStrRadix` a public type with a private name. This seals
+// `ParseInt`, that is, prevents downstream users from implementing the
+// trait.
+mod private {
+ use crate::prelude::*;
+ use crate::str::BStr;
+
+ /// Trait that allows parsing a [`&BStr`] to an integer with a radix.
+ pub trait FromStrRadix: Sized {
+ /// Parse `src` to [`Self`] using radix `radix`.
+ fn from_str_radix(src: &BStr, radix: u32) -> Result<Self>;
+
+ /// Tries to convert `value` into [`Self`] and negates the resulting value.
+ fn from_u64_negated(value: u64) -> Result<Self>;
+ }
+}
+
+/// Extract the radix from an integer literal optionally prefixed with
+/// one of `0x`, `0X`, `0o`, `0O`, `0b`, `0B`, `0`.
+fn strip_radix(src: &BStr) -> (u32, &BStr) {
+ match src.deref() {
+ [b'0', b'x' | b'X', rest @ ..] => (16, rest.as_ref()),
+ [b'0', b'o' | b'O', rest @ ..] => (8, rest.as_ref()),
+ [b'0', b'b' | b'B', rest @ ..] => (2, rest.as_ref()),
+ // NOTE: We are including the leading zero to be able to parse
+ // literal `0` here. If we removed it as a radix prefix, we would
+ // not be able to parse `0`.
+ [b'0', ..] => (8, src),
+ _ => (10, src),
+ }
+}
+
+/// Trait for parsing string representations of integers.
+///
+/// Strings beginning with `0x`, `0o`, or `0b` are parsed as hex, octal, or
+/// binary respectively. Strings beginning with `0` otherwise are parsed as
+/// octal. Anything else is parsed as decimal. A leading `+` or `-` is also
+/// permitted. Any string parsed by [`kstrtol()`] or [`kstrtoul()`] will be
+/// successfully parsed.
+///
+/// [`kstrtol()`]: https://docs.kernel.org/core-api/kernel-api.html#c.kstrtol
+/// [`kstrtoul()`]: https://docs.kernel.org/core-api/kernel-api.html#c.kstrtoul
+///
+/// # Examples
+///
+/// ```
+/// # use kernel::str::parse_int::ParseInt;
+/// # use kernel::b_str;
+///
+/// assert_eq!(Ok(0u8), u8::from_str(b_str!("0")));
+///
+/// assert_eq!(Ok(0xa2u8), u8::from_str(b_str!("0xa2")));
+/// assert_eq!(Ok(-0xa2i32), i32::from_str(b_str!("-0xa2")));
+///
+/// assert_eq!(Ok(-0o57i8), i8::from_str(b_str!("-0o57")));
+/// assert_eq!(Ok(0o57i8), i8::from_str(b_str!("057")));
+///
+/// assert_eq!(Ok(0b1001i16), i16::from_str(b_str!("0b1001")));
+/// assert_eq!(Ok(-0b1001i16), i16::from_str(b_str!("-0b1001")));
+///
+/// assert_eq!(Ok(127i8), i8::from_str(b_str!("127")));
+/// assert!(i8::from_str(b_str!("128")).is_err());
+/// assert_eq!(Ok(-128i8), i8::from_str(b_str!("-128")));
+/// assert!(i8::from_str(b_str!("-129")).is_err());
+/// assert_eq!(Ok(255u8), u8::from_str(b_str!("255")));
+/// assert!(u8::from_str(b_str!("256")).is_err());
+/// ```
+pub trait ParseInt: private::FromStrRadix + TryFrom<u64> {
+ /// Parse a string according to the description in [`Self`].
+ fn from_str(src: &BStr) -> Result<Self> {
+ match src.deref() {
+ [b'-', rest @ ..] => {
+ let (radix, digits) = strip_radix(rest.as_ref());
+ // 2's complement values range from -2^(b-1) to 2^(b-1)-1.
+ // So if we want to parse negative numbers as positive and
+ // later multiply by -1, we have to parse into a larger
+ // integer. We choose `u64` as sufficiently large.
+ //
+ // NOTE: 128 bit integers are not available on all
+ // platforms, hence the choice of 64 bits.
+ let val =
+ u64::from_str_radix(core::str::from_utf8(digits).map_err(|_| EINVAL)?, radix)
+ .map_err(|_| EINVAL)?;
+ Self::from_u64_negated(val)
+ }
+ _ => {
+ let (radix, digits) = strip_radix(src);
+ Self::from_str_radix(digits, radix).map_err(|_| EINVAL)
+ }
+ }
+ }
+}
+
+macro_rules! impl_parse_int {
+ ($($ty:ty),*) => {
+ $(
+ impl private::FromStrRadix for $ty {
+ fn from_str_radix(src: &BStr, radix: u32) -> Result<Self> {
+ <$ty>::from_str_radix(core::str::from_utf8(src).map_err(|_| EINVAL)?, radix)
+ .map_err(|_| EINVAL)
+ }
+
+ fn from_u64_negated(value: u64) -> Result<Self> {
+ const ABS_MIN: u64 = {
+ #[allow(unused_comparisons)]
+ if <$ty>::MIN < 0 {
+ 1u64 << (<$ty>::BITS - 1)
+ } else {
+ 0
+ }
+ };
+
+ if value > ABS_MIN {
+ return Err(EINVAL);
+ }
+
+ if value == ABS_MIN {
+ return Ok(<$ty>::MIN);
+ }
+
+ // SAFETY: The above checks guarantee that `value` fits into `Self`:
+ // - if `Self` is unsigned, then `ABS_MIN == 0` and thus we have returned above
+ // (either `EINVAL` or `MIN`).
+ // - if `Self` is signed, then we have that `0 <= value < ABS_MIN`. And since
+ // `ABS_MIN - 1` fits into `Self` by construction, `value` also does.
+ let value: Self = unsafe { value.try_into().unwrap_unchecked() };
+
+ Ok((!value).wrapping_add(1))
+ }
+ }
+
+ impl ParseInt for $ty {}
+ )*
+ };
+}
+
+impl_parse_int![i8, u8, i16, u16, i32, u32, i64, u64, isize, usize];
--
2.47.2
^ permalink raw reply related
* [PATCH v18 6/7] rust: samples: add a module parameter to the rust_minimal sample
From: Andreas Hindborg @ 2025-09-24 12:39 UTC (permalink / raw)
To: Miguel Ojeda, Alex Gaynor, Boqun Feng, Gary Guo,
Björn Roy Baron, Alice Ryhl, Masahiro Yamada,
Nathan Chancellor, Luis Chamberlain, Danilo Krummrich,
Benno Lossin, Daniel Gomez, Benno Lossin, Nicolas Schier
Cc: Trevor Gross, Adam Bratschi-Kaye, rust-for-linux, linux-kernel,
linux-kbuild, Petr Pavlu, Sami Tolvanen, Daniel Gomez,
Simona Vetter, Greg KH, Fiona Behrens, Daniel Almeida,
linux-modules, Andreas Hindborg
In-Reply-To: <20250924-module-params-v3-v18-0-bf512c35d910@kernel.org>
Showcase the rust module parameter support by adding a module parameter to
the `rust_minimal` sample.
Reviewed-by: Benno Lossin <lossin@kernel.org>
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
---
samples/rust/rust_minimal.rs | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/samples/rust/rust_minimal.rs b/samples/rust/rust_minimal.rs
index 1fc7a1be6b6d7..8eb9583571d72 100644
--- a/samples/rust/rust_minimal.rs
+++ b/samples/rust/rust_minimal.rs
@@ -10,6 +10,12 @@
authors: ["Rust for Linux Contributors"],
description: "Rust minimal sample",
license: "GPL",
+ params: {
+ test_parameter: i64 {
+ default: 1,
+ description: "This parameter has a default of 1",
+ },
+ },
}
struct RustMinimal {
@@ -20,6 +26,10 @@ impl kernel::Module for RustMinimal {
fn init(_module: &'static ThisModule) -> Result<Self> {
pr_info!("Rust minimal sample (init)\n");
pr_info!("Am I built-in? {}\n", !cfg!(MODULE));
+ pr_info!(
+ "test_parameter: {}\n",
+ *module_parameters::test_parameter.value()
+ );
let mut numbers = KVec::new();
numbers.push(72, GFP_KERNEL)?;
--
2.47.2
^ permalink raw reply related
* [PATCH v18 4/7] rust: module: use a reference in macros::module::module
From: Andreas Hindborg @ 2025-09-24 12:39 UTC (permalink / raw)
To: Miguel Ojeda, Alex Gaynor, Boqun Feng, Gary Guo,
Björn Roy Baron, Alice Ryhl, Masahiro Yamada,
Nathan Chancellor, Luis Chamberlain, Danilo Krummrich,
Benno Lossin, Daniel Gomez, Benno Lossin, Nicolas Schier
Cc: Trevor Gross, Adam Bratschi-Kaye, rust-for-linux, linux-kernel,
linux-kbuild, Petr Pavlu, Sami Tolvanen, Daniel Gomez,
Simona Vetter, Greg KH, Fiona Behrens, Daniel Almeida,
linux-modules, Andreas Hindborg
In-Reply-To: <20250924-module-params-v3-v18-0-bf512c35d910@kernel.org>
When we add parameter support to the module macro, we want to be able to
pass a reference to `ModuleInfo` to a helper function. That is not possible
when we move out of the local `modinfo`. So change the function to access
the local via reference rather than value.
Reviewed-by: Benno Lossin <lossin@kernel.org>
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
---
rust/macros/module.rs | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/rust/macros/module.rs b/rust/macros/module.rs
index 5ee54a00c0b65..cbf3ac0a8f7ba 100644
--- a/rust/macros/module.rs
+++ b/rust/macros/module.rs
@@ -176,23 +176,23 @@ pub(crate) fn module(ts: TokenStream) -> TokenStream {
// Rust does not allow hyphens in identifiers, use underscore instead.
let ident = info.name.replace('-', "_");
let mut modinfo = ModInfoBuilder::new(ident.as_ref());
- if let Some(authors) = info.authors {
+ if let Some(authors) = &info.authors {
for author in authors {
- modinfo.emit("author", &author);
+ modinfo.emit("author", author);
}
}
- if let Some(description) = info.description {
- modinfo.emit("description", &description);
+ if let Some(description) = &info.description {
+ modinfo.emit("description", description);
}
modinfo.emit("license", &info.license);
- if let Some(aliases) = info.alias {
+ if let Some(aliases) = &info.alias {
for alias in aliases {
- modinfo.emit("alias", &alias);
+ modinfo.emit("alias", alias);
}
}
- if let Some(firmware) = info.firmware {
+ if let Some(firmware) = &info.firmware {
for fw in firmware {
- modinfo.emit("firmware", &fw);
+ modinfo.emit("firmware", fw);
}
}
--
2.47.2
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox