* [PATCH RFC net-next] net: drop_monitor: Add debugfs support
@ 2025-10-15 10:14 Wang Liang
2025-10-15 10:40 ` Eric Dumazet
0 siblings, 1 reply; 5+ messages in thread
From: Wang Liang @ 2025-10-15 10:14 UTC (permalink / raw)
To: nhorman, davem, edumazet, kuba, pabeni, horms
Cc: netdev, linux-kernel, yuehaibing, zhangchangzhong, wangliang74
This patch add debugfs interfaces for drop monitor. Similar to kmemleak, we
can use the monitor by below commands:
echo clear > /sys/kernel/debug/drop_monitor/trace
echo start > /sys/kernel/debug/drop_monitor/trace
echo stop > /sys/kernel/debug/drop_monitor/trace
cat /sys/kernel/debug/drop_monitor/trace
The trace skb number limit can be set dynamically:
cat /sys/kernel/debug/drop_monitor/trace_limit
echo 200 > /sys/kernel/debug/drop_monitor/trace_limit
Compare to original netlink method, the callstack dump is supported. There
is a example for received udp packet with error checksum:
reason : UDP_CSUM (11)
pc : udp_queue_rcv_one_skb+0x14b/0x350
len : 12
protocol : 0x0800
stack :
sk_skb_reason_drop+0x8f/0x120
udp_queue_rcv_one_skb+0x14b/0x350
udp_unicast_rcv_skb+0x71/0x90
ip_protocol_deliver_rcu+0xa6/0x160
ip_local_deliver_finish+0x90/0x100
ip_sublist_rcv_finish+0x65/0x80
ip_sublist_rcv+0x130/0x1c0
ip_list_rcv+0xf7/0x130
__netif_receive_skb_list_core+0x21d/0x240
netif_receive_skb_list_internal+0x186/0x2b0
napi_complete_done+0x78/0x190
e1000_clean+0x27f/0x860
__napi_poll+0x25/0x1e0
net_rx_action+0x2ca/0x330
handle_softirqs+0xbc/0x290
irq_exit_rcu+0x90/0xb0
It's more friendly to use and not need user application to cooperate.
Furthermore, it is easier to add new feature. We can add reason/ip/port
filter by debugfs parameters, like ftrace, rather than netlink msg.
As the first version, only enable the sw switch when starting the monitor
and save drop skb info in kfree_skb_probe hook.
Signed-off-by: Wang Liang <wangliang74@huawei.com>
---
net/core/drop_monitor.c | 198 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 198 insertions(+)
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 60d31c2feed3..d790439bbc77 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -38,10 +38,35 @@
#include <trace/events/devlink.h>
#include <linux/unaligned.h>
+#include <linux/debugfs.h>
#define TRACE_ON 1
#define TRACE_OFF 0
+static struct dentry *dm_debugfs_dir;
+static LIST_HEAD(dm_entry_list);
+static DEFINE_RAW_SPINLOCK(dm_entry_lock);
+
+#define DM_MAX_STACK_TRACE_ENTRIES 16
+#define DM_MAX_STACK_TRACE_SKIP 1
+
+struct dm_skb_info {
+ unsigned int len;
+ unsigned short protocol;
+};
+
+struct dm_entry {
+ struct list_head dm_list;
+ void *pc;
+ enum skb_drop_reason reason;
+ unsigned int stack_len;
+ unsigned long stack[DM_MAX_STACK_TRACE_ENTRIES];
+ struct dm_skb_info skb_info;
+};
+
+static unsigned int dm_trace_limit = 100;
+static void init_dm_entry(struct sk_buff *skb, void *location, enum skb_drop_reason reason);
+
/*
* Globals, our netlink socket pointer
* and the work handle that will send up
@@ -268,6 +293,7 @@ static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb,
enum skb_drop_reason reason,
struct sock *rx_sk)
{
+ init_dm_entry(skb, location, reason);
trace_drop_common(skb, location);
}
@@ -502,6 +528,8 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
struct sk_buff *nskb;
unsigned long flags;
+ init_dm_entry(skb, location, reason);
+
if (!skb_mac_header_was_set(skb))
return;
@@ -1723,6 +1751,173 @@ static void net_dm_hw_cpu_data_fini(int cpu)
__net_dm_cpu_data_fini(hw_data);
}
+static void init_dm_entry(struct sk_buff *skb, void *location, enum skb_drop_reason reason)
+{
+ struct dm_entry *entry;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&dm_entry_lock, flags);
+ if (list_count_nodes(&dm_entry_list) >= dm_trace_limit) {
+ raw_spin_unlock_irqrestore(&dm_entry_lock, flags);
+ return;
+ }
+ raw_spin_unlock_irqrestore(&dm_entry_lock, flags);
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry)
+ return;
+ entry->pc = location;
+ entry->reason = reason;
+ entry->skb_info.len = skb->len;
+ entry->skb_info.protocol = be16_to_cpu(skb->protocol);
+
+ raw_spin_lock_irqsave(&dm_entry_lock, flags);
+ entry->stack_len = stack_trace_save(entry->stack, ARRAY_SIZE(entry->stack),
+ DM_MAX_STACK_TRACE_SKIP);
+ list_add_tail(&entry->dm_list, &dm_entry_list);
+ raw_spin_unlock_irqrestore(&dm_entry_lock, flags);
+}
+
+static void clear_dm_entry(void)
+{
+ struct dm_entry *entry, *tmp;
+
+ raw_spin_lock(&dm_entry_lock);
+ list_for_each_entry_safe(entry, tmp, &dm_entry_list, dm_list) {
+ list_del(&entry->dm_list);
+ kfree(entry);
+ }
+ raw_spin_unlock(&dm_entry_lock);
+}
+
+static void *dm_debugfs_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ raw_spin_lock(&dm_entry_lock);
+ return seq_list_start_head(&dm_entry_list, *pos);
+}
+
+static void dm_debugfs_seq_stop(struct seq_file *seq, void *v)
+{
+ raw_spin_unlock(&dm_entry_lock);
+}
+
+static void *dm_debugfs_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &dm_entry_list, pos);
+}
+
+static const char *dm_show_reason_str(enum skb_drop_reason reason)
+{
+ const struct drop_reason_list *list;
+ unsigned int subsys, subsys_reason;
+
+ subsys = u32_get_bits(reason, SKB_DROP_REASON_SUBSYS_MASK);
+ if (subsys >= SKB_DROP_REASON_SUBSYS_NUM)
+ return NULL;
+ list = rcu_dereference(drop_reasons_by_subsys[subsys]);
+
+ subsys_reason = reason & ~SKB_DROP_REASON_SUBSYS_MASK;
+ if (subsys_reason >= list->n_reasons)
+ return NULL;
+ return list->reasons[subsys_reason];
+}
+
+static int dm_debugfs_seq_show(struct seq_file *seq, void *v)
+{
+ struct dm_entry *entry;
+ unsigned int i;
+
+ if (v == &dm_entry_list) {
+ seq_printf(seq, "drop count: %zu\n", list_count_nodes(&dm_entry_list));
+ return 0;
+ }
+
+ entry = list_entry(v, struct dm_entry, dm_list);
+ seq_puts(seq, "\n");
+ seq_printf(seq, "reason : %s (%d)\n", dm_show_reason_str(entry->reason), entry->reason);
+ seq_printf(seq, "pc : %pS\n", entry->pc);
+ seq_printf(seq, "len : %d\n", entry->skb_info.len);
+ seq_printf(seq, "protocol : 0x%04x\n", entry->skb_info.protocol);
+ seq_puts(seq, "stack :\n");
+ for (i = 0; i < entry->stack_len; i++)
+ seq_printf(seq, " %pS\n", (void *)entry->stack[i]);
+ return 0;
+}
+
+static const struct seq_operations dm_debugfs_seq_ops = {
+ .start = dm_debugfs_seq_start,
+ .next = dm_debugfs_seq_next,
+ .stop = dm_debugfs_seq_stop,
+ .show = dm_debugfs_seq_show,
+};
+
+static int dm_debugfs_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &dm_debugfs_seq_ops);
+}
+
+static void dm_debugfs_set_trace_sw(int state)
+{
+ mutex_lock(&net_dm_mutex);
+
+ if (state == TRACE_ON)
+ net_dm_monitor_start(true, false, NULL);
+ else
+ net_dm_monitor_stop(true, false, NULL);
+
+ mutex_unlock(&net_dm_mutex);
+}
+
+static ssize_t dm_debugfs_write(struct file *file, const char __user *user_buf,
+ size_t size, loff_t *ppos)
+{
+ char buf[64];
+ int buf_size;
+
+ buf_size = min(size, (sizeof(buf) - 1));
+ if (strncpy_from_user(buf, user_buf, buf_size) < 0)
+ return -EFAULT;
+ buf[buf_size] = 0;
+
+ if (strncmp(buf, "clear", strlen("clear")) == 0)
+ clear_dm_entry();
+ else if (strncmp(buf, "start", strlen("start")) == 0)
+ dm_debugfs_set_trace_sw(TRACE_ON);
+ else if (strncmp(buf, "stop", strlen("stop")) == 0)
+ dm_debugfs_set_trace_sw(TRACE_OFF);
+ else
+ return -EINVAL;
+
+ return size;
+}
+
+static const struct file_operations dm_debugfs_fops = {
+ .owner = THIS_MODULE,
+ .open = dm_debugfs_open,
+ .read = seq_read,
+ .write = dm_debugfs_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void net_dm_create_debugfs(void)
+{
+ dm_debugfs_dir = debugfs_create_dir("drop_monitor", NULL);
+ if (IS_ERR_OR_NULL(dm_debugfs_dir)) {
+ pr_warn("Cannot create drop monitor debugfs directory\n");
+ return;
+ }
+
+ debugfs_create_u32("trace_limit", 0644, dm_debugfs_dir, &dm_trace_limit);
+ debugfs_create_file("trace", 0644, dm_debugfs_dir, NULL, &dm_debugfs_fops);
+}
+
+static void net_dm_destroy_debugfs(void)
+{
+ debugfs_remove_recursive(dm_debugfs_dir);
+ clear_dm_entry();
+}
+
static int __init init_net_drop_monitor(void)
{
int cpu, rc;
@@ -1754,6 +1949,7 @@ static int __init init_net_drop_monitor(void)
rc = 0;
+ net_dm_create_debugfs();
goto out;
out_unreg:
@@ -1766,6 +1962,8 @@ static void exit_net_drop_monitor(void)
{
int cpu;
+ net_dm_destroy_debugfs();
+
/*
* Because of the module_get/put we do in the trace state change path
* we are guaranteed not to have any current users when we get here
--
2.34.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH RFC net-next] net: drop_monitor: Add debugfs support
2025-10-15 10:14 [PATCH RFC net-next] net: drop_monitor: Add debugfs support Wang Liang
@ 2025-10-15 10:40 ` Eric Dumazet
2025-10-15 11:14 ` Florian Westphal
2025-10-16 6:26 ` Wang Liang
0 siblings, 2 replies; 5+ messages in thread
From: Eric Dumazet @ 2025-10-15 10:40 UTC (permalink / raw)
To: Wang Liang
Cc: nhorman, davem, kuba, pabeni, horms, netdev, linux-kernel,
yuehaibing, zhangchangzhong
On Wed, Oct 15, 2025 at 2:51 AM Wang Liang <wangliang74@huawei.com> wrote:
>
> This patch add debugfs interfaces for drop monitor. Similar to kmemleak, we
> can use the monitor by below commands:
>
> echo clear > /sys/kernel/debug/drop_monitor/trace
> echo start > /sys/kernel/debug/drop_monitor/trace
> echo stop > /sys/kernel/debug/drop_monitor/trace
> cat /sys/kernel/debug/drop_monitor/trace
>
> The trace skb number limit can be set dynamically:
>
> cat /sys/kernel/debug/drop_monitor/trace_limit
> echo 200 > /sys/kernel/debug/drop_monitor/trace_limit
>
> Compare to original netlink method, the callstack dump is supported. There
> is a example for received udp packet with error checksum:
>
> reason : UDP_CSUM (11)
> pc : udp_queue_rcv_one_skb+0x14b/0x350
> len : 12
> protocol : 0x0800
> stack :
> sk_skb_reason_drop+0x8f/0x120
> udp_queue_rcv_one_skb+0x14b/0x350
> udp_unicast_rcv_skb+0x71/0x90
> ip_protocol_deliver_rcu+0xa6/0x160
> ip_local_deliver_finish+0x90/0x100
> ip_sublist_rcv_finish+0x65/0x80
> ip_sublist_rcv+0x130/0x1c0
> ip_list_rcv+0xf7/0x130
> __netif_receive_skb_list_core+0x21d/0x240
> netif_receive_skb_list_internal+0x186/0x2b0
> napi_complete_done+0x78/0x190
> e1000_clean+0x27f/0x860
> __napi_poll+0x25/0x1e0
> net_rx_action+0x2ca/0x330
> handle_softirqs+0xbc/0x290
> irq_exit_rcu+0x90/0xb0
>
> It's more friendly to use and not need user application to cooperate.
> Furthermore, it is easier to add new feature. We can add reason/ip/port
> filter by debugfs parameters, like ftrace, rather than netlink msg.
I do not understand the fascination with net/core/drop_monitor.c,
which looks very old school to me,
and misses all the features, flexibility, scalability that 'perf',
eBPF tracing, bpftrace, .... have today.
Adding /sys/kernel/debug/drop_monitor/* is even more old school.
Not mentioning the maintenance burden.
For me the choice is easy :
# CONFIG_NET_DROP_MONITOR is not set
perf record -ag -e skb:kfree_skb sleep 1
perf script # or perf report
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH RFC net-next] net: drop_monitor: Add debugfs support
2025-10-15 10:40 ` Eric Dumazet
@ 2025-10-15 11:14 ` Florian Westphal
2025-10-15 16:49 ` Simon Horman
2025-10-16 6:26 ` Wang Liang
1 sibling, 1 reply; 5+ messages in thread
From: Florian Westphal @ 2025-10-15 11:14 UTC (permalink / raw)
To: Eric Dumazet
Cc: Wang Liang, nhorman, davem, kuba, pabeni, horms, netdev,
linux-kernel, yuehaibing, zhangchangzhong
Eric Dumazet <edumazet@google.com> wrote:
> I do not understand the fascination with net/core/drop_monitor.c,
> which looks very old school to me,
> and misses all the features, flexibility, scalability that 'perf',
> eBPF tracing, bpftrace, .... have today.
>
> Adding /sys/kernel/debug/drop_monitor/* is even more old school.
>
> Not mentioning the maintenance burden.
>
> For me the choice is easy :
>
> # CONFIG_NET_DROP_MONITOR is not set
>
> perf record -ag -e skb:kfree_skb sleep 1
>
> perf script # or perf report
Maybe:
diff --git a/net/Kconfig b/net/Kconfig
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -400,15 +400,15 @@ config NET_PKTGEN
module will be called pktgen.
config NET_DROP_MONITOR
- tristate "Network packet drop alerting service"
+ tristate "Legacy network packet drop alerting service"
depends on INET && TRACEPOINTS
help
This feature provides an alerting service to userspace in the
event that packets are discarded in the network stack. Alerts
are broadcast via netlink socket to any listening user space
- process. If you don't need network drop alerts, or if you are ok
- just checking the various proc files and other utilities for
- drop statistics, say N here.
+ process. This feature is NOT related to "perf" based drop monitoring.
+ Say N here unless you need to support older userspace tools like
+ "dropwatch".
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH RFC net-next] net: drop_monitor: Add debugfs support
2025-10-15 11:14 ` Florian Westphal
@ 2025-10-15 16:49 ` Simon Horman
0 siblings, 0 replies; 5+ messages in thread
From: Simon Horman @ 2025-10-15 16:49 UTC (permalink / raw)
To: Florian Westphal
Cc: Eric Dumazet, Wang Liang, nhorman, davem, kuba, pabeni, netdev,
linux-kernel, yuehaibing, zhangchangzhong
On Wed, Oct 15, 2025 at 01:14:47PM +0200, Florian Westphal wrote:
> Eric Dumazet <edumazet@google.com> wrote:
> > I do not understand the fascination with net/core/drop_monitor.c,
> > which looks very old school to me,
> > and misses all the features, flexibility, scalability that 'perf',
> > eBPF tracing, bpftrace, .... have today.
> >
> > Adding /sys/kernel/debug/drop_monitor/* is even more old school.
> >
> > Not mentioning the maintenance burden.
> >
> > For me the choice is easy :
> >
> > # CONFIG_NET_DROP_MONITOR is not set
> >
> > perf record -ag -e skb:kfree_skb sleep 1
> >
> > perf script # or perf report
>
> Maybe:
>
> diff --git a/net/Kconfig b/net/Kconfig
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -400,15 +400,15 @@ config NET_PKTGEN
> module will be called pktgen.
>
> config NET_DROP_MONITOR
> - tristate "Network packet drop alerting service"
> + tristate "Legacy network packet drop alerting service"
+1
...
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH RFC net-next] net: drop_monitor: Add debugfs support
2025-10-15 10:40 ` Eric Dumazet
2025-10-15 11:14 ` Florian Westphal
@ 2025-10-16 6:26 ` Wang Liang
1 sibling, 0 replies; 5+ messages in thread
From: Wang Liang @ 2025-10-16 6:26 UTC (permalink / raw)
To: Eric Dumazet, Florian Westphal, Simon Horman
Cc: nhorman, davem, kuba, pabeni, netdev, linux-kernel, yuehaibing,
zhangchangzhong
在 2025/10/15 18:40, Eric Dumazet 写道:
> On Wed, Oct 15, 2025 at 2:51 AM Wang Liang <wangliang74@huawei.com> wrote:
>> This patch add debugfs interfaces for drop monitor. Similar to kmemleak, we
>> can use the monitor by below commands:
>>
>> echo clear > /sys/kernel/debug/drop_monitor/trace
>> echo start > /sys/kernel/debug/drop_monitor/trace
>> echo stop > /sys/kernel/debug/drop_monitor/trace
>> cat /sys/kernel/debug/drop_monitor/trace
>>
>> The trace skb number limit can be set dynamically:
>>
>> cat /sys/kernel/debug/drop_monitor/trace_limit
>> echo 200 > /sys/kernel/debug/drop_monitor/trace_limit
>>
>> Compare to original netlink method, the callstack dump is supported. There
>> is a example for received udp packet with error checksum:
>>
>> reason : UDP_CSUM (11)
>> pc : udp_queue_rcv_one_skb+0x14b/0x350
>> len : 12
>> protocol : 0x0800
>> stack :
>> sk_skb_reason_drop+0x8f/0x120
>> udp_queue_rcv_one_skb+0x14b/0x350
>> udp_unicast_rcv_skb+0x71/0x90
>> ip_protocol_deliver_rcu+0xa6/0x160
>> ip_local_deliver_finish+0x90/0x100
>> ip_sublist_rcv_finish+0x65/0x80
>> ip_sublist_rcv+0x130/0x1c0
>> ip_list_rcv+0xf7/0x130
>> __netif_receive_skb_list_core+0x21d/0x240
>> netif_receive_skb_list_internal+0x186/0x2b0
>> napi_complete_done+0x78/0x190
>> e1000_clean+0x27f/0x860
>> __napi_poll+0x25/0x1e0
>> net_rx_action+0x2ca/0x330
>> handle_softirqs+0xbc/0x290
>> irq_exit_rcu+0x90/0xb0
>>
>> It's more friendly to use and not need user application to cooperate.
>> Furthermore, it is easier to add new feature. We can add reason/ip/port
>> filter by debugfs parameters, like ftrace, rather than netlink msg.
> I do not understand the fascination with net/core/drop_monitor.c,
> which looks very old school to me,
> and misses all the features, flexibility, scalability that 'perf',
> eBPF tracing, bpftrace, .... have today.
>
> Adding /sys/kernel/debug/drop_monitor/* is even more old school.
>
> Not mentioning the maintenance burden.
>
> For me the choice is easy :
>
> # CONFIG_NET_DROP_MONITOR is not set
>
> perf record -ag -e skb:kfree_skb sleep 1
>
> perf script # or perf report
Thank you for taking time to review this patch!
My initially thought was that the drop_monitor may cover more drop
positions (not just kfree_skb), show more skb info, filter skb by ip/port
debugfs parameter (not support now), and not need userspace tools.
Currently perf is indeed a better choice, adding debugfs is not necessary.
Thanks!
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2025-10-16 6:26 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-15 10:14 [PATCH RFC net-next] net: drop_monitor: Add debugfs support Wang Liang
2025-10-15 10:40 ` Eric Dumazet
2025-10-15 11:14 ` Florian Westphal
2025-10-15 16:49 ` Simon Horman
2025-10-16 6:26 ` Wang Liang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox