* [PATCH] dropmon: add ability to detect when hardware drops rx packets @ 2009-05-08 19:50 Neil Horman 2009-05-09 6:30 ` Eric Dumazet 0 siblings, 1 reply; 36+ messages in thread From: Neil Horman @ 2009-05-08 19:50 UTC (permalink / raw) To: netdev; +Cc: nhorman, davem Hey all- A few weeks back I sent out an RFC in which I proposed that I could extend dropwatch to detect drops in hardware by adding a napi poll tracepoint, using that as an indicator that I should periodically check the stats to see if the rx drop counter had changed. Having not had any negative feedback to that proposal, heres the patch. I've tested it and it seems to work fairly well. Neil Patch to add the ability to detect drops in hardware interfaces via dropwatch. Adds a tracepoint to net_rx_action to signal everytime a napi instance is polled. The dropmon code then periodically checks to see if the rx_frames counter has changed, and if so, adds a drop notification to the netlink protocol, using the reserved all-0's vector to indicate the drop location was in hardware, rather than somewhere in the code. Signed-off-by: Neil Horman <nhorman@tuxdriver.com> include/linux/net_dropmon.h | 7 ++ include/trace/napi.h | 11 ++++ net/core/dev.c | 5 +- net/core/drop_monitor.c | 107 ++++++++++++++++++++++++++++++++++++++++++-- net/core/net-traces.c | 4 + net/core/netpoll.c | 2 6 files changed, 131 insertions(+), 5 deletions(-) diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h index 0217fb8..9addc62 100644 --- a/include/linux/net_dropmon.h +++ b/include/linux/net_dropmon.h @@ -8,6 +8,13 @@ struct net_dm_drop_point { __u32 count; }; +#define is_drop_point_hw(x) do {\ + int ____i, ____j;\ + for (____i = 0; ____i < 8; i ____i++)\ + ____j |= x[____i];\ + ____j;\ +} while (0) + #define NET_DM_CFG_VERSION 0 #define NET_DM_CFG_ALERT_COUNT 1 #define NET_DM_CFG_ALERT_DELAY 2 diff --git a/include/trace/napi.h b/include/trace/napi.h new file mode 100644 index 0000000..7c39eb7 --- /dev/null +++ b/include/trace/napi.h @@ -0,0 +1,11 @@ +#ifndef _TRACE_NAPI_H_ +#define _TRACE_NAPI_H_ + +#include <linux/skbuff.h> +#include <linux/tracepoint.h> + +DECLARE_TRACE(napi_poll, + TP_PROTO(struct napi_struct *napi), + TP_ARGS(napi)); + +#endif diff --git a/net/core/dev.c b/net/core/dev.c index 637ea71..165979c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -126,6 +126,7 @@ #include <linux/in.h> #include <linux/jhash.h> #include <linux/random.h> +#include <trace/napi.h> #include "net-sysfs.h" @@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h) * accidently calling ->poll() when NAPI is not scheduled. */ work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) + if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); + trace_napi_poll(n); + } WARN_ON_ONCE(work > weight); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 2797b71..538c1d4 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -22,8 +22,10 @@ #include <linux/timer.h> #include <linux/bitops.h> #include <net/genetlink.h> +#include <net/netevent.h> #include <trace/skb.h> +#include <trace/napi.h> #include <asm/unaligned.h> @@ -38,7 +40,7 @@ static void send_dm_alert(struct work_struct *unused); * and the work handle that will send up * netlink alerts */ -struct sock *dm_sock; +static int trace_state = TRACE_OFF; struct per_cpu_dm_data { struct work_struct dm_alert_work; @@ -47,6 +49,12 @@ struct per_cpu_dm_data { struct timer_list send_timer; }; +struct dm_hw_stat_delta { + struct net_device *dev; + struct list_head list; + unsigned long last_drop_val; +}; + static struct genl_family net_drop_monitor_family = { .id = GENL_ID_GENERATE, .hdrsize = 0, @@ -59,7 +67,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; - +static unsigned long dm_hw_check_delta = 2*HZ; +static LIST_HEAD(hw_stats_list); static void reset_per_cpu_data(struct per_cpu_dm_data *data) { @@ -115,7 +124,7 @@ static void sched_send_work(unsigned long unused) schedule_work(&data->dm_alert_work); } -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; struct nlmsghdr *nlh; @@ -159,18 +168,59 @@ out: return; } +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +{ + trace_drop_common(skb, location); +} + +static void trace_napi_poll_hit(struct napi_struct *napi) +{ + struct dm_hw_stat_delta *new_stat; + + /* + * Ratelimit our check time to dm_hw_check_delta jiffies + */ + if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) + return; + + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if ((new_stat->dev == napi->dev) && + (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + trace_drop_common(NULL, NULL); + new_stat->last_drop_val = napi->dev->stats.rx_dropped; + break; + } + } +} + + static int set_all_monitor_traces(int state) { int rc = 0; + struct dm_hw_stat_delta *new_stat = NULL; switch (state) { case TRACE_ON: rc |= register_trace_kfree_skb(trace_kfree_skb_hit); + rc |= register_trace_napi_poll(trace_napi_poll_hit); break; case TRACE_OFF: rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); + rc |= unregister_trace_napi_poll(trace_napi_poll_hit); tracepoint_synchronize_unregister(); + + /* + * Clean the device list + */ + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if (new_stat->dev == NULL) { + list_del_rcu(&new_stat->list); + kfree(new_stat); + } + } + rcu_read_unlock(); break; default: rc = 1; @@ -179,6 +229,7 @@ static int set_all_monitor_traces(int state) if (rc) return -EINPROGRESS; + trace_state = state; return rc; } @@ -204,6 +255,43 @@ static int net_dm_cmd_trace(struct sk_buff *skb, return -ENOTSUPP; } +int dropmon_net_event(struct notifier_block *ev_block, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct dm_hw_stat_delta *new_stat = NULL; + + + switch (event) { + case NETDEV_REGISTER: + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + + if (!new_stat) + goto out; + + new_stat->dev = dev; + rcu_read_lock(); + list_add_rcu(&new_stat->list, &hw_stats_list); + rcu_read_unlock(); + break; + case NETDEV_UNREGISTER: + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if (new_stat->dev == dev) + new_stat->dev = NULL; + break; + } + + if (trace_state == TRACE_OFF) { + rcu_read_lock(); + list_del_rcu(&new_stat->list); + rcu_read_unlock(); + kfree(new_stat); + } + break; + } +out: + return NOTIFY_DONE; +} static struct genl_ops dropmon_ops[] = { { @@ -220,6 +308,10 @@ static struct genl_ops dropmon_ops[] = { }, }; +static struct notifier_block dropmon_net_notifier = { + .notifier_call = dropmon_net_event +}; + static int __init init_net_drop_monitor(void) { int cpu; @@ -243,12 +335,18 @@ static int __init init_net_drop_monitor(void) ret = genl_register_ops(&net_drop_monitor_family, &dropmon_ops[i]); if (ret) { - printk(KERN_CRIT "failed to register operation %d\n", + printk(KERN_CRIT "Failed to register operation %d\n", dropmon_ops[i].cmd); goto out_unreg; } } + rc = register_netdevice_notifier(&dropmon_net_notifier); + if (rc < 0) { + printk(KERN_CRIT "Failed to register netdevice notifier\n"); + goto out_unreg; + } + rc = 0; for_each_present_cpu(cpu) { @@ -259,6 +357,7 @@ static int __init init_net_drop_monitor(void) data->send_timer.data = cpu; data->send_timer.function = sched_send_work; } + goto out; out_unreg: diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c8fb456..b07b25b 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -20,6 +20,7 @@ #include <linux/netlink.h> #include <linux/net_dropmon.h> #include <trace/skb.h> +#include <trace/napi.h> #include <asm/unaligned.h> #include <asm/bitops.h> @@ -27,3 +28,6 @@ DEFINE_TRACE(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); + +DEFINE_TRACE(napi_poll); +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index b5873bd..446d3ba 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -24,6 +24,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <asm/unaligned.h> +#include <trace/napi.h> /* * We maintain a small pool of fully-sized skbs, to make sure the @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, set_bit(NAPI_STATE_NPSVC, &napi->state); work = napi->poll(napi, budget); + trace_napi_poll(napi->dev); clear_bit(NAPI_STATE_NPSVC, &napi->state); atomic_dec(&trapped); ^ permalink raw reply related [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware drops rx packets 2009-05-08 19:50 [PATCH] dropmon: add ability to detect when hardware drops rx packets Neil Horman @ 2009-05-09 6:30 ` Eric Dumazet 2009-05-09 18:07 ` Neil Horman 2009-05-12 16:30 ` Neil Horman 0 siblings, 2 replies; 36+ messages in thread From: Eric Dumazet @ 2009-05-09 6:30 UTC (permalink / raw) To: Neil Horman; +Cc: netdev, davem Neil Horman a écrit : > Hey all- > A few weeks back I sent out an RFC in which I proposed that I could > extend dropwatch to detect drops in hardware by adding a napi poll tracepoint, > using that as an indicator that I should periodically check the stats to see if > the rx drop counter had changed. Having not had any negative feedback to that > proposal, heres the patch. I've tested it and it seems to work fairly well. > Neil > > > Patch to add the ability to detect drops in hardware interfaces via dropwatch. > Adds a tracepoint to net_rx_action to signal everytime a napi instance is > polled. The dropmon code then periodically checks to see if the rx_frames > counter has changed, and if so, adds a drop notification to the netlink > protocol, using the reserved all-0's vector to indicate the drop location was in > hardware, rather than somewhere in the code. > > Signed-off-by: Neil Horman <nhorman@tuxdriver.com> > Hi Neil I dont fully understand your patch, but at least have some questions about rcu stuff. > > include/linux/net_dropmon.h | 7 ++ > include/trace/napi.h | 11 ++++ > net/core/dev.c | 5 +- > net/core/drop_monitor.c | 107 ++++++++++++++++++++++++++++++++++++++++++-- > net/core/net-traces.c | 4 + > net/core/netpoll.c | 2 > 6 files changed, 131 insertions(+), 5 deletions(-) > > diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h > index 0217fb8..9addc62 100644 > --- a/include/linux/net_dropmon.h > +++ b/include/linux/net_dropmon.h > @@ -8,6 +8,13 @@ struct net_dm_drop_point { > __u32 count; > }; > > +#define is_drop_point_hw(x) do {\ > + int ____i, ____j;\ > + for (____i = 0; ____i < 8; i ____i++)\ > + ____j |= x[____i];\ > + ____j;\ > +} while (0) > + > #define NET_DM_CFG_VERSION 0 > #define NET_DM_CFG_ALERT_COUNT 1 > #define NET_DM_CFG_ALERT_DELAY 2 > diff --git a/include/trace/napi.h b/include/trace/napi.h > new file mode 100644 > index 0000000..7c39eb7 > --- /dev/null > +++ b/include/trace/napi.h > @@ -0,0 +1,11 @@ > +#ifndef _TRACE_NAPI_H_ > +#define _TRACE_NAPI_H_ > + > +#include <linux/skbuff.h> > +#include <linux/tracepoint.h> > + > +DECLARE_TRACE(napi_poll, > + TP_PROTO(struct napi_struct *napi), > + TP_ARGS(napi)); > + > +#endif > diff --git a/net/core/dev.c b/net/core/dev.c > index 637ea71..165979c 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -126,6 +126,7 @@ > #include <linux/in.h> > #include <linux/jhash.h> > #include <linux/random.h> > +#include <trace/napi.h> > > #include "net-sysfs.h" > > @@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h) > * accidently calling ->poll() when NAPI is not scheduled. > */ > work = 0; > - if (test_bit(NAPI_STATE_SCHED, &n->state)) > + if (test_bit(NAPI_STATE_SCHED, &n->state)) { > work = n->poll(n, weight); > + trace_napi_poll(n); > + } > > WARN_ON_ONCE(work > weight); > > diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c > index 2797b71..538c1d4 100644 > --- a/net/core/drop_monitor.c > +++ b/net/core/drop_monitor.c > @@ -22,8 +22,10 @@ > #include <linux/timer.h> > #include <linux/bitops.h> > #include <net/genetlink.h> > +#include <net/netevent.h> > > #include <trace/skb.h> > +#include <trace/napi.h> > > #include <asm/unaligned.h> > > @@ -38,7 +40,7 @@ static void send_dm_alert(struct work_struct *unused); > * and the work handle that will send up > * netlink alerts > */ > -struct sock *dm_sock; > +static int trace_state = TRACE_OFF; > > struct per_cpu_dm_data { > struct work_struct dm_alert_work; > @@ -47,6 +49,12 @@ struct per_cpu_dm_data { > struct timer_list send_timer; > }; > > +struct dm_hw_stat_delta { > + struct net_device *dev; > + struct list_head list; > + unsigned long last_drop_val; > +}; > + > static struct genl_family net_drop_monitor_family = { > .id = GENL_ID_GENERATE, > .hdrsize = 0, > @@ -59,7 +67,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); > > static int dm_hit_limit = 64; > static int dm_delay = 1; > - > +static unsigned long dm_hw_check_delta = 2*HZ; > +static LIST_HEAD(hw_stats_list); > > static void reset_per_cpu_data(struct per_cpu_dm_data *data) > { > @@ -115,7 +124,7 @@ static void sched_send_work(unsigned long unused) > schedule_work(&data->dm_alert_work); > } > > -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) > +static void trace_drop_common(struct sk_buff *skb, void *location) > { > struct net_dm_alert_msg *msg; > struct nlmsghdr *nlh; > @@ -159,18 +168,59 @@ out: > return; > } > > +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) > +{ > + trace_drop_common(skb, location); > +} > + > +static void trace_napi_poll_hit(struct napi_struct *napi) > +{ > + struct dm_hw_stat_delta *new_stat; > + > + /* > + * Ratelimit our check time to dm_hw_check_delta jiffies > + */ > + if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) > + return; > + > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > + if ((new_stat->dev == napi->dev) && > + (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { > + trace_drop_common(NULL, NULL); > + new_stat->last_drop_val = napi->dev->stats.rx_dropped; > + break; > + } > + } > +} > + > + > static int set_all_monitor_traces(int state) > { > int rc = 0; > + struct dm_hw_stat_delta *new_stat = NULL; > > switch (state) { > case TRACE_ON: > rc |= register_trace_kfree_skb(trace_kfree_skb_hit); > + rc |= register_trace_napi_poll(trace_napi_poll_hit); > break; > case TRACE_OFF: > rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); > + rc |= unregister_trace_napi_poll(trace_napi_poll_hit); > > tracepoint_synchronize_unregister(); > + > + /* > + * Clean the device list > + */ > + rcu_read_lock(); > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > + if (new_stat->dev == NULL) { > + list_del_rcu(&new_stat->list); > + kfree(new_stat); I can't see how list_del_rcu(&new_stat->list) can be followed by kfree(new_stat) safely. You need a grace period between two operations. That can be done with call_rcu() helper. Also, rcu_read_lock()/rcu_read_unlock() is not the proper locking pragmas needed here. multiple writers are/should be forbidden here. If caller already does a locking, then no additional locking is needed here. > + } > + } > + rcu_read_unlock(); > break; > default: > rc = 1; > @@ -179,6 +229,7 @@ static int set_all_monitor_traces(int state) > > if (rc) > return -EINPROGRESS; > + trace_state = state; > return rc; > } > > @@ -204,6 +255,43 @@ static int net_dm_cmd_trace(struct sk_buff *skb, > return -ENOTSUPP; > } > > +int dropmon_net_event(struct notifier_block *ev_block, > + unsigned long event, void *ptr) > +{ > + struct net_device *dev = ptr; > + struct dm_hw_stat_delta *new_stat = NULL; > + > + > + switch (event) { > + case NETDEV_REGISTER: > + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); > + > + if (!new_stat) > + goto out; > + > + new_stat->dev = dev; > + rcu_read_lock(); same point here, rcu_read_lock is not the proper way to protect the list. > + list_add_rcu(&new_stat->list, &hw_stats_list); > + rcu_read_unlock(); > + break; > + case NETDEV_UNREGISTER: > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > + if (new_stat->dev == dev) > + new_stat->dev = NULL; > + break; > + } > + > + if (trace_state == TRACE_OFF) { > + rcu_read_lock(); > + list_del_rcu(&new_stat->list); > + rcu_read_unlock(); > + kfree(new_stat); same probleme here. If rcu is used, then you need a grace period. > + } > + break; > + } > +out: > + return NOTIFY_DONE; > +} > > static struct genl_ops dropmon_ops[] = { > { > @@ -220,6 +308,10 @@ static struct genl_ops dropmon_ops[] = { > }, > }; > > +static struct notifier_block dropmon_net_notifier = { > + .notifier_call = dropmon_net_event > +}; > + > static int __init init_net_drop_monitor(void) > { > int cpu; > @@ -243,12 +335,18 @@ static int __init init_net_drop_monitor(void) > ret = genl_register_ops(&net_drop_monitor_family, > &dropmon_ops[i]); > if (ret) { > - printk(KERN_CRIT "failed to register operation %d\n", > + printk(KERN_CRIT "Failed to register operation %d\n", > dropmon_ops[i].cmd); > goto out_unreg; > } > } > > + rc = register_netdevice_notifier(&dropmon_net_notifier); > + if (rc < 0) { > + printk(KERN_CRIT "Failed to register netdevice notifier\n"); > + goto out_unreg; > + } > + > rc = 0; > > for_each_present_cpu(cpu) { > @@ -259,6 +357,7 @@ static int __init init_net_drop_monitor(void) > data->send_timer.data = cpu; > data->send_timer.function = sched_send_work; > } > + > goto out; > > out_unreg: > diff --git a/net/core/net-traces.c b/net/core/net-traces.c > index c8fb456..b07b25b 100644 > --- a/net/core/net-traces.c > +++ b/net/core/net-traces.c > @@ -20,6 +20,7 @@ > #include <linux/netlink.h> > #include <linux/net_dropmon.h> > #include <trace/skb.h> > +#include <trace/napi.h> > > #include <asm/unaligned.h> > #include <asm/bitops.h> > @@ -27,3 +28,6 @@ > > DEFINE_TRACE(kfree_skb); > EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); > + > +DEFINE_TRACE(napi_poll); > +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); > diff --git a/net/core/netpoll.c b/net/core/netpoll.c > index b5873bd..446d3ba 100644 > --- a/net/core/netpoll.c > +++ b/net/core/netpoll.c > @@ -24,6 +24,7 @@ > #include <net/tcp.h> > #include <net/udp.h> > #include <asm/unaligned.h> > +#include <trace/napi.h> > > /* > * We maintain a small pool of fully-sized skbs, to make sure the > @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, > set_bit(NAPI_STATE_NPSVC, &napi->state); > > work = napi->poll(napi, budget); > + trace_napi_poll(napi->dev); > > clear_bit(NAPI_STATE_NPSVC, &napi->state); > atomic_dec(&trapped); > ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware drops rx packets 2009-05-09 6:30 ` Eric Dumazet @ 2009-05-09 18:07 ` Neil Horman 2009-05-12 16:30 ` Neil Horman 1 sibling, 0 replies; 36+ messages in thread From: Neil Horman @ 2009-05-09 18:07 UTC (permalink / raw) To: Eric Dumazet; +Cc: netdev, davem On Sat, May 09, 2009 at 08:30:35AM +0200, Eric Dumazet wrote: > Neil Horman a écrit : > > Hey all- > > A few weeks back I sent out an RFC in which I proposed that I could > > extend dropwatch to detect drops in hardware by adding a napi poll tracepoint, > > using that as an indicator that I should periodically check the stats to see if > > the rx drop counter had changed. Having not had any negative feedback to that > > proposal, heres the patch. I've tested it and it seems to work fairly well. > > Neil > > > > > > Patch to add the ability to detect drops in hardware interfaces via dropwatch. > > Adds a tracepoint to net_rx_action to signal everytime a napi instance is > > polled. The dropmon code then periodically checks to see if the rx_frames > > counter has changed, and if so, adds a drop notification to the netlink > > protocol, using the reserved all-0's vector to indicate the drop location was in > > hardware, rather than somewhere in the code. > > > > Signed-off-by: Neil Horman <nhorman@tuxdriver.com> > > > > Hi Neil > > I dont fully understand your patch, but at least have some questions > about rcu stuff. > The idea (if you'll recall the dropmon protocol that I added a few months ago to detect frame loss at specific locations in the kernel), is to extend that idea to detect frame loss in the hardware as well. This is the first part of a patch to do that in the receive path. If it gets past review, I'll be extending it to do transmit drop detection as well. Regarding the rcu locking, I was under the impression that the rcu_read_lock/unlock point were meant to provide that syncronization, although I admit that I didn't look to closely, and assumed that the lack of crashing when Flooded traffic on the box while adding and removing net drivers mean I understood the locking structure. Obviously looking at the implementation I've misunderstood. I'll self-nak this and repost when I understand the rcu interface a bit better (sometime in the next week I expect). Thanks! Neil > > > > include/linux/net_dropmon.h | 7 ++ > > include/trace/napi.h | 11 ++++ > > net/core/dev.c | 5 +- > > net/core/drop_monitor.c | 107 ++++++++++++++++++++++++++++++++++++++++++-- > > net/core/net-traces.c | 4 + > > net/core/netpoll.c | 2 > > 6 files changed, 131 insertions(+), 5 deletions(-) > > > > diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h > > index 0217fb8..9addc62 100644 > > --- a/include/linux/net_dropmon.h > > +++ b/include/linux/net_dropmon.h > > @@ -8,6 +8,13 @@ struct net_dm_drop_point { > > __u32 count; > > }; > > > > +#define is_drop_point_hw(x) do {\ > > + int ____i, ____j;\ > > + for (____i = 0; ____i < 8; i ____i++)\ > > + ____j |= x[____i];\ > > + ____j;\ > > +} while (0) > > + > > #define NET_DM_CFG_VERSION 0 > > #define NET_DM_CFG_ALERT_COUNT 1 > > #define NET_DM_CFG_ALERT_DELAY 2 > > diff --git a/include/trace/napi.h b/include/trace/napi.h > > new file mode 100644 > > index 0000000..7c39eb7 > > --- /dev/null > > +++ b/include/trace/napi.h > > @@ -0,0 +1,11 @@ > > +#ifndef _TRACE_NAPI_H_ > > +#define _TRACE_NAPI_H_ > > + > > +#include <linux/skbuff.h> > > +#include <linux/tracepoint.h> > > + > > +DECLARE_TRACE(napi_poll, > > + TP_PROTO(struct napi_struct *napi), > > + TP_ARGS(napi)); > > + > > +#endif > > diff --git a/net/core/dev.c b/net/core/dev.c > > index 637ea71..165979c 100644 > > --- a/net/core/dev.c > > +++ b/net/core/dev.c > > @@ -126,6 +126,7 @@ > > #include <linux/in.h> > > #include <linux/jhash.h> > > #include <linux/random.h> > > +#include <trace/napi.h> > > > > #include "net-sysfs.h" > > > > @@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h) > > * accidently calling ->poll() when NAPI is not scheduled. > > */ > > work = 0; > > - if (test_bit(NAPI_STATE_SCHED, &n->state)) > > + if (test_bit(NAPI_STATE_SCHED, &n->state)) { > > work = n->poll(n, weight); > > + trace_napi_poll(n); > > + } > > > > WARN_ON_ONCE(work > weight); > > > > diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c > > index 2797b71..538c1d4 100644 > > --- a/net/core/drop_monitor.c > > +++ b/net/core/drop_monitor.c > > @@ -22,8 +22,10 @@ > > #include <linux/timer.h> > > #include <linux/bitops.h> > > #include <net/genetlink.h> > > +#include <net/netevent.h> > > > > #include <trace/skb.h> > > +#include <trace/napi.h> > > > > #include <asm/unaligned.h> > > > > @@ -38,7 +40,7 @@ static void send_dm_alert(struct work_struct *unused); > > * and the work handle that will send up > > * netlink alerts > > */ > > -struct sock *dm_sock; > > +static int trace_state = TRACE_OFF; > > > > struct per_cpu_dm_data { > > struct work_struct dm_alert_work; > > @@ -47,6 +49,12 @@ struct per_cpu_dm_data { > > struct timer_list send_timer; > > }; > > > > +struct dm_hw_stat_delta { > > + struct net_device *dev; > > + struct list_head list; > > + unsigned long last_drop_val; > > +}; > > + > > static struct genl_family net_drop_monitor_family = { > > .id = GENL_ID_GENERATE, > > .hdrsize = 0, > > @@ -59,7 +67,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); > > > > static int dm_hit_limit = 64; > > static int dm_delay = 1; > > - > > +static unsigned long dm_hw_check_delta = 2*HZ; > > +static LIST_HEAD(hw_stats_list); > > > > static void reset_per_cpu_data(struct per_cpu_dm_data *data) > > { > > @@ -115,7 +124,7 @@ static void sched_send_work(unsigned long unused) > > schedule_work(&data->dm_alert_work); > > } > > > > -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) > > +static void trace_drop_common(struct sk_buff *skb, void *location) > > { > > struct net_dm_alert_msg *msg; > > struct nlmsghdr *nlh; > > @@ -159,18 +168,59 @@ out: > > return; > > } > > > > +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) > > +{ > > + trace_drop_common(skb, location); > > +} > > + > > +static void trace_napi_poll_hit(struct napi_struct *napi) > > +{ > > + struct dm_hw_stat_delta *new_stat; > > + > > + /* > > + * Ratelimit our check time to dm_hw_check_delta jiffies > > + */ > > + if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) > > + return; > > + > > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > > + if ((new_stat->dev == napi->dev) && > > + (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { > > + trace_drop_common(NULL, NULL); > > + new_stat->last_drop_val = napi->dev->stats.rx_dropped; > > + break; > > + } > > + } > > +} > > + > > + > > static int set_all_monitor_traces(int state) > > { > > int rc = 0; > > + struct dm_hw_stat_delta *new_stat = NULL; > > > > switch (state) { > > case TRACE_ON: > > rc |= register_trace_kfree_skb(trace_kfree_skb_hit); > > + rc |= register_trace_napi_poll(trace_napi_poll_hit); > > break; > > case TRACE_OFF: > > rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); > > + rc |= unregister_trace_napi_poll(trace_napi_poll_hit); > > > > tracepoint_synchronize_unregister(); > > + > > + /* > > + * Clean the device list > > + */ > > + rcu_read_lock(); > > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > > + if (new_stat->dev == NULL) { > > + list_del_rcu(&new_stat->list); > > + kfree(new_stat); > > I can't see how list_del_rcu(&new_stat->list) can be followed by kfree(new_stat) > safely. You need a grace period between two operations. That can be done > with call_rcu() helper. > > Also, rcu_read_lock()/rcu_read_unlock() is not the proper locking pragmas > needed here. multiple writers are/should be forbidden here. If caller already > does a locking, then no additional locking is needed here. > > > > + } > > + } > > + rcu_read_unlock(); > > break; > > default: > > rc = 1; > > @@ -179,6 +229,7 @@ static int set_all_monitor_traces(int state) > > > > if (rc) > > return -EINPROGRESS; > > + trace_state = state; > > return rc; > > } > > > > @@ -204,6 +255,43 @@ static int net_dm_cmd_trace(struct sk_buff *skb, > > return -ENOTSUPP; > > } > > > > +int dropmon_net_event(struct notifier_block *ev_block, > > + unsigned long event, void *ptr) > > +{ > > + struct net_device *dev = ptr; > > + struct dm_hw_stat_delta *new_stat = NULL; > > + > > + > > + switch (event) { > > + case NETDEV_REGISTER: > > + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); > > + > > + if (!new_stat) > > + goto out; > > + > > + new_stat->dev = dev; > > + rcu_read_lock(); > > same point here, rcu_read_lock is not the proper way to protect the list. > > > + list_add_rcu(&new_stat->list, &hw_stats_list); > > + rcu_read_unlock(); > > + break; > > + case NETDEV_UNREGISTER: > > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > > + if (new_stat->dev == dev) > > + new_stat->dev = NULL; > > + break; > > + } > > + > > + if (trace_state == TRACE_OFF) { > > + rcu_read_lock(); > > + list_del_rcu(&new_stat->list); > > + rcu_read_unlock(); > > + kfree(new_stat); > > same probleme here. If rcu is used, then you need a grace period. > > > + } > > + break; > > + } > > +out: > > + return NOTIFY_DONE; > > +} > > > > static struct genl_ops dropmon_ops[] = { > > { > > @@ -220,6 +308,10 @@ static struct genl_ops dropmon_ops[] = { > > }, > > }; > > > > +static struct notifier_block dropmon_net_notifier = { > > + .notifier_call = dropmon_net_event > > +}; > > + > > static int __init init_net_drop_monitor(void) > > { > > int cpu; > > @@ -243,12 +335,18 @@ static int __init init_net_drop_monitor(void) > > ret = genl_register_ops(&net_drop_monitor_family, > > &dropmon_ops[i]); > > if (ret) { > > - printk(KERN_CRIT "failed to register operation %d\n", > > + printk(KERN_CRIT "Failed to register operation %d\n", > > dropmon_ops[i].cmd); > > goto out_unreg; > > } > > } > > > > + rc = register_netdevice_notifier(&dropmon_net_notifier); > > + if (rc < 0) { > > + printk(KERN_CRIT "Failed to register netdevice notifier\n"); > > + goto out_unreg; > > + } > > + > > rc = 0; > > > > for_each_present_cpu(cpu) { > > @@ -259,6 +357,7 @@ static int __init init_net_drop_monitor(void) > > data->send_timer.data = cpu; > > data->send_timer.function = sched_send_work; > > } > > + > > goto out; > > > > out_unreg: > > diff --git a/net/core/net-traces.c b/net/core/net-traces.c > > index c8fb456..b07b25b 100644 > > --- a/net/core/net-traces.c > > +++ b/net/core/net-traces.c > > @@ -20,6 +20,7 @@ > > #include <linux/netlink.h> > > #include <linux/net_dropmon.h> > > #include <trace/skb.h> > > +#include <trace/napi.h> > > > > #include <asm/unaligned.h> > > #include <asm/bitops.h> > > @@ -27,3 +28,6 @@ > > > > DEFINE_TRACE(kfree_skb); > > EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); > > + > > +DEFINE_TRACE(napi_poll); > > +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); > > diff --git a/net/core/netpoll.c b/net/core/netpoll.c > > index b5873bd..446d3ba 100644 > > --- a/net/core/netpoll.c > > +++ b/net/core/netpoll.c > > @@ -24,6 +24,7 @@ > > #include <net/tcp.h> > > #include <net/udp.h> > > #include <asm/unaligned.h> > > +#include <trace/napi.h> > > > > /* > > * We maintain a small pool of fully-sized skbs, to make sure the > > @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, > > set_bit(NAPI_STATE_NPSVC, &napi->state); > > > > work = napi->poll(napi, budget); > > + trace_napi_poll(napi->dev); > > > > clear_bit(NAPI_STATE_NPSVC, &napi->state); > > atomic_dec(&trapped); > > > > > ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware drops rx packets 2009-05-09 6:30 ` Eric Dumazet 2009-05-09 18:07 ` Neil Horman @ 2009-05-12 16:30 ` Neil Horman 2009-05-13 18:23 ` [PATCH] dropmon: add ability to detect when hardware drops rxpackets Paul E. McKenney 1 sibling, 1 reply; 36+ messages in thread From: Neil Horman @ 2009-05-12 16:30 UTC (permalink / raw) To: Eric Dumazet; +Cc: netdev, davem, nhorman On Sat, May 09, 2009 at 08:30:35AM +0200, Eric Dumazet wrote: > > I dont fully understand your patch, but at least have some questions > about rcu stuff. > Ok, so I went back and I think I managed to better understand the RCU interface. New patch attached, works the same way, saving for the gross previous misuse of rcu. Patch to add the ability to detect drops in hardware interfaces via dropwatch. Adds a tracepoint to net_rx_action to signal everytime a napi instance is polled. The dropmon code then periodically checks to see if the rx_frames counter has changed, and if so, adds a drop notification to the netlink protocol, using the reserved all-0's vector to indicate the drop location was in hardware, rather than somewhere in the code. Signed-off-by: Neil Horman <nhorman@tuxdriver.com> include/linux/net_dropmon.h | 7 ++ include/trace/napi.h | 11 ++++ net/core/dev.c | 5 + net/core/drop_monitor.c | 114 ++++++++++++++++++++++++++++++++++++++++++-- net/core/net-traces.c | 4 + net/core/netpoll.c | 2 6 files changed, 138 insertions(+), 5 deletions(-) diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h index 0217fb8..9addc62 100644 --- a/include/linux/net_dropmon.h +++ b/include/linux/net_dropmon.h @@ -8,6 +8,13 @@ struct net_dm_drop_point { __u32 count; }; +#define is_drop_point_hw(x) do {\ + int ____i, ____j;\ + for (____i = 0; ____i < 8; i ____i++)\ + ____j |= x[____i];\ + ____j;\ +} while (0) + #define NET_DM_CFG_VERSION 0 #define NET_DM_CFG_ALERT_COUNT 1 #define NET_DM_CFG_ALERT_DELAY 2 diff --git a/include/trace/napi.h b/include/trace/napi.h new file mode 100644 index 0000000..7c39eb7 --- /dev/null +++ b/include/trace/napi.h @@ -0,0 +1,11 @@ +#ifndef _TRACE_NAPI_H_ +#define _TRACE_NAPI_H_ + +#include <linux/skbuff.h> +#include <linux/tracepoint.h> + +DECLARE_TRACE(napi_poll, + TP_PROTO(struct napi_struct *napi), + TP_ARGS(napi)); + +#endif diff --git a/net/core/dev.c b/net/core/dev.c index 637ea71..165979c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -126,6 +126,7 @@ #include <linux/in.h> #include <linux/jhash.h> #include <linux/random.h> +#include <trace/napi.h> #include "net-sysfs.h" @@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h) * accidently calling ->poll() when NAPI is not scheduled. */ work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) + if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); + trace_napi_poll(n); + } WARN_ON_ONCE(work > weight); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 2797b71..5153479 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -22,8 +22,10 @@ #include <linux/timer.h> #include <linux/bitops.h> #include <net/genetlink.h> +#include <net/netevent.h> #include <trace/skb.h> +#include <trace/napi.h> #include <asm/unaligned.h> @@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused); * and the work handle that will send up * netlink alerts */ -struct sock *dm_sock; +static int trace_state = TRACE_OFF; +static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; struct per_cpu_dm_data { struct work_struct dm_alert_work; @@ -47,6 +50,13 @@ struct per_cpu_dm_data { struct timer_list send_timer; }; +struct dm_hw_stat_delta { + struct net_device *dev; + struct list_head list; + struct rcu_head rcu; + unsigned long last_drop_val; +}; + static struct genl_family net_drop_monitor_family = { .id = GENL_ID_GENERATE, .hdrsize = 0, @@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; - +static unsigned long dm_hw_check_delta = 2*HZ; +static LIST_HEAD(hw_stats_list); static void reset_per_cpu_data(struct per_cpu_dm_data *data) { @@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused) schedule_work(&data->dm_alert_work); } -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; struct nlmsghdr *nlh; @@ -159,26 +170,79 @@ out: return; } +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +{ + trace_drop_common(skb, location); +} + +static void trace_napi_poll_hit(struct napi_struct *napi) +{ + struct dm_hw_stat_delta *new_stat; + + /* + * Ratelimit our check time to dm_hw_check_delta jiffies + */ + if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if ((new_stat->dev == napi->dev) && + (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + trace_drop_common(NULL, NULL); + new_stat->last_drop_val = napi->dev->stats.rx_dropped; + break; + } + } + rcu_read_unlock(); +} + + +static void free_dm_hw_stat(struct rcu_head *head) +{ + struct dm_hw_stat_delta *n; + n = container_of(head, struct dm_hw_stat_delta, rcu); + kfree(n); +} + static int set_all_monitor_traces(int state) { int rc = 0; + struct dm_hw_stat_delta *new_stat = NULL; + + spin_lock(&trace_state_lock); switch (state) { case TRACE_ON: rc |= register_trace_kfree_skb(trace_kfree_skb_hit); + rc |= register_trace_napi_poll(trace_napi_poll_hit); break; case TRACE_OFF: rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); + rc |= unregister_trace_napi_poll(trace_napi_poll_hit); tracepoint_synchronize_unregister(); + + /* + * Clean the device list + */ + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if (new_stat->dev == NULL) { + list_del_rcu(&new_stat->list); + call_rcu(&new_stat->rcu, free_dm_hw_stat); + } + } break; default: rc = 1; break; } + spin_unlock(&trace_state_lock); + if (rc) return -EINPROGRESS; + trace_state = state; return rc; } @@ -204,6 +268,37 @@ static int net_dm_cmd_trace(struct sk_buff *skb, return -ENOTSUPP; } +static int dropmon_net_event(struct notifier_block *ev_block, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct dm_hw_stat_delta *new_stat = NULL; + + + switch (event) { + case NETDEV_REGISTER: + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + + if (!new_stat) + goto out; + + new_stat->dev = dev; + INIT_RCU_HEAD(&new_stat->rcu); + list_add_rcu(&new_stat->list, &hw_stats_list); + break; + case NETDEV_UNREGISTER: + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if (new_stat->dev == dev) + new_stat->dev = NULL; + break; + } + rcu_read_unlock(); + break; + } +out: + return NOTIFY_DONE; +} static struct genl_ops dropmon_ops[] = { { @@ -220,6 +315,10 @@ static struct genl_ops dropmon_ops[] = { }, }; +static struct notifier_block dropmon_net_notifier = { + .notifier_call = dropmon_net_event +}; + static int __init init_net_drop_monitor(void) { int cpu; @@ -243,12 +342,18 @@ static int __init init_net_drop_monitor(void) ret = genl_register_ops(&net_drop_monitor_family, &dropmon_ops[i]); if (ret) { - printk(KERN_CRIT "failed to register operation %d\n", + printk(KERN_CRIT "Failed to register operation %d\n", dropmon_ops[i].cmd); goto out_unreg; } } + rc = register_netdevice_notifier(&dropmon_net_notifier); + if (rc < 0) { + printk(KERN_CRIT "Failed to register netdevice notifier\n"); + goto out_unreg; + } + rc = 0; for_each_present_cpu(cpu) { @@ -259,6 +364,7 @@ static int __init init_net_drop_monitor(void) data->send_timer.data = cpu; data->send_timer.function = sched_send_work; } + goto out; out_unreg: diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c8fb456..b07b25b 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -20,6 +20,7 @@ #include <linux/netlink.h> #include <linux/net_dropmon.h> #include <trace/skb.h> +#include <trace/napi.h> #include <asm/unaligned.h> #include <asm/bitops.h> @@ -27,3 +28,6 @@ DEFINE_TRACE(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); + +DEFINE_TRACE(napi_poll); +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index b5873bd..446d3ba 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -24,6 +24,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <asm/unaligned.h> +#include <trace/napi.h> /* * We maintain a small pool of fully-sized skbs, to make sure the @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, set_bit(NAPI_STATE_NPSVC, &napi->state); work = napi->poll(napi, budget); + trace_napi_poll(napi->dev); clear_bit(NAPI_STATE_NPSVC, &napi->state); atomic_dec(&trapped); ^ permalink raw reply related [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware drops rxpackets 2009-05-12 16:30 ` Neil Horman @ 2009-05-13 18:23 ` Paul E. McKenney 2009-05-14 0:45 ` Neil Horman 0 siblings, 1 reply; 36+ messages in thread From: Paul E. McKenney @ 2009-05-13 18:23 UTC (permalink / raw) To: Neil Horman; +Cc: Eric Dumazet, netdev, davem On Tue, May 12, 2009 at 12:30:44PM -0400, Neil Horman wrote: > On Sat, May 09, 2009 at 08:30:35AM +0200, Eric Dumazet wrote: > > > > I dont fully understand your patch, but at least have some questions > > about rcu stuff. > > > > > Ok, so I went back and I think I managed to better understand the RCU interface. > New patch attached, works the same way, saving for the gross previous misuse of > rcu. > > > Patch to add the ability to detect drops in hardware interfaces via dropwatch. > Adds a tracepoint to net_rx_action to signal everytime a napi instance is > polled. The dropmon code then periodically checks to see if the rx_frames > counter has changed, and if so, adds a drop notification to the netlink > protocol, using the reserved all-0's vector to indicate the drop location was in > hardware, rather than somewhere in the code. One concern shown below. Thanx, Paul > Signed-off-by: Neil Horman <nhorman@tuxdriver.com> > > > include/linux/net_dropmon.h | 7 ++ > include/trace/napi.h | 11 ++++ > net/core/dev.c | 5 + > net/core/drop_monitor.c | 114 ++++++++++++++++++++++++++++++++++++++++++-- > net/core/net-traces.c | 4 + > net/core/netpoll.c | 2 > 6 files changed, 138 insertions(+), 5 deletions(-) > > diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h > index 0217fb8..9addc62 100644 > --- a/include/linux/net_dropmon.h > +++ b/include/linux/net_dropmon.h > @@ -8,6 +8,13 @@ struct net_dm_drop_point { > __u32 count; > }; > > +#define is_drop_point_hw(x) do {\ > + int ____i, ____j;\ > + for (____i = 0; ____i < 8; i ____i++)\ > + ____j |= x[____i];\ > + ____j;\ > +} while (0) > + > #define NET_DM_CFG_VERSION 0 > #define NET_DM_CFG_ALERT_COUNT 1 > #define NET_DM_CFG_ALERT_DELAY 2 > diff --git a/include/trace/napi.h b/include/trace/napi.h > new file mode 100644 > index 0000000..7c39eb7 > --- /dev/null > +++ b/include/trace/napi.h > @@ -0,0 +1,11 @@ > +#ifndef _TRACE_NAPI_H_ > +#define _TRACE_NAPI_H_ > + > +#include <linux/skbuff.h> > +#include <linux/tracepoint.h> > + > +DECLARE_TRACE(napi_poll, > + TP_PROTO(struct napi_struct *napi), > + TP_ARGS(napi)); > + > +#endif > diff --git a/net/core/dev.c b/net/core/dev.c > index 637ea71..165979c 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -126,6 +126,7 @@ > #include <linux/in.h> > #include <linux/jhash.h> > #include <linux/random.h> > +#include <trace/napi.h> > > #include "net-sysfs.h" > > @@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h) > * accidently calling ->poll() when NAPI is not scheduled. > */ > work = 0; > - if (test_bit(NAPI_STATE_SCHED, &n->state)) > + if (test_bit(NAPI_STATE_SCHED, &n->state)) { > work = n->poll(n, weight); > + trace_napi_poll(n); > + } > > WARN_ON_ONCE(work > weight); > > diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c > index 2797b71..5153479 100644 > --- a/net/core/drop_monitor.c > +++ b/net/core/drop_monitor.c > @@ -22,8 +22,10 @@ > #include <linux/timer.h> > #include <linux/bitops.h> > #include <net/genetlink.h> > +#include <net/netevent.h> > > #include <trace/skb.h> > +#include <trace/napi.h> > > #include <asm/unaligned.h> > > @@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused); > * and the work handle that will send up > * netlink alerts > */ > -struct sock *dm_sock; > +static int trace_state = TRACE_OFF; > +static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; > > struct per_cpu_dm_data { > struct work_struct dm_alert_work; > @@ -47,6 +50,13 @@ struct per_cpu_dm_data { > struct timer_list send_timer; > }; > > +struct dm_hw_stat_delta { > + struct net_device *dev; > + struct list_head list; > + struct rcu_head rcu; > + unsigned long last_drop_val; > +}; > + > static struct genl_family net_drop_monitor_family = { > .id = GENL_ID_GENERATE, > .hdrsize = 0, > @@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); > > static int dm_hit_limit = 64; > static int dm_delay = 1; > - > +static unsigned long dm_hw_check_delta = 2*HZ; > +static LIST_HEAD(hw_stats_list); > > static void reset_per_cpu_data(struct per_cpu_dm_data *data) > { > @@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused) > schedule_work(&data->dm_alert_work); > } > > -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) > +static void trace_drop_common(struct sk_buff *skb, void *location) > { > struct net_dm_alert_msg *msg; > struct nlmsghdr *nlh; > @@ -159,26 +170,79 @@ out: > return; > } > > +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) > +{ > + trace_drop_common(skb, location); > +} > + > +static void trace_napi_poll_hit(struct napi_struct *napi) > +{ > + struct dm_hw_stat_delta *new_stat; > + > + /* > + * Ratelimit our check time to dm_hw_check_delta jiffies > + */ > + if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) > + return; > + > + rcu_read_lock(); > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > + if ((new_stat->dev == napi->dev) && > + (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { > + trace_drop_common(NULL, NULL); > + new_stat->last_drop_val = napi->dev->stats.rx_dropped; > + break; > + } > + } > + rcu_read_unlock(); > +} > + > + > +static void free_dm_hw_stat(struct rcu_head *head) > +{ > + struct dm_hw_stat_delta *n; > + n = container_of(head, struct dm_hw_stat_delta, rcu); > + kfree(n); > +} > + > static int set_all_monitor_traces(int state) > { > int rc = 0; > + struct dm_hw_stat_delta *new_stat = NULL; > + > + spin_lock(&trace_state_lock); > > switch (state) { > case TRACE_ON: > rc |= register_trace_kfree_skb(trace_kfree_skb_hit); > + rc |= register_trace_napi_poll(trace_napi_poll_hit); > break; > case TRACE_OFF: > rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); > + rc |= unregister_trace_napi_poll(trace_napi_poll_hit); > > tracepoint_synchronize_unregister(); > + > + /* > + * Clean the device list > + */ > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > + if (new_stat->dev == NULL) { > + list_del_rcu(&new_stat->list); > + call_rcu(&new_stat->rcu, free_dm_hw_stat); Much better! ;-) > + } > + } > break; > default: > rc = 1; > break; > } > > + spin_unlock(&trace_state_lock); > + > if (rc) > return -EINPROGRESS; > + trace_state = state; > return rc; > } > > @@ -204,6 +268,37 @@ static int net_dm_cmd_trace(struct sk_buff *skb, > return -ENOTSUPP; > } > > +static int dropmon_net_event(struct notifier_block *ev_block, > + unsigned long event, void *ptr) > +{ > + struct net_device *dev = ptr; > + struct dm_hw_stat_delta *new_stat = NULL; > + > + > + switch (event) { > + case NETDEV_REGISTER: > + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); > + > + if (!new_stat) > + goto out; > + > + new_stat->dev = dev; > + INIT_RCU_HEAD(&new_stat->rcu); > + list_add_rcu(&new_stat->list, &hw_stats_list); Don't we need to be holding trace_state_lock at this point? Otherwise, can't we mangle the list with a concurrent list_add_rcu() and list_del_rcu()? > + break; > + case NETDEV_UNREGISTER: > + rcu_read_lock(); > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > + if (new_stat->dev == dev) > + new_stat->dev = NULL; > + break; > + } > + rcu_read_unlock(); > + break; > + } > +out: > + return NOTIFY_DONE; > +} > > static struct genl_ops dropmon_ops[] = { > { > @@ -220,6 +315,10 @@ static struct genl_ops dropmon_ops[] = { > }, > }; > > +static struct notifier_block dropmon_net_notifier = { > + .notifier_call = dropmon_net_event > +}; > + > static int __init init_net_drop_monitor(void) > { > int cpu; > @@ -243,12 +342,18 @@ static int __init init_net_drop_monitor(void) > ret = genl_register_ops(&net_drop_monitor_family, > &dropmon_ops[i]); > if (ret) { > - printk(KERN_CRIT "failed to register operation %d\n", > + printk(KERN_CRIT "Failed to register operation %d\n", > dropmon_ops[i].cmd); > goto out_unreg; > } > } > > + rc = register_netdevice_notifier(&dropmon_net_notifier); > + if (rc < 0) { > + printk(KERN_CRIT "Failed to register netdevice notifier\n"); > + goto out_unreg; > + } > + > rc = 0; > > for_each_present_cpu(cpu) { > @@ -259,6 +364,7 @@ static int __init init_net_drop_monitor(void) > data->send_timer.data = cpu; > data->send_timer.function = sched_send_work; > } > + > goto out; > > out_unreg: > diff --git a/net/core/net-traces.c b/net/core/net-traces.c > index c8fb456..b07b25b 100644 > --- a/net/core/net-traces.c > +++ b/net/core/net-traces.c > @@ -20,6 +20,7 @@ > #include <linux/netlink.h> > #include <linux/net_dropmon.h> > #include <trace/skb.h> > +#include <trace/napi.h> > > #include <asm/unaligned.h> > #include <asm/bitops.h> > @@ -27,3 +28,6 @@ > > DEFINE_TRACE(kfree_skb); > EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); > + > +DEFINE_TRACE(napi_poll); > +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); > diff --git a/net/core/netpoll.c b/net/core/netpoll.c > index b5873bd..446d3ba 100644 > --- a/net/core/netpoll.c > +++ b/net/core/netpoll.c > @@ -24,6 +24,7 @@ > #include <net/tcp.h> > #include <net/udp.h> > #include <asm/unaligned.h> > +#include <trace/napi.h> > > /* > * We maintain a small pool of fully-sized skbs, to make sure the > @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, > set_bit(NAPI_STATE_NPSVC, &napi->state); > > work = napi->poll(napi, budget); > + trace_napi_poll(napi->dev); > > clear_bit(NAPI_STATE_NPSVC, &napi->state); > atomic_dec(&trapped); > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware drops rxpackets 2009-05-13 18:23 ` [PATCH] dropmon: add ability to detect when hardware drops rxpackets Paul E. McKenney @ 2009-05-14 0:45 ` Neil Horman 2009-05-14 1:03 ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Paul E. McKenney 0 siblings, 1 reply; 36+ messages in thread From: Neil Horman @ 2009-05-14 0:45 UTC (permalink / raw) To: Paul E. McKenney; +Cc: Eric Dumazet, netdev, davem On Wed, May 13, 2009 at 11:23:54AM -0700, Paul E. McKenney wrote: > On Tue, May 12, 2009 at 12:30:44PM -0400, Neil Horman wrote: > > On Sat, May 09, 2009 at 08:30:35AM +0200, Eric Dumazet wrote: > > > > > > I dont fully understand your patch, but at least have some questions > > > about rcu stuff. > > > > > > > > > Ok, so I went back and I think I managed to better understand the RCU interface. > > New patch attached, works the same way, saving for the gross previous misuse of > > rcu. > > > > > > Patch to add the ability to detect drops in hardware interfaces via dropwatch. > > Adds a tracepoint to net_rx_action to signal everytime a napi instance is > > polled. The dropmon code then periodically checks to see if the rx_frames > > counter has changed, and if so, adds a drop notification to the netlink > > protocol, using the reserved all-0's vector to indicate the drop location was in > > hardware, rather than somewhere in the code. > > One concern shown below. > <snip> > tracepoint_synchronize_unregister(); > > + > > + /* > > + * Clean the device list > > + */ > > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > > + if (new_stat->dev == NULL) { > > + list_del_rcu(&new_stat->list); > > + call_rcu(&new_stat->rcu, free_dm_hw_stat); > > Much better! ;-) > Thanks! :) <snip> > > + > > + switch (event) { > > + case NETDEV_REGISTER: > > + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); > > + > > + if (!new_stat) > > + goto out; > > + > > + new_stat->dev = dev; > > + INIT_RCU_HEAD(&new_stat->rcu); > > + list_add_rcu(&new_stat->list, &hw_stats_list); > > Don't we need to be holding trace_state_lock at this point? Otherwise, > can't we mangle the list with a concurrent list_add_rcu() and > list_del_rcu()? > I thought the purpose of list_add_rcu and list_del_rcu was to be able to modify lists without needing to hold additional locks. Or am I missing something else about the nuance of how RCU works? Neil > ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-14 0:45 ` Neil Horman @ 2009-05-14 1:03 ` Paul E. McKenney 2009-05-14 12:33 ` Neil Horman 0 siblings, 1 reply; 36+ messages in thread From: Paul E. McKenney @ 2009-05-14 1:03 UTC (permalink / raw) To: Neil Horman; +Cc: Eric Dumazet, netdev, davem On Wed, May 13, 2009 at 08:45:48PM -0400, Neil Horman wrote: > On Wed, May 13, 2009 at 11:23:54AM -0700, Paul E. McKenney wrote: > > On Tue, May 12, 2009 at 12:30:44PM -0400, Neil Horman wrote: > > > On Sat, May 09, 2009 at 08:30:35AM +0200, Eric Dumazet wrote: > > > > > > > > I dont fully understand your patch, but at least have some questions > > > > about rcu stuff. > > > > > > > > > > > > > Ok, so I went back and I think I managed to better understand the RCU interface. > > > New patch attached, works the same way, saving for the gross previous misuse of > > > rcu. > > > > > > > > > Patch to add the ability to detect drops in hardware interfaces via dropwatch. > > > Adds a tracepoint to net_rx_action to signal everytime a napi instance is > > > polled. The dropmon code then periodically checks to see if the rx_frames > > > counter has changed, and if so, adds a drop notification to the netlink > > > protocol, using the reserved all-0's vector to indicate the drop location was in > > > hardware, rather than somewhere in the code. > > > > One concern shown below. > > > <snip> > > tracepoint_synchronize_unregister(); > > > + > > > + /* > > > + * Clean the device list > > > + */ > > > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > > > + if (new_stat->dev == NULL) { > > > + list_del_rcu(&new_stat->list); > > > + call_rcu(&new_stat->rcu, free_dm_hw_stat); > > > > Much better! ;-) > > > Thanks! :) > <snip> > > > > + > > > + switch (event) { > > > + case NETDEV_REGISTER: > > > + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); > > > + > > > + if (!new_stat) > > > + goto out; > > > + > > > + new_stat->dev = dev; > > > + INIT_RCU_HEAD(&new_stat->rcu); > > > + list_add_rcu(&new_stat->list, &hw_stats_list); > > > > Don't we need to be holding trace_state_lock at this point? Otherwise, > > can't we mangle the list with a concurrent list_add_rcu() and > > list_del_rcu()? > > > I thought the purpose of list_add_rcu and list_del_rcu was to be > able to modify lists without needing to hold additional locks. Or am > I missing something else about the nuance of how RCU works? The purpose of list_add_rcu() and list_del_rcu() is to be able to permit concurrent readers to traverse the list while you are modifying that same list. You still need something to handle concurrent updates to the list. The usual approach is to use locks, in this case, to hold trace_state_lock across the addition, given that it is already held across the removal. The reason that I expect holding the lock to be OK is that you cannot add elements faster than you delete them long-term, so given that you are willing to hold the lock over deletions, you are probably OK also holding that same lock over additions. But please let me know if that is not the case! Thanx, Paul ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-14 1:03 ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Paul E. McKenney @ 2009-05-14 12:33 ` Neil Horman 2009-05-14 12:44 ` Jiri Pirko 2009-05-14 16:18 ` [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets Paul E. McKenney 0 siblings, 2 replies; 36+ messages in thread From: Neil Horman @ 2009-05-14 12:33 UTC (permalink / raw) To: Paul E. McKenney; +Cc: Eric Dumazet, netdev, davem > > The purpose of list_add_rcu() and list_del_rcu() is to be able to permit > concurrent readers to traverse the list while you are modifying that same > list. You still need something to handle concurrent updates to the list. > The usual approach is to use locks, in this case, to hold trace_state_lock > across the addition, given that it is already held across the removal. > > The reason that I expect holding the lock to be OK is that you cannot > add elements faster than you delete them long-term, so given that you > are willing to hold the lock over deletions, you are probably OK also > holding that same lock over additions. But please let me know if that > is not the case! > > Thanx, Paul > Ahh, I did learn something then :). That makes sense, ok. New patch attached. Patch to add the ability to detect drops in hardware interfaces via dropwatch. Adds a tracepoint to net_rx_action to signal everytime a napi instance is polled. The dropmon code then periodically checks to see if the rx_frames counter has changed, and if so, adds a drop notification to the netlink protocol, using the reserved all-0's vector to indicate the drop location was in hardware, rather than somewhere in the code. change notes: 1) added use of trace_state_lock around list_add_rcu operation Signed-off-by: Neil Horman <nhorman@tuxdriver.com> include/linux/net_dropmon.h | 7 ++ include/trace/napi.h | 11 ++++ net/core/dev.c | 5 + net/core/drop_monitor.c | 115 ++++++++++++++++++++++++++++++++++++++++++-- net/core/net-traces.c | 4 + net/core/netpoll.c | 2 6 files changed, 139 insertions(+), 5 deletions(-) diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h index 0217fb8..9addc62 100644 --- a/include/linux/net_dropmon.h +++ b/include/linux/net_dropmon.h @@ -8,6 +8,13 @@ struct net_dm_drop_point { __u32 count; }; +#define is_drop_point_hw(x) do {\ + int ____i, ____j;\ + for (____i = 0; ____i < 8; i ____i++)\ + ____j |= x[____i];\ + ____j;\ +} while (0) + #define NET_DM_CFG_VERSION 0 #define NET_DM_CFG_ALERT_COUNT 1 #define NET_DM_CFG_ALERT_DELAY 2 diff --git a/include/trace/napi.h b/include/trace/napi.h new file mode 100644 index 0000000..7c39eb7 --- /dev/null +++ b/include/trace/napi.h @@ -0,0 +1,11 @@ +#ifndef _TRACE_NAPI_H_ +#define _TRACE_NAPI_H_ + +#include <linux/skbuff.h> +#include <linux/tracepoint.h> + +DECLARE_TRACE(napi_poll, + TP_PROTO(struct napi_struct *napi), + TP_ARGS(napi)); + +#endif diff --git a/net/core/dev.c b/net/core/dev.c index 637ea71..165979c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -126,6 +126,7 @@ #include <linux/in.h> #include <linux/jhash.h> #include <linux/random.h> +#include <trace/napi.h> #include "net-sysfs.h" @@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h) * accidently calling ->poll() when NAPI is not scheduled. */ work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) + if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); + trace_napi_poll(n); + } WARN_ON_ONCE(work > weight); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 2797b71..f449971 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -22,8 +22,10 @@ #include <linux/timer.h> #include <linux/bitops.h> #include <net/genetlink.h> +#include <net/netevent.h> #include <trace/skb.h> +#include <trace/napi.h> #include <asm/unaligned.h> @@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused); * and the work handle that will send up * netlink alerts */ -struct sock *dm_sock; +static int trace_state = TRACE_OFF; +static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; struct per_cpu_dm_data { struct work_struct dm_alert_work; @@ -47,6 +50,13 @@ struct per_cpu_dm_data { struct timer_list send_timer; }; +struct dm_hw_stat_delta { + struct net_device *dev; + struct list_head list; + struct rcu_head rcu; + unsigned long last_drop_val; +}; + static struct genl_family net_drop_monitor_family = { .id = GENL_ID_GENERATE, .hdrsize = 0, @@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; - +static unsigned long dm_hw_check_delta = 2*HZ; +static LIST_HEAD(hw_stats_list); static void reset_per_cpu_data(struct per_cpu_dm_data *data) { @@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused) schedule_work(&data->dm_alert_work); } -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; struct nlmsghdr *nlh; @@ -159,26 +170,79 @@ out: return; } +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +{ + trace_drop_common(skb, location); +} + +static void trace_napi_poll_hit(struct napi_struct *napi) +{ + struct dm_hw_stat_delta *new_stat; + + /* + * Ratelimit our check time to dm_hw_check_delta jiffies + */ + if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if ((new_stat->dev == napi->dev) && + (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + trace_drop_common(NULL, NULL); + new_stat->last_drop_val = napi->dev->stats.rx_dropped; + break; + } + } + rcu_read_unlock(); +} + + +static void free_dm_hw_stat(struct rcu_head *head) +{ + struct dm_hw_stat_delta *n; + n = container_of(head, struct dm_hw_stat_delta, rcu); + kfree(n); +} + static int set_all_monitor_traces(int state) { int rc = 0; + struct dm_hw_stat_delta *new_stat = NULL; + + spin_lock(&trace_state_lock); switch (state) { case TRACE_ON: rc |= register_trace_kfree_skb(trace_kfree_skb_hit); + rc |= register_trace_napi_poll(trace_napi_poll_hit); break; case TRACE_OFF: rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); + rc |= unregister_trace_napi_poll(trace_napi_poll_hit); tracepoint_synchronize_unregister(); + + /* + * Clean the device list + */ + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if (new_stat->dev == NULL) { + list_del_rcu(&new_stat->list); + call_rcu(&new_stat->rcu, free_dm_hw_stat); + } + } break; default: rc = 1; break; } + spin_unlock(&trace_state_lock); + if (rc) return -EINPROGRESS; + trace_state = state; return rc; } @@ -204,6 +268,38 @@ static int net_dm_cmd_trace(struct sk_buff *skb, return -ENOTSUPP; } +static int dropmon_net_event(struct notifier_block *ev_block, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct dm_hw_stat_delta *new_stat = NULL; + + switch (event) { + case NETDEV_REGISTER: + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + + if (!new_stat) + goto out; + + new_stat->dev = dev; + INIT_RCU_HEAD(&new_stat->rcu); + spin_lock(&trace_state_lock); + list_add_rcu(&new_stat->list, &hw_stats_list); + spin_unlock(&trace_state_lock); + break; + case NETDEV_UNREGISTER: + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if (new_stat->dev == dev) + new_stat->dev = NULL; + break; + } + rcu_read_unlock(); + break; + } +out: + return NOTIFY_DONE; +} static struct genl_ops dropmon_ops[] = { { @@ -220,6 +316,10 @@ static struct genl_ops dropmon_ops[] = { }, }; +static struct notifier_block dropmon_net_notifier = { + .notifier_call = dropmon_net_event +}; + static int __init init_net_drop_monitor(void) { int cpu; @@ -243,12 +343,18 @@ static int __init init_net_drop_monitor(void) ret = genl_register_ops(&net_drop_monitor_family, &dropmon_ops[i]); if (ret) { - printk(KERN_CRIT "failed to register operation %d\n", + printk(KERN_CRIT "Failed to register operation %d\n", dropmon_ops[i].cmd); goto out_unreg; } } + rc = register_netdevice_notifier(&dropmon_net_notifier); + if (rc < 0) { + printk(KERN_CRIT "Failed to register netdevice notifier\n"); + goto out_unreg; + } + rc = 0; for_each_present_cpu(cpu) { @@ -259,6 +365,7 @@ static int __init init_net_drop_monitor(void) data->send_timer.data = cpu; data->send_timer.function = sched_send_work; } + goto out; out_unreg: diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c8fb456..b07b25b 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -20,6 +20,7 @@ #include <linux/netlink.h> #include <linux/net_dropmon.h> #include <trace/skb.h> +#include <trace/napi.h> #include <asm/unaligned.h> #include <asm/bitops.h> @@ -27,3 +28,6 @@ DEFINE_TRACE(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); + +DEFINE_TRACE(napi_poll); +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index b5873bd..446d3ba 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -24,6 +24,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <asm/unaligned.h> +#include <trace/napi.h> /* * We maintain a small pool of fully-sized skbs, to make sure the @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, set_bit(NAPI_STATE_NPSVC, &napi->state); work = napi->poll(napi, budget); + trace_napi_poll(napi->dev); clear_bit(NAPI_STATE_NPSVC, &napi->state); atomic_dec(&trapped); ^ permalink raw reply related [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-14 12:33 ` Neil Horman @ 2009-05-14 12:44 ` Jiri Pirko 2009-05-14 16:17 ` [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets Paul E. McKenney 2009-05-14 17:29 ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Neil Horman 2009-05-14 16:18 ` [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets Paul E. McKenney 1 sibling, 2 replies; 36+ messages in thread From: Jiri Pirko @ 2009-05-14 12:44 UTC (permalink / raw) To: Neil Horman; +Cc: Paul E. McKenney, Eric Dumazet, netdev, davem Thu, May 14, 2009 at 02:33:00PM CEST, nhorman@tuxdriver.com wrote: >> >> The purpose of list_add_rcu() and list_del_rcu() is to be able to permit >> concurrent readers to traverse the list while you are modifying that same >> list. You still need something to handle concurrent updates to the list. >> The usual approach is to use locks, in this case, to hold trace_state_lock >> across the addition, given that it is already held across the removal. >> >> The reason that I expect holding the lock to be OK is that you cannot >> add elements faster than you delete them long-term, so given that you >> are willing to hold the lock over deletions, you are probably OK also >> holding that same lock over additions. But please let me know if that >> is not the case! >> >> Thanx, Paul >> > > >Ahh, I did learn something then :). That makes sense, ok. New patch attached. > >Patch to add the ability to detect drops in hardware interfaces via dropwatch. >Adds a tracepoint to net_rx_action to signal everytime a napi instance is >polled. The dropmon code then periodically checks to see if the rx_frames >counter has changed, and if so, adds a drop notification to the netlink >protocol, using the reserved all-0's vector to indicate the drop location was in >hardware, rather than somewhere in the code. > >change notes: >1) added use of trace_state_lock around list_add_rcu operation > > >Signed-off-by: Neil Horman <nhorman@tuxdriver.com> > > > include/linux/net_dropmon.h | 7 ++ > include/trace/napi.h | 11 ++++ > net/core/dev.c | 5 + > net/core/drop_monitor.c | 115 ++++++++++++++++++++++++++++++++++++++++++-- > net/core/net-traces.c | 4 + > net/core/netpoll.c | 2 > 6 files changed, 139 insertions(+), 5 deletions(-) > >diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h >index 0217fb8..9addc62 100644 >--- a/include/linux/net_dropmon.h >+++ b/include/linux/net_dropmon.h >@@ -8,6 +8,13 @@ struct net_dm_drop_point { > __u32 count; > }; > >+#define is_drop_point_hw(x) do {\ >+ int ____i, ____j;\ >+ for (____i = 0; ____i < 8; i ____i++)\ >+ ____j |= x[____i];\ >+ ____j;\ >+} while (0) >+ > #define NET_DM_CFG_VERSION 0 > #define NET_DM_CFG_ALERT_COUNT 1 > #define NET_DM_CFG_ALERT_DELAY 2 >diff --git a/include/trace/napi.h b/include/trace/napi.h >new file mode 100644 >index 0000000..7c39eb7 >--- /dev/null >+++ b/include/trace/napi.h >@@ -0,0 +1,11 @@ >+#ifndef _TRACE_NAPI_H_ >+#define _TRACE_NAPI_H_ >+ >+#include <linux/skbuff.h> >+#include <linux/tracepoint.h> >+ >+DECLARE_TRACE(napi_poll, >+ TP_PROTO(struct napi_struct *napi), >+ TP_ARGS(napi)); >+ >+#endif >diff --git a/net/core/dev.c b/net/core/dev.c >index 637ea71..165979c 100644 >--- a/net/core/dev.c >+++ b/net/core/dev.c >@@ -126,6 +126,7 @@ > #include <linux/in.h> > #include <linux/jhash.h> > #include <linux/random.h> >+#include <trace/napi.h> > > #include "net-sysfs.h" > >@@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h) > * accidently calling ->poll() when NAPI is not scheduled. > */ > work = 0; >- if (test_bit(NAPI_STATE_SCHED, &n->state)) >+ if (test_bit(NAPI_STATE_SCHED, &n->state)) { > work = n->poll(n, weight); >+ trace_napi_poll(n); >+ } > > WARN_ON_ONCE(work > weight); > >diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c >index 2797b71..f449971 100644 >--- a/net/core/drop_monitor.c >+++ b/net/core/drop_monitor.c >@@ -22,8 +22,10 @@ > #include <linux/timer.h> > #include <linux/bitops.h> > #include <net/genetlink.h> >+#include <net/netevent.h> > > #include <trace/skb.h> >+#include <trace/napi.h> > > #include <asm/unaligned.h> > >@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused); > * and the work handle that will send up > * netlink alerts > */ >-struct sock *dm_sock; >+static int trace_state = TRACE_OFF; >+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; > > struct per_cpu_dm_data { > struct work_struct dm_alert_work; >@@ -47,6 +50,13 @@ struct per_cpu_dm_data { > struct timer_list send_timer; > }; > >+struct dm_hw_stat_delta { >+ struct net_device *dev; >+ struct list_head list; >+ struct rcu_head rcu; >+ unsigned long last_drop_val; >+}; >+ > static struct genl_family net_drop_monitor_family = { > .id = GENL_ID_GENERATE, > .hdrsize = 0, >@@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); > > static int dm_hit_limit = 64; > static int dm_delay = 1; >- >+static unsigned long dm_hw_check_delta = 2*HZ; >+static LIST_HEAD(hw_stats_list); > > static void reset_per_cpu_data(struct per_cpu_dm_data *data) > { >@@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused) > schedule_work(&data->dm_alert_work); > } > >-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) >+static void trace_drop_common(struct sk_buff *skb, void *location) > { > struct net_dm_alert_msg *msg; > struct nlmsghdr *nlh; >@@ -159,26 +170,79 @@ out: > return; > } > >+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) >+{ >+ trace_drop_common(skb, location); >+} >+ >+static void trace_napi_poll_hit(struct napi_struct *napi) >+{ >+ struct dm_hw_stat_delta *new_stat; >+ >+ /* >+ * Ratelimit our check time to dm_hw_check_delta jiffies >+ */ >+ if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) >+ return; >+ >+ rcu_read_lock(); >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { >+ if ((new_stat->dev == napi->dev) && >+ (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { >+ trace_drop_common(NULL, NULL); >+ new_stat->last_drop_val = napi->dev->stats.rx_dropped; >+ break; >+ } >+ } >+ rcu_read_unlock(); >+} >+ >+ >+static void free_dm_hw_stat(struct rcu_head *head) >+{ >+ struct dm_hw_stat_delta *n; >+ n = container_of(head, struct dm_hw_stat_delta, rcu); >+ kfree(n); >+} >+ > static int set_all_monitor_traces(int state) > { > int rc = 0; >+ struct dm_hw_stat_delta *new_stat = NULL; >+ >+ spin_lock(&trace_state_lock); > > switch (state) { > case TRACE_ON: > rc |= register_trace_kfree_skb(trace_kfree_skb_hit); >+ rc |= register_trace_napi_poll(trace_napi_poll_hit); > break; > case TRACE_OFF: > rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); >+ rc |= unregister_trace_napi_poll(trace_napi_poll_hit); > > tracepoint_synchronize_unregister(); >+ >+ /* >+ * Clean the device list >+ */ >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { ^^^^^^^^^^^^^^^^^^^^^^^ This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock. Also it would be good to use list_for_each_entry_safe here since you're modifying the list. Jirka >+ if (new_stat->dev == NULL) { >+ list_del_rcu(&new_stat->list); >+ call_rcu(&new_stat->rcu, free_dm_hw_stat); >+ } >+ } > break; > default: > rc = 1; > break; > } > >+ spin_unlock(&trace_state_lock); >+ > if (rc) > return -EINPROGRESS; >+ trace_state = state; > return rc; > } > >@@ -204,6 +268,38 @@ static int net_dm_cmd_trace(struct sk_buff *skb, > return -ENOTSUPP; > } > >+static int dropmon_net_event(struct notifier_block *ev_block, >+ unsigned long event, void *ptr) >+{ >+ struct net_device *dev = ptr; >+ struct dm_hw_stat_delta *new_stat = NULL; >+ >+ switch (event) { >+ case NETDEV_REGISTER: >+ new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); >+ >+ if (!new_stat) >+ goto out; >+ >+ new_stat->dev = dev; >+ INIT_RCU_HEAD(&new_stat->rcu); >+ spin_lock(&trace_state_lock); >+ list_add_rcu(&new_stat->list, &hw_stats_list); >+ spin_unlock(&trace_state_lock); >+ break; >+ case NETDEV_UNREGISTER: >+ rcu_read_lock(); >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { >+ if (new_stat->dev == dev) >+ new_stat->dev = NULL; >+ break; >+ } >+ rcu_read_unlock(); >+ break; >+ } >+out: >+ return NOTIFY_DONE; >+} > > static struct genl_ops dropmon_ops[] = { > { >@@ -220,6 +316,10 @@ static struct genl_ops dropmon_ops[] = { > }, > }; > >+static struct notifier_block dropmon_net_notifier = { >+ .notifier_call = dropmon_net_event >+}; >+ > static int __init init_net_drop_monitor(void) > { > int cpu; >@@ -243,12 +343,18 @@ static int __init init_net_drop_monitor(void) > ret = genl_register_ops(&net_drop_monitor_family, > &dropmon_ops[i]); > if (ret) { >- printk(KERN_CRIT "failed to register operation %d\n", >+ printk(KERN_CRIT "Failed to register operation %d\n", > dropmon_ops[i].cmd); > goto out_unreg; > } > } > >+ rc = register_netdevice_notifier(&dropmon_net_notifier); >+ if (rc < 0) { >+ printk(KERN_CRIT "Failed to register netdevice notifier\n"); >+ goto out_unreg; >+ } >+ > rc = 0; > > for_each_present_cpu(cpu) { >@@ -259,6 +365,7 @@ static int __init init_net_drop_monitor(void) > data->send_timer.data = cpu; > data->send_timer.function = sched_send_work; > } >+ > goto out; > > out_unreg: >diff --git a/net/core/net-traces.c b/net/core/net-traces.c >index c8fb456..b07b25b 100644 >--- a/net/core/net-traces.c >+++ b/net/core/net-traces.c >@@ -20,6 +20,7 @@ > #include <linux/netlink.h> > #include <linux/net_dropmon.h> > #include <trace/skb.h> >+#include <trace/napi.h> > > #include <asm/unaligned.h> > #include <asm/bitops.h> >@@ -27,3 +28,6 @@ > > DEFINE_TRACE(kfree_skb); > EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); >+ >+DEFINE_TRACE(napi_poll); >+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); >diff --git a/net/core/netpoll.c b/net/core/netpoll.c >index b5873bd..446d3ba 100644 >--- a/net/core/netpoll.c >+++ b/net/core/netpoll.c >@@ -24,6 +24,7 @@ > #include <net/tcp.h> > #include <net/udp.h> > #include <asm/unaligned.h> >+#include <trace/napi.h> > > /* > * We maintain a small pool of fully-sized skbs, to make sure the >@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, > set_bit(NAPI_STATE_NPSVC, &napi->state); > > work = napi->poll(napi, budget); >+ trace_napi_poll(napi->dev); > > clear_bit(NAPI_STATE_NPSVC, &napi->state); > atomic_dec(&trapped); >-- >To unsubscribe from this list: send the line "unsubscribe netdev" in >the body of a message to majordomo@vger.kernel.org >More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets 2009-05-14 12:44 ` Jiri Pirko @ 2009-05-14 16:17 ` Paul E. McKenney 2009-05-14 17:29 ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Neil Horman 1 sibling, 0 replies; 36+ messages in thread From: Paul E. McKenney @ 2009-05-14 16:17 UTC (permalink / raw) To: Jiri Pirko; +Cc: Neil Horman, Eric Dumazet, netdev, davem On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote: > Thu, May 14, 2009 at 02:33:00PM CEST, nhorman@tuxdriver.com wrote: > >> > >> The purpose of list_add_rcu() and list_del_rcu() is to be able to permit > >> concurrent readers to traverse the list while you are modifying that same > >> list. You still need something to handle concurrent updates to the list. > >> The usual approach is to use locks, in this case, to hold trace_state_lock > >> across the addition, given that it is already held across the removal. > >> > >> The reason that I expect holding the lock to be OK is that you cannot > >> add elements faster than you delete them long-term, so given that you > >> are willing to hold the lock over deletions, you are probably OK also > >> holding that same lock over additions. But please let me know if that > >> is not the case! > >> > >> Thanx, Paul > >> > > > > > >Ahh, I did learn something then :). That makes sense, ok. New patch attached. > > > >Patch to add the ability to detect drops in hardware interfaces via dropwatch. > >Adds a tracepoint to net_rx_action to signal everytime a napi instance is > >polled. The dropmon code then periodically checks to see if the rx_frames > >counter has changed, and if so, adds a drop notification to the netlink > >protocol, using the reserved all-0's vector to indicate the drop location was in > >hardware, rather than somewhere in the code. > > > >change notes: > >1) added use of trace_state_lock around list_add_rcu operation > > > > > >Signed-off-by: Neil Horman <nhorman@tuxdriver.com> > > > > > > include/linux/net_dropmon.h | 7 ++ > > include/trace/napi.h | 11 ++++ > > net/core/dev.c | 5 + > > net/core/drop_monitor.c | 115 ++++++++++++++++++++++++++++++++++++++++++-- > > net/core/net-traces.c | 4 + > > net/core/netpoll.c | 2 > > 6 files changed, 139 insertions(+), 5 deletions(-) > > > >diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h [ . . . ] > > static int set_all_monitor_traces(int state) > > { > > int rc = 0; > >+ struct dm_hw_stat_delta *new_stat = NULL; > >+ > >+ spin_lock(&trace_state_lock); > > > > switch (state) { > > case TRACE_ON: > > rc |= register_trace_kfree_skb(trace_kfree_skb_hit); > >+ rc |= register_trace_napi_poll(trace_napi_poll_hit); > > break; > > case TRACE_OFF: > > rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); > >+ rc |= unregister_trace_napi_poll(trace_napi_poll_hit); > > > > tracepoint_synchronize_unregister(); > >+ > >+ /* > >+ * Clean the device list > >+ */ > >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > ^^^^^^^^^^^^^^^^^^^^^^^ > This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock. > Also it would be good to use list_for_each_entry_safe here since you're > modifying the list. Either way works, as the list_del_rcu() leaves the forward pointers intact. So I don't have an opinion either way on this particular piece of code. More experience will be needed to work out which approach is less confusing. :-/ If this code was shared between the read side and the update side, then you really would want to be able to use list_for_each_entry_rcu() on the update side. Thanx, Paul ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-14 12:44 ` Jiri Pirko 2009-05-14 16:17 ` [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets Paul E. McKenney @ 2009-05-14 17:29 ` Neil Horman 2009-05-15 5:49 ` Jarek Poplawski 2009-05-15 6:51 ` Jiri Pirko 1 sibling, 2 replies; 36+ messages in thread From: Neil Horman @ 2009-05-14 17:29 UTC (permalink / raw) To: Jiri Pirko; +Cc: Paul E. McKenney, Eric Dumazet, netdev, davem On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote: > >+ > >+ /* > >+ * Clean the device list > >+ */ > >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > ^^^^^^^^^^^^^^^^^^^^^^^ > This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock. > Also it would be good to use list_for_each_entry_safe here since you're > modifying the list. > The definition of list_for_each_entry_rcu specifically says its safe against list-mutation primitives, so its fine. Although you are correct, in that its safety is dependent on the protection of rcu_read_lock(), so I'll add that in. Thanks for the catch! New patch attached Change notes: 1) Add rcu_read_lock/unlock protection around TRACE_OFF event Neil Patch to add the ability to detect drops in hardware interfaces via dropwatch. Adds a tracepoint to net_rx_action to signal everytime a napi instance is polled. The dropmon code then periodically checks to see if the rx_frames counter has changed, and if so, adds a drop notification to the netlink protocol, using the reserved all-0's vector to indicate the drop location was in hardware, rather than somewhere in the code. Signed-off-by: Neil Horman <nhorman@tuxdriver.com> include/linux/net_dropmon.h | 7 ++ include/trace/napi.h | 11 ++++ net/core/dev.c | 5 + net/core/drop_monitor.c | 117 ++++++++++++++++++++++++++++++++++++++++++-- net/core/net-traces.c | 4 + net/core/netpoll.c | 2 6 files changed, 141 insertions(+), 5 deletions(-) diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h index 0217fb8..9addc62 100644 --- a/include/linux/net_dropmon.h +++ b/include/linux/net_dropmon.h @@ -8,6 +8,13 @@ struct net_dm_drop_point { __u32 count; }; +#define is_drop_point_hw(x) do {\ + int ____i, ____j;\ + for (____i = 0; ____i < 8; i ____i++)\ + ____j |= x[____i];\ + ____j;\ +} while (0) + #define NET_DM_CFG_VERSION 0 #define NET_DM_CFG_ALERT_COUNT 1 #define NET_DM_CFG_ALERT_DELAY 2 diff --git a/include/trace/napi.h b/include/trace/napi.h new file mode 100644 index 0000000..7c39eb7 --- /dev/null +++ b/include/trace/napi.h @@ -0,0 +1,11 @@ +#ifndef _TRACE_NAPI_H_ +#define _TRACE_NAPI_H_ + +#include <linux/skbuff.h> +#include <linux/tracepoint.h> + +DECLARE_TRACE(napi_poll, + TP_PROTO(struct napi_struct *napi), + TP_ARGS(napi)); + +#endif diff --git a/net/core/dev.c b/net/core/dev.c index 637ea71..165979c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -126,6 +126,7 @@ #include <linux/in.h> #include <linux/jhash.h> #include <linux/random.h> +#include <trace/napi.h> #include "net-sysfs.h" @@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h) * accidently calling ->poll() when NAPI is not scheduled. */ work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) + if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); + trace_napi_poll(n); + } WARN_ON_ONCE(work > weight); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 2797b71..f9130d5 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -22,8 +22,10 @@ #include <linux/timer.h> #include <linux/bitops.h> #include <net/genetlink.h> +#include <net/netevent.h> #include <trace/skb.h> +#include <trace/napi.h> #include <asm/unaligned.h> @@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused); * and the work handle that will send up * netlink alerts */ -struct sock *dm_sock; +static int trace_state = TRACE_OFF; +static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; struct per_cpu_dm_data { struct work_struct dm_alert_work; @@ -47,6 +50,13 @@ struct per_cpu_dm_data { struct timer_list send_timer; }; +struct dm_hw_stat_delta { + struct net_device *dev; + struct list_head list; + struct rcu_head rcu; + unsigned long last_drop_val; +}; + static struct genl_family net_drop_monitor_family = { .id = GENL_ID_GENERATE, .hdrsize = 0, @@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; - +static unsigned long dm_hw_check_delta = 2*HZ; +static LIST_HEAD(hw_stats_list); static void reset_per_cpu_data(struct per_cpu_dm_data *data) { @@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused) schedule_work(&data->dm_alert_work); } -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; struct nlmsghdr *nlh; @@ -159,26 +170,81 @@ out: return; } +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +{ + trace_drop_common(skb, location); +} + +static void trace_napi_poll_hit(struct napi_struct *napi) +{ + struct dm_hw_stat_delta *new_stat; + + /* + * Ratelimit our check time to dm_hw_check_delta jiffies + */ + if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if ((new_stat->dev == napi->dev) && + (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + trace_drop_common(NULL, NULL); + new_stat->last_drop_val = napi->dev->stats.rx_dropped; + break; + } + } + rcu_read_unlock(); +} + + +static void free_dm_hw_stat(struct rcu_head *head) +{ + struct dm_hw_stat_delta *n; + n = container_of(head, struct dm_hw_stat_delta, rcu); + kfree(n); +} + static int set_all_monitor_traces(int state) { int rc = 0; + struct dm_hw_stat_delta *new_stat = NULL; + + spin_lock(&trace_state_lock); switch (state) { case TRACE_ON: rc |= register_trace_kfree_skb(trace_kfree_skb_hit); + rc |= register_trace_napi_poll(trace_napi_poll_hit); break; case TRACE_OFF: rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); + rc |= unregister_trace_napi_poll(trace_napi_poll_hit); tracepoint_synchronize_unregister(); + + /* + * Clean the device list + */ + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if (new_stat->dev == NULL) { + list_del_rcu(&new_stat->list); + call_rcu(&new_stat->rcu, free_dm_hw_stat); + } + } + rcu_read_unlock(); break; default: rc = 1; break; } + spin_unlock(&trace_state_lock); + if (rc) return -EINPROGRESS; + trace_state = state; return rc; } @@ -204,6 +270,38 @@ static int net_dm_cmd_trace(struct sk_buff *skb, return -ENOTSUPP; } +static int dropmon_net_event(struct notifier_block *ev_block, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct dm_hw_stat_delta *new_stat = NULL; + + switch (event) { + case NETDEV_REGISTER: + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + + if (!new_stat) + goto out; + + new_stat->dev = dev; + INIT_RCU_HEAD(&new_stat->rcu); + spin_lock(&trace_state_lock); + list_add_rcu(&new_stat->list, &hw_stats_list); + spin_unlock(&trace_state_lock); + break; + case NETDEV_UNREGISTER: + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if (new_stat->dev == dev) + new_stat->dev = NULL; + break; + } + rcu_read_unlock(); + break; + } +out: + return NOTIFY_DONE; +} static struct genl_ops dropmon_ops[] = { { @@ -220,6 +318,10 @@ static struct genl_ops dropmon_ops[] = { }, }; +static struct notifier_block dropmon_net_notifier = { + .notifier_call = dropmon_net_event +}; + static int __init init_net_drop_monitor(void) { int cpu; @@ -243,12 +345,18 @@ static int __init init_net_drop_monitor(void) ret = genl_register_ops(&net_drop_monitor_family, &dropmon_ops[i]); if (ret) { - printk(KERN_CRIT "failed to register operation %d\n", + printk(KERN_CRIT "Failed to register operation %d\n", dropmon_ops[i].cmd); goto out_unreg; } } + rc = register_netdevice_notifier(&dropmon_net_notifier); + if (rc < 0) { + printk(KERN_CRIT "Failed to register netdevice notifier\n"); + goto out_unreg; + } + rc = 0; for_each_present_cpu(cpu) { @@ -259,6 +367,7 @@ static int __init init_net_drop_monitor(void) data->send_timer.data = cpu; data->send_timer.function = sched_send_work; } + goto out; out_unreg: diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c8fb456..b07b25b 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -20,6 +20,7 @@ #include <linux/netlink.h> #include <linux/net_dropmon.h> #include <trace/skb.h> +#include <trace/napi.h> #include <asm/unaligned.h> #include <asm/bitops.h> @@ -27,3 +28,6 @@ DEFINE_TRACE(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); + +DEFINE_TRACE(napi_poll); +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index b5873bd..446d3ba 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -24,6 +24,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <asm/unaligned.h> +#include <trace/napi.h> /* * We maintain a small pool of fully-sized skbs, to make sure the @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, set_bit(NAPI_STATE_NPSVC, &napi->state); work = napi->poll(napi, budget); + trace_napi_poll(napi->dev); clear_bit(NAPI_STATE_NPSVC, &napi->state); atomic_dec(&trapped); ^ permalink raw reply related [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-14 17:29 ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Neil Horman @ 2009-05-15 5:49 ` Jarek Poplawski 2009-05-15 11:01 ` Neil Horman 2009-05-15 6:51 ` Jiri Pirko 1 sibling, 1 reply; 36+ messages in thread From: Jarek Poplawski @ 2009-05-15 5:49 UTC (permalink / raw) To: Neil Horman; +Cc: Jiri Pirko, Paul E. McKenney, Eric Dumazet, netdev, davem On 14-05-2009 19:29, Neil Horman wrote: > On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote: >>> + >>> + /* >>> + * Clean the device list >>> + */ >>> + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { >> ^^^^^^^^^^^^^^^^^^^^^^^ >> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock. >> Also it would be good to use list_for_each_entry_safe here since you're >> modifying the list. >> > > The definition of list_for_each_entry_rcu specifically says its safe against > list-mutation primitives, so its fine. Although you are correct, in that its > safety is dependent on the protection of rcu_read_lock(), so I'll add that in. > Thanks for the catch! New patch attached > > Change notes: > 1) Add rcu_read_lock/unlock protection around TRACE_OFF event > > Neil ... > static int set_all_monitor_traces(int state) > { > int rc = 0; > + struct dm_hw_stat_delta *new_stat = NULL; > + > + spin_lock(&trace_state_lock); > > switch (state) { > case TRACE_ON: > rc |= register_trace_kfree_skb(trace_kfree_skb_hit); > + rc |= register_trace_napi_poll(trace_napi_poll_hit); > break; > case TRACE_OFF: > rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); > + rc |= unregister_trace_napi_poll(trace_napi_poll_hit); > > tracepoint_synchronize_unregister(); > + > + /* > + * Clean the device list > + */ > + rcu_read_lock(); > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > + if (new_stat->dev == NULL) { > + list_del_rcu(&new_stat->list); > + call_rcu(&new_stat->rcu, free_dm_hw_stat); > + } > + } > + rcu_read_unlock(); IMHO it looks worse now. rcu_read_lock() suggests it's a read side, and spin_lock(&trace_state_lock) protects something else. Jarek P. ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 5:49 ` Jarek Poplawski @ 2009-05-15 11:01 ` Neil Horman 2009-05-15 11:12 ` Jarek Poplawski 0 siblings, 1 reply; 36+ messages in thread From: Neil Horman @ 2009-05-15 11:01 UTC (permalink / raw) To: Jarek Poplawski; +Cc: Jiri Pirko, Paul E. McKenney, Eric Dumazet, netdev, davem On Fri, May 15, 2009 at 05:49:47AM +0000, Jarek Poplawski wrote: > On 14-05-2009 19:29, Neil Horman wrote: > > On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote: > >>> + > >>> + /* > >>> + * Clean the device list > >>> + */ > >>> + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > >> ^^^^^^^^^^^^^^^^^^^^^^^ > >> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock. > >> Also it would be good to use list_for_each_entry_safe here since you're > >> modifying the list. > >> > > > > The definition of list_for_each_entry_rcu specifically says its safe against > > list-mutation primitives, so its fine. Although you are correct, in that its > > safety is dependent on the protection of rcu_read_lock(), so I'll add that in. > > Thanks for the catch! New patch attached > > > > Change notes: > > 1) Add rcu_read_lock/unlock protection around TRACE_OFF event > > > > Neil > ... > > static int set_all_monitor_traces(int state) > > { > > int rc = 0; > > + struct dm_hw_stat_delta *new_stat = NULL; > > + > > + spin_lock(&trace_state_lock); > > > > switch (state) { > > case TRACE_ON: > > rc |= register_trace_kfree_skb(trace_kfree_skb_hit); > > + rc |= register_trace_napi_poll(trace_napi_poll_hit); > > break; > > case TRACE_OFF: > > rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); > > + rc |= unregister_trace_napi_poll(trace_napi_poll_hit); > > > > tracepoint_synchronize_unregister(); > > + > > + /* > > + * Clean the device list > > + */ > > + rcu_read_lock(); > > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > > + if (new_stat->dev == NULL) { > > + list_del_rcu(&new_stat->list); > > + call_rcu(&new_stat->rcu, free_dm_hw_stat); > > + } > > + } > > + rcu_read_unlock(); > > IMHO it looks worse now. rcu_read_lock() suggests it's a read side, > and spin_lock(&trace_state_lock) protects something else. > the read lock is required (according to the comments for the list loop primitive) to protect against the embedded mutation primitive, so its required. I understand that its a bit counterintuitive, but intuition takes a backseat to functionality. :) Neil > Jarek P. > ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 11:01 ` Neil Horman @ 2009-05-15 11:12 ` Jarek Poplawski 2009-05-15 11:15 ` Neil Horman 0 siblings, 1 reply; 36+ messages in thread From: Jarek Poplawski @ 2009-05-15 11:12 UTC (permalink / raw) To: Neil Horman; +Cc: Jiri Pirko, Paul E. McKenney, Eric Dumazet, netdev, davem On Fri, May 15, 2009 at 07:01:41AM -0400, Neil Horman wrote: > On Fri, May 15, 2009 at 05:49:47AM +0000, Jarek Poplawski wrote: ... > > IMHO it looks worse now. rcu_read_lock() suggests it's a read side, > > and spin_lock(&trace_state_lock) protects something else. > > > the read lock is required (according to the comments for the list loop > primitive) to protect against the embedded mutation primitive, so its required. > I understand that its a bit counterintuitive, but intuition takes a backseat to > functionality. :) > Neil > I guess, you missed: > Looks good from an RCU viewpoint! > > Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> for the previous version... Jarek P. ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 11:12 ` Jarek Poplawski @ 2009-05-15 11:15 ` Neil Horman 2009-05-15 11:40 ` Jarek Poplawski 0 siblings, 1 reply; 36+ messages in thread From: Neil Horman @ 2009-05-15 11:15 UTC (permalink / raw) To: Jarek Poplawski; +Cc: Jiri Pirko, Paul E. McKenney, Eric Dumazet, netdev, davem On Fri, May 15, 2009 at 11:12:14AM +0000, Jarek Poplawski wrote: > On Fri, May 15, 2009 at 07:01:41AM -0400, Neil Horman wrote: > > On Fri, May 15, 2009 at 05:49:47AM +0000, Jarek Poplawski wrote: > ... > > > IMHO it looks worse now. rcu_read_lock() suggests it's a read side, > > > and spin_lock(&trace_state_lock) protects something else. > > > > > the read lock is required (according to the comments for the list loop > > primitive) to protect against the embedded mutation primitive, so its required. > > I understand that its a bit counterintuitive, but intuition takes a backseat to > > functionality. :) > > Neil > > > > I guess, you missed: > > > Looks good from an RCU viewpoint! > > > > Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> > > for the previous version... > I didn't, our comments passed in flight. Nevertheless, I'm not sure what this adds (other than additional overhead), which I agree is bad and so might should be removed, but there are some outstanding questions regarding if it is needed in relation to the list primitives I'm using here. According to Eric, list_for_each_entry_safe might be less intrusive here, and I'm trying to figure out if I agree. :) Neil > Jarek P. > ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 11:15 ` Neil Horman @ 2009-05-15 11:40 ` Jarek Poplawski 2009-05-16 0:07 ` Paul E. McKenney 0 siblings, 1 reply; 36+ messages in thread From: Jarek Poplawski @ 2009-05-15 11:40 UTC (permalink / raw) To: Neil Horman; +Cc: Jiri Pirko, Paul E. McKenney, Eric Dumazet, netdev, davem On Fri, May 15, 2009 at 07:15:30AM -0400, Neil Horman wrote: > On Fri, May 15, 2009 at 11:12:14AM +0000, Jarek Poplawski wrote: > > On Fri, May 15, 2009 at 07:01:41AM -0400, Neil Horman wrote: > > > On Fri, May 15, 2009 at 05:49:47AM +0000, Jarek Poplawski wrote: > > ... > > > > IMHO it looks worse now. rcu_read_lock() suggests it's a read side, > > > > and spin_lock(&trace_state_lock) protects something else. > > > > > > > the read lock is required (according to the comments for the list loop > > > primitive) to protect against the embedded mutation primitive, so its required. > > > I understand that its a bit counterintuitive, but intuition takes a backseat to > > > functionality. :) > > > Neil > > > > > > > I guess, you missed: > > > > > Looks good from an RCU viewpoint! > > > > > > Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> > > > > for the previous version... > > > I didn't, our comments passed in flight. Nevertheless, I'm not sure what this > adds (other than additional overhead), which I agree is bad and so might should > be removed, but there are some outstanding questions regarding if it is needed > in relation to the list primitives I'm using here. According to Eric, > list_for_each_entry_safe might be less intrusive here, and I'm trying to figure > out if I agree. :) > Neil Paul "acked" two variants, and Eric prefers one of them. Adding rcu_read_lock() makes sense only "If this code was shared between the read side and the update side". Anyway it would need additional comment. Otherwise it's misleading (but not wrong). And, since Paul reviewed this, it's definitely not needed here because Paul is simply always right ;-) Jarek P. ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 11:40 ` Jarek Poplawski @ 2009-05-16 0:07 ` Paul E. McKenney 0 siblings, 0 replies; 36+ messages in thread From: Paul E. McKenney @ 2009-05-16 0:07 UTC (permalink / raw) To: Jarek Poplawski; +Cc: Neil Horman, Jiri Pirko, Eric Dumazet, netdev, davem On Fri, May 15, 2009 at 11:40:29AM +0000, Jarek Poplawski wrote: > On Fri, May 15, 2009 at 07:15:30AM -0400, Neil Horman wrote: > > On Fri, May 15, 2009 at 11:12:14AM +0000, Jarek Poplawski wrote: > > > On Fri, May 15, 2009 at 07:01:41AM -0400, Neil Horman wrote: > > > > On Fri, May 15, 2009 at 05:49:47AM +0000, Jarek Poplawski wrote: > > > ... > > > > > IMHO it looks worse now. rcu_read_lock() suggests it's a read side, > > > > > and spin_lock(&trace_state_lock) protects something else. > > > > > > > > > the read lock is required (according to the comments for the list loop > > > > primitive) to protect against the embedded mutation primitive, so its required. > > > > I understand that its a bit counterintuitive, but intuition takes a backseat to > > > > functionality. :) > > > > Neil > > > > > > > > > > I guess, you missed: > > > > > > > Looks good from an RCU viewpoint! > > > > > > > > Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> > > > > > > for the previous version... > > > > > I didn't, our comments passed in flight. Nevertheless, I'm not sure what this > > adds (other than additional overhead), which I agree is bad and so might should > > be removed, but there are some outstanding questions regarding if it is needed > > in relation to the list primitives I'm using here. According to Eric, > > list_for_each_entry_safe might be less intrusive here, and I'm trying to figure > > out if I agree. :) > > Neil > > Paul "acked" two variants, and Eric prefers one of them. Adding > rcu_read_lock() makes sense only "If this code was shared between the > read side and the update side". Anyway it would need additional > comment. Otherwise it's misleading (but not wrong). And, since Paul > reviewed this, it's definitely not needed here because Paul is simply > always right ;-) Much as I appreciate the vote of confidence... ;-) I believe that both versions work correctly, and that the difference is therefore a matter of style. My mild preference would be to use rcu_read_lock() only if there was some possibility that a reader (some task not holding the update-side lock) would execute this code. Thanx, Paul ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-14 17:29 ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Neil Horman 2009-05-15 5:49 ` Jarek Poplawski @ 2009-05-15 6:51 ` Jiri Pirko 2009-05-15 7:37 ` Eric Dumazet 2009-05-15 10:59 ` Neil Horman 1 sibling, 2 replies; 36+ messages in thread From: Jiri Pirko @ 2009-05-15 6:51 UTC (permalink / raw) To: Neil Horman; +Cc: Paul E. McKenney, Eric Dumazet, netdev, davem Thu, May 14, 2009 at 07:29:54PM CEST, nhorman@tuxdriver.com wrote: >On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote: >> >+ >> >+ /* >> >+ * Clean the device list >> >+ */ >> >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { >> ^^^^^^^^^^^^^^^^^^^^^^^ >> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock. >> Also it would be good to use list_for_each_entry_safe here since you're >> modifying the list. >> > >The definition of list_for_each_entry_rcu specifically says its safe against >list-mutation primitives, so its fine. Although you are correct, in that its >safety is dependent on the protection of rcu_read_lock(), so I'll add that in. You are right that list_for_each_entry_rcu is safe against list-mutation primitives. But there's no need for this on the update side when you hold a writer spinlock. Here I think it's better (and also less confusing) to use ordinary list_for_each_entry which in this case must be list_for_each_entry_safe. Jirka >Thanks for the catch! New patch attached > >Change notes: >1) Add rcu_read_lock/unlock protection around TRACE_OFF event > >Neil > > >Patch to add the ability to detect drops in hardware interfaces via dropwatch. >Adds a tracepoint to net_rx_action to signal everytime a napi instance is >polled. The dropmon code then periodically checks to see if the rx_frames >counter has changed, and if so, adds a drop notification to the netlink >protocol, using the reserved all-0's vector to indicate the drop location was in >hardware, rather than somewhere in the code. > >Signed-off-by: Neil Horman <nhorman@tuxdriver.com> > > > include/linux/net_dropmon.h | 7 ++ > include/trace/napi.h | 11 ++++ > net/core/dev.c | 5 + > net/core/drop_monitor.c | 117 ++++++++++++++++++++++++++++++++++++++++++-- > net/core/net-traces.c | 4 + > net/core/netpoll.c | 2 > 6 files changed, 141 insertions(+), 5 deletions(-) > >diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h >index 0217fb8..9addc62 100644 >--- a/include/linux/net_dropmon.h >+++ b/include/linux/net_dropmon.h >@@ -8,6 +8,13 @@ struct net_dm_drop_point { > __u32 count; > }; > >+#define is_drop_point_hw(x) do {\ >+ int ____i, ____j;\ >+ for (____i = 0; ____i < 8; i ____i++)\ >+ ____j |= x[____i];\ >+ ____j;\ >+} while (0) >+ > #define NET_DM_CFG_VERSION 0 > #define NET_DM_CFG_ALERT_COUNT 1 > #define NET_DM_CFG_ALERT_DELAY 2 >diff --git a/include/trace/napi.h b/include/trace/napi.h >new file mode 100644 >index 0000000..7c39eb7 >--- /dev/null >+++ b/include/trace/napi.h >@@ -0,0 +1,11 @@ >+#ifndef _TRACE_NAPI_H_ >+#define _TRACE_NAPI_H_ >+ >+#include <linux/skbuff.h> >+#include <linux/tracepoint.h> >+ >+DECLARE_TRACE(napi_poll, >+ TP_PROTO(struct napi_struct *napi), >+ TP_ARGS(napi)); >+ >+#endif >diff --git a/net/core/dev.c b/net/core/dev.c >index 637ea71..165979c 100644 >--- a/net/core/dev.c >+++ b/net/core/dev.c >@@ -126,6 +126,7 @@ > #include <linux/in.h> > #include <linux/jhash.h> > #include <linux/random.h> >+#include <trace/napi.h> > > #include "net-sysfs.h" > >@@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h) > * accidently calling ->poll() when NAPI is not scheduled. > */ > work = 0; >- if (test_bit(NAPI_STATE_SCHED, &n->state)) >+ if (test_bit(NAPI_STATE_SCHED, &n->state)) { > work = n->poll(n, weight); >+ trace_napi_poll(n); >+ } > > WARN_ON_ONCE(work > weight); > >diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c >index 2797b71..f9130d5 100644 >--- a/net/core/drop_monitor.c >+++ b/net/core/drop_monitor.c >@@ -22,8 +22,10 @@ > #include <linux/timer.h> > #include <linux/bitops.h> > #include <net/genetlink.h> >+#include <net/netevent.h> > > #include <trace/skb.h> >+#include <trace/napi.h> > > #include <asm/unaligned.h> > >@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused); > * and the work handle that will send up > * netlink alerts > */ >-struct sock *dm_sock; >+static int trace_state = TRACE_OFF; >+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; > > struct per_cpu_dm_data { > struct work_struct dm_alert_work; >@@ -47,6 +50,13 @@ struct per_cpu_dm_data { > struct timer_list send_timer; > }; > >+struct dm_hw_stat_delta { >+ struct net_device *dev; >+ struct list_head list; >+ struct rcu_head rcu; >+ unsigned long last_drop_val; >+}; >+ > static struct genl_family net_drop_monitor_family = { > .id = GENL_ID_GENERATE, > .hdrsize = 0, >@@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); > > static int dm_hit_limit = 64; > static int dm_delay = 1; >- >+static unsigned long dm_hw_check_delta = 2*HZ; >+static LIST_HEAD(hw_stats_list); > > static void reset_per_cpu_data(struct per_cpu_dm_data *data) > { >@@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused) > schedule_work(&data->dm_alert_work); > } > >-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) >+static void trace_drop_common(struct sk_buff *skb, void *location) > { > struct net_dm_alert_msg *msg; > struct nlmsghdr *nlh; >@@ -159,26 +170,81 @@ out: > return; > } > >+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) >+{ >+ trace_drop_common(skb, location); >+} >+ >+static void trace_napi_poll_hit(struct napi_struct *napi) >+{ >+ struct dm_hw_stat_delta *new_stat; >+ >+ /* >+ * Ratelimit our check time to dm_hw_check_delta jiffies >+ */ >+ if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) >+ return; >+ >+ rcu_read_lock(); >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { >+ if ((new_stat->dev == napi->dev) && >+ (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { >+ trace_drop_common(NULL, NULL); >+ new_stat->last_drop_val = napi->dev->stats.rx_dropped; >+ break; >+ } >+ } >+ rcu_read_unlock(); >+} >+ >+ >+static void free_dm_hw_stat(struct rcu_head *head) >+{ >+ struct dm_hw_stat_delta *n; >+ n = container_of(head, struct dm_hw_stat_delta, rcu); >+ kfree(n); >+} >+ > static int set_all_monitor_traces(int state) > { > int rc = 0; >+ struct dm_hw_stat_delta *new_stat = NULL; >+ >+ spin_lock(&trace_state_lock); > > switch (state) { > case TRACE_ON: > rc |= register_trace_kfree_skb(trace_kfree_skb_hit); >+ rc |= register_trace_napi_poll(trace_napi_poll_hit); > break; > case TRACE_OFF: > rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); >+ rc |= unregister_trace_napi_poll(trace_napi_poll_hit); > > tracepoint_synchronize_unregister(); >+ >+ /* >+ * Clean the device list >+ */ >+ rcu_read_lock(); >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { >+ if (new_stat->dev == NULL) { >+ list_del_rcu(&new_stat->list); >+ call_rcu(&new_stat->rcu, free_dm_hw_stat); >+ } >+ } >+ rcu_read_unlock(); > break; > default: > rc = 1; > break; > } > >+ spin_unlock(&trace_state_lock); >+ > if (rc) > return -EINPROGRESS; >+ trace_state = state; > return rc; > } > >@@ -204,6 +270,38 @@ static int net_dm_cmd_trace(struct sk_buff *skb, > return -ENOTSUPP; > } > >+static int dropmon_net_event(struct notifier_block *ev_block, >+ unsigned long event, void *ptr) >+{ >+ struct net_device *dev = ptr; >+ struct dm_hw_stat_delta *new_stat = NULL; >+ >+ switch (event) { >+ case NETDEV_REGISTER: >+ new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); >+ >+ if (!new_stat) >+ goto out; >+ >+ new_stat->dev = dev; >+ INIT_RCU_HEAD(&new_stat->rcu); >+ spin_lock(&trace_state_lock); >+ list_add_rcu(&new_stat->list, &hw_stats_list); >+ spin_unlock(&trace_state_lock); >+ break; >+ case NETDEV_UNREGISTER: >+ rcu_read_lock(); >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { >+ if (new_stat->dev == dev) >+ new_stat->dev = NULL; >+ break; >+ } >+ rcu_read_unlock(); >+ break; >+ } >+out: >+ return NOTIFY_DONE; >+} > > static struct genl_ops dropmon_ops[] = { > { >@@ -220,6 +318,10 @@ static struct genl_ops dropmon_ops[] = { > }, > }; > >+static struct notifier_block dropmon_net_notifier = { >+ .notifier_call = dropmon_net_event >+}; >+ > static int __init init_net_drop_monitor(void) > { > int cpu; >@@ -243,12 +345,18 @@ static int __init init_net_drop_monitor(void) > ret = genl_register_ops(&net_drop_monitor_family, > &dropmon_ops[i]); > if (ret) { >- printk(KERN_CRIT "failed to register operation %d\n", >+ printk(KERN_CRIT "Failed to register operation %d\n", > dropmon_ops[i].cmd); > goto out_unreg; > } > } > >+ rc = register_netdevice_notifier(&dropmon_net_notifier); >+ if (rc < 0) { >+ printk(KERN_CRIT "Failed to register netdevice notifier\n"); >+ goto out_unreg; >+ } >+ > rc = 0; > > for_each_present_cpu(cpu) { >@@ -259,6 +367,7 @@ static int __init init_net_drop_monitor(void) > data->send_timer.data = cpu; > data->send_timer.function = sched_send_work; > } >+ > goto out; > > out_unreg: >diff --git a/net/core/net-traces.c b/net/core/net-traces.c >index c8fb456..b07b25b 100644 >--- a/net/core/net-traces.c >+++ b/net/core/net-traces.c >@@ -20,6 +20,7 @@ > #include <linux/netlink.h> > #include <linux/net_dropmon.h> > #include <trace/skb.h> >+#include <trace/napi.h> > > #include <asm/unaligned.h> > #include <asm/bitops.h> >@@ -27,3 +28,6 @@ > > DEFINE_TRACE(kfree_skb); > EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); >+ >+DEFINE_TRACE(napi_poll); >+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); >diff --git a/net/core/netpoll.c b/net/core/netpoll.c >index b5873bd..446d3ba 100644 >--- a/net/core/netpoll.c >+++ b/net/core/netpoll.c >@@ -24,6 +24,7 @@ > #include <net/tcp.h> > #include <net/udp.h> > #include <asm/unaligned.h> >+#include <trace/napi.h> > > /* > * We maintain a small pool of fully-sized skbs, to make sure the >@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, > set_bit(NAPI_STATE_NPSVC, &napi->state); > > work = napi->poll(napi, budget); >+ trace_napi_poll(napi->dev); > > clear_bit(NAPI_STATE_NPSVC, &napi->state); > atomic_dec(&trapped); ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 6:51 ` Jiri Pirko @ 2009-05-15 7:37 ` Eric Dumazet 2009-05-15 11:12 ` Neil Horman 2009-05-15 10:59 ` Neil Horman 1 sibling, 1 reply; 36+ messages in thread From: Eric Dumazet @ 2009-05-15 7:37 UTC (permalink / raw) To: Neil Horman; +Cc: Jiri Pirko, Paul E. McKenney, netdev, davem Jiri Pirko a écrit : > Thu, May 14, 2009 at 07:29:54PM CEST, nhorman@tuxdriver.com wrote: >> On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote: >>>> + >>>> + /* >>>> + * Clean the device list >>>> + */ >>>> + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { >>> ^^^^^^^^^^^^^^^^^^^^^^^ >>> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock. >>> Also it would be good to use list_for_each_entry_safe here since you're >>> modifying the list. >>> >> The definition of list_for_each_entry_rcu specifically says its safe against >> list-mutation primitives, so its fine. Although you are correct, in that its >> safety is dependent on the protection of rcu_read_lock(), so I'll add that in. > > You are right that list_for_each_entry_rcu is safe against list-mutation > primitives. But there's no need for this on the update side when you hold a writer > spinlock. Here I think it's better (and also less confusing) to use ordinary > list_for_each_entry which in this case must be list_for_each_entry_safe. Absolutely. RCU is tricky, and must be well understood. Using the right verbs is really needed to help everybody read the code and be able to understand it quickly and maintain it if needed. In this particular case, the list_for_each_entry_rcu() is a litle bit more expensive than regular list_for_each_entry(), as it includes additionnal barriers. Just reading code like this : + spin_lock(&trace_state_lock); + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if (new_stat->dev == NULL) { + list_del_rcu(&new_stat->list); + call_rcu(&new_stat->rcu, free_dm_hw_stat); + } + } + rcu_read_unlock(); + spin_unlock(&trace_state_lock); We *know* something is wrong, as rcu_read_lock() is supposed to guard a readside, so having both rcu_read_lock() and list_del_rcu() is certainly wrong, we dont have to actually understand what is really done by the algorithm. So following is much better : (but maybe not correct... I let you check if list_for_each_entry_safe() would not be better here, since you delete elements in a list while iterating in it) Maybe you can add a break; after call_rcu() if you know only one element can match the "if (new_stat->dev == NULL) " condition, and use normal list_for_each_entry() + spin_lock(&trace_state_lock); + list_for_each_entry(new_stat, &hw_stats_list, list) { + if (new_stat->dev == NULL) { + list_del_rcu(&new_stat->list); + call_rcu(&new_stat->rcu, free_dm_hw_stat); + } + } + spin_unlock(&trace_state_lock); Even if the 'wrong' version was not buggy (no crashes or corruption), the 'right' one is a commonly used construct in kernel that most dev are able to read without asking themselves "is is correct or not ? Dont we have something strange here ?" Neil, you need to read more code playing with RCU to get familiar with it, we all did same errors in the past :) ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 7:37 ` Eric Dumazet @ 2009-05-15 11:12 ` Neil Horman 0 siblings, 0 replies; 36+ messages in thread From: Neil Horman @ 2009-05-15 11:12 UTC (permalink / raw) To: Eric Dumazet; +Cc: Jiri Pirko, Paul E. McKenney, netdev, davem On Fri, May 15, 2009 at 09:37:28AM +0200, Eric Dumazet wrote: > Jiri Pirko a écrit : > > Thu, May 14, 2009 at 07:29:54PM CEST, nhorman@tuxdriver.com wrote: > >> On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote: > >>>> + > >>>> + /* > >>>> + * Clean the device list > >>>> + */ > >>>> + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > >>> ^^^^^^^^^^^^^^^^^^^^^^^ > >>> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock. > >>> Also it would be good to use list_for_each_entry_safe here since you're > >>> modifying the list. > >>> > >> The definition of list_for_each_entry_rcu specifically says its safe against > >> list-mutation primitives, so its fine. Although you are correct, in that its > >> safety is dependent on the protection of rcu_read_lock(), so I'll add that in. > > > > You are right that list_for_each_entry_rcu is safe against list-mutation > > primitives. But there's no need for this on the update side when you hold a writer > > spinlock. Here I think it's better (and also less confusing) to use ordinary > > list_for_each_entry which in this case must be list_for_each_entry_safe. > > Absolutely. > > RCU is tricky, and must be well understood. Using the right verbs is really needed > to help everybody read the code and be able to understand it quickly and maintain it > if needed. > > In this particular case, the list_for_each_entry_rcu() is a litle bit more > expensive than regular list_for_each_entry(), as it includes additionnal barriers. > > Just reading code like this : > > + spin_lock(&trace_state_lock); > + rcu_read_lock(); > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > + if (new_stat->dev == NULL) { > + list_del_rcu(&new_stat->list); > + call_rcu(&new_stat->rcu, free_dm_hw_stat); > + } > + } > + rcu_read_unlock(); > + spin_unlock(&trace_state_lock); > > We *know* something is wrong, as rcu_read_lock() is supposed to guard a readside, > so having both rcu_read_lock() and list_del_rcu() is certainly wrong, we dont have > to actually understand what is really done by the algorithm. > > So following is much better : (but maybe not correct... I let you > check if list_for_each_entry_safe() would not be better here, since > you delete elements in a list while iterating in it) > > Maybe you can add a break; after call_rcu() if you know only one element > can match the "if (new_stat->dev == NULL) " condition, and use normal list_for_each_entry() > > + spin_lock(&trace_state_lock); > + list_for_each_entry(new_stat, &hw_stats_list, list) { > + if (new_stat->dev == NULL) { > + list_del_rcu(&new_stat->list); > + call_rcu(&new_stat->rcu, free_dm_hw_stat); > + } > + } > + spin_unlock(&trace_state_lock); > > Even if the 'wrong' version was not buggy (no crashes or corruption), the 'right' one > is a commonly used construct in kernel that most dev are able to read without asking > themselves "is is correct or not ? Dont we have something strange here ?" > > Neil, you need to read more code playing with RCU to get familiar with it, we all did same > errors in the past :) > Thanks :) So help me understand something here then. The trace_state_lock has no intention of protecting the actual read side of this list (the the napi trace hook that this patch adds). My understanding of list_for_each_entry_rcu and list_del_rcu, was that even with a call list_del_rcu, the list remained unchanged unti a quiescent point was passed by all cpus (which is detected via the counter in rcu_read_[un]lock. How is it insufficient to use rcu_read_[un]lock here to implement that protection? I agree it looks counterintuitive, but it makes sense to me from a function standpoint. Further to that point, given the example that you have above of what you think _should_ work, is exactly what I have in the set_all_monitor_trace function currently (saving for the conversion of list_for_each_entry_rcu to list_for_each_entry_safe and the removal of the read_lock). If those changes improve performance, then I can get behind them, but I don't see what problem they avoid if I don't use them. Can you clarify that? Regards Neil > > ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 6:51 ` Jiri Pirko 2009-05-15 7:37 ` Eric Dumazet @ 2009-05-15 10:59 ` Neil Horman 2009-05-15 11:27 ` Jiri Pirko 1 sibling, 1 reply; 36+ messages in thread From: Neil Horman @ 2009-05-15 10:59 UTC (permalink / raw) To: Jiri Pirko; +Cc: Paul E. McKenney, Eric Dumazet, netdev, davem On Fri, May 15, 2009 at 08:51:02AM +0200, Jiri Pirko wrote: > Thu, May 14, 2009 at 07:29:54PM CEST, nhorman@tuxdriver.com wrote: > >On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote: > >> >+ > >> >+ /* > >> >+ * Clean the device list > >> >+ */ > >> >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > >> ^^^^^^^^^^^^^^^^^^^^^^^ > >> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock. > >> Also it would be good to use list_for_each_entry_safe here since you're > >> modifying the list. > >> > > > >The definition of list_for_each_entry_rcu specifically says its safe against > >list-mutation primitives, so its fine. Although you are correct, in that its > >safety is dependent on the protection of rcu_read_lock(), so I'll add that in. > > You are right that list_for_each_entry_rcu is safe against list-mutation > primitives. But there's no need for this on the update side when you hold a writer > spinlock. Here I think it's better (and also less confusing) to use ordinary > list_for_each_entry which in this case must be list_for_each_entry_safe. > > Jirka > I disagree. I think visually its more understandable with the rcu variant, and that spinlock isn't a writer spinlock, the other uses of this list don't use that lock (to imporove performance). The spinlock is there simply to serialize changes to the trace state (so that one cpu doen't try to turn dropmon on, while another tries to turn it off). So we need the rcu variant to protect the list removal against concurrent use on the reader side in the new registered trace point function. Neil > >Thanks for the catch! New patch attached > > > >Change notes: > >1) Add rcu_read_lock/unlock protection around TRACE_OFF event > > > >Neil > > > > > >Patch to add the ability to detect drops in hardware interfaces via dropwatch. > >Adds a tracepoint to net_rx_action to signal everytime a napi instance is > >polled. The dropmon code then periodically checks to see if the rx_frames > >counter has changed, and if so, adds a drop notification to the netlink > >protocol, using the reserved all-0's vector to indicate the drop location was in > >hardware, rather than somewhere in the code. > > > >Signed-off-by: Neil Horman <nhorman@tuxdriver.com> > > > > > > include/linux/net_dropmon.h | 7 ++ > > include/trace/napi.h | 11 ++++ > > net/core/dev.c | 5 + > > net/core/drop_monitor.c | 117 ++++++++++++++++++++++++++++++++++++++++++-- > > net/core/net-traces.c | 4 + > > net/core/netpoll.c | 2 > > 6 files changed, 141 insertions(+), 5 deletions(-) > > > >diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h > >index 0217fb8..9addc62 100644 > >--- a/include/linux/net_dropmon.h > >+++ b/include/linux/net_dropmon.h > >@@ -8,6 +8,13 @@ struct net_dm_drop_point { > > __u32 count; > > }; > > > >+#define is_drop_point_hw(x) do {\ > >+ int ____i, ____j;\ > >+ for (____i = 0; ____i < 8; i ____i++)\ > >+ ____j |= x[____i];\ > >+ ____j;\ > >+} while (0) > >+ > > #define NET_DM_CFG_VERSION 0 > > #define NET_DM_CFG_ALERT_COUNT 1 > > #define NET_DM_CFG_ALERT_DELAY 2 > >diff --git a/include/trace/napi.h b/include/trace/napi.h > >new file mode 100644 > >index 0000000..7c39eb7 > >--- /dev/null > >+++ b/include/trace/napi.h > >@@ -0,0 +1,11 @@ > >+#ifndef _TRACE_NAPI_H_ > >+#define _TRACE_NAPI_H_ > >+ > >+#include <linux/skbuff.h> > >+#include <linux/tracepoint.h> > >+ > >+DECLARE_TRACE(napi_poll, > >+ TP_PROTO(struct napi_struct *napi), > >+ TP_ARGS(napi)); > >+ > >+#endif > >diff --git a/net/core/dev.c b/net/core/dev.c > >index 637ea71..165979c 100644 > >--- a/net/core/dev.c > >+++ b/net/core/dev.c > >@@ -126,6 +126,7 @@ > > #include <linux/in.h> > > #include <linux/jhash.h> > > #include <linux/random.h> > >+#include <trace/napi.h> > > > > #include "net-sysfs.h" > > > >@@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h) > > * accidently calling ->poll() when NAPI is not scheduled. > > */ > > work = 0; > >- if (test_bit(NAPI_STATE_SCHED, &n->state)) > >+ if (test_bit(NAPI_STATE_SCHED, &n->state)) { > > work = n->poll(n, weight); > >+ trace_napi_poll(n); > >+ } > > > > WARN_ON_ONCE(work > weight); > > > >diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c > >index 2797b71..f9130d5 100644 > >--- a/net/core/drop_monitor.c > >+++ b/net/core/drop_monitor.c > >@@ -22,8 +22,10 @@ > > #include <linux/timer.h> > > #include <linux/bitops.h> > > #include <net/genetlink.h> > >+#include <net/netevent.h> > > > > #include <trace/skb.h> > >+#include <trace/napi.h> > > > > #include <asm/unaligned.h> > > > >@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused); > > * and the work handle that will send up > > * netlink alerts > > */ > >-struct sock *dm_sock; > >+static int trace_state = TRACE_OFF; > >+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; > > > > struct per_cpu_dm_data { > > struct work_struct dm_alert_work; > >@@ -47,6 +50,13 @@ struct per_cpu_dm_data { > > struct timer_list send_timer; > > }; > > > >+struct dm_hw_stat_delta { > >+ struct net_device *dev; > >+ struct list_head list; > >+ struct rcu_head rcu; > >+ unsigned long last_drop_val; > >+}; > >+ > > static struct genl_family net_drop_monitor_family = { > > .id = GENL_ID_GENERATE, > > .hdrsize = 0, > >@@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); > > > > static int dm_hit_limit = 64; > > static int dm_delay = 1; > >- > >+static unsigned long dm_hw_check_delta = 2*HZ; > >+static LIST_HEAD(hw_stats_list); > > > > static void reset_per_cpu_data(struct per_cpu_dm_data *data) > > { > >@@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused) > > schedule_work(&data->dm_alert_work); > > } > > > >-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) > >+static void trace_drop_common(struct sk_buff *skb, void *location) > > { > > struct net_dm_alert_msg *msg; > > struct nlmsghdr *nlh; > >@@ -159,26 +170,81 @@ out: > > return; > > } > > > >+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) > >+{ > >+ trace_drop_common(skb, location); > >+} > >+ > >+static void trace_napi_poll_hit(struct napi_struct *napi) > >+{ > >+ struct dm_hw_stat_delta *new_stat; > >+ > >+ /* > >+ * Ratelimit our check time to dm_hw_check_delta jiffies > >+ */ > >+ if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) > >+ return; > >+ > >+ rcu_read_lock(); > >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > >+ if ((new_stat->dev == napi->dev) && > >+ (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { > >+ trace_drop_common(NULL, NULL); > >+ new_stat->last_drop_val = napi->dev->stats.rx_dropped; > >+ break; > >+ } > >+ } > >+ rcu_read_unlock(); > >+} > >+ > >+ > >+static void free_dm_hw_stat(struct rcu_head *head) > >+{ > >+ struct dm_hw_stat_delta *n; > >+ n = container_of(head, struct dm_hw_stat_delta, rcu); > >+ kfree(n); > >+} > >+ > > static int set_all_monitor_traces(int state) > > { > > int rc = 0; > >+ struct dm_hw_stat_delta *new_stat = NULL; > >+ > >+ spin_lock(&trace_state_lock); > > > > switch (state) { > > case TRACE_ON: > > rc |= register_trace_kfree_skb(trace_kfree_skb_hit); > >+ rc |= register_trace_napi_poll(trace_napi_poll_hit); > > break; > > case TRACE_OFF: > > rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); > >+ rc |= unregister_trace_napi_poll(trace_napi_poll_hit); > > > > tracepoint_synchronize_unregister(); > >+ > >+ /* > >+ * Clean the device list > >+ */ > >+ rcu_read_lock(); > >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > >+ if (new_stat->dev == NULL) { > >+ list_del_rcu(&new_stat->list); > >+ call_rcu(&new_stat->rcu, free_dm_hw_stat); > >+ } > >+ } > >+ rcu_read_unlock(); > > break; > > default: > > rc = 1; > > break; > > } > > > >+ spin_unlock(&trace_state_lock); > >+ > > if (rc) > > return -EINPROGRESS; > >+ trace_state = state; > > return rc; > > } > > > >@@ -204,6 +270,38 @@ static int net_dm_cmd_trace(struct sk_buff *skb, > > return -ENOTSUPP; > > } > > > >+static int dropmon_net_event(struct notifier_block *ev_block, > >+ unsigned long event, void *ptr) > >+{ > >+ struct net_device *dev = ptr; > >+ struct dm_hw_stat_delta *new_stat = NULL; > >+ > >+ switch (event) { > >+ case NETDEV_REGISTER: > >+ new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); > >+ > >+ if (!new_stat) > >+ goto out; > >+ > >+ new_stat->dev = dev; > >+ INIT_RCU_HEAD(&new_stat->rcu); > >+ spin_lock(&trace_state_lock); > >+ list_add_rcu(&new_stat->list, &hw_stats_list); > >+ spin_unlock(&trace_state_lock); > >+ break; > >+ case NETDEV_UNREGISTER: > >+ rcu_read_lock(); > >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > >+ if (new_stat->dev == dev) > >+ new_stat->dev = NULL; > >+ break; > >+ } > >+ rcu_read_unlock(); > >+ break; > >+ } > >+out: > >+ return NOTIFY_DONE; > >+} > > > > static struct genl_ops dropmon_ops[] = { > > { > >@@ -220,6 +318,10 @@ static struct genl_ops dropmon_ops[] = { > > }, > > }; > > > >+static struct notifier_block dropmon_net_notifier = { > >+ .notifier_call = dropmon_net_event > >+}; > >+ > > static int __init init_net_drop_monitor(void) > > { > > int cpu; > >@@ -243,12 +345,18 @@ static int __init init_net_drop_monitor(void) > > ret = genl_register_ops(&net_drop_monitor_family, > > &dropmon_ops[i]); > > if (ret) { > >- printk(KERN_CRIT "failed to register operation %d\n", > >+ printk(KERN_CRIT "Failed to register operation %d\n", > > dropmon_ops[i].cmd); > > goto out_unreg; > > } > > } > > > >+ rc = register_netdevice_notifier(&dropmon_net_notifier); > >+ if (rc < 0) { > >+ printk(KERN_CRIT "Failed to register netdevice notifier\n"); > >+ goto out_unreg; > >+ } > >+ > > rc = 0; > > > > for_each_present_cpu(cpu) { > >@@ -259,6 +367,7 @@ static int __init init_net_drop_monitor(void) > > data->send_timer.data = cpu; > > data->send_timer.function = sched_send_work; > > } > >+ > > goto out; > > > > out_unreg: > >diff --git a/net/core/net-traces.c b/net/core/net-traces.c > >index c8fb456..b07b25b 100644 > >--- a/net/core/net-traces.c > >+++ b/net/core/net-traces.c > >@@ -20,6 +20,7 @@ > > #include <linux/netlink.h> > > #include <linux/net_dropmon.h> > > #include <trace/skb.h> > >+#include <trace/napi.h> > > > > #include <asm/unaligned.h> > > #include <asm/bitops.h> > >@@ -27,3 +28,6 @@ > > > > DEFINE_TRACE(kfree_skb); > > EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); > >+ > >+DEFINE_TRACE(napi_poll); > >+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); > >diff --git a/net/core/netpoll.c b/net/core/netpoll.c > >index b5873bd..446d3ba 100644 > >--- a/net/core/netpoll.c > >+++ b/net/core/netpoll.c > >@@ -24,6 +24,7 @@ > > #include <net/tcp.h> > > #include <net/udp.h> > > #include <asm/unaligned.h> > >+#include <trace/napi.h> > > > > /* > > * We maintain a small pool of fully-sized skbs, to make sure the > >@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, > > set_bit(NAPI_STATE_NPSVC, &napi->state); > > > > work = napi->poll(napi, budget); > >+ trace_napi_poll(napi->dev); > > > > clear_bit(NAPI_STATE_NPSVC, &napi->state); > > atomic_dec(&trapped); > ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 10:59 ` Neil Horman @ 2009-05-15 11:27 ` Jiri Pirko 2009-05-15 16:07 ` Neil Horman 0 siblings, 1 reply; 36+ messages in thread From: Jiri Pirko @ 2009-05-15 11:27 UTC (permalink / raw) To: Neil Horman; +Cc: Paul E. McKenney, Eric Dumazet, netdev, davem Fri, May 15, 2009 at 12:59:09PM CEST, nhorman@tuxdriver.com wrote: >On Fri, May 15, 2009 at 08:51:02AM +0200, Jiri Pirko wrote: >> Thu, May 14, 2009 at 07:29:54PM CEST, nhorman@tuxdriver.com wrote: >> >On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote: >> >> >+ >> >> >+ /* >> >> >+ * Clean the device list >> >> >+ */ >> >> >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { >> >> ^^^^^^^^^^^^^^^^^^^^^^^ >> >> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock. >> >> Also it would be good to use list_for_each_entry_safe here since you're >> >> modifying the list. >> >> >> > >> >The definition of list_for_each_entry_rcu specifically says its safe against >> >list-mutation primitives, so its fine. Although you are correct, in that its >> >safety is dependent on the protection of rcu_read_lock(), so I'll add that in. >> >> You are right that list_for_each_entry_rcu is safe against list-mutation >> primitives. But there's no need for this on the update side when you hold a writer >> spinlock. Here I think it's better (and also less confusing) to use ordinary >> list_for_each_entry which in this case must be list_for_each_entry_safe. >> >> Jirka >> >I disagree. I think visually its more understandable with the rcu variant, and >that spinlock isn't a writer spinlock, the other uses of this list don't use >that lock (to imporove performance). The spinlock is there simply to serialize >changes to the trace state (so that one cpu doen't try to turn dropmon on, while >another tries to turn it off). So we need the rcu variant to protect the list >removal against concurrent use on the reader side in the new registered trace >point function. Ok, since you spinlock is not writer lock, you need to protect writers from concurrent list updates anyway! And this spinlock does that work. Then you _donotneed_ rcu_read_lock to read from list, you work with it as with ordinary list while reading. As Eric wrote this approach is common. Jirka > >Neil > >> >Thanks for the catch! New patch attached >> > >> >Change notes: >> >1) Add rcu_read_lock/unlock protection around TRACE_OFF event >> > >> >Neil >> > >> > >> >Patch to add the ability to detect drops in hardware interfaces via dropwatch. >> >Adds a tracepoint to net_rx_action to signal everytime a napi instance is >> >polled. The dropmon code then periodically checks to see if the rx_frames >> >counter has changed, and if so, adds a drop notification to the netlink >> >protocol, using the reserved all-0's vector to indicate the drop location was in >> >hardware, rather than somewhere in the code. >> > >> >Signed-off-by: Neil Horman <nhorman@tuxdriver.com> >> > >> > >> > include/linux/net_dropmon.h | 7 ++ >> > include/trace/napi.h | 11 ++++ >> > net/core/dev.c | 5 + >> > net/core/drop_monitor.c | 117 ++++++++++++++++++++++++++++++++++++++++++-- >> > net/core/net-traces.c | 4 + >> > net/core/netpoll.c | 2 >> > 6 files changed, 141 insertions(+), 5 deletions(-) >> > >> >diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h >> >index 0217fb8..9addc62 100644 >> >--- a/include/linux/net_dropmon.h >> >+++ b/include/linux/net_dropmon.h >> >@@ -8,6 +8,13 @@ struct net_dm_drop_point { >> > __u32 count; >> > }; >> > >> >+#define is_drop_point_hw(x) do {\ >> >+ int ____i, ____j;\ >> >+ for (____i = 0; ____i < 8; i ____i++)\ >> >+ ____j |= x[____i];\ >> >+ ____j;\ >> >+} while (0) >> >+ >> > #define NET_DM_CFG_VERSION 0 >> > #define NET_DM_CFG_ALERT_COUNT 1 >> > #define NET_DM_CFG_ALERT_DELAY 2 >> >diff --git a/include/trace/napi.h b/include/trace/napi.h >> >new file mode 100644 >> >index 0000000..7c39eb7 >> >--- /dev/null >> >+++ b/include/trace/napi.h >> >@@ -0,0 +1,11 @@ >> >+#ifndef _TRACE_NAPI_H_ >> >+#define _TRACE_NAPI_H_ >> >+ >> >+#include <linux/skbuff.h> >> >+#include <linux/tracepoint.h> >> >+ >> >+DECLARE_TRACE(napi_poll, >> >+ TP_PROTO(struct napi_struct *napi), >> >+ TP_ARGS(napi)); >> >+ >> >+#endif >> >diff --git a/net/core/dev.c b/net/core/dev.c >> >index 637ea71..165979c 100644 >> >--- a/net/core/dev.c >> >+++ b/net/core/dev.c >> >@@ -126,6 +126,7 @@ >> > #include <linux/in.h> >> > #include <linux/jhash.h> >> > #include <linux/random.h> >> >+#include <trace/napi.h> >> > >> > #include "net-sysfs.h" >> > >> >@@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h) >> > * accidently calling ->poll() when NAPI is not scheduled. >> > */ >> > work = 0; >> >- if (test_bit(NAPI_STATE_SCHED, &n->state)) >> >+ if (test_bit(NAPI_STATE_SCHED, &n->state)) { >> > work = n->poll(n, weight); >> >+ trace_napi_poll(n); >> >+ } >> > >> > WARN_ON_ONCE(work > weight); >> > >> >diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c >> >index 2797b71..f9130d5 100644 >> >--- a/net/core/drop_monitor.c >> >+++ b/net/core/drop_monitor.c >> >@@ -22,8 +22,10 @@ >> > #include <linux/timer.h> >> > #include <linux/bitops.h> >> > #include <net/genetlink.h> >> >+#include <net/netevent.h> >> > >> > #include <trace/skb.h> >> >+#include <trace/napi.h> >> > >> > #include <asm/unaligned.h> >> > >> >@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused); >> > * and the work handle that will send up >> > * netlink alerts >> > */ >> >-struct sock *dm_sock; >> >+static int trace_state = TRACE_OFF; >> >+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; >> > >> > struct per_cpu_dm_data { >> > struct work_struct dm_alert_work; >> >@@ -47,6 +50,13 @@ struct per_cpu_dm_data { >> > struct timer_list send_timer; >> > }; >> > >> >+struct dm_hw_stat_delta { >> >+ struct net_device *dev; >> >+ struct list_head list; >> >+ struct rcu_head rcu; >> >+ unsigned long last_drop_val; >> >+}; >> >+ >> > static struct genl_family net_drop_monitor_family = { >> > .id = GENL_ID_GENERATE, >> > .hdrsize = 0, >> >@@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); >> > >> > static int dm_hit_limit = 64; >> > static int dm_delay = 1; >> >- >> >+static unsigned long dm_hw_check_delta = 2*HZ; >> >+static LIST_HEAD(hw_stats_list); >> > >> > static void reset_per_cpu_data(struct per_cpu_dm_data *data) >> > { >> >@@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused) >> > schedule_work(&data->dm_alert_work); >> > } >> > >> >-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) >> >+static void trace_drop_common(struct sk_buff *skb, void *location) >> > { >> > struct net_dm_alert_msg *msg; >> > struct nlmsghdr *nlh; >> >@@ -159,26 +170,81 @@ out: >> > return; >> > } >> > >> >+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) >> >+{ >> >+ trace_drop_common(skb, location); >> >+} >> >+ >> >+static void trace_napi_poll_hit(struct napi_struct *napi) >> >+{ >> >+ struct dm_hw_stat_delta *new_stat; >> >+ >> >+ /* >> >+ * Ratelimit our check time to dm_hw_check_delta jiffies >> >+ */ >> >+ if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) >> >+ return; >> >+ >> >+ rcu_read_lock(); >> >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { >> >+ if ((new_stat->dev == napi->dev) && >> >+ (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { >> >+ trace_drop_common(NULL, NULL); >> >+ new_stat->last_drop_val = napi->dev->stats.rx_dropped; >> >+ break; >> >+ } >> >+ } >> >+ rcu_read_unlock(); >> >+} >> >+ >> >+ >> >+static void free_dm_hw_stat(struct rcu_head *head) >> >+{ >> >+ struct dm_hw_stat_delta *n; >> >+ n = container_of(head, struct dm_hw_stat_delta, rcu); >> >+ kfree(n); >> >+} >> >+ >> > static int set_all_monitor_traces(int state) >> > { >> > int rc = 0; >> >+ struct dm_hw_stat_delta *new_stat = NULL; >> >+ >> >+ spin_lock(&trace_state_lock); >> > >> > switch (state) { >> > case TRACE_ON: >> > rc |= register_trace_kfree_skb(trace_kfree_skb_hit); >> >+ rc |= register_trace_napi_poll(trace_napi_poll_hit); >> > break; >> > case TRACE_OFF: >> > rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); >> >+ rc |= unregister_trace_napi_poll(trace_napi_poll_hit); >> > >> > tracepoint_synchronize_unregister(); >> >+ >> >+ /* >> >+ * Clean the device list >> >+ */ >> >+ rcu_read_lock(); >> >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { >> >+ if (new_stat->dev == NULL) { >> >+ list_del_rcu(&new_stat->list); >> >+ call_rcu(&new_stat->rcu, free_dm_hw_stat); >> >+ } >> >+ } >> >+ rcu_read_unlock(); >> > break; >> > default: >> > rc = 1; >> > break; >> > } >> > >> >+ spin_unlock(&trace_state_lock); >> >+ >> > if (rc) >> > return -EINPROGRESS; >> >+ trace_state = state; >> > return rc; >> > } >> > >> >@@ -204,6 +270,38 @@ static int net_dm_cmd_trace(struct sk_buff *skb, >> > return -ENOTSUPP; >> > } >> > >> >+static int dropmon_net_event(struct notifier_block *ev_block, >> >+ unsigned long event, void *ptr) >> >+{ >> >+ struct net_device *dev = ptr; >> >+ struct dm_hw_stat_delta *new_stat = NULL; >> >+ >> >+ switch (event) { >> >+ case NETDEV_REGISTER: >> >+ new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); >> >+ >> >+ if (!new_stat) >> >+ goto out; >> >+ >> >+ new_stat->dev = dev; >> >+ INIT_RCU_HEAD(&new_stat->rcu); >> >+ spin_lock(&trace_state_lock); >> >+ list_add_rcu(&new_stat->list, &hw_stats_list); >> >+ spin_unlock(&trace_state_lock); >> >+ break; >> >+ case NETDEV_UNREGISTER: >> >+ rcu_read_lock(); >> >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { >> >+ if (new_stat->dev == dev) >> >+ new_stat->dev = NULL; >> >+ break; >> >+ } >> >+ rcu_read_unlock(); >> >+ break; >> >+ } >> >+out: >> >+ return NOTIFY_DONE; >> >+} >> > >> > static struct genl_ops dropmon_ops[] = { >> > { >> >@@ -220,6 +318,10 @@ static struct genl_ops dropmon_ops[] = { >> > }, >> > }; >> > >> >+static struct notifier_block dropmon_net_notifier = { >> >+ .notifier_call = dropmon_net_event >> >+}; >> >+ >> > static int __init init_net_drop_monitor(void) >> > { >> > int cpu; >> >@@ -243,12 +345,18 @@ static int __init init_net_drop_monitor(void) >> > ret = genl_register_ops(&net_drop_monitor_family, >> > &dropmon_ops[i]); >> > if (ret) { >> >- printk(KERN_CRIT "failed to register operation %d\n", >> >+ printk(KERN_CRIT "Failed to register operation %d\n", >> > dropmon_ops[i].cmd); >> > goto out_unreg; >> > } >> > } >> > >> >+ rc = register_netdevice_notifier(&dropmon_net_notifier); >> >+ if (rc < 0) { >> >+ printk(KERN_CRIT "Failed to register netdevice notifier\n"); >> >+ goto out_unreg; >> >+ } >> >+ >> > rc = 0; >> > >> > for_each_present_cpu(cpu) { >> >@@ -259,6 +367,7 @@ static int __init init_net_drop_monitor(void) >> > data->send_timer.data = cpu; >> > data->send_timer.function = sched_send_work; >> > } >> >+ >> > goto out; >> > >> > out_unreg: >> >diff --git a/net/core/net-traces.c b/net/core/net-traces.c >> >index c8fb456..b07b25b 100644 >> >--- a/net/core/net-traces.c >> >+++ b/net/core/net-traces.c >> >@@ -20,6 +20,7 @@ >> > #include <linux/netlink.h> >> > #include <linux/net_dropmon.h> >> > #include <trace/skb.h> >> >+#include <trace/napi.h> >> > >> > #include <asm/unaligned.h> >> > #include <asm/bitops.h> >> >@@ -27,3 +28,6 @@ >> > >> > DEFINE_TRACE(kfree_skb); >> > EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); >> >+ >> >+DEFINE_TRACE(napi_poll); >> >+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); >> >diff --git a/net/core/netpoll.c b/net/core/netpoll.c >> >index b5873bd..446d3ba 100644 >> >--- a/net/core/netpoll.c >> >+++ b/net/core/netpoll.c >> >@@ -24,6 +24,7 @@ >> > #include <net/tcp.h> >> > #include <net/udp.h> >> > #include <asm/unaligned.h> >> >+#include <trace/napi.h> >> > >> > /* >> > * We maintain a small pool of fully-sized skbs, to make sure the >> >@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, >> > set_bit(NAPI_STATE_NPSVC, &napi->state); >> > >> > work = napi->poll(napi, budget); >> >+ trace_napi_poll(napi->dev); >> > >> > clear_bit(NAPI_STATE_NPSVC, &napi->state); >> > atomic_dec(&trapped); >> ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 11:27 ` Jiri Pirko @ 2009-05-15 16:07 ` Neil Horman 2009-05-15 18:11 ` Eric Dumazet 2009-05-15 18:18 ` Stephen Hemminger 0 siblings, 2 replies; 36+ messages in thread From: Neil Horman @ 2009-05-15 16:07 UTC (permalink / raw) To: Jiri Pirko; +Cc: Paul E. McKenney, Eric Dumazet, netdev, davem On Fri, May 15, 2009 at 01:27:36PM +0200, Jiri Pirko wrote: > Fri, May 15, 2009 at 12:59:09PM CEST, nhorman@tuxdriver.com wrote: > >On Fri, May 15, 2009 at 08:51:02AM +0200, Jiri Pirko wrote: > >> Thu, May 14, 2009 at 07:29:54PM CEST, nhorman@tuxdriver.com wrote: > >> >On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote: > >> >> >+ > >> >> >+ /* > >> >> >+ * Clean the device list > >> >> >+ */ > >> >> >+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > >> >> ^^^^^^^^^^^^^^^^^^^^^^^ > >> >> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock. > >> >> Also it would be good to use list_for_each_entry_safe here since you're > >> >> modifying the list. > >> >> > >> > > >> >The definition of list_for_each_entry_rcu specifically says its safe against > >> >list-mutation primitives, so its fine. Although you are correct, in that its > >> >safety is dependent on the protection of rcu_read_lock(), so I'll add that in. > >> > >> You are right that list_for_each_entry_rcu is safe against list-mutation > >> primitives. But there's no need for this on the update side when you hold a writer > >> spinlock. Here I think it's better (and also less confusing) to use ordinary > >> list_for_each_entry which in this case must be list_for_each_entry_safe. > >> > >> Jirka > >> > >I disagree. I think visually its more understandable with the rcu variant, and > >that spinlock isn't a writer spinlock, the other uses of this list don't use > >that lock (to imporove performance). The spinlock is there simply to serialize > >changes to the trace state (so that one cpu doen't try to turn dropmon on, while > >another tries to turn it off). So we need the rcu variant to protect the list > >removal against concurrent use on the reader side in the new registered trace > >point function. > > Ok, since you spinlock is not writer lock, you need to protect writers from > concurrent list updates anyway! And this spinlock does that work. Then you > _donotneed_ rcu_read_lock to read from list, you work with it as with ordinary > list while reading. As Eric wrote this approach is common. > > Jirka > Ok, I've spent the morning looking at several other examples, and re-reading your comments and Erics, and agree, the rcu_read_lock doesn't need to be there, and a writer mutex does (which I can use trace_state_lock for). As such, I've got a new patch below, tested and working. I think it fully takes your and Erics comments into account. V4 Change Notes: 1) Remove rcu_read_lock from TRACE_OFF case in set_all_monitor_traces, since its not needed in light of the use of the trace_state_lock to protect list mutations 2) Change list_for_each_entry_rcu to list_for_each_entry_safe, since rcu list traversal isn't needed here. 3) Fully protect global tracer state with trace_state lock 4) Add freeing code to the NETDEV_UNREGISTER event notification to safe on memory if the tracer is never used (using trace_state_lock properly here too) Patch to add the ability to detect drops in hardware interfaces via dropwatch. Adds a tracepoint to net_rx_action to signal everytime a napi instance is polled. The dropmon code then periodically checks to see if the rx_frames counter has changed, and if so, adds a drop notification to the netlink protocol, using the reserved all-0's vector to indicate the drop location was in hardware, rather than somewhere in the code. Signed-off-by: Neil Horman <nhorman@tuxdriver.com> include/linux/net_dropmon.h | 7 ++ include/trace/napi.h | 11 +++ net/core/dev.c | 5 + net/core/drop_monitor.c | 127 ++++++++++++++++++++++++++++++++++++++++++-- net/core/net-traces.c | 4 + net/core/netpoll.c | 2 6 files changed, 151 insertions(+), 5 deletions(-) diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h index 0217fb8..9addc62 100644 --- a/include/linux/net_dropmon.h +++ b/include/linux/net_dropmon.h @@ -8,6 +8,13 @@ struct net_dm_drop_point { __u32 count; }; +#define is_drop_point_hw(x) do {\ + int ____i, ____j;\ + for (____i = 0; ____i < 8; i ____i++)\ + ____j |= x[____i];\ + ____j;\ +} while (0) + #define NET_DM_CFG_VERSION 0 #define NET_DM_CFG_ALERT_COUNT 1 #define NET_DM_CFG_ALERT_DELAY 2 diff --git a/include/trace/napi.h b/include/trace/napi.h new file mode 100644 index 0000000..7c39eb7 --- /dev/null +++ b/include/trace/napi.h @@ -0,0 +1,11 @@ +#ifndef _TRACE_NAPI_H_ +#define _TRACE_NAPI_H_ + +#include <linux/skbuff.h> +#include <linux/tracepoint.h> + +DECLARE_TRACE(napi_poll, + TP_PROTO(struct napi_struct *napi), + TP_ARGS(napi)); + +#endif diff --git a/net/core/dev.c b/net/core/dev.c index 637ea71..165979c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -126,6 +126,7 @@ #include <linux/in.h> #include <linux/jhash.h> #include <linux/random.h> +#include <trace/napi.h> #include "net-sysfs.h" @@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h) * accidently calling ->poll() when NAPI is not scheduled. */ work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) + if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); + trace_napi_poll(n); + } WARN_ON_ONCE(work > weight); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 2797b71..4f7e915 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -22,8 +22,10 @@ #include <linux/timer.h> #include <linux/bitops.h> #include <net/genetlink.h> +#include <net/netevent.h> #include <trace/skb.h> +#include <trace/napi.h> #include <asm/unaligned.h> @@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused); * and the work handle that will send up * netlink alerts */ -struct sock *dm_sock; +static int trace_state = TRACE_OFF; +static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; struct per_cpu_dm_data { struct work_struct dm_alert_work; @@ -47,6 +50,13 @@ struct per_cpu_dm_data { struct timer_list send_timer; }; +struct dm_hw_stat_delta { + struct net_device *dev; + struct list_head list; + struct rcu_head rcu; + unsigned long last_drop_val; +}; + static struct genl_family net_drop_monitor_family = { .id = GENL_ID_GENERATE, .hdrsize = 0, @@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; - +static unsigned long dm_hw_check_delta = 2*HZ; +static LIST_HEAD(hw_stats_list); static void reset_per_cpu_data(struct per_cpu_dm_data *data) { @@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused) schedule_work(&data->dm_alert_work); } -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; struct nlmsghdr *nlh; @@ -159,24 +170,80 @@ out: return; } +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +{ + trace_drop_common(skb, location); +} + +static void trace_napi_poll_hit(struct napi_struct *napi) +{ + struct dm_hw_stat_delta *new_stat; + + /* + * Ratelimit our check time to dm_hw_check_delta jiffies + */ + if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if ((new_stat->dev == napi->dev) && + (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + trace_drop_common(NULL, NULL); + new_stat->last_drop_val = napi->dev->stats.rx_dropped; + break; + } + } + rcu_read_unlock(); +} + + +static void free_dm_hw_stat(struct rcu_head *head) +{ + struct dm_hw_stat_delta *n; + n = container_of(head, struct dm_hw_stat_delta, rcu); + kfree(n); +} + static int set_all_monitor_traces(int state) { int rc = 0; + struct dm_hw_stat_delta *new_stat = NULL; + struct dm_hw_stat_delta *temp; + + spin_lock(&trace_state_lock); switch (state) { case TRACE_ON: rc |= register_trace_kfree_skb(trace_kfree_skb_hit); + rc |= register_trace_napi_poll(trace_napi_poll_hit); break; case TRACE_OFF: rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); + rc |= unregister_trace_napi_poll(trace_napi_poll_hit); tracepoint_synchronize_unregister(); + + /* + * Clean the device list + */ + list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { + if (new_stat->dev == NULL) { + list_del_rcu(&new_stat->list); + call_rcu(&new_stat->rcu, free_dm_hw_stat); + } + } break; default: rc = 1; break; } + if (!rc) + trace_state = state; + + spin_unlock(&trace_state_lock); + if (rc) return -EINPROGRESS; return rc; @@ -204,6 +271,47 @@ static int net_dm_cmd_trace(struct sk_buff *skb, return -ENOTSUPP; } +static int dropmon_net_event(struct notifier_block *ev_block, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct dm_hw_stat_delta *new_stat = NULL; + int found = 0; + + switch (event) { + case NETDEV_REGISTER: + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + + if (!new_stat) + goto out; + + new_stat->dev = dev; + INIT_RCU_HEAD(&new_stat->rcu); + spin_lock(&trace_state_lock); + list_add_rcu(&new_stat->list, &hw_stats_list); + spin_unlock(&trace_state_lock); + break; + case NETDEV_UNREGISTER: + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if (new_stat->dev == dev) + new_stat->dev = NULL; + found = 1; + break; + } + rcu_read_unlock(); + + spin_lock(&trace_state_lock); + if (found && (trace_state == TRACE_OFF)) { + list_del_rcu(&new_stat->list); + call_rcu(&new_stat->rcu, free_dm_hw_stat); + } + spin_unlock(&trace_state_lock); + break; + } +out: + return NOTIFY_DONE; +} static struct genl_ops dropmon_ops[] = { { @@ -220,6 +328,10 @@ static struct genl_ops dropmon_ops[] = { }, }; +static struct notifier_block dropmon_net_notifier = { + .notifier_call = dropmon_net_event +}; + static int __init init_net_drop_monitor(void) { int cpu; @@ -243,12 +355,18 @@ static int __init init_net_drop_monitor(void) ret = genl_register_ops(&net_drop_monitor_family, &dropmon_ops[i]); if (ret) { - printk(KERN_CRIT "failed to register operation %d\n", + printk(KERN_CRIT "Failed to register operation %d\n", dropmon_ops[i].cmd); goto out_unreg; } } + rc = register_netdevice_notifier(&dropmon_net_notifier); + if (rc < 0) { + printk(KERN_CRIT "Failed to register netdevice notifier\n"); + goto out_unreg; + } + rc = 0; for_each_present_cpu(cpu) { @@ -259,6 +377,7 @@ static int __init init_net_drop_monitor(void) data->send_timer.data = cpu; data->send_timer.function = sched_send_work; } + goto out; out_unreg: diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c8fb456..b07b25b 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -20,6 +20,7 @@ #include <linux/netlink.h> #include <linux/net_dropmon.h> #include <trace/skb.h> +#include <trace/napi.h> #include <asm/unaligned.h> #include <asm/bitops.h> @@ -27,3 +28,6 @@ DEFINE_TRACE(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); + +DEFINE_TRACE(napi_poll); +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index b5873bd..446d3ba 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -24,6 +24,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <asm/unaligned.h> +#include <trace/napi.h> /* * We maintain a small pool of fully-sized skbs, to make sure the @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, set_bit(NAPI_STATE_NPSVC, &napi->state); work = napi->poll(napi, budget); + trace_napi_poll(napi->dev); clear_bit(NAPI_STATE_NPSVC, &napi->state); atomic_dec(&trapped); ^ permalink raw reply related [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 16:07 ` Neil Horman @ 2009-05-15 18:11 ` Eric Dumazet 2009-05-15 18:23 ` Stephen Hemminger ` (2 more replies) 2009-05-15 18:18 ` Stephen Hemminger 1 sibling, 3 replies; 36+ messages in thread From: Eric Dumazet @ 2009-05-15 18:11 UTC (permalink / raw) To: Neil Horman; +Cc: Jiri Pirko, Paul E. McKenney, netdev, davem Neil Horman a écrit : > +static int dropmon_net_event(struct notifier_block *ev_block, > + unsigned long event, void *ptr) > +{ > + struct net_device *dev = ptr; > + struct dm_hw_stat_delta *new_stat = NULL; > + int found = 0; > + > + switch (event) { > + case NETDEV_REGISTER: > + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); > + > + if (!new_stat) > + goto out; > + > + new_stat->dev = dev; > + INIT_RCU_HEAD(&new_stat->rcu); > + spin_lock(&trace_state_lock); > + list_add_rcu(&new_stat->list, &hw_stats_list); > + spin_unlock(&trace_state_lock); > + break; > + case NETDEV_UNREGISTER: > + rcu_read_lock(); > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > + if (new_stat->dev == dev) > + new_stat->dev = NULL; > + found = 1; > + break; > + } > + rcu_read_unlock(); This is racy, unless caller already owns a lock. If caller aleady owns a lock, you dont need : rcu_read_lock() list_for_each_entry_rcu() rcu_read_unlock(); > + > + spin_lock(&trace_state_lock); > + if (found && (trace_state == TRACE_OFF)) { > + list_del_rcu(&new_stat->list); > + call_rcu(&new_stat->rcu, free_dm_hw_stat); > + } > + spin_unlock(&trace_state_lock); > + break; Its not clear that you use trace_state_lock as the lock guarding all this. If this is the case I suggest a plain and straight forward : case NETDEV_UNREGISTER: spin_lock(&trace_state_lock); if (trace_state == TRACE_OFF) { list_for_each_entry(new_stat, &hw_stats_list, list) { if (new_stat->dev == dev) { new_stat->dev = NULL; list_del_rcu(&new_stat->list); call_rcu(&new_stat->rcu, free_dm_hw_stat); break; } } } spin_unlock(&trace_state_lock); break; ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 18:11 ` Eric Dumazet @ 2009-05-15 18:23 ` Stephen Hemminger 2009-05-15 19:53 ` Neil Horman 2009-05-15 19:23 ` Neil Horman 2009-05-16 12:40 ` Neil Horman 2 siblings, 1 reply; 36+ messages in thread From: Stephen Hemminger @ 2009-05-15 18:23 UTC (permalink / raw) To: Eric Dumazet; +Cc: Neil Horman, Jiri Pirko, Paul E. McKenney, netdev, davem On Fri, 15 May 2009 20:11:57 +0200 Eric Dumazet <dada1@cosmosbay.com> wrote: > Neil Horman a écrit : > > +static int dropmon_net_event(struct notifier_block *ev_block, > > + unsigned long event, void *ptr) > > +{ > > + struct net_device *dev = ptr; > > + struct dm_hw_stat_delta *new_stat = NULL; > > + int found = 0; > > + > > + switch (event) { > > + case NETDEV_REGISTER: > > + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); > > + > > + if (!new_stat) > > + goto out; > > + > > + new_stat->dev = dev; > > + INIT_RCU_HEAD(&new_stat->rcu); > > + spin_lock(&trace_state_lock); > > + list_add_rcu(&new_stat->list, &hw_stats_list); > > + spin_unlock(&trace_state_lock); > > + break; > > + case NETDEV_UNREGISTER: > > + rcu_read_lock(); > > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > > + if (new_stat->dev == dev) > > + new_stat->dev = NULL; > > + found = 1; > > + break; > > + } > > + rcu_read_unlock(); > > This is racy, unless caller already owns a lock. > > If caller aleady owns a lock, you dont need : > > rcu_read_lock() > list_for_each_entry_rcu() > rcu_read_unlock(); RTNL mutex is always held on notification call backs. Actually why is trace_state_lock needed at all? Why not just use the RTNL mutex? -- ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 18:23 ` Stephen Hemminger @ 2009-05-15 19:53 ` Neil Horman 0 siblings, 0 replies; 36+ messages in thread From: Neil Horman @ 2009-05-15 19:53 UTC (permalink / raw) To: Stephen Hemminger Cc: Eric Dumazet, Jiri Pirko, Paul E. McKenney, netdev, davem On Fri, May 15, 2009 at 11:23:45AM -0700, Stephen Hemminger wrote: > On Fri, 15 May 2009 20:11:57 +0200 > Eric Dumazet <dada1@cosmosbay.com> wrote: > > > Neil Horman a écrit : > > > +static int dropmon_net_event(struct notifier_block *ev_block, > > > + unsigned long event, void *ptr) > > > +{ > > > + struct net_device *dev = ptr; > > > + struct dm_hw_stat_delta *new_stat = NULL; > > > + int found = 0; > > > + > > > + switch (event) { > > > + case NETDEV_REGISTER: > > > + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); > > > + > > > + if (!new_stat) > > > + goto out; > > > + > > > + new_stat->dev = dev; > > > + INIT_RCU_HEAD(&new_stat->rcu); > > > + spin_lock(&trace_state_lock); > > > + list_add_rcu(&new_stat->list, &hw_stats_list); > > > + spin_unlock(&trace_state_lock); > > > + break; > > > + case NETDEV_UNREGISTER: > > > + rcu_read_lock(); > > > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > > > + if (new_stat->dev == dev) > > > + new_stat->dev = NULL; > > > + found = 1; > > > + break; > > > + } > > > + rcu_read_unlock(); > > > > This is racy, unless caller already owns a lock. > > > > If caller aleady owns a lock, you dont need : > > > > rcu_read_lock() > > list_for_each_entry_rcu() > > rcu_read_unlock(); > > RTNL mutex is always held on notification call backs. > Actually why is trace_state_lock needed at all? Why not > just use the RTNL mutex? > Theres also a need for this mutex to protect simeoustaneous state changes to the tracer from userspace. I suppose I could use the rtnl lock for that, but I think this is more readable. Neil > > > -- > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 18:11 ` Eric Dumazet 2009-05-15 18:23 ` Stephen Hemminger @ 2009-05-15 19:23 ` Neil Horman 2009-05-16 12:40 ` Neil Horman 2 siblings, 0 replies; 36+ messages in thread From: Neil Horman @ 2009-05-15 19:23 UTC (permalink / raw) To: Eric Dumazet; +Cc: Jiri Pirko, Paul E. McKenney, netdev, davem On Fri, May 15, 2009 at 08:11:57PM +0200, Eric Dumazet wrote: > Neil Horman a écrit : > > +static int dropmon_net_event(struct notifier_block *ev_block, > > + unsigned long event, void *ptr) > > +{ > > + struct net_device *dev = ptr; > > + struct dm_hw_stat_delta *new_stat = NULL; > > + int found = 0; > > + > > + switch (event) { > > + case NETDEV_REGISTER: > > + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); > > + > > + if (!new_stat) > > + goto out; > > + > > + new_stat->dev = dev; > > + INIT_RCU_HEAD(&new_stat->rcu); > > + spin_lock(&trace_state_lock); > > + list_add_rcu(&new_stat->list, &hw_stats_list); > > + spin_unlock(&trace_state_lock); > > + break; > > + case NETDEV_UNREGISTER: > > + rcu_read_lock(); > > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > > + if (new_stat->dev == dev) > > + new_stat->dev = NULL; > > + found = 1; > > + break; > > + } > > + rcu_read_unlock(); > > This is racy, unless caller already owns a lock. > Racy in what way? the rcu_read_lock (As I understand it), prevents list mutation while we're traversing it here, so we're not going to dereference a bogus pointer while walking it. If you're worried about changing the value of a struct member without an exclusive lock, thats safe in this case, as the worst case scenario is that we miss detecting a hardware frame drop right before we stop monitoring the device anyway, and I think thats better than the performance impact of taking a lock here. > If caller aleady owns a lock, you dont need : > > rcu_read_lock() > list_for_each_entry_rcu() > rcu_read_unlock(); > Stephen below points out that notification callbacks always hold the rtnl_lock, which I wasn't aware of, but I'm not sure thats ok for me to rely on here (at least I'd rather not) > > + > > + spin_lock(&trace_state_lock); > > + if (found && (trace_state == TRACE_OFF)) { > > + list_del_rcu(&new_stat->list); > > + call_rcu(&new_stat->rcu, free_dm_hw_stat); > > + } > > + spin_unlock(&trace_state_lock); > > + break; > > > > > Its not clear that you use trace_state_lock as the lock guarding all this. > How so? > If this is the case I suggest a plain and straight forward : > > case NETDEV_UNREGISTER: > spin_lock(&trace_state_lock); > if (trace_state == TRACE_OFF) { > list_for_each_entry(new_stat, &hw_stats_list, list) { > if (new_stat->dev == dev) { > new_stat->dev = NULL; > list_del_rcu(&new_stat->list); > call_rcu(&new_stat->rcu, free_dm_hw_stat); > break; > } > } > } > spin_unlock(&trace_state_lock); > break; > I was hoping to avoid holding the lock while I traversed the entire list. Not a huge deal I suppose, but I'd like to avoid doing that if I can. > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 18:11 ` Eric Dumazet 2009-05-15 18:23 ` Stephen Hemminger 2009-05-15 19:23 ` Neil Horman @ 2009-05-16 12:40 ` Neil Horman 2009-05-18 14:46 ` Neil Horman 2 siblings, 1 reply; 36+ messages in thread From: Neil Horman @ 2009-05-16 12:40 UTC (permalink / raw) To: Eric Dumazet; +Cc: Jiri Pirko, Paul E. McKenney, netdev, davem On Fri, May 15, 2009 at 08:11:57PM +0200, Eric Dumazet wrote: > Neil Horman a écrit : > > +static int dropmon_net_event(struct notifier_block *ev_block, > > + unsigned long event, void *ptr) > > +{ > > + struct net_device *dev = ptr; > > + struct dm_hw_stat_delta *new_stat = NULL; > > + int found = 0; > > + > > + switch (event) { > > + case NETDEV_REGISTER: > > + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); > > + > > + if (!new_stat) > > + goto out; > > + > > + new_stat->dev = dev; > > + INIT_RCU_HEAD(&new_stat->rcu); > > + spin_lock(&trace_state_lock); > > + list_add_rcu(&new_stat->list, &hw_stats_list); > > + spin_unlock(&trace_state_lock); > > + break; > > + case NETDEV_UNREGISTER: > > + rcu_read_lock(); > > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > > + if (new_stat->dev == dev) > > + new_stat->dev = NULL; > > + found = 1; > > + break; > > + } > > + rcu_read_unlock(); > > This is racy, unless caller already owns a lock. > > If caller aleady owns a lock, you dont need : > > rcu_read_lock() > list_for_each_entry_rcu() > rcu_read_unlock(); > > > + > > + spin_lock(&trace_state_lock); > > + if (found && (trace_state == TRACE_OFF)) { > > + list_del_rcu(&new_stat->list); > > + call_rcu(&new_stat->rcu, free_dm_hw_stat); > > + } > > + spin_unlock(&trace_state_lock); > > + break; > > > > > Its not clear that you use trace_state_lock as the lock guarding all this. > > If this is the case I suggest a plain and straight forward : > > case NETDEV_UNREGISTER: > spin_lock(&trace_state_lock); > if (trace_state == TRACE_OFF) { > list_for_each_entry(new_stat, &hw_stats_list, list) { > if (new_stat->dev == dev) { > new_stat->dev = NULL; > list_del_rcu(&new_stat->list); > call_rcu(&new_stat->rcu, free_dm_hw_stat); > break; > } > } > } > spin_unlock(&trace_state_lock); > break; > > > I was thinking about this last night, and I agree, that I think your solution would be better here (my solution might be racy with regards to a trace state change resulting in leaked memory. I'll post a new patch monday. Sorry for the trouble. Neil ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-16 12:40 ` Neil Horman @ 2009-05-18 14:46 ` Neil Horman 2009-05-21 7:17 ` David Miller 0 siblings, 1 reply; 36+ messages in thread From: Neil Horman @ 2009-05-18 14:46 UTC (permalink / raw) To: Eric Dumazet; +Cc: Jiri Pirko, Paul E. McKenney, netdev, davem On Sat, May 16, 2009 at 08:40:29AM -0400, Neil Horman wrote: > Ok, new version of the patch (v5 I think), fixes up Erics comments regarding the locking in the NETDEV unregister event: Change notes: 1) Modify the NETDEV_UNREGISTER event handling so that its clear the trace_state_lock protects the integrity of the stats list while we are traversing (and possibly deleting) members Regards Neil Patch to add the ability to detect drops in hardware interfaces via dropwatch. Adds a tracepoint to net_rx_action to signal everytime a napi instance is polled. The dropmon code then periodically checks to see if the rx_frames counter has changed, and if so, adds a drop notification to the netlink protocol, using the reserved all-0's vector to indicate the drop location was in hardware, rather than somewhere in the code. Signed-off-by: Neil Horman <nhorman@tuxdriver.com> include/linux/net_dropmon.h | 8 ++ include/trace/napi.h | 11 +++ net/core/dev.c | 5 + net/core/drop_monitor.c | 124 ++++++++++++++++++++++++++++++++++++++++++-- net/core/net-traces.c | 4 + net/core/netpoll.c | 2 6 files changed, 149 insertions(+), 5 deletions(-) diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h index 0217fb8..e8a8b5c 100644 --- a/include/linux/net_dropmon.h +++ b/include/linux/net_dropmon.h @@ -2,12 +2,20 @@ #define __NET_DROPMON_H #include <linux/netlink.h> +#include <linux/types.h> struct net_dm_drop_point { __u8 pc[8]; __u32 count; }; +#define is_drop_point_hw(x) do {\ + int ____i, ____j;\ + for (____i = 0; ____i < 8; i ____i++)\ + ____j |= x[____i];\ + ____j;\ +} while (0) + #define NET_DM_CFG_VERSION 0 #define NET_DM_CFG_ALERT_COUNT 1 #define NET_DM_CFG_ALERT_DELAY 2 diff --git a/include/trace/napi.h b/include/trace/napi.h new file mode 100644 index 0000000..7c39eb7 --- /dev/null +++ b/include/trace/napi.h @@ -0,0 +1,11 @@ +#ifndef _TRACE_NAPI_H_ +#define _TRACE_NAPI_H_ + +#include <linux/skbuff.h> +#include <linux/tracepoint.h> + +DECLARE_TRACE(napi_poll, + TP_PROTO(struct napi_struct *napi), + TP_ARGS(napi)); + +#endif diff --git a/net/core/dev.c b/net/core/dev.c index 637ea71..165979c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -126,6 +126,7 @@ #include <linux/in.h> #include <linux/jhash.h> #include <linux/random.h> +#include <trace/napi.h> #include "net-sysfs.h" @@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h) * accidently calling ->poll() when NAPI is not scheduled. */ work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) + if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); + trace_napi_poll(n); + } WARN_ON_ONCE(work > weight); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 2797b71..a6c2ac2 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -22,8 +22,10 @@ #include <linux/timer.h> #include <linux/bitops.h> #include <net/genetlink.h> +#include <net/netevent.h> #include <trace/skb.h> +#include <trace/napi.h> #include <asm/unaligned.h> @@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused); * and the work handle that will send up * netlink alerts */ -struct sock *dm_sock; +static int trace_state = TRACE_OFF; +static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; struct per_cpu_dm_data { struct work_struct dm_alert_work; @@ -47,6 +50,13 @@ struct per_cpu_dm_data { struct timer_list send_timer; }; +struct dm_hw_stat_delta { + struct net_device *dev; + struct list_head list; + struct rcu_head rcu; + unsigned long last_drop_val; +}; + static struct genl_family net_drop_monitor_family = { .id = GENL_ID_GENERATE, .hdrsize = 0, @@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; - +static unsigned long dm_hw_check_delta = 2*HZ; +static LIST_HEAD(hw_stats_list); static void reset_per_cpu_data(struct per_cpu_dm_data *data) { @@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused) schedule_work(&data->dm_alert_work); } -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; struct nlmsghdr *nlh; @@ -159,24 +170,80 @@ out: return; } +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +{ + trace_drop_common(skb, location); +} + +static void trace_napi_poll_hit(struct napi_struct *napi) +{ + struct dm_hw_stat_delta *new_stat; + + /* + * Ratelimit our check time to dm_hw_check_delta jiffies + */ + if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if ((new_stat->dev == napi->dev) && + (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + trace_drop_common(NULL, NULL); + new_stat->last_drop_val = napi->dev->stats.rx_dropped; + break; + } + } + rcu_read_unlock(); +} + + +static void free_dm_hw_stat(struct rcu_head *head) +{ + struct dm_hw_stat_delta *n; + n = container_of(head, struct dm_hw_stat_delta, rcu); + kfree(n); +} + static int set_all_monitor_traces(int state) { int rc = 0; + struct dm_hw_stat_delta *new_stat = NULL; + struct dm_hw_stat_delta *temp; + + spin_lock(&trace_state_lock); switch (state) { case TRACE_ON: rc |= register_trace_kfree_skb(trace_kfree_skb_hit); + rc |= register_trace_napi_poll(trace_napi_poll_hit); break; case TRACE_OFF: rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); + rc |= unregister_trace_napi_poll(trace_napi_poll_hit); tracepoint_synchronize_unregister(); + + /* + * Clean the device list + */ + list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { + if (new_stat->dev == NULL) { + list_del_rcu(&new_stat->list); + call_rcu(&new_stat->rcu, free_dm_hw_stat); + } + } break; default: rc = 1; break; } + if (!rc) + trace_state = state; + + spin_unlock(&trace_state_lock); + if (rc) return -EINPROGRESS; return rc; @@ -204,6 +271,44 @@ static int net_dm_cmd_trace(struct sk_buff *skb, return -ENOTSUPP; } +static int dropmon_net_event(struct notifier_block *ev_block, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct dm_hw_stat_delta *new_stat = NULL; + struct dm_hw_stat_delta *tmp; + + switch (event) { + case NETDEV_REGISTER: + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + + if (!new_stat) + goto out; + + new_stat->dev = dev; + INIT_RCU_HEAD(&new_stat->rcu); + spin_lock(&trace_state_lock); + list_add_rcu(&new_stat->list, &hw_stats_list); + spin_unlock(&trace_state_lock); + break; + case NETDEV_UNREGISTER: + spin_lock(&trace_state_lock); + list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { + if (new_stat->dev == dev) { + new_stat->dev = NULL; + if (trace_state == TRACE_OFF) { + list_del_rcu(&new_stat->list); + call_rcu(&new_stat->rcu, free_dm_hw_stat); + break; + } + } + } + spin_unlock(&trace_state_lock); + break; + } +out: + return NOTIFY_DONE; +} static struct genl_ops dropmon_ops[] = { { @@ -220,6 +325,10 @@ static struct genl_ops dropmon_ops[] = { }, }; +static struct notifier_block dropmon_net_notifier = { + .notifier_call = dropmon_net_event +}; + static int __init init_net_drop_monitor(void) { int cpu; @@ -243,12 +352,18 @@ static int __init init_net_drop_monitor(void) ret = genl_register_ops(&net_drop_monitor_family, &dropmon_ops[i]); if (ret) { - printk(KERN_CRIT "failed to register operation %d\n", + printk(KERN_CRIT "Failed to register operation %d\n", dropmon_ops[i].cmd); goto out_unreg; } } + rc = register_netdevice_notifier(&dropmon_net_notifier); + if (rc < 0) { + printk(KERN_CRIT "Failed to register netdevice notifier\n"); + goto out_unreg; + } + rc = 0; for_each_present_cpu(cpu) { @@ -259,6 +374,7 @@ static int __init init_net_drop_monitor(void) data->send_timer.data = cpu; data->send_timer.function = sched_send_work; } + goto out; out_unreg: diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c8fb456..b07b25b 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -20,6 +20,7 @@ #include <linux/netlink.h> #include <linux/net_dropmon.h> #include <trace/skb.h> +#include <trace/napi.h> #include <asm/unaligned.h> #include <asm/bitops.h> @@ -27,3 +28,6 @@ DEFINE_TRACE(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); + +DEFINE_TRACE(napi_poll); +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index b5873bd..446d3ba 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -24,6 +24,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <asm/unaligned.h> +#include <trace/napi.h> /* * We maintain a small pool of fully-sized skbs, to make sure the @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, set_bit(NAPI_STATE_NPSVC, &napi->state); work = napi->poll(napi, budget); + trace_napi_poll(napi->dev); clear_bit(NAPI_STATE_NPSVC, &napi->state); atomic_dec(&trapped); ^ permalink raw reply related [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-18 14:46 ` Neil Horman @ 2009-05-21 7:17 ` David Miller 2009-05-21 17:36 ` Neil Horman 0 siblings, 1 reply; 36+ messages in thread From: David Miller @ 2009-05-21 7:17 UTC (permalink / raw) To: nhorman; +Cc: dada1, jpirko, paulmck, netdev From: Neil Horman <nhorman@tuxdriver.com> Date: Mon, 18 May 2009 10:46:22 -0400 > @@ -0,0 +1,11 @@ > +#ifndef _TRACE_NAPI_H_ > +#define _TRACE_NAPI_H_ > + > +#include <linux/skbuff.h> > +#include <linux/tracepoint.h> > + > +DECLARE_TRACE(napi_poll, > + TP_PROTO(struct napi_struct *napi), > + TP_ARGS(napi)); > + > +#endif If skbuff.h gets you the declaration for "struct napi_struct" it's via some indirect accident rather than it actually being in linux/skbuff.h Please include the correct headers, thanks. ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-21 7:17 ` David Miller @ 2009-05-21 17:36 ` Neil Horman 2009-05-21 22:15 ` David Miller 0 siblings, 1 reply; 36+ messages in thread From: Neil Horman @ 2009-05-21 17:36 UTC (permalink / raw) To: David Miller; +Cc: dada1, jpirko, paulmck, netdev On Thu, May 21, 2009 at 12:17:36AM -0700, David Miller wrote: > From: Neil Horman <nhorman@tuxdriver.com> > Date: Mon, 18 May 2009 10:46:22 -0400 > > > @@ -0,0 +1,11 @@ > > +#ifndef _TRACE_NAPI_H_ > > +#define _TRACE_NAPI_H_ > > + > > +#include <linux/skbuff.h> > > +#include <linux/tracepoint.h> > > + > > +DECLARE_TRACE(napi_poll, > > + TP_PROTO(struct napi_struct *napi), > > + TP_ARGS(napi)); > > + > > +#endif > > If skbuff.h gets you the declaration for "struct napi_struct" > it's via some indirect accident rather than it actually being > in linux/skbuff.h > > Please include the correct headers, thanks. > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html Thank you Dave, copy n' paste bit me. Patch v6 Change notes: 1) updated header file in trace/napi.h to include netdevice.h so that I correctly pull in struct napi_struct definition Neil Patch to add the ability to detect drops in hardware interfaces via dropwatch. Adds a tracepoint to net_rx_action to signal everytime a napi instance is polled. The dropmon code then periodically checks to see if the rx_frames counter has changed, and if so, adds a drop notification to the netlink protocol, using the reserved all-0's vector to indicate the drop location was in hardware, rather than somewhere in the code. Signed-off-by: Neil Horman <nhorman@tuxdriver.com> include/linux/net_dropmon.h | 8 ++ include/trace/napi.h | 11 +++ net/core/dev.c | 5 + net/core/drop_monitor.c | 124 ++++++++++++++++++++++++++++++++++++++++++-- net/core/net-traces.c | 4 + net/core/netpoll.c | 2 6 files changed, 149 insertions(+), 5 deletions(-) diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h index 0217fb8..e8a8b5c 100644 --- a/include/linux/net_dropmon.h +++ b/include/linux/net_dropmon.h @@ -2,12 +2,20 @@ #define __NET_DROPMON_H #include <linux/netlink.h> +#include <linux/types.h> struct net_dm_drop_point { __u8 pc[8]; __u32 count; }; +#define is_drop_point_hw(x) do {\ + int ____i, ____j;\ + for (____i = 0; ____i < 8; i ____i++)\ + ____j |= x[____i];\ + ____j;\ +} while (0) + #define NET_DM_CFG_VERSION 0 #define NET_DM_CFG_ALERT_COUNT 1 #define NET_DM_CFG_ALERT_DELAY 2 diff --git a/include/trace/napi.h b/include/trace/napi.h new file mode 100644 index 0000000..a8989c4 --- /dev/null +++ b/include/trace/napi.h @@ -0,0 +1,11 @@ +#ifndef _TRACE_NAPI_H_ +#define _TRACE_NAPI_H_ + +#include <linux/netdevice.h> +#include <linux/tracepoint.h> + +DECLARE_TRACE(napi_poll, + TP_PROTO(struct napi_struct *napi), + TP_ARGS(napi)); + +#endif diff --git a/net/core/dev.c b/net/core/dev.c index 637ea71..165979c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -126,6 +126,7 @@ #include <linux/in.h> #include <linux/jhash.h> #include <linux/random.h> +#include <trace/napi.h> #include "net-sysfs.h" @@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h) * accidently calling ->poll() when NAPI is not scheduled. */ work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) + if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); + trace_napi_poll(n); + } WARN_ON_ONCE(work > weight); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 2797b71..a6c2ac2 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -22,8 +22,10 @@ #include <linux/timer.h> #include <linux/bitops.h> #include <net/genetlink.h> +#include <net/netevent.h> #include <trace/skb.h> +#include <trace/napi.h> #include <asm/unaligned.h> @@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused); * and the work handle that will send up * netlink alerts */ -struct sock *dm_sock; +static int trace_state = TRACE_OFF; +static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; struct per_cpu_dm_data { struct work_struct dm_alert_work; @@ -47,6 +50,13 @@ struct per_cpu_dm_data { struct timer_list send_timer; }; +struct dm_hw_stat_delta { + struct net_device *dev; + struct list_head list; + struct rcu_head rcu; + unsigned long last_drop_val; +}; + static struct genl_family net_drop_monitor_family = { .id = GENL_ID_GENERATE, .hdrsize = 0, @@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; - +static unsigned long dm_hw_check_delta = 2*HZ; +static LIST_HEAD(hw_stats_list); static void reset_per_cpu_data(struct per_cpu_dm_data *data) { @@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused) schedule_work(&data->dm_alert_work); } -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; struct nlmsghdr *nlh; @@ -159,24 +170,80 @@ out: return; } +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +{ + trace_drop_common(skb, location); +} + +static void trace_napi_poll_hit(struct napi_struct *napi) +{ + struct dm_hw_stat_delta *new_stat; + + /* + * Ratelimit our check time to dm_hw_check_delta jiffies + */ + if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if ((new_stat->dev == napi->dev) && + (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + trace_drop_common(NULL, NULL); + new_stat->last_drop_val = napi->dev->stats.rx_dropped; + break; + } + } + rcu_read_unlock(); +} + + +static void free_dm_hw_stat(struct rcu_head *head) +{ + struct dm_hw_stat_delta *n; + n = container_of(head, struct dm_hw_stat_delta, rcu); + kfree(n); +} + static int set_all_monitor_traces(int state) { int rc = 0; + struct dm_hw_stat_delta *new_stat = NULL; + struct dm_hw_stat_delta *temp; + + spin_lock(&trace_state_lock); switch (state) { case TRACE_ON: rc |= register_trace_kfree_skb(trace_kfree_skb_hit); + rc |= register_trace_napi_poll(trace_napi_poll_hit); break; case TRACE_OFF: rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); + rc |= unregister_trace_napi_poll(trace_napi_poll_hit); tracepoint_synchronize_unregister(); + + /* + * Clean the device list + */ + list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { + if (new_stat->dev == NULL) { + list_del_rcu(&new_stat->list); + call_rcu(&new_stat->rcu, free_dm_hw_stat); + } + } break; default: rc = 1; break; } + if (!rc) + trace_state = state; + + spin_unlock(&trace_state_lock); + if (rc) return -EINPROGRESS; return rc; @@ -204,6 +271,44 @@ static int net_dm_cmd_trace(struct sk_buff *skb, return -ENOTSUPP; } +static int dropmon_net_event(struct notifier_block *ev_block, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct dm_hw_stat_delta *new_stat = NULL; + struct dm_hw_stat_delta *tmp; + + switch (event) { + case NETDEV_REGISTER: + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + + if (!new_stat) + goto out; + + new_stat->dev = dev; + INIT_RCU_HEAD(&new_stat->rcu); + spin_lock(&trace_state_lock); + list_add_rcu(&new_stat->list, &hw_stats_list); + spin_unlock(&trace_state_lock); + break; + case NETDEV_UNREGISTER: + spin_lock(&trace_state_lock); + list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { + if (new_stat->dev == dev) { + new_stat->dev = NULL; + if (trace_state == TRACE_OFF) { + list_del_rcu(&new_stat->list); + call_rcu(&new_stat->rcu, free_dm_hw_stat); + break; + } + } + } + spin_unlock(&trace_state_lock); + break; + } +out: + return NOTIFY_DONE; +} static struct genl_ops dropmon_ops[] = { { @@ -220,6 +325,10 @@ static struct genl_ops dropmon_ops[] = { }, }; +static struct notifier_block dropmon_net_notifier = { + .notifier_call = dropmon_net_event +}; + static int __init init_net_drop_monitor(void) { int cpu; @@ -243,12 +352,18 @@ static int __init init_net_drop_monitor(void) ret = genl_register_ops(&net_drop_monitor_family, &dropmon_ops[i]); if (ret) { - printk(KERN_CRIT "failed to register operation %d\n", + printk(KERN_CRIT "Failed to register operation %d\n", dropmon_ops[i].cmd); goto out_unreg; } } + rc = register_netdevice_notifier(&dropmon_net_notifier); + if (rc < 0) { + printk(KERN_CRIT "Failed to register netdevice notifier\n"); + goto out_unreg; + } + rc = 0; for_each_present_cpu(cpu) { @@ -259,6 +374,7 @@ static int __init init_net_drop_monitor(void) data->send_timer.data = cpu; data->send_timer.function = sched_send_work; } + goto out; out_unreg: diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c8fb456..b07b25b 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -20,6 +20,7 @@ #include <linux/netlink.h> #include <linux/net_dropmon.h> #include <trace/skb.h> +#include <trace/napi.h> #include <asm/unaligned.h> #include <asm/bitops.h> @@ -27,3 +28,6 @@ DEFINE_TRACE(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); + +DEFINE_TRACE(napi_poll); +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index b5873bd..446d3ba 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -24,6 +24,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <asm/unaligned.h> +#include <trace/napi.h> /* * We maintain a small pool of fully-sized skbs, to make sure the @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, set_bit(NAPI_STATE_NPSVC, &napi->state); work = napi->poll(napi, budget); + trace_napi_poll(napi->dev); clear_bit(NAPI_STATE_NPSVC, &napi->state); atomic_dec(&trapped); ^ permalink raw reply related [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-21 17:36 ` Neil Horman @ 2009-05-21 22:15 ` David Miller 2009-05-22 0:09 ` Neil Horman 0 siblings, 1 reply; 36+ messages in thread From: David Miller @ 2009-05-21 22:15 UTC (permalink / raw) To: nhorman; +Cc: dada1, jpirko, paulmck, netdev From: Neil Horman <nhorman@tuxdriver.com> Date: Thu, 21 May 2009 13:36:08 -0400 > Thank you Dave, copy n' paste bit me. > > Patch v6 Applied to net-next-2.6, thanks! ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-21 22:15 ` David Miller @ 2009-05-22 0:09 ` Neil Horman 0 siblings, 0 replies; 36+ messages in thread From: Neil Horman @ 2009-05-22 0:09 UTC (permalink / raw) To: David Miller; +Cc: dada1, jpirko, paulmck, netdev On Thu, May 21, 2009 at 03:15:03PM -0700, David Miller wrote: > From: Neil Horman <nhorman@tuxdriver.com> > Date: Thu, 21 May 2009 13:36:08 -0400 > > > Thank you Dave, copy n' paste bit me. > > > > Patch v6 > > Applied to net-next-2.6, thanks! > Thanks Dave. I'll try get it right the first try next time around. Ok, maybe the third try :) Neil ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 16:07 ` Neil Horman 2009-05-15 18:11 ` Eric Dumazet @ 2009-05-15 18:18 ` Stephen Hemminger 2009-05-15 19:12 ` Neil Horman 1 sibling, 1 reply; 36+ messages in thread From: Stephen Hemminger @ 2009-05-15 18:18 UTC (permalink / raw) To: Neil Horman; +Cc: Jiri Pirko, Paul E. McKenney, Eric Dumazet, netdev, davem > +#define is_drop_point_hw(x) do {\ > + int ____i, ____j;\ > + for (____i = 0; ____i < 8; i ____i++)\ > + ____j |= x[____i];\ > + ____j;\ > +} while (0) Would this code be less ugly if it were an inline function? ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets 2009-05-15 18:18 ` Stephen Hemminger @ 2009-05-15 19:12 ` Neil Horman 0 siblings, 0 replies; 36+ messages in thread From: Neil Horman @ 2009-05-15 19:12 UTC (permalink / raw) To: Stephen Hemminger Cc: Jiri Pirko, Paul E. McKenney, Eric Dumazet, netdev, davem On Fri, May 15, 2009 at 11:18:52AM -0700, Stephen Hemminger wrote: > > > +#define is_drop_point_hw(x) do {\ > > + int ____i, ____j;\ > > + for (____i = 0; ____i < 8; i ____i++)\ > > + ____j |= x[____i];\ > > + ____j;\ > > +} while (0) > > Would this code be less ugly if it were an inline function? > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html Possibly, I don't actually use that yet, but was planning on it shortly. I'll fiddle with it and update it in a later patch if another format makes mroe sense > ^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets 2009-05-14 12:33 ` Neil Horman 2009-05-14 12:44 ` Jiri Pirko @ 2009-05-14 16:18 ` Paul E. McKenney 1 sibling, 0 replies; 36+ messages in thread From: Paul E. McKenney @ 2009-05-14 16:18 UTC (permalink / raw) To: Neil Horman; +Cc: Eric Dumazet, netdev, davem On Thu, May 14, 2009 at 08:33:00AM -0400, Neil Horman wrote: > > > > The purpose of list_add_rcu() and list_del_rcu() is to be able to permit > > concurrent readers to traverse the list while you are modifying that same > > list. You still need something to handle concurrent updates to the list. > > The usual approach is to use locks, in this case, to hold trace_state_lock > > across the addition, given that it is already held across the removal. > > > > The reason that I expect holding the lock to be OK is that you cannot > > add elements faster than you delete them long-term, so given that you > > are willing to hold the lock over deletions, you are probably OK also > > holding that same lock over additions. But please let me know if that > > is not the case! > > > > Thanx, Paul > > > > > Ahh, I did learn something then :). That makes sense, ok. New patch attached. > > Patch to add the ability to detect drops in hardware interfaces via dropwatch. > Adds a tracepoint to net_rx_action to signal everytime a napi instance is > polled. The dropmon code then periodically checks to see if the rx_frames > counter has changed, and if so, adds a drop notification to the netlink > protocol, using the reserved all-0's vector to indicate the drop location was in > hardware, rather than somewhere in the code. > > change notes: > 1) added use of trace_state_lock around list_add_rcu operation Looks good from an RCU viewpoint! Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com> > > > include/linux/net_dropmon.h | 7 ++ > include/trace/napi.h | 11 ++++ > net/core/dev.c | 5 + > net/core/drop_monitor.c | 115 ++++++++++++++++++++++++++++++++++++++++++-- > net/core/net-traces.c | 4 + > net/core/netpoll.c | 2 > 6 files changed, 139 insertions(+), 5 deletions(-) > > diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h > index 0217fb8..9addc62 100644 > --- a/include/linux/net_dropmon.h > +++ b/include/linux/net_dropmon.h > @@ -8,6 +8,13 @@ struct net_dm_drop_point { > __u32 count; > }; > > +#define is_drop_point_hw(x) do {\ > + int ____i, ____j;\ > + for (____i = 0; ____i < 8; i ____i++)\ > + ____j |= x[____i];\ > + ____j;\ > +} while (0) > + > #define NET_DM_CFG_VERSION 0 > #define NET_DM_CFG_ALERT_COUNT 1 > #define NET_DM_CFG_ALERT_DELAY 2 > diff --git a/include/trace/napi.h b/include/trace/napi.h > new file mode 100644 > index 0000000..7c39eb7 > --- /dev/null > +++ b/include/trace/napi.h > @@ -0,0 +1,11 @@ > +#ifndef _TRACE_NAPI_H_ > +#define _TRACE_NAPI_H_ > + > +#include <linux/skbuff.h> > +#include <linux/tracepoint.h> > + > +DECLARE_TRACE(napi_poll, > + TP_PROTO(struct napi_struct *napi), > + TP_ARGS(napi)); > + > +#endif > diff --git a/net/core/dev.c b/net/core/dev.c > index 637ea71..165979c 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -126,6 +126,7 @@ > #include <linux/in.h> > #include <linux/jhash.h> > #include <linux/random.h> > +#include <trace/napi.h> > > #include "net-sysfs.h" > > @@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h) > * accidently calling ->poll() when NAPI is not scheduled. > */ > work = 0; > - if (test_bit(NAPI_STATE_SCHED, &n->state)) > + if (test_bit(NAPI_STATE_SCHED, &n->state)) { > work = n->poll(n, weight); > + trace_napi_poll(n); > + } > > WARN_ON_ONCE(work > weight); > > diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c > index 2797b71..f449971 100644 > --- a/net/core/drop_monitor.c > +++ b/net/core/drop_monitor.c > @@ -22,8 +22,10 @@ > #include <linux/timer.h> > #include <linux/bitops.h> > #include <net/genetlink.h> > +#include <net/netevent.h> > > #include <trace/skb.h> > +#include <trace/napi.h> > > #include <asm/unaligned.h> > > @@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused); > * and the work handle that will send up > * netlink alerts > */ > -struct sock *dm_sock; > +static int trace_state = TRACE_OFF; > +static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; > > struct per_cpu_dm_data { > struct work_struct dm_alert_work; > @@ -47,6 +50,13 @@ struct per_cpu_dm_data { > struct timer_list send_timer; > }; > > +struct dm_hw_stat_delta { > + struct net_device *dev; > + struct list_head list; > + struct rcu_head rcu; > + unsigned long last_drop_val; > +}; > + > static struct genl_family net_drop_monitor_family = { > .id = GENL_ID_GENERATE, > .hdrsize = 0, > @@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); > > static int dm_hit_limit = 64; > static int dm_delay = 1; > - > +static unsigned long dm_hw_check_delta = 2*HZ; > +static LIST_HEAD(hw_stats_list); > > static void reset_per_cpu_data(struct per_cpu_dm_data *data) > { > @@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused) > schedule_work(&data->dm_alert_work); > } > > -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) > +static void trace_drop_common(struct sk_buff *skb, void *location) > { > struct net_dm_alert_msg *msg; > struct nlmsghdr *nlh; > @@ -159,26 +170,79 @@ out: > return; > } > > +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) > +{ > + trace_drop_common(skb, location); > +} > + > +static void trace_napi_poll_hit(struct napi_struct *napi) > +{ > + struct dm_hw_stat_delta *new_stat; > + > + /* > + * Ratelimit our check time to dm_hw_check_delta jiffies > + */ > + if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) > + return; > + > + rcu_read_lock(); > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > + if ((new_stat->dev == napi->dev) && > + (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { > + trace_drop_common(NULL, NULL); > + new_stat->last_drop_val = napi->dev->stats.rx_dropped; > + break; > + } > + } > + rcu_read_unlock(); > +} > + > + > +static void free_dm_hw_stat(struct rcu_head *head) > +{ > + struct dm_hw_stat_delta *n; > + n = container_of(head, struct dm_hw_stat_delta, rcu); > + kfree(n); > +} > + > static int set_all_monitor_traces(int state) > { > int rc = 0; > + struct dm_hw_stat_delta *new_stat = NULL; > + > + spin_lock(&trace_state_lock); > > switch (state) { > case TRACE_ON: > rc |= register_trace_kfree_skb(trace_kfree_skb_hit); > + rc |= register_trace_napi_poll(trace_napi_poll_hit); > break; > case TRACE_OFF: > rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); > + rc |= unregister_trace_napi_poll(trace_napi_poll_hit); > > tracepoint_synchronize_unregister(); > + > + /* > + * Clean the device list > + */ > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > + if (new_stat->dev == NULL) { > + list_del_rcu(&new_stat->list); > + call_rcu(&new_stat->rcu, free_dm_hw_stat); > + } > + } > break; > default: > rc = 1; > break; > } > > + spin_unlock(&trace_state_lock); > + > if (rc) > return -EINPROGRESS; > + trace_state = state; > return rc; > } > > @@ -204,6 +268,38 @@ static int net_dm_cmd_trace(struct sk_buff *skb, > return -ENOTSUPP; > } > > +static int dropmon_net_event(struct notifier_block *ev_block, > + unsigned long event, void *ptr) > +{ > + struct net_device *dev = ptr; > + struct dm_hw_stat_delta *new_stat = NULL; > + > + switch (event) { > + case NETDEV_REGISTER: > + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); > + > + if (!new_stat) > + goto out; > + > + new_stat->dev = dev; > + INIT_RCU_HEAD(&new_stat->rcu); > + spin_lock(&trace_state_lock); > + list_add_rcu(&new_stat->list, &hw_stats_list); > + spin_unlock(&trace_state_lock); > + break; > + case NETDEV_UNREGISTER: > + rcu_read_lock(); > + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { > + if (new_stat->dev == dev) > + new_stat->dev = NULL; > + break; > + } > + rcu_read_unlock(); > + break; > + } > +out: > + return NOTIFY_DONE; > +} > > static struct genl_ops dropmon_ops[] = { > { > @@ -220,6 +316,10 @@ static struct genl_ops dropmon_ops[] = { > }, > }; > > +static struct notifier_block dropmon_net_notifier = { > + .notifier_call = dropmon_net_event > +}; > + > static int __init init_net_drop_monitor(void) > { > int cpu; > @@ -243,12 +343,18 @@ static int __init init_net_drop_monitor(void) > ret = genl_register_ops(&net_drop_monitor_family, > &dropmon_ops[i]); > if (ret) { > - printk(KERN_CRIT "failed to register operation %d\n", > + printk(KERN_CRIT "Failed to register operation %d\n", > dropmon_ops[i].cmd); > goto out_unreg; > } > } > > + rc = register_netdevice_notifier(&dropmon_net_notifier); > + if (rc < 0) { > + printk(KERN_CRIT "Failed to register netdevice notifier\n"); > + goto out_unreg; > + } > + > rc = 0; > > for_each_present_cpu(cpu) { > @@ -259,6 +365,7 @@ static int __init init_net_drop_monitor(void) > data->send_timer.data = cpu; > data->send_timer.function = sched_send_work; > } > + > goto out; > > out_unreg: > diff --git a/net/core/net-traces.c b/net/core/net-traces.c > index c8fb456..b07b25b 100644 > --- a/net/core/net-traces.c > +++ b/net/core/net-traces.c > @@ -20,6 +20,7 @@ > #include <linux/netlink.h> > #include <linux/net_dropmon.h> > #include <trace/skb.h> > +#include <trace/napi.h> > > #include <asm/unaligned.h> > #include <asm/bitops.h> > @@ -27,3 +28,6 @@ > > DEFINE_TRACE(kfree_skb); > EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); > + > +DEFINE_TRACE(napi_poll); > +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); > diff --git a/net/core/netpoll.c b/net/core/netpoll.c > index b5873bd..446d3ba 100644 > --- a/net/core/netpoll.c > +++ b/net/core/netpoll.c > @@ -24,6 +24,7 @@ > #include <net/tcp.h> > #include <net/udp.h> > #include <asm/unaligned.h> > +#include <trace/napi.h> > > /* > * We maintain a small pool of fully-sized skbs, to make sure the > @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, > set_bit(NAPI_STATE_NPSVC, &napi->state); > > work = napi->poll(napi, budget); > + trace_napi_poll(napi->dev); > > clear_bit(NAPI_STATE_NPSVC, &napi->state); > atomic_dec(&trapped); > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 36+ messages in thread
end of thread, other threads:[~2009-05-22 0:09 UTC | newest] Thread overview: 36+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2009-05-08 19:50 [PATCH] dropmon: add ability to detect when hardware drops rx packets Neil Horman 2009-05-09 6:30 ` Eric Dumazet 2009-05-09 18:07 ` Neil Horman 2009-05-12 16:30 ` Neil Horman 2009-05-13 18:23 ` [PATCH] dropmon: add ability to detect when hardware drops rxpackets Paul E. McKenney 2009-05-14 0:45 ` Neil Horman 2009-05-14 1:03 ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Paul E. McKenney 2009-05-14 12:33 ` Neil Horman 2009-05-14 12:44 ` Jiri Pirko 2009-05-14 16:17 ` [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets Paul E. McKenney 2009-05-14 17:29 ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Neil Horman 2009-05-15 5:49 ` Jarek Poplawski 2009-05-15 11:01 ` Neil Horman 2009-05-15 11:12 ` Jarek Poplawski 2009-05-15 11:15 ` Neil Horman 2009-05-15 11:40 ` Jarek Poplawski 2009-05-16 0:07 ` Paul E. McKenney 2009-05-15 6:51 ` Jiri Pirko 2009-05-15 7:37 ` Eric Dumazet 2009-05-15 11:12 ` Neil Horman 2009-05-15 10:59 ` Neil Horman 2009-05-15 11:27 ` Jiri Pirko 2009-05-15 16:07 ` Neil Horman 2009-05-15 18:11 ` Eric Dumazet 2009-05-15 18:23 ` Stephen Hemminger 2009-05-15 19:53 ` Neil Horman 2009-05-15 19:23 ` Neil Horman 2009-05-16 12:40 ` Neil Horman 2009-05-18 14:46 ` Neil Horman 2009-05-21 7:17 ` David Miller 2009-05-21 17:36 ` Neil Horman 2009-05-21 22:15 ` David Miller 2009-05-22 0:09 ` Neil Horman 2009-05-15 18:18 ` Stephen Hemminger 2009-05-15 19:12 ` Neil Horman 2009-05-14 16:18 ` [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets Paul E. McKenney
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).