[PATCH] dropmon: add ability to detect when hardware drops rx packets

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] dropmon: add ability to detect when hardware drops rx packets
@ 2009-05-08 19:50 Neil Horman
  2009-05-09  6:30 ` Eric Dumazet
  0 siblings, 1 reply; 36+ messages in thread
From: Neil Horman @ 2009-05-08 19:50 UTC (permalink / raw)
  To: netdev; +Cc: nhorman, davem

Hey all-
	A few weeks back I sent out an RFC in which I proposed that I could
extend dropwatch to detect drops in hardware by adding a napi poll tracepoint,
using that as an indicator that I should periodically check the stats to see if
the rx drop counter had changed.  Having not had any negative feedback to that
proposal, heres the patch.  I've tested it and it seems to work fairly well.
Neil


Patch to add the ability to detect drops in hardware interfaces via dropwatch.
Adds a tracepoint to net_rx_action to signal everytime a napi instance is
polled.  The dropmon code then periodically checks to see if the rx_frames
counter has changed, and if so, adds a drop notification to the netlink
protocol, using the reserved all-0's vector to indicate the drop location was in
hardware, rather than somewhere in the code.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>


 include/linux/net_dropmon.h |    7 ++
 include/trace/napi.h        |   11 ++++
 net/core/dev.c              |    5 +-
 net/core/drop_monitor.c     |  107 ++++++++++++++++++++++++++++++++++++++++++--
 net/core/net-traces.c       |    4 +
 net/core/netpoll.c          |    2 
 6 files changed, 131 insertions(+), 5 deletions(-)

diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
index 0217fb8..9addc62 100644
--- a/include/linux/net_dropmon.h
+++ b/include/linux/net_dropmon.h
@@ -8,6 +8,13 @@ struct net_dm_drop_point {
 	__u32 count;
 };
 
+#define is_drop_point_hw(x) do {\
+	int ____i, ____j;\
+	for (____i = 0; ____i < 8; i ____i++)\
+		____j |= x[____i];\
+	____j;\
+} while (0)
+
 #define NET_DM_CFG_VERSION  0
 #define NET_DM_CFG_ALERT_COUNT  1
 #define NET_DM_CFG_ALERT_DELAY 2
diff --git a/include/trace/napi.h b/include/trace/napi.h
new file mode 100644
index 0000000..7c39eb7
--- /dev/null
+++ b/include/trace/napi.h
@@ -0,0 +1,11 @@
+#ifndef _TRACE_NAPI_H_
+#define _TRACE_NAPI_H_
+
+#include <linux/skbuff.h>
+#include <linux/tracepoint.h>
+
+DECLARE_TRACE(napi_poll,
+	TP_PROTO(struct napi_struct *napi),
+	TP_ARGS(napi));
+
+#endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 637ea71..165979c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -126,6 +126,7 @@
 #include <linux/in.h>
 #include <linux/jhash.h>
 #include <linux/random.h>
+#include <trace/napi.h>
 
 #include "net-sysfs.h"
 
@@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h)
 		 * accidently calling ->poll() when NAPI is not scheduled.
 		 */
 		work = 0;
-		if (test_bit(NAPI_STATE_SCHED, &n->state))
+		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 			work = n->poll(n, weight);
+			trace_napi_poll(n);
+		}
 
 		WARN_ON_ONCE(work > weight);
 
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 2797b71..538c1d4 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -22,8 +22,10 @@
 #include <linux/timer.h>
 #include <linux/bitops.h>
 #include <net/genetlink.h>
+#include <net/netevent.h>
 
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 
@@ -38,7 +40,7 @@ static void send_dm_alert(struct work_struct *unused);
  * and the work handle that will send up
  * netlink alerts
  */
-struct sock *dm_sock;
+static int trace_state = TRACE_OFF;
 
 struct per_cpu_dm_data {
 	struct work_struct dm_alert_work;
@@ -47,6 +49,12 @@ struct per_cpu_dm_data {
 	struct timer_list send_timer;
 };
 
+struct dm_hw_stat_delta {
+	struct net_device *dev;
+	struct list_head list;
+	unsigned long last_drop_val;
+};
+
 static struct genl_family net_drop_monitor_family = {
 	.id             = GENL_ID_GENERATE,
 	.hdrsize        = 0,
@@ -59,7 +67,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
 
 static int dm_hit_limit = 64;
 static int dm_delay = 1;
-
+static unsigned long dm_hw_check_delta = 2*HZ;
+static LIST_HEAD(hw_stats_list);
 
 static void reset_per_cpu_data(struct per_cpu_dm_data *data)
 {
@@ -115,7 +124,7 @@ static void sched_send_work(unsigned long unused)
 	schedule_work(&data->dm_alert_work);
 }
 
-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+static void trace_drop_common(struct sk_buff *skb, void *location)
 {
 	struct net_dm_alert_msg *msg;
 	struct nlmsghdr *nlh;
@@ -159,18 +168,59 @@ out:
 	return;
 }
 
+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+{
+	trace_drop_common(skb, location);
+}
+
+static void trace_napi_poll_hit(struct napi_struct *napi)
+{
+	struct dm_hw_stat_delta *new_stat;
+
+	/*
+	 * Ratelimit our check time to dm_hw_check_delta jiffies
+	 */
+	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
+		return;
+
+	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+		if ((new_stat->dev == napi->dev)  &&
+		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+			trace_drop_common(NULL, NULL);
+			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
+			break;
+		}
+	}
+}
+
+
 static int set_all_monitor_traces(int state)
 {
 	int rc = 0;
+	struct dm_hw_stat_delta *new_stat = NULL;
 
 	switch (state) {
 	case TRACE_ON:
 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= register_trace_napi_poll(trace_napi_poll_hit);
 		break;
 	case TRACE_OFF:
 		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
 
 		tracepoint_synchronize_unregister();
+
+		/*
+		 * Clean the device list
+		 */
+		rcu_read_lock();
+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+			if (new_stat->dev == NULL) {
+				list_del_rcu(&new_stat->list);
+				kfree(new_stat);
+			}
+		}
+		rcu_read_unlock();
 		break;
 	default:
 		rc = 1;
@@ -179,6 +229,7 @@ static int set_all_monitor_traces(int state)
 
 	if (rc)
 		return -EINPROGRESS;
+	trace_state = state;
 	return rc;
 }
 
@@ -204,6 +255,43 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
 	return -ENOTSUPP;
 }
 
+int dropmon_net_event(struct notifier_block *ev_block,
+			unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct dm_hw_stat_delta *new_stat = NULL;
+
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+
+		if (!new_stat)
+			goto out;
+
+		new_stat->dev = dev;
+		rcu_read_lock();
+		list_add_rcu(&new_stat->list, &hw_stats_list);
+		rcu_read_unlock();
+		break;
+	case NETDEV_UNREGISTER:
+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+			if (new_stat->dev == dev)
+				new_stat->dev = NULL;
+				break;
+		}
+
+		if (trace_state == TRACE_OFF) {
+			rcu_read_lock();
+			list_del_rcu(&new_stat->list);
+			rcu_read_unlock();
+			kfree(new_stat);
+		}
+		break;
+	}
+out:
+	return NOTIFY_DONE;
+}
 
 static struct genl_ops dropmon_ops[] = {
 	{
@@ -220,6 +308,10 @@ static struct genl_ops dropmon_ops[] = {
 	},
 };
 
+static struct notifier_block dropmon_net_notifier = {
+	.notifier_call = dropmon_net_event
+};
+
 static int __init init_net_drop_monitor(void)
 {
 	int cpu;
@@ -243,12 +335,18 @@ static int __init init_net_drop_monitor(void)
 		ret = genl_register_ops(&net_drop_monitor_family,
 					&dropmon_ops[i]);
 		if (ret) {
-			printk(KERN_CRIT "failed to register operation %d\n",
+			printk(KERN_CRIT "Failed to register operation %d\n",
 				dropmon_ops[i].cmd);
 			goto out_unreg;
 		}
 	}
 
+	rc = register_netdevice_notifier(&dropmon_net_notifier);
+	if (rc < 0) {
+		printk(KERN_CRIT "Failed to register netdevice notifier\n");
+		goto out_unreg;
+	}
+
 	rc = 0;
 
 	for_each_present_cpu(cpu) {
@@ -259,6 +357,7 @@ static int __init init_net_drop_monitor(void)
 		data->send_timer.data = cpu;
 		data->send_timer.function = sched_send_work;
 	}
+
 	goto out;
 
 out_unreg:
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index c8fb456..b07b25b 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -20,6 +20,7 @@
 #include <linux/netlink.h>
 #include <linux/net_dropmon.h>
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 #include <asm/bitops.h>
@@ -27,3 +28,6 @@
 
 DEFINE_TRACE(kfree_skb);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
+
+DEFINE_TRACE(napi_poll);
+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index b5873bd..446d3ba 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -24,6 +24,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <asm/unaligned.h>
+#include <trace/napi.h>
 
 /*
  * We maintain a small pool of fully-sized skbs, to make sure the
@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 
 	work = napi->poll(napi, budget);
+	trace_napi_poll(napi->dev);
 
 	clear_bit(NAPI_STATE_NPSVC, &napi->state);
 	atomic_dec(&trapped);


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware drops rx packets
  2009-05-08 19:50 [PATCH] dropmon: add ability to detect when hardware drops rx packets Neil Horman
@ 2009-05-09  6:30 ` Eric Dumazet
  2009-05-09 18:07   ` Neil Horman
  2009-05-12 16:30   ` Neil Horman
  0 siblings, 2 replies; 36+ messages in thread
From: Eric Dumazet @ 2009-05-09  6:30 UTC (permalink / raw)
  To: Neil Horman; +Cc: netdev, davem

Neil Horman a écrit :
> Hey all-
> 	A few weeks back I sent out an RFC in which I proposed that I could
> extend dropwatch to detect drops in hardware by adding a napi poll tracepoint,
> using that as an indicator that I should periodically check the stats to see if
> the rx drop counter had changed.  Having not had any negative feedback to that
> proposal, heres the patch.  I've tested it and it seems to work fairly well.
> Neil
> 
> 
> Patch to add the ability to detect drops in hardware interfaces via dropwatch.
> Adds a tracepoint to net_rx_action to signal everytime a napi instance is
> polled.  The dropmon code then periodically checks to see if the rx_frames
> counter has changed, and if so, adds a drop notification to the netlink
> protocol, using the reserved all-0's vector to indicate the drop location was in
> hardware, rather than somewhere in the code.
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> 

Hi Neil

I dont fully understand your patch, but at least have some questions
about rcu stuff.

> 
>  include/linux/net_dropmon.h |    7 ++
>  include/trace/napi.h        |   11 ++++
>  net/core/dev.c              |    5 +-
>  net/core/drop_monitor.c     |  107 ++++++++++++++++++++++++++++++++++++++++++--
>  net/core/net-traces.c       |    4 +
>  net/core/netpoll.c          |    2 
>  6 files changed, 131 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
> index 0217fb8..9addc62 100644
> --- a/include/linux/net_dropmon.h
> +++ b/include/linux/net_dropmon.h
> @@ -8,6 +8,13 @@ struct net_dm_drop_point {
>  	__u32 count;
>  };
>  
> +#define is_drop_point_hw(x) do {\
> +	int ____i, ____j;\
> +	for (____i = 0; ____i < 8; i ____i++)\
> +		____j |= x[____i];\
> +	____j;\
> +} while (0)
> +
>  #define NET_DM_CFG_VERSION  0
>  #define NET_DM_CFG_ALERT_COUNT  1
>  #define NET_DM_CFG_ALERT_DELAY 2
> diff --git a/include/trace/napi.h b/include/trace/napi.h
> new file mode 100644
> index 0000000..7c39eb7
> --- /dev/null
> +++ b/include/trace/napi.h
> @@ -0,0 +1,11 @@
> +#ifndef _TRACE_NAPI_H_
> +#define _TRACE_NAPI_H_
> +
> +#include <linux/skbuff.h>
> +#include <linux/tracepoint.h>
> +
> +DECLARE_TRACE(napi_poll,
> +	TP_PROTO(struct napi_struct *napi),
> +	TP_ARGS(napi));
> +
> +#endif
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 637ea71..165979c 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -126,6 +126,7 @@
>  #include <linux/in.h>
>  #include <linux/jhash.h>
>  #include <linux/random.h>
> +#include <trace/napi.h>
>  
>  #include "net-sysfs.h"
>  
> @@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h)
>  		 * accidently calling ->poll() when NAPI is not scheduled.
>  		 */
>  		work = 0;
> -		if (test_bit(NAPI_STATE_SCHED, &n->state))
> +		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
>  			work = n->poll(n, weight);
> +			trace_napi_poll(n);
> +		}
>  
>  		WARN_ON_ONCE(work > weight);
>  
> diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
> index 2797b71..538c1d4 100644
> --- a/net/core/drop_monitor.c
> +++ b/net/core/drop_monitor.c
> @@ -22,8 +22,10 @@
>  #include <linux/timer.h>
>  #include <linux/bitops.h>
>  #include <net/genetlink.h>
> +#include <net/netevent.h>
>  
>  #include <trace/skb.h>
> +#include <trace/napi.h>
>  
>  #include <asm/unaligned.h>
>  
> @@ -38,7 +40,7 @@ static void send_dm_alert(struct work_struct *unused);
>   * and the work handle that will send up
>   * netlink alerts
>   */
> -struct sock *dm_sock;
> +static int trace_state = TRACE_OFF;
>  
>  struct per_cpu_dm_data {
>  	struct work_struct dm_alert_work;
> @@ -47,6 +49,12 @@ struct per_cpu_dm_data {
>  	struct timer_list send_timer;
>  };
>  
> +struct dm_hw_stat_delta {
> +	struct net_device *dev;
> +	struct list_head list;
> +	unsigned long last_drop_val;
> +};
> +
>  static struct genl_family net_drop_monitor_family = {
>  	.id             = GENL_ID_GENERATE,
>  	.hdrsize        = 0,
> @@ -59,7 +67,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
>  
>  static int dm_hit_limit = 64;
>  static int dm_delay = 1;
> -
> +static unsigned long dm_hw_check_delta = 2*HZ;
> +static LIST_HEAD(hw_stats_list);
>  
>  static void reset_per_cpu_data(struct per_cpu_dm_data *data)
>  {
> @@ -115,7 +124,7 @@ static void sched_send_work(unsigned long unused)
>  	schedule_work(&data->dm_alert_work);
>  }
>  
> -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
> +static void trace_drop_common(struct sk_buff *skb, void *location)
>  {
>  	struct net_dm_alert_msg *msg;
>  	struct nlmsghdr *nlh;
> @@ -159,18 +168,59 @@ out:
>  	return;
>  }
>  
> +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
> +{
> +	trace_drop_common(skb, location);
> +}
> +
> +static void trace_napi_poll_hit(struct napi_struct *napi)
> +{
> +	struct dm_hw_stat_delta *new_stat;
> +
> +	/*
> +	 * Ratelimit our check time to dm_hw_check_delta jiffies
> +	 */
> +	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
> +		return;
> +
> +	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> +		if ((new_stat->dev == napi->dev)  &&
> +		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
> +			trace_drop_common(NULL, NULL);
> +			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
> +			break;
> +		}
> +	}
> +}
> +
> +
>  static int set_all_monitor_traces(int state)
>  {
>  	int rc = 0;
> +	struct dm_hw_stat_delta *new_stat = NULL;
>  
>  	switch (state) {
>  	case TRACE_ON:
>  		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
> +		rc |= register_trace_napi_poll(trace_napi_poll_hit);
>  		break;
>  	case TRACE_OFF:
>  		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
> +		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
>  
>  		tracepoint_synchronize_unregister();
> +
> +		/*
> +		 * Clean the device list
> +		 */
> +		rcu_read_lock();
> +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> +			if (new_stat->dev == NULL) {
> +				list_del_rcu(&new_stat->list);
> +				kfree(new_stat);

I can't see  how list_del_rcu(&new_stat->list) can be followed by kfree(new_stat)
safely. You need a grace period between two operations. That can be done
with call_rcu() helper.

Also, rcu_read_lock()/rcu_read_unlock() is not the proper locking pragmas
needed here. multiple writers are/should be forbidden here. If caller already
does a locking, then no additional locking is needed here.


> +			}
> +		}
> +		rcu_read_unlock();
>  		break;
>  	default:
>  		rc = 1;
> @@ -179,6 +229,7 @@ static int set_all_monitor_traces(int state)
>  
>  	if (rc)
>  		return -EINPROGRESS;
> +	trace_state = state;
>  	return rc;
>  }
>  
> @@ -204,6 +255,43 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
>  	return -ENOTSUPP;
>  }
>  
> +int dropmon_net_event(struct notifier_block *ev_block,
> +			unsigned long event, void *ptr)
> +{
> +	struct net_device *dev = ptr;
> +	struct dm_hw_stat_delta *new_stat = NULL;
> +
> +
> +	switch (event) {
> +	case NETDEV_REGISTER:
> +		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
> +
> +		if (!new_stat)
> +			goto out;
> +
> +		new_stat->dev = dev;
> +		rcu_read_lock();

same point here, rcu_read_lock is not the proper way to protect the list.

> +		list_add_rcu(&new_stat->list, &hw_stats_list);
> +		rcu_read_unlock();
> +		break;
> +	case NETDEV_UNREGISTER:
> +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> +			if (new_stat->dev == dev)
> +				new_stat->dev = NULL;
> +				break;
> +		}
> +
> +		if (trace_state == TRACE_OFF) {
> +			rcu_read_lock();
> +			list_del_rcu(&new_stat->list);
> +			rcu_read_unlock();
> +			kfree(new_stat);

same probleme here. If rcu is used, then you need a grace period.

> +		}
> +		break;
> +	}
> +out:
> +	return NOTIFY_DONE;
> +}
>  
>  static struct genl_ops dropmon_ops[] = {
>  	{
> @@ -220,6 +308,10 @@ static struct genl_ops dropmon_ops[] = {
>  	},
>  };
>  
> +static struct notifier_block dropmon_net_notifier = {
> +	.notifier_call = dropmon_net_event
> +};
> +
>  static int __init init_net_drop_monitor(void)
>  {
>  	int cpu;
> @@ -243,12 +335,18 @@ static int __init init_net_drop_monitor(void)
>  		ret = genl_register_ops(&net_drop_monitor_family,
>  					&dropmon_ops[i]);
>  		if (ret) {
> -			printk(KERN_CRIT "failed to register operation %d\n",
> +			printk(KERN_CRIT "Failed to register operation %d\n",
>  				dropmon_ops[i].cmd);
>  			goto out_unreg;
>  		}
>  	}
>  
> +	rc = register_netdevice_notifier(&dropmon_net_notifier);
> +	if (rc < 0) {
> +		printk(KERN_CRIT "Failed to register netdevice notifier\n");
> +		goto out_unreg;
> +	}
> +
>  	rc = 0;
>  
>  	for_each_present_cpu(cpu) {
> @@ -259,6 +357,7 @@ static int __init init_net_drop_monitor(void)
>  		data->send_timer.data = cpu;
>  		data->send_timer.function = sched_send_work;
>  	}
> +
>  	goto out;
>  
>  out_unreg:
> diff --git a/net/core/net-traces.c b/net/core/net-traces.c
> index c8fb456..b07b25b 100644
> --- a/net/core/net-traces.c
> +++ b/net/core/net-traces.c
> @@ -20,6 +20,7 @@
>  #include <linux/netlink.h>
>  #include <linux/net_dropmon.h>
>  #include <trace/skb.h>
> +#include <trace/napi.h>
>  
>  #include <asm/unaligned.h>
>  #include <asm/bitops.h>
> @@ -27,3 +28,6 @@
>  
>  DEFINE_TRACE(kfree_skb);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
> +
> +DEFINE_TRACE(napi_poll);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
> diff --git a/net/core/netpoll.c b/net/core/netpoll.c
> index b5873bd..446d3ba 100644
> --- a/net/core/netpoll.c
> +++ b/net/core/netpoll.c
> @@ -24,6 +24,7 @@
>  #include <net/tcp.h>
>  #include <net/udp.h>
>  #include <asm/unaligned.h>
> +#include <trace/napi.h>
>  
>  /*
>   * We maintain a small pool of fully-sized skbs, to make sure the
> @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
>  	set_bit(NAPI_STATE_NPSVC, &napi->state);
>  
>  	work = napi->poll(napi, budget);
> +	trace_napi_poll(napi->dev);
>  
>  	clear_bit(NAPI_STATE_NPSVC, &napi->state);
>  	atomic_dec(&trapped);
> 



^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware drops rx packets
  2009-05-09  6:30 ` Eric Dumazet
@ 2009-05-09 18:07   ` Neil Horman
  2009-05-12 16:30   ` Neil Horman
  1 sibling, 0 replies; 36+ messages in thread
From: Neil Horman @ 2009-05-09 18:07 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, davem

On Sat, May 09, 2009 at 08:30:35AM +0200, Eric Dumazet wrote:
> Neil Horman a écrit :
> > Hey all-
> > 	A few weeks back I sent out an RFC in which I proposed that I could
> > extend dropwatch to detect drops in hardware by adding a napi poll tracepoint,
> > using that as an indicator that I should periodically check the stats to see if
> > the rx drop counter had changed.  Having not had any negative feedback to that
> > proposal, heres the patch.  I've tested it and it seems to work fairly well.
> > Neil
> > 
> > 
> > Patch to add the ability to detect drops in hardware interfaces via dropwatch.
> > Adds a tracepoint to net_rx_action to signal everytime a napi instance is
> > polled.  The dropmon code then periodically checks to see if the rx_frames
> > counter has changed, and if so, adds a drop notification to the netlink
> > protocol, using the reserved all-0's vector to indicate the drop location was in
> > hardware, rather than somewhere in the code.
> > 
> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > 
> 
> Hi Neil
> 
> I dont fully understand your patch, but at least have some questions
> about rcu stuff.
> 
The idea (if you'll recall the dropmon protocol that I added a few months ago to
detect frame loss at specific locations in the kernel), is to extend that idea
to detect frame loss in the hardware as well.  This is the first part of a patch
to do that in the receive path.  If it gets past review, I'll be extending it to
do transmit drop detection as well.

Regarding the rcu locking, I was under the impression that the
rcu_read_lock/unlock point were meant to provide that syncronization, although I
admit that I didn't look to closely, and assumed that the lack of crashing when
Flooded traffic on the box while adding and removing net drivers mean I
understood the locking structure.  Obviously looking at the implementation I've
misunderstood.  I'll self-nak this and repost when I understand the rcu
interface a bit better (sometime in the next week I expect).

Thanks!
Neil

> > 
> >  include/linux/net_dropmon.h |    7 ++
> >  include/trace/napi.h        |   11 ++++
> >  net/core/dev.c              |    5 +-
> >  net/core/drop_monitor.c     |  107 ++++++++++++++++++++++++++++++++++++++++++--
> >  net/core/net-traces.c       |    4 +
> >  net/core/netpoll.c          |    2 
> >  6 files changed, 131 insertions(+), 5 deletions(-)
> > 
> > diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
> > index 0217fb8..9addc62 100644
> > --- a/include/linux/net_dropmon.h
> > +++ b/include/linux/net_dropmon.h
> > @@ -8,6 +8,13 @@ struct net_dm_drop_point {
> >  	__u32 count;
> >  };
> >  
> > +#define is_drop_point_hw(x) do {\
> > +	int ____i, ____j;\
> > +	for (____i = 0; ____i < 8; i ____i++)\
> > +		____j |= x[____i];\
> > +	____j;\
> > +} while (0)
> > +
> >  #define NET_DM_CFG_VERSION  0
> >  #define NET_DM_CFG_ALERT_COUNT  1
> >  #define NET_DM_CFG_ALERT_DELAY 2
> > diff --git a/include/trace/napi.h b/include/trace/napi.h
> > new file mode 100644
> > index 0000000..7c39eb7
> > --- /dev/null
> > +++ b/include/trace/napi.h
> > @@ -0,0 +1,11 @@
> > +#ifndef _TRACE_NAPI_H_
> > +#define _TRACE_NAPI_H_
> > +
> > +#include <linux/skbuff.h>
> > +#include <linux/tracepoint.h>
> > +
> > +DECLARE_TRACE(napi_poll,
> > +	TP_PROTO(struct napi_struct *napi),
> > +	TP_ARGS(napi));
> > +
> > +#endif
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 637ea71..165979c 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -126,6 +126,7 @@
> >  #include <linux/in.h>
> >  #include <linux/jhash.h>
> >  #include <linux/random.h>
> > +#include <trace/napi.h>
> >  
> >  #include "net-sysfs.h"
> >  
> > @@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h)
> >  		 * accidently calling ->poll() when NAPI is not scheduled.
> >  		 */
> >  		work = 0;
> > -		if (test_bit(NAPI_STATE_SCHED, &n->state))
> > +		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
> >  			work = n->poll(n, weight);
> > +			trace_napi_poll(n);
> > +		}
> >  
> >  		WARN_ON_ONCE(work > weight);
> >  
> > diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
> > index 2797b71..538c1d4 100644
> > --- a/net/core/drop_monitor.c
> > +++ b/net/core/drop_monitor.c
> > @@ -22,8 +22,10 @@
> >  #include <linux/timer.h>
> >  #include <linux/bitops.h>
> >  #include <net/genetlink.h>
> > +#include <net/netevent.h>
> >  
> >  #include <trace/skb.h>
> > +#include <trace/napi.h>
> >  
> >  #include <asm/unaligned.h>
> >  
> > @@ -38,7 +40,7 @@ static void send_dm_alert(struct work_struct *unused);
> >   * and the work handle that will send up
> >   * netlink alerts
> >   */
> > -struct sock *dm_sock;
> > +static int trace_state = TRACE_OFF;
> >  
> >  struct per_cpu_dm_data {
> >  	struct work_struct dm_alert_work;
> > @@ -47,6 +49,12 @@ struct per_cpu_dm_data {
> >  	struct timer_list send_timer;
> >  };
> >  
> > +struct dm_hw_stat_delta {
> > +	struct net_device *dev;
> > +	struct list_head list;
> > +	unsigned long last_drop_val;
> > +};
> > +
> >  static struct genl_family net_drop_monitor_family = {
> >  	.id             = GENL_ID_GENERATE,
> >  	.hdrsize        = 0,
> > @@ -59,7 +67,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
> >  
> >  static int dm_hit_limit = 64;
> >  static int dm_delay = 1;
> > -
> > +static unsigned long dm_hw_check_delta = 2*HZ;
> > +static LIST_HEAD(hw_stats_list);
> >  
> >  static void reset_per_cpu_data(struct per_cpu_dm_data *data)
> >  {
> > @@ -115,7 +124,7 @@ static void sched_send_work(unsigned long unused)
> >  	schedule_work(&data->dm_alert_work);
> >  }
> >  
> > -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
> > +static void trace_drop_common(struct sk_buff *skb, void *location)
> >  {
> >  	struct net_dm_alert_msg *msg;
> >  	struct nlmsghdr *nlh;
> > @@ -159,18 +168,59 @@ out:
> >  	return;
> >  }
> >  
> > +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
> > +{
> > +	trace_drop_common(skb, location);
> > +}
> > +
> > +static void trace_napi_poll_hit(struct napi_struct *napi)
> > +{
> > +	struct dm_hw_stat_delta *new_stat;
> > +
> > +	/*
> > +	 * Ratelimit our check time to dm_hw_check_delta jiffies
> > +	 */
> > +	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
> > +		return;
> > +
> > +	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> > +		if ((new_stat->dev == napi->dev)  &&
> > +		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
> > +			trace_drop_common(NULL, NULL);
> > +			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
> > +			break;
> > +		}
> > +	}
> > +}
> > +
> > +
> >  static int set_all_monitor_traces(int state)
> >  {
> >  	int rc = 0;
> > +	struct dm_hw_stat_delta *new_stat = NULL;
> >  
> >  	switch (state) {
> >  	case TRACE_ON:
> >  		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
> > +		rc |= register_trace_napi_poll(trace_napi_poll_hit);
> >  		break;
> >  	case TRACE_OFF:
> >  		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
> > +		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
> >  
> >  		tracepoint_synchronize_unregister();
> > +
> > +		/*
> > +		 * Clean the device list
> > +		 */
> > +		rcu_read_lock();
> > +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> > +			if (new_stat->dev == NULL) {
> > +				list_del_rcu(&new_stat->list);
> > +				kfree(new_stat);
> 
> I can't see  how list_del_rcu(&new_stat->list) can be followed by kfree(new_stat)
> safely. You need a grace period between two operations. That can be done
> with call_rcu() helper.
> 
> Also, rcu_read_lock()/rcu_read_unlock() is not the proper locking pragmas
> needed here. multiple writers are/should be forbidden here. If caller already
> does a locking, then no additional locking is needed here.
> 
> 
> > +			}
> > +		}
> > +		rcu_read_unlock();
> >  		break;
> >  	default:
> >  		rc = 1;
> > @@ -179,6 +229,7 @@ static int set_all_monitor_traces(int state)
> >  
> >  	if (rc)
> >  		return -EINPROGRESS;
> > +	trace_state = state;
> >  	return rc;
> >  }
> >  
> > @@ -204,6 +255,43 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
> >  	return -ENOTSUPP;
> >  }
> >  
> > +int dropmon_net_event(struct notifier_block *ev_block,
> > +			unsigned long event, void *ptr)
> > +{
> > +	struct net_device *dev = ptr;
> > +	struct dm_hw_stat_delta *new_stat = NULL;
> > +
> > +
> > +	switch (event) {
> > +	case NETDEV_REGISTER:
> > +		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
> > +
> > +		if (!new_stat)
> > +			goto out;
> > +
> > +		new_stat->dev = dev;
> > +		rcu_read_lock();
> 
> same point here, rcu_read_lock is not the proper way to protect the list.
> 
> > +		list_add_rcu(&new_stat->list, &hw_stats_list);
> > +		rcu_read_unlock();
> > +		break;
> > +	case NETDEV_UNREGISTER:
> > +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> > +			if (new_stat->dev == dev)
> > +				new_stat->dev = NULL;
> > +				break;
> > +		}
> > +
> > +		if (trace_state == TRACE_OFF) {
> > +			rcu_read_lock();
> > +			list_del_rcu(&new_stat->list);
> > +			rcu_read_unlock();
> > +			kfree(new_stat);
> 
> same probleme here. If rcu is used, then you need a grace period.
> 
> > +		}
> > +		break;
> > +	}
> > +out:
> > +	return NOTIFY_DONE;
> > +}
> >  
> >  static struct genl_ops dropmon_ops[] = {
> >  	{
> > @@ -220,6 +308,10 @@ static struct genl_ops dropmon_ops[] = {
> >  	},
> >  };
> >  
> > +static struct notifier_block dropmon_net_notifier = {
> > +	.notifier_call = dropmon_net_event
> > +};
> > +
> >  static int __init init_net_drop_monitor(void)
> >  {
> >  	int cpu;
> > @@ -243,12 +335,18 @@ static int __init init_net_drop_monitor(void)
> >  		ret = genl_register_ops(&net_drop_monitor_family,
> >  					&dropmon_ops[i]);
> >  		if (ret) {
> > -			printk(KERN_CRIT "failed to register operation %d\n",
> > +			printk(KERN_CRIT "Failed to register operation %d\n",
> >  				dropmon_ops[i].cmd);
> >  			goto out_unreg;
> >  		}
> >  	}
> >  
> > +	rc = register_netdevice_notifier(&dropmon_net_notifier);
> > +	if (rc < 0) {
> > +		printk(KERN_CRIT "Failed to register netdevice notifier\n");
> > +		goto out_unreg;
> > +	}
> > +
> >  	rc = 0;
> >  
> >  	for_each_present_cpu(cpu) {
> > @@ -259,6 +357,7 @@ static int __init init_net_drop_monitor(void)
> >  		data->send_timer.data = cpu;
> >  		data->send_timer.function = sched_send_work;
> >  	}
> > +
> >  	goto out;
> >  
> >  out_unreg:
> > diff --git a/net/core/net-traces.c b/net/core/net-traces.c
> > index c8fb456..b07b25b 100644
> > --- a/net/core/net-traces.c
> > +++ b/net/core/net-traces.c
> > @@ -20,6 +20,7 @@
> >  #include <linux/netlink.h>
> >  #include <linux/net_dropmon.h>
> >  #include <trace/skb.h>
> > +#include <trace/napi.h>
> >  
> >  #include <asm/unaligned.h>
> >  #include <asm/bitops.h>
> > @@ -27,3 +28,6 @@
> >  
> >  DEFINE_TRACE(kfree_skb);
> >  EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
> > +
> > +DEFINE_TRACE(napi_poll);
> > +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
> > diff --git a/net/core/netpoll.c b/net/core/netpoll.c
> > index b5873bd..446d3ba 100644
> > --- a/net/core/netpoll.c
> > +++ b/net/core/netpoll.c
> > @@ -24,6 +24,7 @@
> >  #include <net/tcp.h>
> >  #include <net/udp.h>
> >  #include <asm/unaligned.h>
> > +#include <trace/napi.h>
> >  
> >  /*
> >   * We maintain a small pool of fully-sized skbs, to make sure the
> > @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
> >  	set_bit(NAPI_STATE_NPSVC, &napi->state);
> >  
> >  	work = napi->poll(napi, budget);
> > +	trace_napi_poll(napi->dev);
> >  
> >  	clear_bit(NAPI_STATE_NPSVC, &napi->state);
> >  	atomic_dec(&trapped);
> > 
> 
> 
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware drops rx packets
  2009-05-09  6:30 ` Eric Dumazet
  2009-05-09 18:07   ` Neil Horman
@ 2009-05-12 16:30   ` Neil Horman
  2009-05-13 18:23     ` [PATCH] dropmon: add ability to detect when hardware drops rxpackets Paul E. McKenney
  1 sibling, 1 reply; 36+ messages in thread
From: Neil Horman @ 2009-05-12 16:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, davem, nhorman

On Sat, May 09, 2009 at 08:30:35AM +0200, Eric Dumazet wrote:
> 
> I dont fully understand your patch, but at least have some questions
> about rcu stuff.
> 


Ok, so I went back and I think I managed to better understand the RCU interface.
New patch attached, works the same way, saving for the gross previous misuse of
rcu.


Patch to add the ability to detect drops in hardware interfaces via dropwatch.
Adds a tracepoint to net_rx_action to signal everytime a napi instance is
polled.  The dropmon code then periodically checks to see if the rx_frames
counter has changed, and if so, adds a drop notification to the netlink
protocol, using the reserved all-0's vector to indicate the drop location was in
hardware, rather than somewhere in the code.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>


 include/linux/net_dropmon.h |    7 ++
 include/trace/napi.h        |   11 ++++
 net/core/dev.c              |    5 +
 net/core/drop_monitor.c     |  114 ++++++++++++++++++++++++++++++++++++++++++--
 net/core/net-traces.c       |    4 +
 net/core/netpoll.c          |    2 
 6 files changed, 138 insertions(+), 5 deletions(-)

diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
index 0217fb8..9addc62 100644
--- a/include/linux/net_dropmon.h
+++ b/include/linux/net_dropmon.h
@@ -8,6 +8,13 @@ struct net_dm_drop_point {
 	__u32 count;
 };
 
+#define is_drop_point_hw(x) do {\
+	int ____i, ____j;\
+	for (____i = 0; ____i < 8; i ____i++)\
+		____j |= x[____i];\
+	____j;\
+} while (0)
+
 #define NET_DM_CFG_VERSION  0
 #define NET_DM_CFG_ALERT_COUNT  1
 #define NET_DM_CFG_ALERT_DELAY 2
diff --git a/include/trace/napi.h b/include/trace/napi.h
new file mode 100644
index 0000000..7c39eb7
--- /dev/null
+++ b/include/trace/napi.h
@@ -0,0 +1,11 @@
+#ifndef _TRACE_NAPI_H_
+#define _TRACE_NAPI_H_
+
+#include <linux/skbuff.h>
+#include <linux/tracepoint.h>
+
+DECLARE_TRACE(napi_poll,
+	TP_PROTO(struct napi_struct *napi),
+	TP_ARGS(napi));
+
+#endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 637ea71..165979c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -126,6 +126,7 @@
 #include <linux/in.h>
 #include <linux/jhash.h>
 #include <linux/random.h>
+#include <trace/napi.h>
 
 #include "net-sysfs.h"
 
@@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h)
 		 * accidently calling ->poll() when NAPI is not scheduled.
 		 */
 		work = 0;
-		if (test_bit(NAPI_STATE_SCHED, &n->state))
+		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 			work = n->poll(n, weight);
+			trace_napi_poll(n);
+		}
 
 		WARN_ON_ONCE(work > weight);
 
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 2797b71..5153479 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -22,8 +22,10 @@
 #include <linux/timer.h>
 #include <linux/bitops.h>
 #include <net/genetlink.h>
+#include <net/netevent.h>
 
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 
@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused);
  * and the work handle that will send up
  * netlink alerts
  */
-struct sock *dm_sock;
+static int trace_state = TRACE_OFF;
+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
 
 struct per_cpu_dm_data {
 	struct work_struct dm_alert_work;
@@ -47,6 +50,13 @@ struct per_cpu_dm_data {
 	struct timer_list send_timer;
 };
 
+struct dm_hw_stat_delta {
+	struct net_device *dev;
+	struct list_head list;
+	struct rcu_head rcu;
+	unsigned long last_drop_val;
+};
+
 static struct genl_family net_drop_monitor_family = {
 	.id             = GENL_ID_GENERATE,
 	.hdrsize        = 0,
@@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
 
 static int dm_hit_limit = 64;
 static int dm_delay = 1;
-
+static unsigned long dm_hw_check_delta = 2*HZ;
+static LIST_HEAD(hw_stats_list);
 
 static void reset_per_cpu_data(struct per_cpu_dm_data *data)
 {
@@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused)
 	schedule_work(&data->dm_alert_work);
 }
 
-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+static void trace_drop_common(struct sk_buff *skb, void *location)
 {
 	struct net_dm_alert_msg *msg;
 	struct nlmsghdr *nlh;
@@ -159,26 +170,79 @@ out:
 	return;
 }
 
+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+{
+	trace_drop_common(skb, location);
+}
+
+static void trace_napi_poll_hit(struct napi_struct *napi)
+{
+	struct dm_hw_stat_delta *new_stat;
+
+	/*
+	 * Ratelimit our check time to dm_hw_check_delta jiffies
+	 */
+	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+		if ((new_stat->dev == napi->dev)  &&
+		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+			trace_drop_common(NULL, NULL);
+			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+
+
+static void free_dm_hw_stat(struct rcu_head *head)
+{
+	struct dm_hw_stat_delta *n;
+	n = container_of(head, struct dm_hw_stat_delta, rcu);
+	kfree(n);
+}
+
 static int set_all_monitor_traces(int state)
 {
 	int rc = 0;
+	struct dm_hw_stat_delta *new_stat = NULL;
+
+	spin_lock(&trace_state_lock);
 
 	switch (state) {
 	case TRACE_ON:
 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= register_trace_napi_poll(trace_napi_poll_hit);
 		break;
 	case TRACE_OFF:
 		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
 
 		tracepoint_synchronize_unregister();
+
+		/*
+		 * Clean the device list
+		 */
+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+			if (new_stat->dev == NULL) {
+				list_del_rcu(&new_stat->list);
+				call_rcu(&new_stat->rcu, free_dm_hw_stat);
+			}
+		}
 		break;
 	default:
 		rc = 1;
 		break;
 	}
 
+	spin_unlock(&trace_state_lock);
+
 	if (rc)
 		return -EINPROGRESS;
+	trace_state = state;
 	return rc;
 }
 
@@ -204,6 +268,37 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
 	return -ENOTSUPP;
 }
 
+static int dropmon_net_event(struct notifier_block *ev_block,
+			unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct dm_hw_stat_delta *new_stat = NULL;
+
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+
+		if (!new_stat)
+			goto out;
+
+		new_stat->dev = dev;
+		INIT_RCU_HEAD(&new_stat->rcu);
+		list_add_rcu(&new_stat->list, &hw_stats_list);
+		break;
+	case NETDEV_UNREGISTER:
+		rcu_read_lock();
+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+			if (new_stat->dev == dev)
+				new_stat->dev = NULL;
+				break;
+		}
+		rcu_read_unlock();
+		break;
+	}
+out:
+	return NOTIFY_DONE;
+}
 
 static struct genl_ops dropmon_ops[] = {
 	{
@@ -220,6 +315,10 @@ static struct genl_ops dropmon_ops[] = {
 	},
 };
 
+static struct notifier_block dropmon_net_notifier = {
+	.notifier_call = dropmon_net_event
+};
+
 static int __init init_net_drop_monitor(void)
 {
 	int cpu;
@@ -243,12 +342,18 @@ static int __init init_net_drop_monitor(void)
 		ret = genl_register_ops(&net_drop_monitor_family,
 					&dropmon_ops[i]);
 		if (ret) {
-			printk(KERN_CRIT "failed to register operation %d\n",
+			printk(KERN_CRIT "Failed to register operation %d\n",
 				dropmon_ops[i].cmd);
 			goto out_unreg;
 		}
 	}
 
+	rc = register_netdevice_notifier(&dropmon_net_notifier);
+	if (rc < 0) {
+		printk(KERN_CRIT "Failed to register netdevice notifier\n");
+		goto out_unreg;
+	}
+
 	rc = 0;
 
 	for_each_present_cpu(cpu) {
@@ -259,6 +364,7 @@ static int __init init_net_drop_monitor(void)
 		data->send_timer.data = cpu;
 		data->send_timer.function = sched_send_work;
 	}
+
 	goto out;
 
 out_unreg:
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index c8fb456..b07b25b 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -20,6 +20,7 @@
 #include <linux/netlink.h>
 #include <linux/net_dropmon.h>
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 #include <asm/bitops.h>
@@ -27,3 +28,6 @@
 
 DEFINE_TRACE(kfree_skb);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
+
+DEFINE_TRACE(napi_poll);
+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index b5873bd..446d3ba 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -24,6 +24,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <asm/unaligned.h>
+#include <trace/napi.h>
 
 /*
  * We maintain a small pool of fully-sized skbs, to make sure the
@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 
 	work = napi->poll(napi, budget);
+	trace_napi_poll(napi->dev);
 
 	clear_bit(NAPI_STATE_NPSVC, &napi->state);
 	atomic_dec(&trapped);

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware drops rxpackets
  2009-05-12 16:30   ` Neil Horman
@ 2009-05-13 18:23     ` Paul E. McKenney
  2009-05-14  0:45       ` Neil Horman
  0 siblings, 1 reply; 36+ messages in thread
From: Paul E. McKenney @ 2009-05-13 18:23 UTC (permalink / raw)
  To: Neil Horman; +Cc: Eric Dumazet, netdev, davem

On Tue, May 12, 2009 at 12:30:44PM -0400, Neil Horman wrote:
> On Sat, May 09, 2009 at 08:30:35AM +0200, Eric Dumazet wrote:
> > 
> > I dont fully understand your patch, but at least have some questions
> > about rcu stuff.
> > 
> 
> 
> Ok, so I went back and I think I managed to better understand the RCU interface.
> New patch attached, works the same way, saving for the gross previous misuse of
> rcu.
> 
> 
> Patch to add the ability to detect drops in hardware interfaces via dropwatch.
> Adds a tracepoint to net_rx_action to signal everytime a napi instance is
> polled.  The dropmon code then periodically checks to see if the rx_frames
> counter has changed, and if so, adds a drop notification to the netlink
> protocol, using the reserved all-0's vector to indicate the drop location was in
> hardware, rather than somewhere in the code.

One concern shown below.

							Thanx, Paul

> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> 
> 
>  include/linux/net_dropmon.h |    7 ++
>  include/trace/napi.h        |   11 ++++
>  net/core/dev.c              |    5 +
>  net/core/drop_monitor.c     |  114 ++++++++++++++++++++++++++++++++++++++++++--
>  net/core/net-traces.c       |    4 +
>  net/core/netpoll.c          |    2 
>  6 files changed, 138 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
> index 0217fb8..9addc62 100644
> --- a/include/linux/net_dropmon.h
> +++ b/include/linux/net_dropmon.h
> @@ -8,6 +8,13 @@ struct net_dm_drop_point {
>  	__u32 count;
>  };
> 
> +#define is_drop_point_hw(x) do {\
> +	int ____i, ____j;\
> +	for (____i = 0; ____i < 8; i ____i++)\
> +		____j |= x[____i];\
> +	____j;\
> +} while (0)
> +
>  #define NET_DM_CFG_VERSION  0
>  #define NET_DM_CFG_ALERT_COUNT  1
>  #define NET_DM_CFG_ALERT_DELAY 2
> diff --git a/include/trace/napi.h b/include/trace/napi.h
> new file mode 100644
> index 0000000..7c39eb7
> --- /dev/null
> +++ b/include/trace/napi.h
> @@ -0,0 +1,11 @@
> +#ifndef _TRACE_NAPI_H_
> +#define _TRACE_NAPI_H_
> +
> +#include <linux/skbuff.h>
> +#include <linux/tracepoint.h>
> +
> +DECLARE_TRACE(napi_poll,
> +	TP_PROTO(struct napi_struct *napi),
> +	TP_ARGS(napi));
> +
> +#endif
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 637ea71..165979c 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -126,6 +126,7 @@
>  #include <linux/in.h>
>  #include <linux/jhash.h>
>  #include <linux/random.h>
> +#include <trace/napi.h>
> 
>  #include "net-sysfs.h"
> 
> @@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h)
>  		 * accidently calling ->poll() when NAPI is not scheduled.
>  		 */
>  		work = 0;
> -		if (test_bit(NAPI_STATE_SCHED, &n->state))
> +		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
>  			work = n->poll(n, weight);
> +			trace_napi_poll(n);
> +		}
> 
>  		WARN_ON_ONCE(work > weight);
> 
> diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
> index 2797b71..5153479 100644
> --- a/net/core/drop_monitor.c
> +++ b/net/core/drop_monitor.c
> @@ -22,8 +22,10 @@
>  #include <linux/timer.h>
>  #include <linux/bitops.h>
>  #include <net/genetlink.h>
> +#include <net/netevent.h>
> 
>  #include <trace/skb.h>
> +#include <trace/napi.h>
> 
>  #include <asm/unaligned.h>
> 
> @@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused);
>   * and the work handle that will send up
>   * netlink alerts
>   */
> -struct sock *dm_sock;
> +static int trace_state = TRACE_OFF;
> +static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
> 
>  struct per_cpu_dm_data {
>  	struct work_struct dm_alert_work;
> @@ -47,6 +50,13 @@ struct per_cpu_dm_data {
>  	struct timer_list send_timer;
>  };
> 
> +struct dm_hw_stat_delta {
> +	struct net_device *dev;
> +	struct list_head list;
> +	struct rcu_head rcu;
> +	unsigned long last_drop_val;
> +};
> +
>  static struct genl_family net_drop_monitor_family = {
>  	.id             = GENL_ID_GENERATE,
>  	.hdrsize        = 0,
> @@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
> 
>  static int dm_hit_limit = 64;
>  static int dm_delay = 1;
> -
> +static unsigned long dm_hw_check_delta = 2*HZ;
> +static LIST_HEAD(hw_stats_list);
> 
>  static void reset_per_cpu_data(struct per_cpu_dm_data *data)
>  {
> @@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused)
>  	schedule_work(&data->dm_alert_work);
>  }
> 
> -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
> +static void trace_drop_common(struct sk_buff *skb, void *location)
>  {
>  	struct net_dm_alert_msg *msg;
>  	struct nlmsghdr *nlh;
> @@ -159,26 +170,79 @@ out:
>  	return;
>  }
> 
> +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
> +{
> +	trace_drop_common(skb, location);
> +}
> +
> +static void trace_napi_poll_hit(struct napi_struct *napi)
> +{
> +	struct dm_hw_stat_delta *new_stat;
> +
> +	/*
> +	 * Ratelimit our check time to dm_hw_check_delta jiffies
> +	 */
> +	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
> +		return;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> +		if ((new_stat->dev == napi->dev)  &&
> +		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
> +			trace_drop_common(NULL, NULL);
> +			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
> +			break;
> +		}
> +	}
> +	rcu_read_unlock();
> +}
> +
> +
> +static void free_dm_hw_stat(struct rcu_head *head)
> +{
> +	struct dm_hw_stat_delta *n;
> +	n = container_of(head, struct dm_hw_stat_delta, rcu);
> +	kfree(n);
> +}
> +
>  static int set_all_monitor_traces(int state)
>  {
>  	int rc = 0;
> +	struct dm_hw_stat_delta *new_stat = NULL;
> +
> +	spin_lock(&trace_state_lock);
> 
>  	switch (state) {
>  	case TRACE_ON:
>  		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
> +		rc |= register_trace_napi_poll(trace_napi_poll_hit);
>  		break;
>  	case TRACE_OFF:
>  		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
> +		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
> 
>  		tracepoint_synchronize_unregister();
> +
> +		/*
> +		 * Clean the device list
> +		 */
> +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> +			if (new_stat->dev == NULL) {
> +				list_del_rcu(&new_stat->list);
> +				call_rcu(&new_stat->rcu, free_dm_hw_stat);

Much better!  ;-)

> +			}
> +		}
>  		break;
>  	default:
>  		rc = 1;
>  		break;
>  	}
> 
> +	spin_unlock(&trace_state_lock);
> +
>  	if (rc)
>  		return -EINPROGRESS;
> +	trace_state = state;
>  	return rc;
>  }
> 
> @@ -204,6 +268,37 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
>  	return -ENOTSUPP;
>  }
> 
> +static int dropmon_net_event(struct notifier_block *ev_block,
> +			unsigned long event, void *ptr)
> +{
> +	struct net_device *dev = ptr;
> +	struct dm_hw_stat_delta *new_stat = NULL;
> +
> +
> +	switch (event) {
> +	case NETDEV_REGISTER:
> +		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
> +
> +		if (!new_stat)
> +			goto out;
> +
> +		new_stat->dev = dev;
> +		INIT_RCU_HEAD(&new_stat->rcu);
> +		list_add_rcu(&new_stat->list, &hw_stats_list);

Don't we need to be holding trace_state_lock at this point?  Otherwise,
can't we mangle the list with a concurrent list_add_rcu() and
list_del_rcu()?

> +		break;
> +	case NETDEV_UNREGISTER:
> +		rcu_read_lock();
> +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> +			if (new_stat->dev == dev)
> +				new_stat->dev = NULL;
> +				break;
> +		}
> +		rcu_read_unlock();
> +		break;
> +	}
> +out:
> +	return NOTIFY_DONE;
> +}
> 
>  static struct genl_ops dropmon_ops[] = {
>  	{
> @@ -220,6 +315,10 @@ static struct genl_ops dropmon_ops[] = {
>  	},
>  };
> 
> +static struct notifier_block dropmon_net_notifier = {
> +	.notifier_call = dropmon_net_event
> +};
> +
>  static int __init init_net_drop_monitor(void)
>  {
>  	int cpu;
> @@ -243,12 +342,18 @@ static int __init init_net_drop_monitor(void)
>  		ret = genl_register_ops(&net_drop_monitor_family,
>  					&dropmon_ops[i]);
>  		if (ret) {
> -			printk(KERN_CRIT "failed to register operation %d\n",
> +			printk(KERN_CRIT "Failed to register operation %d\n",
>  				dropmon_ops[i].cmd);
>  			goto out_unreg;
>  		}
>  	}
> 
> +	rc = register_netdevice_notifier(&dropmon_net_notifier);
> +	if (rc < 0) {
> +		printk(KERN_CRIT "Failed to register netdevice notifier\n");
> +		goto out_unreg;
> +	}
> +
>  	rc = 0;
> 
>  	for_each_present_cpu(cpu) {
> @@ -259,6 +364,7 @@ static int __init init_net_drop_monitor(void)
>  		data->send_timer.data = cpu;
>  		data->send_timer.function = sched_send_work;
>  	}
> +
>  	goto out;
> 
>  out_unreg:
> diff --git a/net/core/net-traces.c b/net/core/net-traces.c
> index c8fb456..b07b25b 100644
> --- a/net/core/net-traces.c
> +++ b/net/core/net-traces.c
> @@ -20,6 +20,7 @@
>  #include <linux/netlink.h>
>  #include <linux/net_dropmon.h>
>  #include <trace/skb.h>
> +#include <trace/napi.h>
> 
>  #include <asm/unaligned.h>
>  #include <asm/bitops.h>
> @@ -27,3 +28,6 @@
> 
>  DEFINE_TRACE(kfree_skb);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
> +
> +DEFINE_TRACE(napi_poll);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
> diff --git a/net/core/netpoll.c b/net/core/netpoll.c
> index b5873bd..446d3ba 100644
> --- a/net/core/netpoll.c
> +++ b/net/core/netpoll.c
> @@ -24,6 +24,7 @@
>  #include <net/tcp.h>
>  #include <net/udp.h>
>  #include <asm/unaligned.h>
> +#include <trace/napi.h>
> 
>  /*
>   * We maintain a small pool of fully-sized skbs, to make sure the
> @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
>  	set_bit(NAPI_STATE_NPSVC, &napi->state);
> 
>  	work = napi->poll(napi, budget);
> +	trace_napi_poll(napi->dev);
> 
>  	clear_bit(NAPI_STATE_NPSVC, &napi->state);
>  	atomic_dec(&trapped);
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware drops rxpackets
  2009-05-13 18:23     ` [PATCH] dropmon: add ability to detect when hardware drops rxpackets Paul E. McKenney
@ 2009-05-14  0:45       ` Neil Horman
  2009-05-14  1:03         ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Paul E. McKenney
  0 siblings, 1 reply; 36+ messages in thread
From: Neil Horman @ 2009-05-14  0:45 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: Eric Dumazet, netdev, davem

On Wed, May 13, 2009 at 11:23:54AM -0700, Paul E. McKenney wrote:
> On Tue, May 12, 2009 at 12:30:44PM -0400, Neil Horman wrote:
> > On Sat, May 09, 2009 at 08:30:35AM +0200, Eric Dumazet wrote:
> > > 
> > > I dont fully understand your patch, but at least have some questions
> > > about rcu stuff.
> > > 
> > 
> > 
> > Ok, so I went back and I think I managed to better understand the RCU interface.
> > New patch attached, works the same way, saving for the gross previous misuse of
> > rcu.
> > 
> > 
> > Patch to add the ability to detect drops in hardware interfaces via dropwatch.
> > Adds a tracepoint to net_rx_action to signal everytime a napi instance is
> > polled.  The dropmon code then periodically checks to see if the rx_frames
> > counter has changed, and if so, adds a drop notification to the netlink
> > protocol, using the reserved all-0's vector to indicate the drop location was in
> > hardware, rather than somewhere in the code.
> 
> One concern shown below.
> 
<snip>
>  		tracepoint_synchronize_unregister();
> > +
> > +		/*
> > +		 * Clean the device list
> > +		 */
> > +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> > +			if (new_stat->dev == NULL) {
> > +				list_del_rcu(&new_stat->list);
> > +				call_rcu(&new_stat->rcu, free_dm_hw_stat);
> 
> Much better!  ;-)
> 
Thanks! :)
<snip>

> > +
> > +	switch (event) {
> > +	case NETDEV_REGISTER:
> > +		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
> > +
> > +		if (!new_stat)
> > +			goto out;
> > +
> > +		new_stat->dev = dev;
> > +		INIT_RCU_HEAD(&new_stat->rcu);
> > +		list_add_rcu(&new_stat->list, &hw_stats_list);
> 
> Don't we need to be holding trace_state_lock at this point?  Otherwise,
> can't we mangle the list with a concurrent list_add_rcu() and
> list_del_rcu()?
> 
I thought the purpose of list_add_rcu and list_del_rcu was to be able to modify
lists without needing to hold additional locks.  Or am I missing something else
about the nuance of how RCU works?

Neil

> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-14  0:45       ` Neil Horman
@ 2009-05-14  1:03         ` Paul E. McKenney
  2009-05-14 12:33           ` Neil Horman
  0 siblings, 1 reply; 36+ messages in thread
From: Paul E. McKenney @ 2009-05-14  1:03 UTC (permalink / raw)
  To: Neil Horman; +Cc: Eric Dumazet, netdev, davem

On Wed, May 13, 2009 at 08:45:48PM -0400, Neil Horman wrote:
> On Wed, May 13, 2009 at 11:23:54AM -0700, Paul E. McKenney wrote:
> > On Tue, May 12, 2009 at 12:30:44PM -0400, Neil Horman wrote:
> > > On Sat, May 09, 2009 at 08:30:35AM +0200, Eric Dumazet wrote:
> > > > 
> > > > I dont fully understand your patch, but at least have some questions
> > > > about rcu stuff.
> > > > 
> > > 
> > > 
> > > Ok, so I went back and I think I managed to better understand the RCU interface.
> > > New patch attached, works the same way, saving for the gross previous misuse of
> > > rcu.
> > > 
> > > 
> > > Patch to add the ability to detect drops in hardware interfaces via dropwatch.
> > > Adds a tracepoint to net_rx_action to signal everytime a napi instance is
> > > polled.  The dropmon code then periodically checks to see if the rx_frames
> > > counter has changed, and if so, adds a drop notification to the netlink
> > > protocol, using the reserved all-0's vector to indicate the drop location was in
> > > hardware, rather than somewhere in the code.
> > 
> > One concern shown below.
> > 
> <snip>
> >  		tracepoint_synchronize_unregister();
> > > +
> > > +		/*
> > > +		 * Clean the device list
> > > +		 */
> > > +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> > > +			if (new_stat->dev == NULL) {
> > > +				list_del_rcu(&new_stat->list);
> > > +				call_rcu(&new_stat->rcu, free_dm_hw_stat);
> > 
> > Much better!  ;-)
> > 
> Thanks! :)
> <snip>
> 
> > > +
> > > +	switch (event) {
> > > +	case NETDEV_REGISTER:
> > > +		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
> > > +
> > > +		if (!new_stat)
> > > +			goto out;
> > > +
> > > +		new_stat->dev = dev;
> > > +		INIT_RCU_HEAD(&new_stat->rcu);
> > > +		list_add_rcu(&new_stat->list, &hw_stats_list);
> > 
> > Don't we need to be holding trace_state_lock at this point?  Otherwise,
> > can't we mangle the list with a concurrent list_add_rcu() and
> > list_del_rcu()?
> > 
> I thought the purpose of list_add_rcu and list_del_rcu was to be
> able to modify lists without needing to hold additional locks.  Or am
> I missing something else about the nuance of how RCU works?

The purpose of list_add_rcu() and list_del_rcu() is to be able to permit
concurrent readers to traverse the list while you are modifying that same
list.  You still need something to handle concurrent updates to the list.
The usual approach is to use locks, in this case, to hold trace_state_lock
across the addition, given that it is already held across the removal.

The reason that I expect holding the lock to be OK is that you cannot
add elements faster than you delete them long-term, so given that you
are willing to hold the lock over deletions, you are probably OK also
holding that same lock over additions.  But please let me know if that
is not the case!

							Thanx, Paul

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-14  1:03         ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Paul E. McKenney
@ 2009-05-14 12:33           ` Neil Horman
  2009-05-14 12:44             ` Jiri Pirko
  2009-05-14 16:18             ` [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets Paul E. McKenney
  0 siblings, 2 replies; 36+ messages in thread
From: Neil Horman @ 2009-05-14 12:33 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: Eric Dumazet, netdev, davem

> 
> The purpose of list_add_rcu() and list_del_rcu() is to be able to permit
> concurrent readers to traverse the list while you are modifying that same
> list.  You still need something to handle concurrent updates to the list.
> The usual approach is to use locks, in this case, to hold trace_state_lock
> across the addition, given that it is already held across the removal.
> 
> The reason that I expect holding the lock to be OK is that you cannot
> add elements faster than you delete them long-term, so given that you
> are willing to hold the lock over deletions, you are probably OK also
> holding that same lock over additions.  But please let me know if that
> is not the case!
> 
> 							Thanx, Paul
> 


Ahh, I did learn something then :).  That makes sense, ok. New patch attached.

Patch to add the ability to detect drops in hardware interfaces via dropwatch.
Adds a tracepoint to net_rx_action to signal everytime a napi instance is
polled.  The dropmon code then periodically checks to see if the rx_frames
counter has changed, and if so, adds a drop notification to the netlink
protocol, using the reserved all-0's vector to indicate the drop location was in
hardware, rather than somewhere in the code.
   
change notes:
1) added use of trace_state_lock around list_add_rcu operation


Signed-off-by: Neil Horman <nhorman@tuxdriver.com>


 include/linux/net_dropmon.h |    7 ++
 include/trace/napi.h        |   11 ++++
 net/core/dev.c              |    5 +
 net/core/drop_monitor.c     |  115 ++++++++++++++++++++++++++++++++++++++++++--
 net/core/net-traces.c       |    4 +
 net/core/netpoll.c          |    2 
 6 files changed, 139 insertions(+), 5 deletions(-)

diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
index 0217fb8..9addc62 100644
--- a/include/linux/net_dropmon.h
+++ b/include/linux/net_dropmon.h
@@ -8,6 +8,13 @@ struct net_dm_drop_point {
 	__u32 count;
 };
 
+#define is_drop_point_hw(x) do {\
+	int ____i, ____j;\
+	for (____i = 0; ____i < 8; i ____i++)\
+		____j |= x[____i];\
+	____j;\
+} while (0)
+
 #define NET_DM_CFG_VERSION  0
 #define NET_DM_CFG_ALERT_COUNT  1
 #define NET_DM_CFG_ALERT_DELAY 2
diff --git a/include/trace/napi.h b/include/trace/napi.h
new file mode 100644
index 0000000..7c39eb7
--- /dev/null
+++ b/include/trace/napi.h
@@ -0,0 +1,11 @@
+#ifndef _TRACE_NAPI_H_
+#define _TRACE_NAPI_H_
+
+#include <linux/skbuff.h>
+#include <linux/tracepoint.h>
+
+DECLARE_TRACE(napi_poll,
+	TP_PROTO(struct napi_struct *napi),
+	TP_ARGS(napi));
+
+#endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 637ea71..165979c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -126,6 +126,7 @@
 #include <linux/in.h>
 #include <linux/jhash.h>
 #include <linux/random.h>
+#include <trace/napi.h>
 
 #include "net-sysfs.h"
 
@@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h)
 		 * accidently calling ->poll() when NAPI is not scheduled.
 		 */
 		work = 0;
-		if (test_bit(NAPI_STATE_SCHED, &n->state))
+		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 			work = n->poll(n, weight);
+			trace_napi_poll(n);
+		}
 
 		WARN_ON_ONCE(work > weight);
 
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 2797b71..f449971 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -22,8 +22,10 @@
 #include <linux/timer.h>
 #include <linux/bitops.h>
 #include <net/genetlink.h>
+#include <net/netevent.h>
 
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 
@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused);
  * and the work handle that will send up
  * netlink alerts
  */
-struct sock *dm_sock;
+static int trace_state = TRACE_OFF;
+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
 
 struct per_cpu_dm_data {
 	struct work_struct dm_alert_work;
@@ -47,6 +50,13 @@ struct per_cpu_dm_data {
 	struct timer_list send_timer;
 };
 
+struct dm_hw_stat_delta {
+	struct net_device *dev;
+	struct list_head list;
+	struct rcu_head rcu;
+	unsigned long last_drop_val;
+};
+
 static struct genl_family net_drop_monitor_family = {
 	.id             = GENL_ID_GENERATE,
 	.hdrsize        = 0,
@@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
 
 static int dm_hit_limit = 64;
 static int dm_delay = 1;
-
+static unsigned long dm_hw_check_delta = 2*HZ;
+static LIST_HEAD(hw_stats_list);
 
 static void reset_per_cpu_data(struct per_cpu_dm_data *data)
 {
@@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused)
 	schedule_work(&data->dm_alert_work);
 }
 
-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+static void trace_drop_common(struct sk_buff *skb, void *location)
 {
 	struct net_dm_alert_msg *msg;
 	struct nlmsghdr *nlh;
@@ -159,26 +170,79 @@ out:
 	return;
 }
 
+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+{
+	trace_drop_common(skb, location);
+}
+
+static void trace_napi_poll_hit(struct napi_struct *napi)
+{
+	struct dm_hw_stat_delta *new_stat;
+
+	/*
+	 * Ratelimit our check time to dm_hw_check_delta jiffies
+	 */
+	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+		if ((new_stat->dev == napi->dev)  &&
+		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+			trace_drop_common(NULL, NULL);
+			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+
+
+static void free_dm_hw_stat(struct rcu_head *head)
+{
+	struct dm_hw_stat_delta *n;
+	n = container_of(head, struct dm_hw_stat_delta, rcu);
+	kfree(n);
+}
+
 static int set_all_monitor_traces(int state)
 {
 	int rc = 0;
+	struct dm_hw_stat_delta *new_stat = NULL;
+
+	spin_lock(&trace_state_lock);
 
 	switch (state) {
 	case TRACE_ON:
 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= register_trace_napi_poll(trace_napi_poll_hit);
 		break;
 	case TRACE_OFF:
 		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
 
 		tracepoint_synchronize_unregister();
+
+		/*
+		 * Clean the device list
+		 */
+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+			if (new_stat->dev == NULL) {
+				list_del_rcu(&new_stat->list);
+				call_rcu(&new_stat->rcu, free_dm_hw_stat);
+			}
+		}
 		break;
 	default:
 		rc = 1;
 		break;
 	}
 
+	spin_unlock(&trace_state_lock);
+
 	if (rc)
 		return -EINPROGRESS;
+	trace_state = state;
 	return rc;
 }
 
@@ -204,6 +268,38 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
 	return -ENOTSUPP;
 }
 
+static int dropmon_net_event(struct notifier_block *ev_block,
+			unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct dm_hw_stat_delta *new_stat = NULL;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+
+		if (!new_stat)
+			goto out;
+
+		new_stat->dev = dev;
+		INIT_RCU_HEAD(&new_stat->rcu);
+		spin_lock(&trace_state_lock);
+		list_add_rcu(&new_stat->list, &hw_stats_list);
+		spin_unlock(&trace_state_lock);
+		break;
+	case NETDEV_UNREGISTER:
+		rcu_read_lock();
+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+			if (new_stat->dev == dev)
+				new_stat->dev = NULL;
+				break;
+		}
+		rcu_read_unlock();
+		break;
+	}
+out:
+	return NOTIFY_DONE;
+}
 
 static struct genl_ops dropmon_ops[] = {
 	{
@@ -220,6 +316,10 @@ static struct genl_ops dropmon_ops[] = {
 	},
 };
 
+static struct notifier_block dropmon_net_notifier = {
+	.notifier_call = dropmon_net_event
+};
+
 static int __init init_net_drop_monitor(void)
 {
 	int cpu;
@@ -243,12 +343,18 @@ static int __init init_net_drop_monitor(void)
 		ret = genl_register_ops(&net_drop_monitor_family,
 					&dropmon_ops[i]);
 		if (ret) {
-			printk(KERN_CRIT "failed to register operation %d\n",
+			printk(KERN_CRIT "Failed to register operation %d\n",
 				dropmon_ops[i].cmd);
 			goto out_unreg;
 		}
 	}
 
+	rc = register_netdevice_notifier(&dropmon_net_notifier);
+	if (rc < 0) {
+		printk(KERN_CRIT "Failed to register netdevice notifier\n");
+		goto out_unreg;
+	}
+
 	rc = 0;
 
 	for_each_present_cpu(cpu) {
@@ -259,6 +365,7 @@ static int __init init_net_drop_monitor(void)
 		data->send_timer.data = cpu;
 		data->send_timer.function = sched_send_work;
 	}
+
 	goto out;
 
 out_unreg:
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index c8fb456..b07b25b 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -20,6 +20,7 @@
 #include <linux/netlink.h>
 #include <linux/net_dropmon.h>
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 #include <asm/bitops.h>
@@ -27,3 +28,6 @@
 
 DEFINE_TRACE(kfree_skb);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
+
+DEFINE_TRACE(napi_poll);
+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index b5873bd..446d3ba 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -24,6 +24,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <asm/unaligned.h>
+#include <trace/napi.h>
 
 /*
  * We maintain a small pool of fully-sized skbs, to make sure the
@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 
 	work = napi->poll(napi, budget);
+	trace_napi_poll(napi->dev);
 
 	clear_bit(NAPI_STATE_NPSVC, &napi->state);
 	atomic_dec(&trapped);

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-14 12:33           ` Neil Horman
@ 2009-05-14 12:44             ` Jiri Pirko
  2009-05-14 16:17               ` [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets Paul E. McKenney
  2009-05-14 17:29               ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Neil Horman
  2009-05-14 16:18             ` [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets Paul E. McKenney
  1 sibling, 2 replies; 36+ messages in thread
From: Jiri Pirko @ 2009-05-14 12:44 UTC (permalink / raw)
  To: Neil Horman; +Cc: Paul E. McKenney, Eric Dumazet, netdev, davem

Thu, May 14, 2009 at 02:33:00PM CEST, nhorman@tuxdriver.com wrote:
>> 
>> The purpose of list_add_rcu() and list_del_rcu() is to be able to permit
>> concurrent readers to traverse the list while you are modifying that same
>> list.  You still need something to handle concurrent updates to the list.
>> The usual approach is to use locks, in this case, to hold trace_state_lock
>> across the addition, given that it is already held across the removal.
>> 
>> The reason that I expect holding the lock to be OK is that you cannot
>> add elements faster than you delete them long-term, so given that you
>> are willing to hold the lock over deletions, you are probably OK also
>> holding that same lock over additions.  But please let me know if that
>> is not the case!
>> 
>> 							Thanx, Paul
>> 
>
>
>Ahh, I did learn something then :).  That makes sense, ok. New patch attached.
>
>Patch to add the ability to detect drops in hardware interfaces via dropwatch.
>Adds a tracepoint to net_rx_action to signal everytime a napi instance is
>polled.  The dropmon code then periodically checks to see if the rx_frames
>counter has changed, and if so, adds a drop notification to the netlink
>protocol, using the reserved all-0's vector to indicate the drop location was in
>hardware, rather than somewhere in the code.
>   
>change notes:
>1) added use of trace_state_lock around list_add_rcu operation
>
>
>Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
>
>
> include/linux/net_dropmon.h |    7 ++
> include/trace/napi.h        |   11 ++++
> net/core/dev.c              |    5 +
> net/core/drop_monitor.c     |  115 ++++++++++++++++++++++++++++++++++++++++++--
> net/core/net-traces.c       |    4 +
> net/core/netpoll.c          |    2 
> 6 files changed, 139 insertions(+), 5 deletions(-)
>
>diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
>index 0217fb8..9addc62 100644
>--- a/include/linux/net_dropmon.h
>+++ b/include/linux/net_dropmon.h
>@@ -8,6 +8,13 @@ struct net_dm_drop_point {
> 	__u32 count;
> };
> 
>+#define is_drop_point_hw(x) do {\
>+	int ____i, ____j;\
>+	for (____i = 0; ____i < 8; i ____i++)\
>+		____j |= x[____i];\
>+	____j;\
>+} while (0)
>+
> #define NET_DM_CFG_VERSION  0
> #define NET_DM_CFG_ALERT_COUNT  1
> #define NET_DM_CFG_ALERT_DELAY 2
>diff --git a/include/trace/napi.h b/include/trace/napi.h
>new file mode 100644
>index 0000000..7c39eb7
>--- /dev/null
>+++ b/include/trace/napi.h
>@@ -0,0 +1,11 @@
>+#ifndef _TRACE_NAPI_H_
>+#define _TRACE_NAPI_H_
>+
>+#include <linux/skbuff.h>
>+#include <linux/tracepoint.h>
>+
>+DECLARE_TRACE(napi_poll,
>+	TP_PROTO(struct napi_struct *napi),
>+	TP_ARGS(napi));
>+
>+#endif
>diff --git a/net/core/dev.c b/net/core/dev.c
>index 637ea71..165979c 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -126,6 +126,7 @@
> #include <linux/in.h>
> #include <linux/jhash.h>
> #include <linux/random.h>
>+#include <trace/napi.h>
> 
> #include "net-sysfs.h"
> 
>@@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h)
> 		 * accidently calling ->poll() when NAPI is not scheduled.
> 		 */
> 		work = 0;
>-		if (test_bit(NAPI_STATE_SCHED, &n->state))
>+		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
> 			work = n->poll(n, weight);
>+			trace_napi_poll(n);
>+		}
> 
> 		WARN_ON_ONCE(work > weight);
> 
>diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
>index 2797b71..f449971 100644
>--- a/net/core/drop_monitor.c
>+++ b/net/core/drop_monitor.c
>@@ -22,8 +22,10 @@
> #include <linux/timer.h>
> #include <linux/bitops.h>
> #include <net/genetlink.h>
>+#include <net/netevent.h>
> 
> #include <trace/skb.h>
>+#include <trace/napi.h>
> 
> #include <asm/unaligned.h>
> 
>@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused);
>  * and the work handle that will send up
>  * netlink alerts
>  */
>-struct sock *dm_sock;
>+static int trace_state = TRACE_OFF;
>+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
> 
> struct per_cpu_dm_data {
> 	struct work_struct dm_alert_work;
>@@ -47,6 +50,13 @@ struct per_cpu_dm_data {
> 	struct timer_list send_timer;
> };
> 
>+struct dm_hw_stat_delta {
>+	struct net_device *dev;
>+	struct list_head list;
>+	struct rcu_head rcu;
>+	unsigned long last_drop_val;
>+};
>+
> static struct genl_family net_drop_monitor_family = {
> 	.id             = GENL_ID_GENERATE,
> 	.hdrsize        = 0,
>@@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
> 
> static int dm_hit_limit = 64;
> static int dm_delay = 1;
>-
>+static unsigned long dm_hw_check_delta = 2*HZ;
>+static LIST_HEAD(hw_stats_list);
> 
> static void reset_per_cpu_data(struct per_cpu_dm_data *data)
> {
>@@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused)
> 	schedule_work(&data->dm_alert_work);
> }
> 
>-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
>+static void trace_drop_common(struct sk_buff *skb, void *location)
> {
> 	struct net_dm_alert_msg *msg;
> 	struct nlmsghdr *nlh;
>@@ -159,26 +170,79 @@ out:
> 	return;
> }
> 
>+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
>+{
>+	trace_drop_common(skb, location);
>+}
>+
>+static void trace_napi_poll_hit(struct napi_struct *napi)
>+{
>+	struct dm_hw_stat_delta *new_stat;
>+
>+	/*
>+	 * Ratelimit our check time to dm_hw_check_delta jiffies
>+	 */
>+	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
>+		return;
>+
>+	rcu_read_lock();
>+	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
>+		if ((new_stat->dev == napi->dev)  &&
>+		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
>+			trace_drop_common(NULL, NULL);
>+			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
>+			break;
>+		}
>+	}
>+	rcu_read_unlock();
>+}
>+
>+
>+static void free_dm_hw_stat(struct rcu_head *head)
>+{
>+	struct dm_hw_stat_delta *n;
>+	n = container_of(head, struct dm_hw_stat_delta, rcu);
>+	kfree(n);
>+}
>+
> static int set_all_monitor_traces(int state)
> {
> 	int rc = 0;
>+	struct dm_hw_stat_delta *new_stat = NULL;
>+
>+	spin_lock(&trace_state_lock);
> 
> 	switch (state) {
> 	case TRACE_ON:
> 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
>+		rc |= register_trace_napi_poll(trace_napi_poll_hit);
> 		break;
> 	case TRACE_OFF:
> 		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
>+		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
> 
> 		tracepoint_synchronize_unregister();
>+
>+		/*
>+		 * Clean the device list
>+		 */
>+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
		^^^^^^^^^^^^^^^^^^^^^^^
This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock.
Also it would be good to use list_for_each_entry_safe here since you're
modifying the list.

		Jirka
>+			if (new_stat->dev == NULL) {
>+				list_del_rcu(&new_stat->list);
>+				call_rcu(&new_stat->rcu, free_dm_hw_stat);
>+			}
>+		}
> 		break;
> 	default:
> 		rc = 1;
> 		break;
> 	}
> 
>+	spin_unlock(&trace_state_lock);
>+
> 	if (rc)
> 		return -EINPROGRESS;
>+	trace_state = state;
> 	return rc;
> }
> 
>@@ -204,6 +268,38 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
> 	return -ENOTSUPP;
> }
> 
>+static int dropmon_net_event(struct notifier_block *ev_block,
>+			unsigned long event, void *ptr)
>+{
>+	struct net_device *dev = ptr;
>+	struct dm_hw_stat_delta *new_stat = NULL;
>+
>+	switch (event) {
>+	case NETDEV_REGISTER:
>+		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
>+
>+		if (!new_stat)
>+			goto out;
>+
>+		new_stat->dev = dev;
>+		INIT_RCU_HEAD(&new_stat->rcu);
>+		spin_lock(&trace_state_lock);
>+		list_add_rcu(&new_stat->list, &hw_stats_list);
>+		spin_unlock(&trace_state_lock);
>+		break;
>+	case NETDEV_UNREGISTER:
>+		rcu_read_lock();
>+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
>+			if (new_stat->dev == dev)
>+				new_stat->dev = NULL;
>+				break;
>+		}
>+		rcu_read_unlock();
>+		break;
>+	}
>+out:
>+	return NOTIFY_DONE;
>+}
> 
> static struct genl_ops dropmon_ops[] = {
> 	{
>@@ -220,6 +316,10 @@ static struct genl_ops dropmon_ops[] = {
> 	},
> };
> 
>+static struct notifier_block dropmon_net_notifier = {
>+	.notifier_call = dropmon_net_event
>+};
>+
> static int __init init_net_drop_monitor(void)
> {
> 	int cpu;
>@@ -243,12 +343,18 @@ static int __init init_net_drop_monitor(void)
> 		ret = genl_register_ops(&net_drop_monitor_family,
> 					&dropmon_ops[i]);
> 		if (ret) {
>-			printk(KERN_CRIT "failed to register operation %d\n",
>+			printk(KERN_CRIT "Failed to register operation %d\n",
> 				dropmon_ops[i].cmd);
> 			goto out_unreg;
> 		}
> 	}
> 
>+	rc = register_netdevice_notifier(&dropmon_net_notifier);
>+	if (rc < 0) {
>+		printk(KERN_CRIT "Failed to register netdevice notifier\n");
>+		goto out_unreg;
>+	}
>+
> 	rc = 0;
> 
> 	for_each_present_cpu(cpu) {
>@@ -259,6 +365,7 @@ static int __init init_net_drop_monitor(void)
> 		data->send_timer.data = cpu;
> 		data->send_timer.function = sched_send_work;
> 	}
>+
> 	goto out;
> 
> out_unreg:
>diff --git a/net/core/net-traces.c b/net/core/net-traces.c
>index c8fb456..b07b25b 100644
>--- a/net/core/net-traces.c
>+++ b/net/core/net-traces.c
>@@ -20,6 +20,7 @@
> #include <linux/netlink.h>
> #include <linux/net_dropmon.h>
> #include <trace/skb.h>
>+#include <trace/napi.h>
> 
> #include <asm/unaligned.h>
> #include <asm/bitops.h>
>@@ -27,3 +28,6 @@
> 
> DEFINE_TRACE(kfree_skb);
> EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
>+
>+DEFINE_TRACE(napi_poll);
>+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
>diff --git a/net/core/netpoll.c b/net/core/netpoll.c
>index b5873bd..446d3ba 100644
>--- a/net/core/netpoll.c
>+++ b/net/core/netpoll.c
>@@ -24,6 +24,7 @@
> #include <net/tcp.h>
> #include <net/udp.h>
> #include <asm/unaligned.h>
>+#include <trace/napi.h>
> 
> /*
>  * We maintain a small pool of fully-sized skbs, to make sure the
>@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
> 	set_bit(NAPI_STATE_NPSVC, &napi->state);
> 
> 	work = napi->poll(napi, budget);
>+	trace_napi_poll(napi->dev);
> 
> 	clear_bit(NAPI_STATE_NPSVC, &napi->state);
> 	atomic_dec(&trapped);
>--
>To unsubscribe from this list: send the line "unsubscribe netdev" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets
  2009-05-14 12:44             ` Jiri Pirko
@ 2009-05-14 16:17               ` Paul E. McKenney
  2009-05-14 17:29               ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Neil Horman
  1 sibling, 0 replies; 36+ messages in thread
From: Paul E. McKenney @ 2009-05-14 16:17 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: Neil Horman, Eric Dumazet, netdev, davem

On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote:
> Thu, May 14, 2009 at 02:33:00PM CEST, nhorman@tuxdriver.com wrote:
> >> 
> >> The purpose of list_add_rcu() and list_del_rcu() is to be able to permit
> >> concurrent readers to traverse the list while you are modifying that same
> >> list.  You still need something to handle concurrent updates to the list.
> >> The usual approach is to use locks, in this case, to hold trace_state_lock
> >> across the addition, given that it is already held across the removal.
> >> 
> >> The reason that I expect holding the lock to be OK is that you cannot
> >> add elements faster than you delete them long-term, so given that you
> >> are willing to hold the lock over deletions, you are probably OK also
> >> holding that same lock over additions.  But please let me know if that
> >> is not the case!
> >> 
> >> 							Thanx, Paul
> >> 
> >
> >
> >Ahh, I did learn something then :).  That makes sense, ok. New patch attached.
> >
> >Patch to add the ability to detect drops in hardware interfaces via dropwatch.
> >Adds a tracepoint to net_rx_action to signal everytime a napi instance is
> >polled.  The dropmon code then periodically checks to see if the rx_frames
> >counter has changed, and if so, adds a drop notification to the netlink
> >protocol, using the reserved all-0's vector to indicate the drop location was in
> >hardware, rather than somewhere in the code.
> >   
> >change notes:
> >1) added use of trace_state_lock around list_add_rcu operation
> >
> >
> >Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> >
> >
> > include/linux/net_dropmon.h |    7 ++
> > include/trace/napi.h        |   11 ++++
> > net/core/dev.c              |    5 +
> > net/core/drop_monitor.c     |  115 ++++++++++++++++++++++++++++++++++++++++++--
> > net/core/net-traces.c       |    4 +
> > net/core/netpoll.c          |    2 
> > 6 files changed, 139 insertions(+), 5 deletions(-)
> >
> >diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h

[ . . . ]

> > static int set_all_monitor_traces(int state)
> > {
> > 	int rc = 0;
> >+	struct dm_hw_stat_delta *new_stat = NULL;
> >+
> >+	spin_lock(&trace_state_lock);
> > 
> > 	switch (state) {
> > 	case TRACE_ON:
> > 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
> >+		rc |= register_trace_napi_poll(trace_napi_poll_hit);
> > 		break;
> > 	case TRACE_OFF:
> > 		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
> >+		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
> > 
> > 		tracepoint_synchronize_unregister();
> >+
> >+		/*
> >+		 * Clean the device list
> >+		 */
> >+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> 		^^^^^^^^^^^^^^^^^^^^^^^
> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock.
> Also it would be good to use list_for_each_entry_safe here since you're
> modifying the list.

Either way works, as the list_del_rcu() leaves the forward pointers
intact.  So I don't have an opinion either way on this particular piece
of code.  More experience will be needed to work out which approach is
less confusing.  :-/

If this code was shared between the read side and the update side, then
you really would want to be able to use list_for_each_entry_rcu() on the
update side.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets
  2009-05-14 12:33           ` Neil Horman
  2009-05-14 12:44             ` Jiri Pirko
@ 2009-05-14 16:18             ` Paul E. McKenney
  1 sibling, 0 replies; 36+ messages in thread
From: Paul E. McKenney @ 2009-05-14 16:18 UTC (permalink / raw)
  To: Neil Horman; +Cc: Eric Dumazet, netdev, davem

On Thu, May 14, 2009 at 08:33:00AM -0400, Neil Horman wrote:
> > 
> > The purpose of list_add_rcu() and list_del_rcu() is to be able to permit
> > concurrent readers to traverse the list while you are modifying that same
> > list.  You still need something to handle concurrent updates to the list.
> > The usual approach is to use locks, in this case, to hold trace_state_lock
> > across the addition, given that it is already held across the removal.
> > 
> > The reason that I expect holding the lock to be OK is that you cannot
> > add elements faster than you delete them long-term, so given that you
> > are willing to hold the lock over deletions, you are probably OK also
> > holding that same lock over additions.  But please let me know if that
> > is not the case!
> > 
> > 							Thanx, Paul
> > 
> 
> 
> Ahh, I did learn something then :).  That makes sense, ok. New patch attached.
> 
> Patch to add the ability to detect drops in hardware interfaces via dropwatch.
> Adds a tracepoint to net_rx_action to signal everytime a napi instance is
> polled.  The dropmon code then periodically checks to see if the rx_frames
> counter has changed, and if so, adds a drop notification to the netlink
> protocol, using the reserved all-0's vector to indicate the drop location was in
> hardware, rather than somewhere in the code.
>    
> change notes:
> 1) added use of trace_state_lock around list_add_rcu operation

Looks good from an RCU viewpoint!

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> 
> 
>  include/linux/net_dropmon.h |    7 ++
>  include/trace/napi.h        |   11 ++++
>  net/core/dev.c              |    5 +
>  net/core/drop_monitor.c     |  115 ++++++++++++++++++++++++++++++++++++++++++--
>  net/core/net-traces.c       |    4 +
>  net/core/netpoll.c          |    2 
>  6 files changed, 139 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
> index 0217fb8..9addc62 100644
> --- a/include/linux/net_dropmon.h
> +++ b/include/linux/net_dropmon.h
> @@ -8,6 +8,13 @@ struct net_dm_drop_point {
>  	__u32 count;
>  };
> 
> +#define is_drop_point_hw(x) do {\
> +	int ____i, ____j;\
> +	for (____i = 0; ____i < 8; i ____i++)\
> +		____j |= x[____i];\
> +	____j;\
> +} while (0)
> +
>  #define NET_DM_CFG_VERSION  0
>  #define NET_DM_CFG_ALERT_COUNT  1
>  #define NET_DM_CFG_ALERT_DELAY 2
> diff --git a/include/trace/napi.h b/include/trace/napi.h
> new file mode 100644
> index 0000000..7c39eb7
> --- /dev/null
> +++ b/include/trace/napi.h
> @@ -0,0 +1,11 @@
> +#ifndef _TRACE_NAPI_H_
> +#define _TRACE_NAPI_H_
> +
> +#include <linux/skbuff.h>
> +#include <linux/tracepoint.h>
> +
> +DECLARE_TRACE(napi_poll,
> +	TP_PROTO(struct napi_struct *napi),
> +	TP_ARGS(napi));
> +
> +#endif
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 637ea71..165979c 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -126,6 +126,7 @@
>  #include <linux/in.h>
>  #include <linux/jhash.h>
>  #include <linux/random.h>
> +#include <trace/napi.h>
> 
>  #include "net-sysfs.h"
> 
> @@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h)
>  		 * accidently calling ->poll() when NAPI is not scheduled.
>  		 */
>  		work = 0;
> -		if (test_bit(NAPI_STATE_SCHED, &n->state))
> +		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
>  			work = n->poll(n, weight);
> +			trace_napi_poll(n);
> +		}
> 
>  		WARN_ON_ONCE(work > weight);
> 
> diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
> index 2797b71..f449971 100644
> --- a/net/core/drop_monitor.c
> +++ b/net/core/drop_monitor.c
> @@ -22,8 +22,10 @@
>  #include <linux/timer.h>
>  #include <linux/bitops.h>
>  #include <net/genetlink.h>
> +#include <net/netevent.h>
> 
>  #include <trace/skb.h>
> +#include <trace/napi.h>
> 
>  #include <asm/unaligned.h>
> 
> @@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused);
>   * and the work handle that will send up
>   * netlink alerts
>   */
> -struct sock *dm_sock;
> +static int trace_state = TRACE_OFF;
> +static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
> 
>  struct per_cpu_dm_data {
>  	struct work_struct dm_alert_work;
> @@ -47,6 +50,13 @@ struct per_cpu_dm_data {
>  	struct timer_list send_timer;
>  };
> 
> +struct dm_hw_stat_delta {
> +	struct net_device *dev;
> +	struct list_head list;
> +	struct rcu_head rcu;
> +	unsigned long last_drop_val;
> +};
> +
>  static struct genl_family net_drop_monitor_family = {
>  	.id             = GENL_ID_GENERATE,
>  	.hdrsize        = 0,
> @@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
> 
>  static int dm_hit_limit = 64;
>  static int dm_delay = 1;
> -
> +static unsigned long dm_hw_check_delta = 2*HZ;
> +static LIST_HEAD(hw_stats_list);
> 
>  static void reset_per_cpu_data(struct per_cpu_dm_data *data)
>  {
> @@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused)
>  	schedule_work(&data->dm_alert_work);
>  }
> 
> -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
> +static void trace_drop_common(struct sk_buff *skb, void *location)
>  {
>  	struct net_dm_alert_msg *msg;
>  	struct nlmsghdr *nlh;
> @@ -159,26 +170,79 @@ out:
>  	return;
>  }
> 
> +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
> +{
> +	trace_drop_common(skb, location);
> +}
> +
> +static void trace_napi_poll_hit(struct napi_struct *napi)
> +{
> +	struct dm_hw_stat_delta *new_stat;
> +
> +	/*
> +	 * Ratelimit our check time to dm_hw_check_delta jiffies
> +	 */
> +	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
> +		return;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> +		if ((new_stat->dev == napi->dev)  &&
> +		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
> +			trace_drop_common(NULL, NULL);
> +			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
> +			break;
> +		}
> +	}
> +	rcu_read_unlock();
> +}
> +
> +
> +static void free_dm_hw_stat(struct rcu_head *head)
> +{
> +	struct dm_hw_stat_delta *n;
> +	n = container_of(head, struct dm_hw_stat_delta, rcu);
> +	kfree(n);
> +}
> +
>  static int set_all_monitor_traces(int state)
>  {
>  	int rc = 0;
> +	struct dm_hw_stat_delta *new_stat = NULL;
> +
> +	spin_lock(&trace_state_lock);
> 
>  	switch (state) {
>  	case TRACE_ON:
>  		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
> +		rc |= register_trace_napi_poll(trace_napi_poll_hit);
>  		break;
>  	case TRACE_OFF:
>  		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
> +		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
> 
>  		tracepoint_synchronize_unregister();
> +
> +		/*
> +		 * Clean the device list
> +		 */
> +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> +			if (new_stat->dev == NULL) {
> +				list_del_rcu(&new_stat->list);
> +				call_rcu(&new_stat->rcu, free_dm_hw_stat);
> +			}
> +		}
>  		break;
>  	default:
>  		rc = 1;
>  		break;
>  	}
> 
> +	spin_unlock(&trace_state_lock);
> +
>  	if (rc)
>  		return -EINPROGRESS;
> +	trace_state = state;
>  	return rc;
>  }
> 
> @@ -204,6 +268,38 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
>  	return -ENOTSUPP;
>  }
> 
> +static int dropmon_net_event(struct notifier_block *ev_block,
> +			unsigned long event, void *ptr)
> +{
> +	struct net_device *dev = ptr;
> +	struct dm_hw_stat_delta *new_stat = NULL;
> +
> +	switch (event) {
> +	case NETDEV_REGISTER:
> +		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
> +
> +		if (!new_stat)
> +			goto out;
> +
> +		new_stat->dev = dev;
> +		INIT_RCU_HEAD(&new_stat->rcu);
> +		spin_lock(&trace_state_lock);
> +		list_add_rcu(&new_stat->list, &hw_stats_list);
> +		spin_unlock(&trace_state_lock);
> +		break;
> +	case NETDEV_UNREGISTER:
> +		rcu_read_lock();
> +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> +			if (new_stat->dev == dev)
> +				new_stat->dev = NULL;
> +				break;
> +		}
> +		rcu_read_unlock();
> +		break;
> +	}
> +out:
> +	return NOTIFY_DONE;
> +}
> 
>  static struct genl_ops dropmon_ops[] = {
>  	{
> @@ -220,6 +316,10 @@ static struct genl_ops dropmon_ops[] = {
>  	},
>  };
> 
> +static struct notifier_block dropmon_net_notifier = {
> +	.notifier_call = dropmon_net_event
> +};
> +
>  static int __init init_net_drop_monitor(void)
>  {
>  	int cpu;
> @@ -243,12 +343,18 @@ static int __init init_net_drop_monitor(void)
>  		ret = genl_register_ops(&net_drop_monitor_family,
>  					&dropmon_ops[i]);
>  		if (ret) {
> -			printk(KERN_CRIT "failed to register operation %d\n",
> +			printk(KERN_CRIT "Failed to register operation %d\n",
>  				dropmon_ops[i].cmd);
>  			goto out_unreg;
>  		}
>  	}
> 
> +	rc = register_netdevice_notifier(&dropmon_net_notifier);
> +	if (rc < 0) {
> +		printk(KERN_CRIT "Failed to register netdevice notifier\n");
> +		goto out_unreg;
> +	}
> +
>  	rc = 0;
> 
>  	for_each_present_cpu(cpu) {
> @@ -259,6 +365,7 @@ static int __init init_net_drop_monitor(void)
>  		data->send_timer.data = cpu;
>  		data->send_timer.function = sched_send_work;
>  	}
> +
>  	goto out;
> 
>  out_unreg:
> diff --git a/net/core/net-traces.c b/net/core/net-traces.c
> index c8fb456..b07b25b 100644
> --- a/net/core/net-traces.c
> +++ b/net/core/net-traces.c
> @@ -20,6 +20,7 @@
>  #include <linux/netlink.h>
>  #include <linux/net_dropmon.h>
>  #include <trace/skb.h>
> +#include <trace/napi.h>
> 
>  #include <asm/unaligned.h>
>  #include <asm/bitops.h>
> @@ -27,3 +28,6 @@
> 
>  DEFINE_TRACE(kfree_skb);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
> +
> +DEFINE_TRACE(napi_poll);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
> diff --git a/net/core/netpoll.c b/net/core/netpoll.c
> index b5873bd..446d3ba 100644
> --- a/net/core/netpoll.c
> +++ b/net/core/netpoll.c
> @@ -24,6 +24,7 @@
>  #include <net/tcp.h>
>  #include <net/udp.h>
>  #include <asm/unaligned.h>
> +#include <trace/napi.h>
> 
>  /*
>   * We maintain a small pool of fully-sized skbs, to make sure the
> @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
>  	set_bit(NAPI_STATE_NPSVC, &napi->state);
> 
>  	work = napi->poll(napi, budget);
> +	trace_napi_poll(napi->dev);
> 
>  	clear_bit(NAPI_STATE_NPSVC, &napi->state);
>  	atomic_dec(&trapped);
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-14 12:44             ` Jiri Pirko
  2009-05-14 16:17               ` [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets Paul E. McKenney
@ 2009-05-14 17:29               ` Neil Horman
  2009-05-15  5:49                 ` Jarek Poplawski
  2009-05-15  6:51                 ` Jiri Pirko
  1 sibling, 2 replies; 36+ messages in thread
From: Neil Horman @ 2009-05-14 17:29 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: Paul E. McKenney, Eric Dumazet, netdev, davem

On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote:
> >+
> >+		/*
> >+		 * Clean the device list
> >+		 */
> >+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> 		^^^^^^^^^^^^^^^^^^^^^^^
> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock.
> Also it would be good to use list_for_each_entry_safe here since you're
> modifying the list.
> 

The definition of list_for_each_entry_rcu specifically says its safe against
list-mutation primitives, so its fine.  Although you are correct, in that its
safety is dependent on the protection of rcu_read_lock(), so I'll add that in.
Thanks for the catch!  New patch attached

Change notes:
1) Add rcu_read_lock/unlock protection around TRACE_OFF event

Neil


Patch to add the ability to detect drops in hardware interfaces via dropwatch.
Adds a tracepoint to net_rx_action to signal everytime a napi instance is
polled.  The dropmon code then periodically checks to see if the rx_frames
counter has changed, and if so, adds a drop notification to the netlink
protocol, using the reserved all-0's vector to indicate the drop location was in
hardware, rather than somewhere in the code.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>


 include/linux/net_dropmon.h |    7 ++
 include/trace/napi.h        |   11 ++++
 net/core/dev.c              |    5 +
 net/core/drop_monitor.c     |  117 ++++++++++++++++++++++++++++++++++++++++++--
 net/core/net-traces.c       |    4 +
 net/core/netpoll.c          |    2 
 6 files changed, 141 insertions(+), 5 deletions(-)

diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
index 0217fb8..9addc62 100644
--- a/include/linux/net_dropmon.h
+++ b/include/linux/net_dropmon.h
@@ -8,6 +8,13 @@ struct net_dm_drop_point {
 	__u32 count;
 };
 
+#define is_drop_point_hw(x) do {\
+	int ____i, ____j;\
+	for (____i = 0; ____i < 8; i ____i++)\
+		____j |= x[____i];\
+	____j;\
+} while (0)
+
 #define NET_DM_CFG_VERSION  0
 #define NET_DM_CFG_ALERT_COUNT  1
 #define NET_DM_CFG_ALERT_DELAY 2
diff --git a/include/trace/napi.h b/include/trace/napi.h
new file mode 100644
index 0000000..7c39eb7
--- /dev/null
+++ b/include/trace/napi.h
@@ -0,0 +1,11 @@
+#ifndef _TRACE_NAPI_H_
+#define _TRACE_NAPI_H_
+
+#include <linux/skbuff.h>
+#include <linux/tracepoint.h>
+
+DECLARE_TRACE(napi_poll,
+	TP_PROTO(struct napi_struct *napi),
+	TP_ARGS(napi));
+
+#endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 637ea71..165979c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -126,6 +126,7 @@
 #include <linux/in.h>
 #include <linux/jhash.h>
 #include <linux/random.h>
+#include <trace/napi.h>
 
 #include "net-sysfs.h"
 
@@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h)
 		 * accidently calling ->poll() when NAPI is not scheduled.
 		 */
 		work = 0;
-		if (test_bit(NAPI_STATE_SCHED, &n->state))
+		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 			work = n->poll(n, weight);
+			trace_napi_poll(n);
+		}
 
 		WARN_ON_ONCE(work > weight);
 
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 2797b71..f9130d5 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -22,8 +22,10 @@
 #include <linux/timer.h>
 #include <linux/bitops.h>
 #include <net/genetlink.h>
+#include <net/netevent.h>
 
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 
@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused);
  * and the work handle that will send up
  * netlink alerts
  */
-struct sock *dm_sock;
+static int trace_state = TRACE_OFF;
+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
 
 struct per_cpu_dm_data {
 	struct work_struct dm_alert_work;
@@ -47,6 +50,13 @@ struct per_cpu_dm_data {
 	struct timer_list send_timer;
 };
 
+struct dm_hw_stat_delta {
+	struct net_device *dev;
+	struct list_head list;
+	struct rcu_head rcu;
+	unsigned long last_drop_val;
+};
+
 static struct genl_family net_drop_monitor_family = {
 	.id             = GENL_ID_GENERATE,
 	.hdrsize        = 0,
@@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
 
 static int dm_hit_limit = 64;
 static int dm_delay = 1;
-
+static unsigned long dm_hw_check_delta = 2*HZ;
+static LIST_HEAD(hw_stats_list);
 
 static void reset_per_cpu_data(struct per_cpu_dm_data *data)
 {
@@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused)
 	schedule_work(&data->dm_alert_work);
 }
 
-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+static void trace_drop_common(struct sk_buff *skb, void *location)
 {
 	struct net_dm_alert_msg *msg;
 	struct nlmsghdr *nlh;
@@ -159,26 +170,81 @@ out:
 	return;
 }
 
+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+{
+	trace_drop_common(skb, location);
+}
+
+static void trace_napi_poll_hit(struct napi_struct *napi)
+{
+	struct dm_hw_stat_delta *new_stat;
+
+	/*
+	 * Ratelimit our check time to dm_hw_check_delta jiffies
+	 */
+	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+		if ((new_stat->dev == napi->dev)  &&
+		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+			trace_drop_common(NULL, NULL);
+			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+
+
+static void free_dm_hw_stat(struct rcu_head *head)
+{
+	struct dm_hw_stat_delta *n;
+	n = container_of(head, struct dm_hw_stat_delta, rcu);
+	kfree(n);
+}
+
 static int set_all_monitor_traces(int state)
 {
 	int rc = 0;
+	struct dm_hw_stat_delta *new_stat = NULL;
+
+	spin_lock(&trace_state_lock);
 
 	switch (state) {
 	case TRACE_ON:
 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= register_trace_napi_poll(trace_napi_poll_hit);
 		break;
 	case TRACE_OFF:
 		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
 
 		tracepoint_synchronize_unregister();
+
+		/*
+		 * Clean the device list
+		 */
+		rcu_read_lock();
+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+			if (new_stat->dev == NULL) {
+				list_del_rcu(&new_stat->list);
+				call_rcu(&new_stat->rcu, free_dm_hw_stat);
+			}
+		}
+		rcu_read_unlock();
 		break;
 	default:
 		rc = 1;
 		break;
 	}
 
+	spin_unlock(&trace_state_lock);
+
 	if (rc)
 		return -EINPROGRESS;
+	trace_state = state;
 	return rc;
 }
 
@@ -204,6 +270,38 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
 	return -ENOTSUPP;
 }
 
+static int dropmon_net_event(struct notifier_block *ev_block,
+			unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct dm_hw_stat_delta *new_stat = NULL;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+
+		if (!new_stat)
+			goto out;
+
+		new_stat->dev = dev;
+		INIT_RCU_HEAD(&new_stat->rcu);
+		spin_lock(&trace_state_lock);
+		list_add_rcu(&new_stat->list, &hw_stats_list);
+		spin_unlock(&trace_state_lock);
+		break;
+	case NETDEV_UNREGISTER:
+		rcu_read_lock();
+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+			if (new_stat->dev == dev)
+				new_stat->dev = NULL;
+				break;
+		}
+		rcu_read_unlock();
+		break;
+	}
+out:
+	return NOTIFY_DONE;
+}
 
 static struct genl_ops dropmon_ops[] = {
 	{
@@ -220,6 +318,10 @@ static struct genl_ops dropmon_ops[] = {
 	},
 };
 
+static struct notifier_block dropmon_net_notifier = {
+	.notifier_call = dropmon_net_event
+};
+
 static int __init init_net_drop_monitor(void)
 {
 	int cpu;
@@ -243,12 +345,18 @@ static int __init init_net_drop_monitor(void)
 		ret = genl_register_ops(&net_drop_monitor_family,
 					&dropmon_ops[i]);
 		if (ret) {
-			printk(KERN_CRIT "failed to register operation %d\n",
+			printk(KERN_CRIT "Failed to register operation %d\n",
 				dropmon_ops[i].cmd);
 			goto out_unreg;
 		}
 	}
 
+	rc = register_netdevice_notifier(&dropmon_net_notifier);
+	if (rc < 0) {
+		printk(KERN_CRIT "Failed to register netdevice notifier\n");
+		goto out_unreg;
+	}
+
 	rc = 0;
 
 	for_each_present_cpu(cpu) {
@@ -259,6 +367,7 @@ static int __init init_net_drop_monitor(void)
 		data->send_timer.data = cpu;
 		data->send_timer.function = sched_send_work;
 	}
+
 	goto out;
 
 out_unreg:
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index c8fb456..b07b25b 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -20,6 +20,7 @@
 #include <linux/netlink.h>
 #include <linux/net_dropmon.h>
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 #include <asm/bitops.h>
@@ -27,3 +28,6 @@
 
 DEFINE_TRACE(kfree_skb);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
+
+DEFINE_TRACE(napi_poll);
+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index b5873bd..446d3ba 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -24,6 +24,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <asm/unaligned.h>
+#include <trace/napi.h>
 
 /*
  * We maintain a small pool of fully-sized skbs, to make sure the
@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 
 	work = napi->poll(napi, budget);
+	trace_napi_poll(napi->dev);
 
 	clear_bit(NAPI_STATE_NPSVC, &napi->state);
 	atomic_dec(&trapped);

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-14 17:29               ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Neil Horman
@ 2009-05-15  5:49                 ` Jarek Poplawski
  2009-05-15 11:01                   ` Neil Horman
  2009-05-15  6:51                 ` Jiri Pirko
  1 sibling, 1 reply; 36+ messages in thread
From: Jarek Poplawski @ 2009-05-15  5:49 UTC (permalink / raw)
  To: Neil Horman; +Cc: Jiri Pirko, Paul E. McKenney, Eric Dumazet, netdev, davem

On 14-05-2009 19:29, Neil Horman wrote:
> On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote:
>>> +
>>> +		/*
>>> +		 * Clean the device list
>>> +		 */
>>> +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
>> 		^^^^^^^^^^^^^^^^^^^^^^^
>> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock.
>> Also it would be good to use list_for_each_entry_safe here since you're
>> modifying the list.
>>
> 
> The definition of list_for_each_entry_rcu specifically says its safe against
> list-mutation primitives, so its fine.  Although you are correct, in that its
> safety is dependent on the protection of rcu_read_lock(), so I'll add that in.
> Thanks for the catch!  New patch attached
> 
> Change notes:
> 1) Add rcu_read_lock/unlock protection around TRACE_OFF event
> 
> Neil
...
>  static int set_all_monitor_traces(int state)
>  {
>  	int rc = 0;
> +	struct dm_hw_stat_delta *new_stat = NULL;
> +
> +	spin_lock(&trace_state_lock);
>  
>  	switch (state) {
>  	case TRACE_ON:
>  		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
> +		rc |= register_trace_napi_poll(trace_napi_poll_hit);
>  		break;
>  	case TRACE_OFF:
>  		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
> +		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
>  
>  		tracepoint_synchronize_unregister();
> +
> +		/*
> +		 * Clean the device list
> +		 */
> +		rcu_read_lock();
> +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> +			if (new_stat->dev == NULL) {
> +				list_del_rcu(&new_stat->list);
> +				call_rcu(&new_stat->rcu, free_dm_hw_stat);
> +			}
> +		}
> +		rcu_read_unlock();

IMHO it looks worse now. rcu_read_lock() suggests it's a read side,
and spin_lock(&trace_state_lock) protects something else.

Jarek P.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-14 17:29               ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Neil Horman
  2009-05-15  5:49                 ` Jarek Poplawski
@ 2009-05-15  6:51                 ` Jiri Pirko
  2009-05-15  7:37                   ` Eric Dumazet
  2009-05-15 10:59                   ` Neil Horman
  1 sibling, 2 replies; 36+ messages in thread
From: Jiri Pirko @ 2009-05-15  6:51 UTC (permalink / raw)
  To: Neil Horman; +Cc: Paul E. McKenney, Eric Dumazet, netdev, davem

Thu, May 14, 2009 at 07:29:54PM CEST, nhorman@tuxdriver.com wrote:
>On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote:
>> >+
>> >+		/*
>> >+		 * Clean the device list
>> >+		 */
>> >+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
>> 		^^^^^^^^^^^^^^^^^^^^^^^
>> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock.
>> Also it would be good to use list_for_each_entry_safe here since you're
>> modifying the list.
>> 
>
>The definition of list_for_each_entry_rcu specifically says its safe against
>list-mutation primitives, so its fine.  Although you are correct, in that its
>safety is dependent on the protection of rcu_read_lock(), so I'll add that in.

You are right that list_for_each_entry_rcu is safe against list-mutation
primitives. But there's no need for this on the update side when you hold a writer
spinlock. Here I think it's better (and also less confusing) to use ordinary
list_for_each_entry which in this case must be list_for_each_entry_safe.

		Jirka

>Thanks for the catch!  New patch attached
>
>Change notes:
>1) Add rcu_read_lock/unlock protection around TRACE_OFF event
>
>Neil
>
>
>Patch to add the ability to detect drops in hardware interfaces via dropwatch.
>Adds a tracepoint to net_rx_action to signal everytime a napi instance is
>polled.  The dropmon code then periodically checks to see if the rx_frames
>counter has changed, and if so, adds a drop notification to the netlink
>protocol, using the reserved all-0's vector to indicate the drop location was in
>hardware, rather than somewhere in the code.
>
>Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
>
>
> include/linux/net_dropmon.h |    7 ++
> include/trace/napi.h        |   11 ++++
> net/core/dev.c              |    5 +
> net/core/drop_monitor.c     |  117 ++++++++++++++++++++++++++++++++++++++++++--
> net/core/net-traces.c       |    4 +
> net/core/netpoll.c          |    2 
> 6 files changed, 141 insertions(+), 5 deletions(-)
>
>diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
>index 0217fb8..9addc62 100644
>--- a/include/linux/net_dropmon.h
>+++ b/include/linux/net_dropmon.h
>@@ -8,6 +8,13 @@ struct net_dm_drop_point {
> 	__u32 count;
> };
> 
>+#define is_drop_point_hw(x) do {\
>+	int ____i, ____j;\
>+	for (____i = 0; ____i < 8; i ____i++)\
>+		____j |= x[____i];\
>+	____j;\
>+} while (0)
>+
> #define NET_DM_CFG_VERSION  0
> #define NET_DM_CFG_ALERT_COUNT  1
> #define NET_DM_CFG_ALERT_DELAY 2
>diff --git a/include/trace/napi.h b/include/trace/napi.h
>new file mode 100644
>index 0000000..7c39eb7
>--- /dev/null
>+++ b/include/trace/napi.h
>@@ -0,0 +1,11 @@
>+#ifndef _TRACE_NAPI_H_
>+#define _TRACE_NAPI_H_
>+
>+#include <linux/skbuff.h>
>+#include <linux/tracepoint.h>
>+
>+DECLARE_TRACE(napi_poll,
>+	TP_PROTO(struct napi_struct *napi),
>+	TP_ARGS(napi));
>+
>+#endif
>diff --git a/net/core/dev.c b/net/core/dev.c
>index 637ea71..165979c 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -126,6 +126,7 @@
> #include <linux/in.h>
> #include <linux/jhash.h>
> #include <linux/random.h>
>+#include <trace/napi.h>
> 
> #include "net-sysfs.h"
> 
>@@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h)
> 		 * accidently calling ->poll() when NAPI is not scheduled.
> 		 */
> 		work = 0;
>-		if (test_bit(NAPI_STATE_SCHED, &n->state))
>+		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
> 			work = n->poll(n, weight);
>+			trace_napi_poll(n);
>+		}
> 
> 		WARN_ON_ONCE(work > weight);
> 
>diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
>index 2797b71..f9130d5 100644
>--- a/net/core/drop_monitor.c
>+++ b/net/core/drop_monitor.c
>@@ -22,8 +22,10 @@
> #include <linux/timer.h>
> #include <linux/bitops.h>
> #include <net/genetlink.h>
>+#include <net/netevent.h>
> 
> #include <trace/skb.h>
>+#include <trace/napi.h>
> 
> #include <asm/unaligned.h>
> 
>@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused);
>  * and the work handle that will send up
>  * netlink alerts
>  */
>-struct sock *dm_sock;
>+static int trace_state = TRACE_OFF;
>+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
> 
> struct per_cpu_dm_data {
> 	struct work_struct dm_alert_work;
>@@ -47,6 +50,13 @@ struct per_cpu_dm_data {
> 	struct timer_list send_timer;
> };
> 
>+struct dm_hw_stat_delta {
>+	struct net_device *dev;
>+	struct list_head list;
>+	struct rcu_head rcu;
>+	unsigned long last_drop_val;
>+};
>+
> static struct genl_family net_drop_monitor_family = {
> 	.id             = GENL_ID_GENERATE,
> 	.hdrsize        = 0,
>@@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
> 
> static int dm_hit_limit = 64;
> static int dm_delay = 1;
>-
>+static unsigned long dm_hw_check_delta = 2*HZ;
>+static LIST_HEAD(hw_stats_list);
> 
> static void reset_per_cpu_data(struct per_cpu_dm_data *data)
> {
>@@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused)
> 	schedule_work(&data->dm_alert_work);
> }
> 
>-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
>+static void trace_drop_common(struct sk_buff *skb, void *location)
> {
> 	struct net_dm_alert_msg *msg;
> 	struct nlmsghdr *nlh;
>@@ -159,26 +170,81 @@ out:
> 	return;
> }
> 
>+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
>+{
>+	trace_drop_common(skb, location);
>+}
>+
>+static void trace_napi_poll_hit(struct napi_struct *napi)
>+{
>+	struct dm_hw_stat_delta *new_stat;
>+
>+	/*
>+	 * Ratelimit our check time to dm_hw_check_delta jiffies
>+	 */
>+	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
>+		return;
>+
>+	rcu_read_lock();
>+	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
>+		if ((new_stat->dev == napi->dev)  &&
>+		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
>+			trace_drop_common(NULL, NULL);
>+			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
>+			break;
>+		}
>+	}
>+	rcu_read_unlock();
>+}
>+
>+
>+static void free_dm_hw_stat(struct rcu_head *head)
>+{
>+	struct dm_hw_stat_delta *n;
>+	n = container_of(head, struct dm_hw_stat_delta, rcu);
>+	kfree(n);
>+}
>+
> static int set_all_monitor_traces(int state)
> {
> 	int rc = 0;
>+	struct dm_hw_stat_delta *new_stat = NULL;
>+
>+	spin_lock(&trace_state_lock);
> 
> 	switch (state) {
> 	case TRACE_ON:
> 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
>+		rc |= register_trace_napi_poll(trace_napi_poll_hit);
> 		break;
> 	case TRACE_OFF:
> 		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
>+		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
> 
> 		tracepoint_synchronize_unregister();
>+
>+		/*
>+		 * Clean the device list
>+		 */
>+		rcu_read_lock();
>+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
>+			if (new_stat->dev == NULL) {
>+				list_del_rcu(&new_stat->list);
>+				call_rcu(&new_stat->rcu, free_dm_hw_stat);
>+			}
>+		}
>+		rcu_read_unlock();
> 		break;
> 	default:
> 		rc = 1;
> 		break;
> 	}
> 
>+	spin_unlock(&trace_state_lock);
>+
> 	if (rc)
> 		return -EINPROGRESS;
>+	trace_state = state;
> 	return rc;
> }
> 
>@@ -204,6 +270,38 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
> 	return -ENOTSUPP;
> }
> 
>+static int dropmon_net_event(struct notifier_block *ev_block,
>+			unsigned long event, void *ptr)
>+{
>+	struct net_device *dev = ptr;
>+	struct dm_hw_stat_delta *new_stat = NULL;
>+
>+	switch (event) {
>+	case NETDEV_REGISTER:
>+		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
>+
>+		if (!new_stat)
>+			goto out;
>+
>+		new_stat->dev = dev;
>+		INIT_RCU_HEAD(&new_stat->rcu);
>+		spin_lock(&trace_state_lock);
>+		list_add_rcu(&new_stat->list, &hw_stats_list);
>+		spin_unlock(&trace_state_lock);
>+		break;
>+	case NETDEV_UNREGISTER:
>+		rcu_read_lock();
>+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
>+			if (new_stat->dev == dev)
>+				new_stat->dev = NULL;
>+				break;
>+		}
>+		rcu_read_unlock();
>+		break;
>+	}
>+out:
>+	return NOTIFY_DONE;
>+}
> 
> static struct genl_ops dropmon_ops[] = {
> 	{
>@@ -220,6 +318,10 @@ static struct genl_ops dropmon_ops[] = {
> 	},
> };
> 
>+static struct notifier_block dropmon_net_notifier = {
>+	.notifier_call = dropmon_net_event
>+};
>+
> static int __init init_net_drop_monitor(void)
> {
> 	int cpu;
>@@ -243,12 +345,18 @@ static int __init init_net_drop_monitor(void)
> 		ret = genl_register_ops(&net_drop_monitor_family,
> 					&dropmon_ops[i]);
> 		if (ret) {
>-			printk(KERN_CRIT "failed to register operation %d\n",
>+			printk(KERN_CRIT "Failed to register operation %d\n",
> 				dropmon_ops[i].cmd);
> 			goto out_unreg;
> 		}
> 	}
> 
>+	rc = register_netdevice_notifier(&dropmon_net_notifier);
>+	if (rc < 0) {
>+		printk(KERN_CRIT "Failed to register netdevice notifier\n");
>+		goto out_unreg;
>+	}
>+
> 	rc = 0;
> 
> 	for_each_present_cpu(cpu) {
>@@ -259,6 +367,7 @@ static int __init init_net_drop_monitor(void)
> 		data->send_timer.data = cpu;
> 		data->send_timer.function = sched_send_work;
> 	}
>+
> 	goto out;
> 
> out_unreg:
>diff --git a/net/core/net-traces.c b/net/core/net-traces.c
>index c8fb456..b07b25b 100644
>--- a/net/core/net-traces.c
>+++ b/net/core/net-traces.c
>@@ -20,6 +20,7 @@
> #include <linux/netlink.h>
> #include <linux/net_dropmon.h>
> #include <trace/skb.h>
>+#include <trace/napi.h>
> 
> #include <asm/unaligned.h>
> #include <asm/bitops.h>
>@@ -27,3 +28,6 @@
> 
> DEFINE_TRACE(kfree_skb);
> EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
>+
>+DEFINE_TRACE(napi_poll);
>+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
>diff --git a/net/core/netpoll.c b/net/core/netpoll.c
>index b5873bd..446d3ba 100644
>--- a/net/core/netpoll.c
>+++ b/net/core/netpoll.c
>@@ -24,6 +24,7 @@
> #include <net/tcp.h>
> #include <net/udp.h>
> #include <asm/unaligned.h>
>+#include <trace/napi.h>
> 
> /*
>  * We maintain a small pool of fully-sized skbs, to make sure the
>@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
> 	set_bit(NAPI_STATE_NPSVC, &napi->state);
> 
> 	work = napi->poll(napi, budget);
>+	trace_napi_poll(napi->dev);
> 
> 	clear_bit(NAPI_STATE_NPSVC, &napi->state);
> 	atomic_dec(&trapped);

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15  6:51                 ` Jiri Pirko
@ 2009-05-15  7:37                   ` Eric Dumazet
  2009-05-15 11:12                     ` Neil Horman
  2009-05-15 10:59                   ` Neil Horman
  1 sibling, 1 reply; 36+ messages in thread
From: Eric Dumazet @ 2009-05-15  7:37 UTC (permalink / raw)
  To: Neil Horman; +Cc: Jiri Pirko, Paul E. McKenney, netdev, davem

Jiri Pirko a écrit :
> Thu, May 14, 2009 at 07:29:54PM CEST, nhorman@tuxdriver.com wrote:
>> On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote:
>>>> +
>>>> +		/*
>>>> +		 * Clean the device list
>>>> +		 */
>>>> +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
>>> 		^^^^^^^^^^^^^^^^^^^^^^^
>>> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock.
>>> Also it would be good to use list_for_each_entry_safe here since you're
>>> modifying the list.
>>>
>> The definition of list_for_each_entry_rcu specifically says its safe against
>> list-mutation primitives, so its fine.  Although you are correct, in that its
>> safety is dependent on the protection of rcu_read_lock(), so I'll add that in.
> 
> You are right that list_for_each_entry_rcu is safe against list-mutation
> primitives. But there's no need for this on the update side when you hold a writer
> spinlock. Here I think it's better (and also less confusing) to use ordinary
> list_for_each_entry which in this case must be list_for_each_entry_safe.

Absolutely.

RCU is tricky, and must be well understood. Using the right verbs is really needed
to help everybody read the code and be able to understand it quickly and maintain it
if needed.

In this particular case, the list_for_each_entry_rcu() is a litle bit more
expensive than regular list_for_each_entry(), as it includes additionnal barriers.

Just reading code like this :

+	spin_lock(&trace_state_lock);
+		rcu_read_lock();
+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+			if (new_stat->dev == NULL) {
+				list_del_rcu(&new_stat->list);
+				call_rcu(&new_stat->rcu, free_dm_hw_stat);
+			}
+		}
+		rcu_read_unlock();
+	spin_unlock(&trace_state_lock);

We *know* something is wrong, as rcu_read_lock() is supposed to guard a readside,
so having both rcu_read_lock() and list_del_rcu() is certainly wrong, we dont have 
to actually understand what is really done by the algorithm.

So following is much better : (but maybe not correct... I let you 
check if list_for_each_entry_safe() would not be better here, since
you delete elements in a list while iterating in it)

Maybe you can add a break; after call_rcu() if you know only one element
can match the "if (new_stat->dev == NULL) " condition, and use normal list_for_each_entry()

+	spin_lock(&trace_state_lock);
+		list_for_each_entry(new_stat, &hw_stats_list, list) {
+			if (new_stat->dev == NULL) {
+				list_del_rcu(&new_stat->list);
+				call_rcu(&new_stat->rcu, free_dm_hw_stat);
+			}
+		}
+	spin_unlock(&trace_state_lock);

Even if the 'wrong' version was not buggy (no crashes or corruption), the 'right' one
is a commonly used construct in kernel that most dev are able to read without asking 
themselves "is is correct or not ? Dont we have something strange here ?"

Neil, you need to read more code playing with RCU to get familiar with it, we all did same
errors in the past :)



^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15  6:51                 ` Jiri Pirko
  2009-05-15  7:37                   ` Eric Dumazet
@ 2009-05-15 10:59                   ` Neil Horman
  2009-05-15 11:27                     ` Jiri Pirko
  1 sibling, 1 reply; 36+ messages in thread
From: Neil Horman @ 2009-05-15 10:59 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: Paul E. McKenney, Eric Dumazet, netdev, davem

On Fri, May 15, 2009 at 08:51:02AM +0200, Jiri Pirko wrote:
> Thu, May 14, 2009 at 07:29:54PM CEST, nhorman@tuxdriver.com wrote:
> >On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote:
> >> >+
> >> >+		/*
> >> >+		 * Clean the device list
> >> >+		 */
> >> >+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> >> 		^^^^^^^^^^^^^^^^^^^^^^^
> >> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock.
> >> Also it would be good to use list_for_each_entry_safe here since you're
> >> modifying the list.
> >> 
> >
> >The definition of list_for_each_entry_rcu specifically says its safe against
> >list-mutation primitives, so its fine.  Although you are correct, in that its
> >safety is dependent on the protection of rcu_read_lock(), so I'll add that in.
> 
> You are right that list_for_each_entry_rcu is safe against list-mutation
> primitives. But there's no need for this on the update side when you hold a writer
> spinlock. Here I think it's better (and also less confusing) to use ordinary
> list_for_each_entry which in this case must be list_for_each_entry_safe.
> 
> 		Jirka
> 
I disagree.  I think visually its more understandable with the rcu variant, and
that spinlock isn't a writer spinlock, the other uses of this list don't use
that lock (to imporove performance).  The spinlock is there simply to serialize
changes to the trace state (so that one cpu doen't try to turn dropmon on, while
another tries to turn it off).  So we need the rcu variant to protect the list
removal against concurrent use on the reader side in the new registered trace
point function.

Neil

> >Thanks for the catch!  New patch attached
> >
> >Change notes:
> >1) Add rcu_read_lock/unlock protection around TRACE_OFF event
> >
> >Neil
> >
> >
> >Patch to add the ability to detect drops in hardware interfaces via dropwatch.
> >Adds a tracepoint to net_rx_action to signal everytime a napi instance is
> >polled.  The dropmon code then periodically checks to see if the rx_frames
> >counter has changed, and if so, adds a drop notification to the netlink
> >protocol, using the reserved all-0's vector to indicate the drop location was in
> >hardware, rather than somewhere in the code.
> >
> >Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> >
> >
> > include/linux/net_dropmon.h |    7 ++
> > include/trace/napi.h        |   11 ++++
> > net/core/dev.c              |    5 +
> > net/core/drop_monitor.c     |  117 ++++++++++++++++++++++++++++++++++++++++++--
> > net/core/net-traces.c       |    4 +
> > net/core/netpoll.c          |    2 
> > 6 files changed, 141 insertions(+), 5 deletions(-)
> >
> >diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
> >index 0217fb8..9addc62 100644
> >--- a/include/linux/net_dropmon.h
> >+++ b/include/linux/net_dropmon.h
> >@@ -8,6 +8,13 @@ struct net_dm_drop_point {
> > 	__u32 count;
> > };
> > 
> >+#define is_drop_point_hw(x) do {\
> >+	int ____i, ____j;\
> >+	for (____i = 0; ____i < 8; i ____i++)\
> >+		____j |= x[____i];\
> >+	____j;\
> >+} while (0)
> >+
> > #define NET_DM_CFG_VERSION  0
> > #define NET_DM_CFG_ALERT_COUNT  1
> > #define NET_DM_CFG_ALERT_DELAY 2
> >diff --git a/include/trace/napi.h b/include/trace/napi.h
> >new file mode 100644
> >index 0000000..7c39eb7
> >--- /dev/null
> >+++ b/include/trace/napi.h
> >@@ -0,0 +1,11 @@
> >+#ifndef _TRACE_NAPI_H_
> >+#define _TRACE_NAPI_H_
> >+
> >+#include <linux/skbuff.h>
> >+#include <linux/tracepoint.h>
> >+
> >+DECLARE_TRACE(napi_poll,
> >+	TP_PROTO(struct napi_struct *napi),
> >+	TP_ARGS(napi));
> >+
> >+#endif
> >diff --git a/net/core/dev.c b/net/core/dev.c
> >index 637ea71..165979c 100644
> >--- a/net/core/dev.c
> >+++ b/net/core/dev.c
> >@@ -126,6 +126,7 @@
> > #include <linux/in.h>
> > #include <linux/jhash.h>
> > #include <linux/random.h>
> >+#include <trace/napi.h>
> > 
> > #include "net-sysfs.h"
> > 
> >@@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h)
> > 		 * accidently calling ->poll() when NAPI is not scheduled.
> > 		 */
> > 		work = 0;
> >-		if (test_bit(NAPI_STATE_SCHED, &n->state))
> >+		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
> > 			work = n->poll(n, weight);
> >+			trace_napi_poll(n);
> >+		}
> > 
> > 		WARN_ON_ONCE(work > weight);
> > 
> >diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
> >index 2797b71..f9130d5 100644
> >--- a/net/core/drop_monitor.c
> >+++ b/net/core/drop_monitor.c
> >@@ -22,8 +22,10 @@
> > #include <linux/timer.h>
> > #include <linux/bitops.h>
> > #include <net/genetlink.h>
> >+#include <net/netevent.h>
> > 
> > #include <trace/skb.h>
> >+#include <trace/napi.h>
> > 
> > #include <asm/unaligned.h>
> > 
> >@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused);
> >  * and the work handle that will send up
> >  * netlink alerts
> >  */
> >-struct sock *dm_sock;
> >+static int trace_state = TRACE_OFF;
> >+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
> > 
> > struct per_cpu_dm_data {
> > 	struct work_struct dm_alert_work;
> >@@ -47,6 +50,13 @@ struct per_cpu_dm_data {
> > 	struct timer_list send_timer;
> > };
> > 
> >+struct dm_hw_stat_delta {
> >+	struct net_device *dev;
> >+	struct list_head list;
> >+	struct rcu_head rcu;
> >+	unsigned long last_drop_val;
> >+};
> >+
> > static struct genl_family net_drop_monitor_family = {
> > 	.id             = GENL_ID_GENERATE,
> > 	.hdrsize        = 0,
> >@@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
> > 
> > static int dm_hit_limit = 64;
> > static int dm_delay = 1;
> >-
> >+static unsigned long dm_hw_check_delta = 2*HZ;
> >+static LIST_HEAD(hw_stats_list);
> > 
> > static void reset_per_cpu_data(struct per_cpu_dm_data *data)
> > {
> >@@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused)
> > 	schedule_work(&data->dm_alert_work);
> > }
> > 
> >-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
> >+static void trace_drop_common(struct sk_buff *skb, void *location)
> > {
> > 	struct net_dm_alert_msg *msg;
> > 	struct nlmsghdr *nlh;
> >@@ -159,26 +170,81 @@ out:
> > 	return;
> > }
> > 
> >+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
> >+{
> >+	trace_drop_common(skb, location);
> >+}
> >+
> >+static void trace_napi_poll_hit(struct napi_struct *napi)
> >+{
> >+	struct dm_hw_stat_delta *new_stat;
> >+
> >+	/*
> >+	 * Ratelimit our check time to dm_hw_check_delta jiffies
> >+	 */
> >+	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
> >+		return;
> >+
> >+	rcu_read_lock();
> >+	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> >+		if ((new_stat->dev == napi->dev)  &&
> >+		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
> >+			trace_drop_common(NULL, NULL);
> >+			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
> >+			break;
> >+		}
> >+	}
> >+	rcu_read_unlock();
> >+}
> >+
> >+
> >+static void free_dm_hw_stat(struct rcu_head *head)
> >+{
> >+	struct dm_hw_stat_delta *n;
> >+	n = container_of(head, struct dm_hw_stat_delta, rcu);
> >+	kfree(n);
> >+}
> >+
> > static int set_all_monitor_traces(int state)
> > {
> > 	int rc = 0;
> >+	struct dm_hw_stat_delta *new_stat = NULL;
> >+
> >+	spin_lock(&trace_state_lock);
> > 
> > 	switch (state) {
> > 	case TRACE_ON:
> > 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
> >+		rc |= register_trace_napi_poll(trace_napi_poll_hit);
> > 		break;
> > 	case TRACE_OFF:
> > 		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
> >+		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
> > 
> > 		tracepoint_synchronize_unregister();
> >+
> >+		/*
> >+		 * Clean the device list
> >+		 */
> >+		rcu_read_lock();
> >+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> >+			if (new_stat->dev == NULL) {
> >+				list_del_rcu(&new_stat->list);
> >+				call_rcu(&new_stat->rcu, free_dm_hw_stat);
> >+			}
> >+		}
> >+		rcu_read_unlock();
> > 		break;
> > 	default:
> > 		rc = 1;
> > 		break;
> > 	}
> > 
> >+	spin_unlock(&trace_state_lock);
> >+
> > 	if (rc)
> > 		return -EINPROGRESS;
> >+	trace_state = state;
> > 	return rc;
> > }
> > 
> >@@ -204,6 +270,38 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
> > 	return -ENOTSUPP;
> > }
> > 
> >+static int dropmon_net_event(struct notifier_block *ev_block,
> >+			unsigned long event, void *ptr)
> >+{
> >+	struct net_device *dev = ptr;
> >+	struct dm_hw_stat_delta *new_stat = NULL;
> >+
> >+	switch (event) {
> >+	case NETDEV_REGISTER:
> >+		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
> >+
> >+		if (!new_stat)
> >+			goto out;
> >+
> >+		new_stat->dev = dev;
> >+		INIT_RCU_HEAD(&new_stat->rcu);
> >+		spin_lock(&trace_state_lock);
> >+		list_add_rcu(&new_stat->list, &hw_stats_list);
> >+		spin_unlock(&trace_state_lock);
> >+		break;
> >+	case NETDEV_UNREGISTER:
> >+		rcu_read_lock();
> >+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> >+			if (new_stat->dev == dev)
> >+				new_stat->dev = NULL;
> >+				break;
> >+		}
> >+		rcu_read_unlock();
> >+		break;
> >+	}
> >+out:
> >+	return NOTIFY_DONE;
> >+}
> > 
> > static struct genl_ops dropmon_ops[] = {
> > 	{
> >@@ -220,6 +318,10 @@ static struct genl_ops dropmon_ops[] = {
> > 	},
> > };
> > 
> >+static struct notifier_block dropmon_net_notifier = {
> >+	.notifier_call = dropmon_net_event
> >+};
> >+
> > static int __init init_net_drop_monitor(void)
> > {
> > 	int cpu;
> >@@ -243,12 +345,18 @@ static int __init init_net_drop_monitor(void)
> > 		ret = genl_register_ops(&net_drop_monitor_family,
> > 					&dropmon_ops[i]);
> > 		if (ret) {
> >-			printk(KERN_CRIT "failed to register operation %d\n",
> >+			printk(KERN_CRIT "Failed to register operation %d\n",
> > 				dropmon_ops[i].cmd);
> > 			goto out_unreg;
> > 		}
> > 	}
> > 
> >+	rc = register_netdevice_notifier(&dropmon_net_notifier);
> >+	if (rc < 0) {
> >+		printk(KERN_CRIT "Failed to register netdevice notifier\n");
> >+		goto out_unreg;
> >+	}
> >+
> > 	rc = 0;
> > 
> > 	for_each_present_cpu(cpu) {
> >@@ -259,6 +367,7 @@ static int __init init_net_drop_monitor(void)
> > 		data->send_timer.data = cpu;
> > 		data->send_timer.function = sched_send_work;
> > 	}
> >+
> > 	goto out;
> > 
> > out_unreg:
> >diff --git a/net/core/net-traces.c b/net/core/net-traces.c
> >index c8fb456..b07b25b 100644
> >--- a/net/core/net-traces.c
> >+++ b/net/core/net-traces.c
> >@@ -20,6 +20,7 @@
> > #include <linux/netlink.h>
> > #include <linux/net_dropmon.h>
> > #include <trace/skb.h>
> >+#include <trace/napi.h>
> > 
> > #include <asm/unaligned.h>
> > #include <asm/bitops.h>
> >@@ -27,3 +28,6 @@
> > 
> > DEFINE_TRACE(kfree_skb);
> > EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
> >+
> >+DEFINE_TRACE(napi_poll);
> >+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
> >diff --git a/net/core/netpoll.c b/net/core/netpoll.c
> >index b5873bd..446d3ba 100644
> >--- a/net/core/netpoll.c
> >+++ b/net/core/netpoll.c
> >@@ -24,6 +24,7 @@
> > #include <net/tcp.h>
> > #include <net/udp.h>
> > #include <asm/unaligned.h>
> >+#include <trace/napi.h>
> > 
> > /*
> >  * We maintain a small pool of fully-sized skbs, to make sure the
> >@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
> > 	set_bit(NAPI_STATE_NPSVC, &napi->state);
> > 
> > 	work = napi->poll(napi, budget);
> >+	trace_napi_poll(napi->dev);
> > 
> > 	clear_bit(NAPI_STATE_NPSVC, &napi->state);
> > 	atomic_dec(&trapped);
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15  5:49                 ` Jarek Poplawski
@ 2009-05-15 11:01                   ` Neil Horman
  2009-05-15 11:12                     ` Jarek Poplawski
  0 siblings, 1 reply; 36+ messages in thread
From: Neil Horman @ 2009-05-15 11:01 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Jiri Pirko, Paul E. McKenney, Eric Dumazet, netdev, davem

On Fri, May 15, 2009 at 05:49:47AM +0000, Jarek Poplawski wrote:
> On 14-05-2009 19:29, Neil Horman wrote:
> > On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote:
> >>> +
> >>> +		/*
> >>> +		 * Clean the device list
> >>> +		 */
> >>> +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> >> 		^^^^^^^^^^^^^^^^^^^^^^^
> >> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock.
> >> Also it would be good to use list_for_each_entry_safe here since you're
> >> modifying the list.
> >>
> > 
> > The definition of list_for_each_entry_rcu specifically says its safe against
> > list-mutation primitives, so its fine.  Although you are correct, in that its
> > safety is dependent on the protection of rcu_read_lock(), so I'll add that in.
> > Thanks for the catch!  New patch attached
> > 
> > Change notes:
> > 1) Add rcu_read_lock/unlock protection around TRACE_OFF event
> > 
> > Neil
> ...
> >  static int set_all_monitor_traces(int state)
> >  {
> >  	int rc = 0;
> > +	struct dm_hw_stat_delta *new_stat = NULL;
> > +
> > +	spin_lock(&trace_state_lock);
> >  
> >  	switch (state) {
> >  	case TRACE_ON:
> >  		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
> > +		rc |= register_trace_napi_poll(trace_napi_poll_hit);
> >  		break;
> >  	case TRACE_OFF:
> >  		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
> > +		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
> >  
> >  		tracepoint_synchronize_unregister();
> > +
> > +		/*
> > +		 * Clean the device list
> > +		 */
> > +		rcu_read_lock();
> > +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> > +			if (new_stat->dev == NULL) {
> > +				list_del_rcu(&new_stat->list);
> > +				call_rcu(&new_stat->rcu, free_dm_hw_stat);
> > +			}
> > +		}
> > +		rcu_read_unlock();
> 
> IMHO it looks worse now. rcu_read_lock() suggests it's a read side,
> and spin_lock(&trace_state_lock) protects something else.
> 
the read lock is required (according to the comments for the list loop
primitive) to protect against the embedded mutation primitive, so its required.
I understand that its a bit counterintuitive, but intuition takes a backseat to
functionality. :)
Neil

> Jarek P.
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15 11:01                   ` Neil Horman
@ 2009-05-15 11:12                     ` Jarek Poplawski
  2009-05-15 11:15                       ` Neil Horman
  0 siblings, 1 reply; 36+ messages in thread
From: Jarek Poplawski @ 2009-05-15 11:12 UTC (permalink / raw)
  To: Neil Horman; +Cc: Jiri Pirko, Paul E. McKenney, Eric Dumazet, netdev, davem

On Fri, May 15, 2009 at 07:01:41AM -0400, Neil Horman wrote:
> On Fri, May 15, 2009 at 05:49:47AM +0000, Jarek Poplawski wrote:
...
> > IMHO it looks worse now. rcu_read_lock() suggests it's a read side,
> > and spin_lock(&trace_state_lock) protects something else.
> > 
> the read lock is required (according to the comments for the list loop
> primitive) to protect against the embedded mutation primitive, so its required.
> I understand that its a bit counterintuitive, but intuition takes a backseat to
> functionality. :)
> Neil
> 

I guess, you missed:

> Looks good from an RCU viewpoint!
> 
> Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

for the previous version...

Jarek P.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15  7:37                   ` Eric Dumazet
@ 2009-05-15 11:12                     ` Neil Horman
  0 siblings, 0 replies; 36+ messages in thread
From: Neil Horman @ 2009-05-15 11:12 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jiri Pirko, Paul E. McKenney, netdev, davem

On Fri, May 15, 2009 at 09:37:28AM +0200, Eric Dumazet wrote:
> Jiri Pirko a écrit :
> > Thu, May 14, 2009 at 07:29:54PM CEST, nhorman@tuxdriver.com wrote:
> >> On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote:
> >>>> +
> >>>> +		/*
> >>>> +		 * Clean the device list
> >>>> +		 */
> >>>> +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> >>> 		^^^^^^^^^^^^^^^^^^^^^^^
> >>> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock.
> >>> Also it would be good to use list_for_each_entry_safe here since you're
> >>> modifying the list.
> >>>
> >> The definition of list_for_each_entry_rcu specifically says its safe against
> >> list-mutation primitives, so its fine.  Although you are correct, in that its
> >> safety is dependent on the protection of rcu_read_lock(), so I'll add that in.
> > 
> > You are right that list_for_each_entry_rcu is safe against list-mutation
> > primitives. But there's no need for this on the update side when you hold a writer
> > spinlock. Here I think it's better (and also less confusing) to use ordinary
> > list_for_each_entry which in this case must be list_for_each_entry_safe.
> 
> Absolutely.
> 
> RCU is tricky, and must be well understood. Using the right verbs is really needed
> to help everybody read the code and be able to understand it quickly and maintain it
> if needed.
> 
> In this particular case, the list_for_each_entry_rcu() is a litle bit more
> expensive than regular list_for_each_entry(), as it includes additionnal barriers.
> 
> Just reading code like this :
> 
> +	spin_lock(&trace_state_lock);
> +		rcu_read_lock();
> +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> +			if (new_stat->dev == NULL) {
> +				list_del_rcu(&new_stat->list);
> +				call_rcu(&new_stat->rcu, free_dm_hw_stat);
> +			}
> +		}
> +		rcu_read_unlock();
> +	spin_unlock(&trace_state_lock);
> 
> We *know* something is wrong, as rcu_read_lock() is supposed to guard a readside,
> so having both rcu_read_lock() and list_del_rcu() is certainly wrong, we dont have 
> to actually understand what is really done by the algorithm.
> 
> So following is much better : (but maybe not correct... I let you 
> check if list_for_each_entry_safe() would not be better here, since
> you delete elements in a list while iterating in it)
> 
> Maybe you can add a break; after call_rcu() if you know only one element
> can match the "if (new_stat->dev == NULL) " condition, and use normal list_for_each_entry()
> 
> +	spin_lock(&trace_state_lock);
> +		list_for_each_entry(new_stat, &hw_stats_list, list) {
> +			if (new_stat->dev == NULL) {
> +				list_del_rcu(&new_stat->list);
> +				call_rcu(&new_stat->rcu, free_dm_hw_stat);
> +			}
> +		}
> +	spin_unlock(&trace_state_lock);
> 
> Even if the 'wrong' version was not buggy (no crashes or corruption), the 'right' one
> is a commonly used construct in kernel that most dev are able to read without asking 
> themselves "is is correct or not ? Dont we have something strange here ?"
> 
> Neil, you need to read more code playing with RCU to get familiar with it, we all did same
> errors in the past :)
> 

Thanks :)
So help me understand something here then.  The trace_state_lock has no
intention of protecting the actual read side of this list (the the napi trace
hook that this patch adds).  My understanding of list_for_each_entry_rcu and
list_del_rcu, was that even with a call list_del_rcu, the list remained
unchanged unti a quiescent point was passed by all cpus (which is detected via
the counter in rcu_read_[un]lock.  How is it insufficient to use
rcu_read_[un]lock here to implement that protection?  I agree it looks
counterintuitive, but it makes sense to me from a function standpoint.

Further to that point, given the example that you have above of what you think
_should_ work, is exactly what I have in the set_all_monitor_trace function
currently (saving for the conversion of list_for_each_entry_rcu to
list_for_each_entry_safe and the removal of the read_lock).  If those changes
improve performance, then I can get behind them, but I don't see what problem
they avoid if I don't use them.  Can you clarify that?

Regards
Neil

> 
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15 11:12                     ` Jarek Poplawski
@ 2009-05-15 11:15                       ` Neil Horman
  2009-05-15 11:40                         ` Jarek Poplawski
  0 siblings, 1 reply; 36+ messages in thread
From: Neil Horman @ 2009-05-15 11:15 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Jiri Pirko, Paul E. McKenney, Eric Dumazet, netdev, davem

On Fri, May 15, 2009 at 11:12:14AM +0000, Jarek Poplawski wrote:
> On Fri, May 15, 2009 at 07:01:41AM -0400, Neil Horman wrote:
> > On Fri, May 15, 2009 at 05:49:47AM +0000, Jarek Poplawski wrote:
> ...
> > > IMHO it looks worse now. rcu_read_lock() suggests it's a read side,
> > > and spin_lock(&trace_state_lock) protects something else.
> > > 
> > the read lock is required (according to the comments for the list loop
> > primitive) to protect against the embedded mutation primitive, so its required.
> > I understand that its a bit counterintuitive, but intuition takes a backseat to
> > functionality. :)
> > Neil
> > 
> 
> I guess, you missed:
> 
> > Looks good from an RCU viewpoint!
> > 
> > Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> 
> for the previous version...
> 
I didn't, our comments passed in flight.  Nevertheless, I'm not sure what this
adds (other than additional overhead), which I agree is bad and so might should
be removed, but there are some outstanding questions regarding if it is needed
in relation to the list primitives I'm using here.  According to Eric,
list_for_each_entry_safe might be less intrusive here, and I'm trying to figure
out if I agree. :)
Neil

> Jarek P.
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15 10:59                   ` Neil Horman
@ 2009-05-15 11:27                     ` Jiri Pirko
  2009-05-15 16:07                       ` Neil Horman
  0 siblings, 1 reply; 36+ messages in thread
From: Jiri Pirko @ 2009-05-15 11:27 UTC (permalink / raw)
  To: Neil Horman; +Cc: Paul E. McKenney, Eric Dumazet, netdev, davem

Fri, May 15, 2009 at 12:59:09PM CEST, nhorman@tuxdriver.com wrote:
>On Fri, May 15, 2009 at 08:51:02AM +0200, Jiri Pirko wrote:
>> Thu, May 14, 2009 at 07:29:54PM CEST, nhorman@tuxdriver.com wrote:
>> >On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote:
>> >> >+
>> >> >+		/*
>> >> >+		 * Clean the device list
>> >> >+		 */
>> >> >+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
>> >> 		^^^^^^^^^^^^^^^^^^^^^^^
>> >> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock.
>> >> Also it would be good to use list_for_each_entry_safe here since you're
>> >> modifying the list.
>> >> 
>> >
>> >The definition of list_for_each_entry_rcu specifically says its safe against
>> >list-mutation primitives, so its fine.  Although you are correct, in that its
>> >safety is dependent on the protection of rcu_read_lock(), so I'll add that in.
>> 
>> You are right that list_for_each_entry_rcu is safe against list-mutation
>> primitives. But there's no need for this on the update side when you hold a writer
>> spinlock. Here I think it's better (and also less confusing) to use ordinary
>> list_for_each_entry which in this case must be list_for_each_entry_safe.
>> 
>> 		Jirka
>> 
>I disagree.  I think visually its more understandable with the rcu variant, and
>that spinlock isn't a writer spinlock, the other uses of this list don't use
>that lock (to imporove performance).  The spinlock is there simply to serialize
>changes to the trace state (so that one cpu doen't try to turn dropmon on, while
>another tries to turn it off).  So we need the rcu variant to protect the list
>removal against concurrent use on the reader side in the new registered trace
>point function.

Ok, since you spinlock is not writer lock, you need to protect writers from
concurrent list updates anyway! And this spinlock does that work. Then you
_donotneed_ rcu_read_lock to read from list, you work with it as with ordinary
list while reading. As Eric wrote this approach is common.

		Jirka
>
>Neil
>
>> >Thanks for the catch!  New patch attached
>> >
>> >Change notes:
>> >1) Add rcu_read_lock/unlock protection around TRACE_OFF event
>> >
>> >Neil
>> >
>> >
>> >Patch to add the ability to detect drops in hardware interfaces via dropwatch.
>> >Adds a tracepoint to net_rx_action to signal everytime a napi instance is
>> >polled.  The dropmon code then periodically checks to see if the rx_frames
>> >counter has changed, and if so, adds a drop notification to the netlink
>> >protocol, using the reserved all-0's vector to indicate the drop location was in
>> >hardware, rather than somewhere in the code.
>> >
>> >Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
>> >
>> >
>> > include/linux/net_dropmon.h |    7 ++
>> > include/trace/napi.h        |   11 ++++
>> > net/core/dev.c              |    5 +
>> > net/core/drop_monitor.c     |  117 ++++++++++++++++++++++++++++++++++++++++++--
>> > net/core/net-traces.c       |    4 +
>> > net/core/netpoll.c          |    2 
>> > 6 files changed, 141 insertions(+), 5 deletions(-)
>> >
>> >diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
>> >index 0217fb8..9addc62 100644
>> >--- a/include/linux/net_dropmon.h
>> >+++ b/include/linux/net_dropmon.h
>> >@@ -8,6 +8,13 @@ struct net_dm_drop_point {
>> > 	__u32 count;
>> > };
>> > 
>> >+#define is_drop_point_hw(x) do {\
>> >+	int ____i, ____j;\
>> >+	for (____i = 0; ____i < 8; i ____i++)\
>> >+		____j |= x[____i];\
>> >+	____j;\
>> >+} while (0)
>> >+
>> > #define NET_DM_CFG_VERSION  0
>> > #define NET_DM_CFG_ALERT_COUNT  1
>> > #define NET_DM_CFG_ALERT_DELAY 2
>> >diff --git a/include/trace/napi.h b/include/trace/napi.h
>> >new file mode 100644
>> >index 0000000..7c39eb7
>> >--- /dev/null
>> >+++ b/include/trace/napi.h
>> >@@ -0,0 +1,11 @@
>> >+#ifndef _TRACE_NAPI_H_
>> >+#define _TRACE_NAPI_H_
>> >+
>> >+#include <linux/skbuff.h>
>> >+#include <linux/tracepoint.h>
>> >+
>> >+DECLARE_TRACE(napi_poll,
>> >+	TP_PROTO(struct napi_struct *napi),
>> >+	TP_ARGS(napi));
>> >+
>> >+#endif
>> >diff --git a/net/core/dev.c b/net/core/dev.c
>> >index 637ea71..165979c 100644
>> >--- a/net/core/dev.c
>> >+++ b/net/core/dev.c
>> >@@ -126,6 +126,7 @@
>> > #include <linux/in.h>
>> > #include <linux/jhash.h>
>> > #include <linux/random.h>
>> >+#include <trace/napi.h>
>> > 
>> > #include "net-sysfs.h"
>> > 
>> >@@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h)
>> > 		 * accidently calling ->poll() when NAPI is not scheduled.
>> > 		 */
>> > 		work = 0;
>> >-		if (test_bit(NAPI_STATE_SCHED, &n->state))
>> >+		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
>> > 			work = n->poll(n, weight);
>> >+			trace_napi_poll(n);
>> >+		}
>> > 
>> > 		WARN_ON_ONCE(work > weight);
>> > 
>> >diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
>> >index 2797b71..f9130d5 100644
>> >--- a/net/core/drop_monitor.c
>> >+++ b/net/core/drop_monitor.c
>> >@@ -22,8 +22,10 @@
>> > #include <linux/timer.h>
>> > #include <linux/bitops.h>
>> > #include <net/genetlink.h>
>> >+#include <net/netevent.h>
>> > 
>> > #include <trace/skb.h>
>> >+#include <trace/napi.h>
>> > 
>> > #include <asm/unaligned.h>
>> > 
>> >@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused);
>> >  * and the work handle that will send up
>> >  * netlink alerts
>> >  */
>> >-struct sock *dm_sock;
>> >+static int trace_state = TRACE_OFF;
>> >+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
>> > 
>> > struct per_cpu_dm_data {
>> > 	struct work_struct dm_alert_work;
>> >@@ -47,6 +50,13 @@ struct per_cpu_dm_data {
>> > 	struct timer_list send_timer;
>> > };
>> > 
>> >+struct dm_hw_stat_delta {
>> >+	struct net_device *dev;
>> >+	struct list_head list;
>> >+	struct rcu_head rcu;
>> >+	unsigned long last_drop_val;
>> >+};
>> >+
>> > static struct genl_family net_drop_monitor_family = {
>> > 	.id             = GENL_ID_GENERATE,
>> > 	.hdrsize        = 0,
>> >@@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
>> > 
>> > static int dm_hit_limit = 64;
>> > static int dm_delay = 1;
>> >-
>> >+static unsigned long dm_hw_check_delta = 2*HZ;
>> >+static LIST_HEAD(hw_stats_list);
>> > 
>> > static void reset_per_cpu_data(struct per_cpu_dm_data *data)
>> > {
>> >@@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused)
>> > 	schedule_work(&data->dm_alert_work);
>> > }
>> > 
>> >-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
>> >+static void trace_drop_common(struct sk_buff *skb, void *location)
>> > {
>> > 	struct net_dm_alert_msg *msg;
>> > 	struct nlmsghdr *nlh;
>> >@@ -159,26 +170,81 @@ out:
>> > 	return;
>> > }
>> > 
>> >+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
>> >+{
>> >+	trace_drop_common(skb, location);
>> >+}
>> >+
>> >+static void trace_napi_poll_hit(struct napi_struct *napi)
>> >+{
>> >+	struct dm_hw_stat_delta *new_stat;
>> >+
>> >+	/*
>> >+	 * Ratelimit our check time to dm_hw_check_delta jiffies
>> >+	 */
>> >+	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
>> >+		return;
>> >+
>> >+	rcu_read_lock();
>> >+	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
>> >+		if ((new_stat->dev == napi->dev)  &&
>> >+		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
>> >+			trace_drop_common(NULL, NULL);
>> >+			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
>> >+			break;
>> >+		}
>> >+	}
>> >+	rcu_read_unlock();
>> >+}
>> >+
>> >+
>> >+static void free_dm_hw_stat(struct rcu_head *head)
>> >+{
>> >+	struct dm_hw_stat_delta *n;
>> >+	n = container_of(head, struct dm_hw_stat_delta, rcu);
>> >+	kfree(n);
>> >+}
>> >+
>> > static int set_all_monitor_traces(int state)
>> > {
>> > 	int rc = 0;
>> >+	struct dm_hw_stat_delta *new_stat = NULL;
>> >+
>> >+	spin_lock(&trace_state_lock);
>> > 
>> > 	switch (state) {
>> > 	case TRACE_ON:
>> > 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
>> >+		rc |= register_trace_napi_poll(trace_napi_poll_hit);
>> > 		break;
>> > 	case TRACE_OFF:
>> > 		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
>> >+		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
>> > 
>> > 		tracepoint_synchronize_unregister();
>> >+
>> >+		/*
>> >+		 * Clean the device list
>> >+		 */
>> >+		rcu_read_lock();
>> >+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
>> >+			if (new_stat->dev == NULL) {
>> >+				list_del_rcu(&new_stat->list);
>> >+				call_rcu(&new_stat->rcu, free_dm_hw_stat);
>> >+			}
>> >+		}
>> >+		rcu_read_unlock();
>> > 		break;
>> > 	default:
>> > 		rc = 1;
>> > 		break;
>> > 	}
>> > 
>> >+	spin_unlock(&trace_state_lock);
>> >+
>> > 	if (rc)
>> > 		return -EINPROGRESS;
>> >+	trace_state = state;
>> > 	return rc;
>> > }
>> > 
>> >@@ -204,6 +270,38 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
>> > 	return -ENOTSUPP;
>> > }
>> > 
>> >+static int dropmon_net_event(struct notifier_block *ev_block,
>> >+			unsigned long event, void *ptr)
>> >+{
>> >+	struct net_device *dev = ptr;
>> >+	struct dm_hw_stat_delta *new_stat = NULL;
>> >+
>> >+	switch (event) {
>> >+	case NETDEV_REGISTER:
>> >+		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
>> >+
>> >+		if (!new_stat)
>> >+			goto out;
>> >+
>> >+		new_stat->dev = dev;
>> >+		INIT_RCU_HEAD(&new_stat->rcu);
>> >+		spin_lock(&trace_state_lock);
>> >+		list_add_rcu(&new_stat->list, &hw_stats_list);
>> >+		spin_unlock(&trace_state_lock);
>> >+		break;
>> >+	case NETDEV_UNREGISTER:
>> >+		rcu_read_lock();
>> >+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
>> >+			if (new_stat->dev == dev)
>> >+				new_stat->dev = NULL;
>> >+				break;
>> >+		}
>> >+		rcu_read_unlock();
>> >+		break;
>> >+	}
>> >+out:
>> >+	return NOTIFY_DONE;
>> >+}
>> > 
>> > static struct genl_ops dropmon_ops[] = {
>> > 	{
>> >@@ -220,6 +318,10 @@ static struct genl_ops dropmon_ops[] = {
>> > 	},
>> > };
>> > 
>> >+static struct notifier_block dropmon_net_notifier = {
>> >+	.notifier_call = dropmon_net_event
>> >+};
>> >+
>> > static int __init init_net_drop_monitor(void)
>> > {
>> > 	int cpu;
>> >@@ -243,12 +345,18 @@ static int __init init_net_drop_monitor(void)
>> > 		ret = genl_register_ops(&net_drop_monitor_family,
>> > 					&dropmon_ops[i]);
>> > 		if (ret) {
>> >-			printk(KERN_CRIT "failed to register operation %d\n",
>> >+			printk(KERN_CRIT "Failed to register operation %d\n",
>> > 				dropmon_ops[i].cmd);
>> > 			goto out_unreg;
>> > 		}
>> > 	}
>> > 
>> >+	rc = register_netdevice_notifier(&dropmon_net_notifier);
>> >+	if (rc < 0) {
>> >+		printk(KERN_CRIT "Failed to register netdevice notifier\n");
>> >+		goto out_unreg;
>> >+	}
>> >+
>> > 	rc = 0;
>> > 
>> > 	for_each_present_cpu(cpu) {
>> >@@ -259,6 +367,7 @@ static int __init init_net_drop_monitor(void)
>> > 		data->send_timer.data = cpu;
>> > 		data->send_timer.function = sched_send_work;
>> > 	}
>> >+
>> > 	goto out;
>> > 
>> > out_unreg:
>> >diff --git a/net/core/net-traces.c b/net/core/net-traces.c
>> >index c8fb456..b07b25b 100644
>> >--- a/net/core/net-traces.c
>> >+++ b/net/core/net-traces.c
>> >@@ -20,6 +20,7 @@
>> > #include <linux/netlink.h>
>> > #include <linux/net_dropmon.h>
>> > #include <trace/skb.h>
>> >+#include <trace/napi.h>
>> > 
>> > #include <asm/unaligned.h>
>> > #include <asm/bitops.h>
>> >@@ -27,3 +28,6 @@
>> > 
>> > DEFINE_TRACE(kfree_skb);
>> > EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
>> >+
>> >+DEFINE_TRACE(napi_poll);
>> >+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
>> >diff --git a/net/core/netpoll.c b/net/core/netpoll.c
>> >index b5873bd..446d3ba 100644
>> >--- a/net/core/netpoll.c
>> >+++ b/net/core/netpoll.c
>> >@@ -24,6 +24,7 @@
>> > #include <net/tcp.h>
>> > #include <net/udp.h>
>> > #include <asm/unaligned.h>
>> >+#include <trace/napi.h>
>> > 
>> > /*
>> >  * We maintain a small pool of fully-sized skbs, to make sure the
>> >@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
>> > 	set_bit(NAPI_STATE_NPSVC, &napi->state);
>> > 
>> > 	work = napi->poll(napi, budget);
>> >+	trace_napi_poll(napi->dev);
>> > 
>> > 	clear_bit(NAPI_STATE_NPSVC, &napi->state);
>> > 	atomic_dec(&trapped);
>> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15 11:15                       ` Neil Horman
@ 2009-05-15 11:40                         ` Jarek Poplawski
  2009-05-16  0:07                           ` Paul E. McKenney
  0 siblings, 1 reply; 36+ messages in thread
From: Jarek Poplawski @ 2009-05-15 11:40 UTC (permalink / raw)
  To: Neil Horman; +Cc: Jiri Pirko, Paul E. McKenney, Eric Dumazet, netdev, davem

On Fri, May 15, 2009 at 07:15:30AM -0400, Neil Horman wrote:
> On Fri, May 15, 2009 at 11:12:14AM +0000, Jarek Poplawski wrote:
> > On Fri, May 15, 2009 at 07:01:41AM -0400, Neil Horman wrote:
> > > On Fri, May 15, 2009 at 05:49:47AM +0000, Jarek Poplawski wrote:
> > ...
> > > > IMHO it looks worse now. rcu_read_lock() suggests it's a read side,
> > > > and spin_lock(&trace_state_lock) protects something else.
> > > > 
> > > the read lock is required (according to the comments for the list loop
> > > primitive) to protect against the embedded mutation primitive, so its required.
> > > I understand that its a bit counterintuitive, but intuition takes a backseat to
> > > functionality. :)
> > > Neil
> > > 
> > 
> > I guess, you missed:
> > 
> > > Looks good from an RCU viewpoint!
> > > 
> > > Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > 
> > for the previous version...
> > 
> I didn't, our comments passed in flight.  Nevertheless, I'm not sure what this
> adds (other than additional overhead), which I agree is bad and so might should
> be removed, but there are some outstanding questions regarding if it is needed
> in relation to the list primitives I'm using here.  According to Eric,
> list_for_each_entry_safe might be less intrusive here, and I'm trying to figure
> out if I agree. :)
> Neil

Paul "acked" two variants, and Eric prefers one of them. Adding
rcu_read_lock() makes sense only "If this code was shared between the
read side and the update side". Anyway it would need additional
comment. Otherwise it's misleading (but not wrong). And, since Paul
reviewed this, it's definitely not needed here because Paul is simply
always right ;-)

Jarek P.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15 11:27                     ` Jiri Pirko
@ 2009-05-15 16:07                       ` Neil Horman
  2009-05-15 18:11                         ` Eric Dumazet
  2009-05-15 18:18                         ` Stephen Hemminger
  0 siblings, 2 replies; 36+ messages in thread
From: Neil Horman @ 2009-05-15 16:07 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: Paul E. McKenney, Eric Dumazet, netdev, davem

On Fri, May 15, 2009 at 01:27:36PM +0200, Jiri Pirko wrote:
> Fri, May 15, 2009 at 12:59:09PM CEST, nhorman@tuxdriver.com wrote:
> >On Fri, May 15, 2009 at 08:51:02AM +0200, Jiri Pirko wrote:
> >> Thu, May 14, 2009 at 07:29:54PM CEST, nhorman@tuxdriver.com wrote:
> >> >On Thu, May 14, 2009 at 02:44:08PM +0200, Jiri Pirko wrote:
> >> >> >+
> >> >> >+		/*
> >> >> >+		 * Clean the device list
> >> >> >+		 */
> >> >> >+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> >> >> 		^^^^^^^^^^^^^^^^^^^^^^^
> >> >> This is meaningless here. Use list_for_each_entry_rcu only under rcu_read_lock.
> >> >> Also it would be good to use list_for_each_entry_safe here since you're
> >> >> modifying the list.
> >> >> 
> >> >
> >> >The definition of list_for_each_entry_rcu specifically says its safe against
> >> >list-mutation primitives, so its fine.  Although you are correct, in that its
> >> >safety is dependent on the protection of rcu_read_lock(), so I'll add that in.
> >> 
> >> You are right that list_for_each_entry_rcu is safe against list-mutation
> >> primitives. But there's no need for this on the update side when you hold a writer
> >> spinlock. Here I think it's better (and also less confusing) to use ordinary
> >> list_for_each_entry which in this case must be list_for_each_entry_safe.
> >> 
> >> 		Jirka
> >> 
> >I disagree.  I think visually its more understandable with the rcu variant, and
> >that spinlock isn't a writer spinlock, the other uses of this list don't use
> >that lock (to imporove performance).  The spinlock is there simply to serialize
> >changes to the trace state (so that one cpu doen't try to turn dropmon on, while
> >another tries to turn it off).  So we need the rcu variant to protect the list
> >removal against concurrent use on the reader side in the new registered trace
> >point function.
> 
> Ok, since you spinlock is not writer lock, you need to protect writers from
> concurrent list updates anyway! And this spinlock does that work. Then you
> _donotneed_ rcu_read_lock to read from list, you work with it as with ordinary
> list while reading. As Eric wrote this approach is common.
> 
> 		Jirka
> 

Ok, I've spent the morning looking at several other examples, and re-reading
your comments and Erics, and agree, the rcu_read_lock doesn't need to be there,
and a writer mutex does (which I can use trace_state_lock for).  As such, I've
got a new patch below, tested and working.  I think it fully takes your and
Erics comments into account.

V4 Change Notes:
1) Remove rcu_read_lock from TRACE_OFF case in set_all_monitor_traces, since its
not needed in light of the use of the trace_state_lock to protect list mutations 

2) Change list_for_each_entry_rcu to list_for_each_entry_safe, since rcu list
traversal isn't needed here.

3) Fully protect global tracer state with trace_state lock

4) Add freeing code to the NETDEV_UNREGISTER event notification to safe on
memory if the tracer is never used (using trace_state_lock properly here too)



Patch to add the ability to detect drops in hardware interfaces via dropwatch.
Adds a tracepoint to net_rx_action to signal everytime a napi instance is
polled.  The dropmon code then periodically checks to see if the rx_frames
counter has changed, and if so, adds a drop notification to the netlink
protocol, using the reserved all-0's vector to indicate the drop location was in
hardware, rather than somewhere in the code.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>



 include/linux/net_dropmon.h |    7 ++
 include/trace/napi.h        |   11 +++
 net/core/dev.c              |    5 +
 net/core/drop_monitor.c     |  127 ++++++++++++++++++++++++++++++++++++++++++--
 net/core/net-traces.c       |    4 +
 net/core/netpoll.c          |    2 
 6 files changed, 151 insertions(+), 5 deletions(-)

diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
index 0217fb8..9addc62 100644
--- a/include/linux/net_dropmon.h
+++ b/include/linux/net_dropmon.h
@@ -8,6 +8,13 @@ struct net_dm_drop_point {
 	__u32 count;
 };
 
+#define is_drop_point_hw(x) do {\
+	int ____i, ____j;\
+	for (____i = 0; ____i < 8; i ____i++)\
+		____j |= x[____i];\
+	____j;\
+} while (0)
+
 #define NET_DM_CFG_VERSION  0
 #define NET_DM_CFG_ALERT_COUNT  1
 #define NET_DM_CFG_ALERT_DELAY 2
diff --git a/include/trace/napi.h b/include/trace/napi.h
new file mode 100644
index 0000000..7c39eb7
--- /dev/null
+++ b/include/trace/napi.h
@@ -0,0 +1,11 @@
+#ifndef _TRACE_NAPI_H_
+#define _TRACE_NAPI_H_
+
+#include <linux/skbuff.h>
+#include <linux/tracepoint.h>
+
+DECLARE_TRACE(napi_poll,
+	TP_PROTO(struct napi_struct *napi),
+	TP_ARGS(napi));
+
+#endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 637ea71..165979c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -126,6 +126,7 @@
 #include <linux/in.h>
 #include <linux/jhash.h>
 #include <linux/random.h>
+#include <trace/napi.h>
 
 #include "net-sysfs.h"
 
@@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h)
 		 * accidently calling ->poll() when NAPI is not scheduled.
 		 */
 		work = 0;
-		if (test_bit(NAPI_STATE_SCHED, &n->state))
+		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 			work = n->poll(n, weight);
+			trace_napi_poll(n);
+		}
 
 		WARN_ON_ONCE(work > weight);
 
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 2797b71..4f7e915 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -22,8 +22,10 @@
 #include <linux/timer.h>
 #include <linux/bitops.h>
 #include <net/genetlink.h>
+#include <net/netevent.h>
 
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 
@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused);
  * and the work handle that will send up
  * netlink alerts
  */
-struct sock *dm_sock;
+static int trace_state = TRACE_OFF;
+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
 
 struct per_cpu_dm_data {
 	struct work_struct dm_alert_work;
@@ -47,6 +50,13 @@ struct per_cpu_dm_data {
 	struct timer_list send_timer;
 };
 
+struct dm_hw_stat_delta {
+	struct net_device *dev;
+	struct list_head list;
+	struct rcu_head rcu;
+	unsigned long last_drop_val;
+};
+
 static struct genl_family net_drop_monitor_family = {
 	.id             = GENL_ID_GENERATE,
 	.hdrsize        = 0,
@@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
 
 static int dm_hit_limit = 64;
 static int dm_delay = 1;
-
+static unsigned long dm_hw_check_delta = 2*HZ;
+static LIST_HEAD(hw_stats_list);
 
 static void reset_per_cpu_data(struct per_cpu_dm_data *data)
 {
@@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused)
 	schedule_work(&data->dm_alert_work);
 }
 
-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+static void trace_drop_common(struct sk_buff *skb, void *location)
 {
 	struct net_dm_alert_msg *msg;
 	struct nlmsghdr *nlh;
@@ -159,24 +170,80 @@ out:
 	return;
 }
 
+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+{
+	trace_drop_common(skb, location);
+}
+
+static void trace_napi_poll_hit(struct napi_struct *napi)
+{
+	struct dm_hw_stat_delta *new_stat;
+
+	/*
+	 * Ratelimit our check time to dm_hw_check_delta jiffies
+	 */
+	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+		if ((new_stat->dev == napi->dev)  &&
+		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+			trace_drop_common(NULL, NULL);
+			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+
+
+static void free_dm_hw_stat(struct rcu_head *head)
+{
+	struct dm_hw_stat_delta *n;
+	n = container_of(head, struct dm_hw_stat_delta, rcu);
+	kfree(n);
+}
+
 static int set_all_monitor_traces(int state)
 {
 	int rc = 0;
+	struct dm_hw_stat_delta *new_stat = NULL;
+	struct dm_hw_stat_delta *temp;
+
+	spin_lock(&trace_state_lock);
 
 	switch (state) {
 	case TRACE_ON:
 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= register_trace_napi_poll(trace_napi_poll_hit);
 		break;
 	case TRACE_OFF:
 		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
 
 		tracepoint_synchronize_unregister();
+
+		/*
+		 * Clean the device list
+		 */
+		list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
+			if (new_stat->dev == NULL) {
+				list_del_rcu(&new_stat->list);
+				call_rcu(&new_stat->rcu, free_dm_hw_stat);
+			}
+		}
 		break;
 	default:
 		rc = 1;
 		break;
 	}
 
+	if (!rc)
+		trace_state = state;
+
+	spin_unlock(&trace_state_lock);
+
 	if (rc)
 		return -EINPROGRESS;
 	return rc;
@@ -204,6 +271,47 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
 	return -ENOTSUPP;
 }
 
+static int dropmon_net_event(struct notifier_block *ev_block,
+			unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct dm_hw_stat_delta *new_stat = NULL;
+	int found = 0;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+
+		if (!new_stat)
+			goto out;
+
+		new_stat->dev = dev;
+		INIT_RCU_HEAD(&new_stat->rcu);
+		spin_lock(&trace_state_lock);
+		list_add_rcu(&new_stat->list, &hw_stats_list);
+		spin_unlock(&trace_state_lock);
+		break;
+	case NETDEV_UNREGISTER:
+		rcu_read_lock();
+		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+			if (new_stat->dev == dev)
+				new_stat->dev = NULL;
+				found = 1;
+				break;
+		}
+		rcu_read_unlock();
+
+		spin_lock(&trace_state_lock);
+		if (found && (trace_state == TRACE_OFF)) {
+			list_del_rcu(&new_stat->list);
+			call_rcu(&new_stat->rcu, free_dm_hw_stat);
+		}
+		spin_unlock(&trace_state_lock);
+		break;
+	}
+out:
+	return NOTIFY_DONE;
+}
 
 static struct genl_ops dropmon_ops[] = {
 	{
@@ -220,6 +328,10 @@ static struct genl_ops dropmon_ops[] = {
 	},
 };
 
+static struct notifier_block dropmon_net_notifier = {
+	.notifier_call = dropmon_net_event
+};
+
 static int __init init_net_drop_monitor(void)
 {
 	int cpu;
@@ -243,12 +355,18 @@ static int __init init_net_drop_monitor(void)
 		ret = genl_register_ops(&net_drop_monitor_family,
 					&dropmon_ops[i]);
 		if (ret) {
-			printk(KERN_CRIT "failed to register operation %d\n",
+			printk(KERN_CRIT "Failed to register operation %d\n",
 				dropmon_ops[i].cmd);
 			goto out_unreg;
 		}
 	}
 
+	rc = register_netdevice_notifier(&dropmon_net_notifier);
+	if (rc < 0) {
+		printk(KERN_CRIT "Failed to register netdevice notifier\n");
+		goto out_unreg;
+	}
+
 	rc = 0;
 
 	for_each_present_cpu(cpu) {
@@ -259,6 +377,7 @@ static int __init init_net_drop_monitor(void)
 		data->send_timer.data = cpu;
 		data->send_timer.function = sched_send_work;
 	}
+
 	goto out;
 
 out_unreg:
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index c8fb456..b07b25b 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -20,6 +20,7 @@
 #include <linux/netlink.h>
 #include <linux/net_dropmon.h>
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 #include <asm/bitops.h>
@@ -27,3 +28,6 @@
 
 DEFINE_TRACE(kfree_skb);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
+
+DEFINE_TRACE(napi_poll);
+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index b5873bd..446d3ba 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -24,6 +24,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <asm/unaligned.h>
+#include <trace/napi.h>
 
 /*
  * We maintain a small pool of fully-sized skbs, to make sure the
@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 
 	work = napi->poll(napi, budget);
+	trace_napi_poll(napi->dev);
 
 	clear_bit(NAPI_STATE_NPSVC, &napi->state);
 	atomic_dec(&trapped);


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15 16:07                       ` Neil Horman
@ 2009-05-15 18:11                         ` Eric Dumazet
  2009-05-15 18:23                           ` Stephen Hemminger
                                             ` (2 more replies)
  2009-05-15 18:18                         ` Stephen Hemminger
  1 sibling, 3 replies; 36+ messages in thread
From: Eric Dumazet @ 2009-05-15 18:11 UTC (permalink / raw)
  To: Neil Horman; +Cc: Jiri Pirko, Paul E. McKenney, netdev, davem

Neil Horman a écrit :
> +static int dropmon_net_event(struct notifier_block *ev_block,
> +			unsigned long event, void *ptr)
> +{
> +	struct net_device *dev = ptr;
> +	struct dm_hw_stat_delta *new_stat = NULL;
> +	int found = 0;
> +
> +	switch (event) {
> +	case NETDEV_REGISTER:
> +		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
> +
> +		if (!new_stat)
> +			goto out;
> +
> +		new_stat->dev = dev;
> +		INIT_RCU_HEAD(&new_stat->rcu);
> +		spin_lock(&trace_state_lock);
> +		list_add_rcu(&new_stat->list, &hw_stats_list);
> +		spin_unlock(&trace_state_lock);
> +		break;
> +	case NETDEV_UNREGISTER:
> +		rcu_read_lock();
> +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> +			if (new_stat->dev == dev)
> +				new_stat->dev = NULL;
> +				found = 1;
> +				break;
> +		}
> +		rcu_read_unlock();

This is racy, unless caller already owns a lock.

If caller aleady owns a lock, you dont need :

rcu_read_lock()
list_for_each_entry_rcu()
rcu_read_unlock();

> +
> +		spin_lock(&trace_state_lock);
> +		if (found && (trace_state == TRACE_OFF)) {
> +			list_del_rcu(&new_stat->list);
> +			call_rcu(&new_stat->rcu, free_dm_hw_stat);
> +		}
> +		spin_unlock(&trace_state_lock);
> +		break;




Its not clear that you use trace_state_lock as the lock guarding all this.

If this is the case I suggest a plain and straight forward :

case NETDEV_UNREGISTER:
	spin_lock(&trace_state_lock);
	if (trace_state == TRACE_OFF) {
		list_for_each_entry(new_stat, &hw_stats_list, list) {
			if (new_stat->dev == dev) {
				new_stat->dev = NULL;
				list_del_rcu(&new_stat->list);
				call_rcu(&new_stat->rcu, free_dm_hw_stat);
				break;
			}
		}
	}
	spin_unlock(&trace_state_lock);
	break;



^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15 16:07                       ` Neil Horman
  2009-05-15 18:11                         ` Eric Dumazet
@ 2009-05-15 18:18                         ` Stephen Hemminger
  2009-05-15 19:12                           ` Neil Horman
  1 sibling, 1 reply; 36+ messages in thread
From: Stephen Hemminger @ 2009-05-15 18:18 UTC (permalink / raw)
  To: Neil Horman; +Cc: Jiri Pirko, Paul E. McKenney, Eric Dumazet, netdev, davem


> +#define is_drop_point_hw(x) do {\
> +	int ____i, ____j;\
> +	for (____i = 0; ____i < 8; i ____i++)\
> +		____j |= x[____i];\
> +	____j;\
> +} while (0)

Would this code be less ugly if it were an inline function?

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15 18:11                         ` Eric Dumazet
@ 2009-05-15 18:23                           ` Stephen Hemminger
  2009-05-15 19:53                             ` Neil Horman
  2009-05-15 19:23                           ` Neil Horman
  2009-05-16 12:40                           ` Neil Horman
  2 siblings, 1 reply; 36+ messages in thread
From: Stephen Hemminger @ 2009-05-15 18:23 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Neil Horman, Jiri Pirko, Paul E. McKenney, netdev, davem

On Fri, 15 May 2009 20:11:57 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Neil Horman a écrit :
> > +static int dropmon_net_event(struct notifier_block *ev_block,
> > +			unsigned long event, void *ptr)
> > +{
> > +	struct net_device *dev = ptr;
> > +	struct dm_hw_stat_delta *new_stat = NULL;
> > +	int found = 0;
> > +
> > +	switch (event) {
> > +	case NETDEV_REGISTER:
> > +		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
> > +
> > +		if (!new_stat)
> > +			goto out;
> > +
> > +		new_stat->dev = dev;
> > +		INIT_RCU_HEAD(&new_stat->rcu);
> > +		spin_lock(&trace_state_lock);
> > +		list_add_rcu(&new_stat->list, &hw_stats_list);
> > +		spin_unlock(&trace_state_lock);
> > +		break;
> > +	case NETDEV_UNREGISTER:
> > +		rcu_read_lock();
> > +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> > +			if (new_stat->dev == dev)
> > +				new_stat->dev = NULL;
> > +				found = 1;
> > +				break;
> > +		}
> > +		rcu_read_unlock();
> 
> This is racy, unless caller already owns a lock.
> 
> If caller aleady owns a lock, you dont need :
> 
> rcu_read_lock()
> list_for_each_entry_rcu()
> rcu_read_unlock();

RTNL mutex is always held on notification call backs.
Actually why is trace_state_lock needed at all? Why not
just use the RTNL mutex?



-- 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15 18:18                         ` Stephen Hemminger
@ 2009-05-15 19:12                           ` Neil Horman
  0 siblings, 0 replies; 36+ messages in thread
From: Neil Horman @ 2009-05-15 19:12 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jiri Pirko, Paul E. McKenney, Eric Dumazet, netdev, davem

On Fri, May 15, 2009 at 11:18:52AM -0700, Stephen Hemminger wrote:
> 
> > +#define is_drop_point_hw(x) do {\
> > +	int ____i, ____j;\
> > +	for (____i = 0; ____i < 8; i ____i++)\
> > +		____j |= x[____i];\
> > +	____j;\
> > +} while (0)
> 
> Would this code be less ugly if it were an inline function?
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Possibly, I don't actually use that yet, but was planning on it shortly.  I'll
fiddle with it and update it in a later patch if another format makes mroe sense
 
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15 18:11                         ` Eric Dumazet
  2009-05-15 18:23                           ` Stephen Hemminger
@ 2009-05-15 19:23                           ` Neil Horman
  2009-05-16 12:40                           ` Neil Horman
  2 siblings, 0 replies; 36+ messages in thread
From: Neil Horman @ 2009-05-15 19:23 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jiri Pirko, Paul E. McKenney, netdev, davem

On Fri, May 15, 2009 at 08:11:57PM +0200, Eric Dumazet wrote:
> Neil Horman a écrit :
> > +static int dropmon_net_event(struct notifier_block *ev_block,
> > +			unsigned long event, void *ptr)
> > +{
> > +	struct net_device *dev = ptr;
> > +	struct dm_hw_stat_delta *new_stat = NULL;
> > +	int found = 0;
> > +
> > +	switch (event) {
> > +	case NETDEV_REGISTER:
> > +		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
> > +
> > +		if (!new_stat)
> > +			goto out;
> > +
> > +		new_stat->dev = dev;
> > +		INIT_RCU_HEAD(&new_stat->rcu);
> > +		spin_lock(&trace_state_lock);
> > +		list_add_rcu(&new_stat->list, &hw_stats_list);
> > +		spin_unlock(&trace_state_lock);
> > +		break;
> > +	case NETDEV_UNREGISTER:
> > +		rcu_read_lock();
> > +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> > +			if (new_stat->dev == dev)
> > +				new_stat->dev = NULL;
> > +				found = 1;
> > +				break;
> > +		}
> > +		rcu_read_unlock();
> 
> This is racy, unless caller already owns a lock.
> 
Racy in what way?  the rcu_read_lock (As I understand it), prevents list
mutation while we're traversing it here, so we're not going to dereference a
bogus pointer while walking it.  If you're worried about changing the value of a
struct member without an exclusive lock, thats safe in this case, as the worst
case scenario is that we miss detecting a hardware frame drop right before we
stop monitoring the device anyway, and I think thats better than the performance
impact of taking a lock here.

> If caller aleady owns a lock, you dont need :
> 
> rcu_read_lock()
> list_for_each_entry_rcu()
> rcu_read_unlock();
> 
Stephen below points out that notification callbacks always hold the rtnl_lock,
which I wasn't aware of, but I'm not sure thats ok for me to rely on here (at
least I'd rather not)

> > +
> > +		spin_lock(&trace_state_lock);
> > +		if (found && (trace_state == TRACE_OFF)) {
> > +			list_del_rcu(&new_stat->list);
> > +			call_rcu(&new_stat->rcu, free_dm_hw_stat);
> > +		}
> > +		spin_unlock(&trace_state_lock);
> > +		break;
> 
> 
> 
> 
> Its not clear that you use trace_state_lock as the lock guarding all this.
> 
How so?

> If this is the case I suggest a plain and straight forward :
> 
> case NETDEV_UNREGISTER:
> 	spin_lock(&trace_state_lock);
> 	if (trace_state == TRACE_OFF) {
> 		list_for_each_entry(new_stat, &hw_stats_list, list) {
> 			if (new_stat->dev == dev) {
> 				new_stat->dev = NULL;
> 				list_del_rcu(&new_stat->list);
> 				call_rcu(&new_stat->rcu, free_dm_hw_stat);
> 				break;
> 			}
> 		}
> 	}
> 	spin_unlock(&trace_state_lock);
> 	break;
> 
I was hoping to avoid holding the lock while I traversed the entire list.  Not a
huge deal I suppose, but I'd like to avoid doing that if I can.

> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15 18:23                           ` Stephen Hemminger
@ 2009-05-15 19:53                             ` Neil Horman
  0 siblings, 0 replies; 36+ messages in thread
From: Neil Horman @ 2009-05-15 19:53 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, Jiri Pirko, Paul E. McKenney, netdev, davem

On Fri, May 15, 2009 at 11:23:45AM -0700, Stephen Hemminger wrote:
> On Fri, 15 May 2009 20:11:57 +0200
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
> > Neil Horman a écrit :
> > > +static int dropmon_net_event(struct notifier_block *ev_block,
> > > +			unsigned long event, void *ptr)
> > > +{
> > > +	struct net_device *dev = ptr;
> > > +	struct dm_hw_stat_delta *new_stat = NULL;
> > > +	int found = 0;
> > > +
> > > +	switch (event) {
> > > +	case NETDEV_REGISTER:
> > > +		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
> > > +
> > > +		if (!new_stat)
> > > +			goto out;
> > > +
> > > +		new_stat->dev = dev;
> > > +		INIT_RCU_HEAD(&new_stat->rcu);
> > > +		spin_lock(&trace_state_lock);
> > > +		list_add_rcu(&new_stat->list, &hw_stats_list);
> > > +		spin_unlock(&trace_state_lock);
> > > +		break;
> > > +	case NETDEV_UNREGISTER:
> > > +		rcu_read_lock();
> > > +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> > > +			if (new_stat->dev == dev)
> > > +				new_stat->dev = NULL;
> > > +				found = 1;
> > > +				break;
> > > +		}
> > > +		rcu_read_unlock();
> > 
> > This is racy, unless caller already owns a lock.
> > 
> > If caller aleady owns a lock, you dont need :
> > 
> > rcu_read_lock()
> > list_for_each_entry_rcu()
> > rcu_read_unlock();
> 
> RTNL mutex is always held on notification call backs.
> Actually why is trace_state_lock needed at all? Why not
> just use the RTNL mutex?
> 
Theres also a need for this mutex to protect simeoustaneous state changes to the
tracer from userspace.  I suppose I could use the rtnl lock for that, but I
think this is more readable.

Neil

> 
> 
> -- 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15 11:40                         ` Jarek Poplawski
@ 2009-05-16  0:07                           ` Paul E. McKenney
  0 siblings, 0 replies; 36+ messages in thread
From: Paul E. McKenney @ 2009-05-16  0:07 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Neil Horman, Jiri Pirko, Eric Dumazet, netdev, davem

On Fri, May 15, 2009 at 11:40:29AM +0000, Jarek Poplawski wrote:
> On Fri, May 15, 2009 at 07:15:30AM -0400, Neil Horman wrote:
> > On Fri, May 15, 2009 at 11:12:14AM +0000, Jarek Poplawski wrote:
> > > On Fri, May 15, 2009 at 07:01:41AM -0400, Neil Horman wrote:
> > > > On Fri, May 15, 2009 at 05:49:47AM +0000, Jarek Poplawski wrote:
> > > ...
> > > > > IMHO it looks worse now. rcu_read_lock() suggests it's a read side,
> > > > > and spin_lock(&trace_state_lock) protects something else.
> > > > > 
> > > > the read lock is required (according to the comments for the list loop
> > > > primitive) to protect against the embedded mutation primitive, so its required.
> > > > I understand that its a bit counterintuitive, but intuition takes a backseat to
> > > > functionality. :)
> > > > Neil
> > > > 
> > > 
> > > I guess, you missed:
> > > 
> > > > Looks good from an RCU viewpoint!
> > > > 
> > > > Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > > 
> > > for the previous version...
> > > 
> > I didn't, our comments passed in flight.  Nevertheless, I'm not sure what this
> > adds (other than additional overhead), which I agree is bad and so might should
> > be removed, but there are some outstanding questions regarding if it is needed
> > in relation to the list primitives I'm using here.  According to Eric,
> > list_for_each_entry_safe might be less intrusive here, and I'm trying to figure
> > out if I agree. :)
> > Neil
> 
> Paul "acked" two variants, and Eric prefers one of them. Adding
> rcu_read_lock() makes sense only "If this code was shared between the
> read side and the update side". Anyway it would need additional
> comment. Otherwise it's misleading (but not wrong). And, since Paul
> reviewed this, it's definitely not needed here because Paul is simply
> always right ;-)

Much as I appreciate the vote of confidence...  ;-)

I believe that both versions work correctly, and that the difference
is therefore a matter of style.  My mild preference would be to use
rcu_read_lock() only if there was some possibility that a reader (some
task not holding the update-side lock) would execute this code.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-15 18:11                         ` Eric Dumazet
  2009-05-15 18:23                           ` Stephen Hemminger
  2009-05-15 19:23                           ` Neil Horman
@ 2009-05-16 12:40                           ` Neil Horman
  2009-05-18 14:46                             ` Neil Horman
  2 siblings, 1 reply; 36+ messages in thread
From: Neil Horman @ 2009-05-16 12:40 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jiri Pirko, Paul E. McKenney, netdev, davem

On Fri, May 15, 2009 at 08:11:57PM +0200, Eric Dumazet wrote:
> Neil Horman a écrit :
> > +static int dropmon_net_event(struct notifier_block *ev_block,
> > +			unsigned long event, void *ptr)
> > +{
> > +	struct net_device *dev = ptr;
> > +	struct dm_hw_stat_delta *new_stat = NULL;
> > +	int found = 0;
> > +
> > +	switch (event) {
> > +	case NETDEV_REGISTER:
> > +		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
> > +
> > +		if (!new_stat)
> > +			goto out;
> > +
> > +		new_stat->dev = dev;
> > +		INIT_RCU_HEAD(&new_stat->rcu);
> > +		spin_lock(&trace_state_lock);
> > +		list_add_rcu(&new_stat->list, &hw_stats_list);
> > +		spin_unlock(&trace_state_lock);
> > +		break;
> > +	case NETDEV_UNREGISTER:
> > +		rcu_read_lock();
> > +		list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
> > +			if (new_stat->dev == dev)
> > +				new_stat->dev = NULL;
> > +				found = 1;
> > +				break;
> > +		}
> > +		rcu_read_unlock();
> 
> This is racy, unless caller already owns a lock.
> 
> If caller aleady owns a lock, you dont need :
> 
> rcu_read_lock()
> list_for_each_entry_rcu()
> rcu_read_unlock();
> 
> > +
> > +		spin_lock(&trace_state_lock);
> > +		if (found && (trace_state == TRACE_OFF)) {
> > +			list_del_rcu(&new_stat->list);
> > +			call_rcu(&new_stat->rcu, free_dm_hw_stat);
> > +		}
> > +		spin_unlock(&trace_state_lock);
> > +		break;
> 
> 
> 
> 
> Its not clear that you use trace_state_lock as the lock guarding all this.
> 
> If this is the case I suggest a plain and straight forward :
> 
> case NETDEV_UNREGISTER:
> 	spin_lock(&trace_state_lock);
> 	if (trace_state == TRACE_OFF) {
> 		list_for_each_entry(new_stat, &hw_stats_list, list) {
> 			if (new_stat->dev == dev) {
> 				new_stat->dev = NULL;
> 				list_del_rcu(&new_stat->list);
> 				call_rcu(&new_stat->rcu, free_dm_hw_stat);
> 				break;
> 			}
> 		}
> 	}
> 	spin_unlock(&trace_state_lock);
> 	break;
> 
> 
> 
I was thinking about this last night, and I agree, that I think your solution
would be better here (my solution might be racy with regards to a trace state
change resulting in leaked memory.  I'll post a new patch monday.  Sorry for the
trouble.

Neil


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-16 12:40                           ` Neil Horman
@ 2009-05-18 14:46                             ` Neil Horman
  2009-05-21  7:17                               ` David Miller
  0 siblings, 1 reply; 36+ messages in thread
From: Neil Horman @ 2009-05-18 14:46 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jiri Pirko, Paul E. McKenney, netdev, davem

On Sat, May 16, 2009 at 08:40:29AM -0400, Neil Horman wrote:
> 

Ok, new version of the patch (v5 I think), fixes up Erics comments regarding the
locking in the NETDEV unregister event:

Change notes:

1) Modify the NETDEV_UNREGISTER event handling so that its clear the
trace_state_lock protects the integrity of the stats list while we are
traversing (and possibly deleting) members

Regards
Neil



Patch to add the ability to detect drops in hardware interfaces via dropwatch.
Adds a tracepoint to net_rx_action to signal everytime a napi instance is
polled.  The dropmon code then periodically checks to see if the rx_frames
counter has changed, and if so, adds a drop notification to the netlink
protocol, using the reserved all-0's vector to indicate the drop location was in
hardware, rather than somewhere in the code.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>


 include/linux/net_dropmon.h |    8 ++
 include/trace/napi.h        |   11 +++
 net/core/dev.c              |    5 +
 net/core/drop_monitor.c     |  124 ++++++++++++++++++++++++++++++++++++++++++--
 net/core/net-traces.c       |    4 +
 net/core/netpoll.c          |    2 
 6 files changed, 149 insertions(+), 5 deletions(-)

diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
index 0217fb8..e8a8b5c 100644
--- a/include/linux/net_dropmon.h
+++ b/include/linux/net_dropmon.h
@@ -2,12 +2,20 @@
 #define __NET_DROPMON_H
 
 #include <linux/netlink.h>
+#include <linux/types.h>
 
 struct net_dm_drop_point {
 	__u8 pc[8];
 	__u32 count;
 };
 
+#define is_drop_point_hw(x) do {\
+	int ____i, ____j;\
+	for (____i = 0; ____i < 8; i ____i++)\
+		____j |= x[____i];\
+	____j;\
+} while (0)
+
 #define NET_DM_CFG_VERSION  0
 #define NET_DM_CFG_ALERT_COUNT  1
 #define NET_DM_CFG_ALERT_DELAY 2
diff --git a/include/trace/napi.h b/include/trace/napi.h
new file mode 100644
index 0000000..7c39eb7
--- /dev/null
+++ b/include/trace/napi.h
@@ -0,0 +1,11 @@
+#ifndef _TRACE_NAPI_H_
+#define _TRACE_NAPI_H_
+
+#include <linux/skbuff.h>
+#include <linux/tracepoint.h>
+
+DECLARE_TRACE(napi_poll,
+	TP_PROTO(struct napi_struct *napi),
+	TP_ARGS(napi));
+
+#endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 637ea71..165979c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -126,6 +126,7 @@
 #include <linux/in.h>
 #include <linux/jhash.h>
 #include <linux/random.h>
+#include <trace/napi.h>
 
 #include "net-sysfs.h"
 
@@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h)
 		 * accidently calling ->poll() when NAPI is not scheduled.
 		 */
 		work = 0;
-		if (test_bit(NAPI_STATE_SCHED, &n->state))
+		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 			work = n->poll(n, weight);
+			trace_napi_poll(n);
+		}
 
 		WARN_ON_ONCE(work > weight);
 
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 2797b71..a6c2ac2 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -22,8 +22,10 @@
 #include <linux/timer.h>
 #include <linux/bitops.h>
 #include <net/genetlink.h>
+#include <net/netevent.h>
 
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 
@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused);
  * and the work handle that will send up
  * netlink alerts
  */
-struct sock *dm_sock;
+static int trace_state = TRACE_OFF;
+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
 
 struct per_cpu_dm_data {
 	struct work_struct dm_alert_work;
@@ -47,6 +50,13 @@ struct per_cpu_dm_data {
 	struct timer_list send_timer;
 };
 
+struct dm_hw_stat_delta {
+	struct net_device *dev;
+	struct list_head list;
+	struct rcu_head rcu;
+	unsigned long last_drop_val;
+};
+
 static struct genl_family net_drop_monitor_family = {
 	.id             = GENL_ID_GENERATE,
 	.hdrsize        = 0,
@@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
 
 static int dm_hit_limit = 64;
 static int dm_delay = 1;
-
+static unsigned long dm_hw_check_delta = 2*HZ;
+static LIST_HEAD(hw_stats_list);
 
 static void reset_per_cpu_data(struct per_cpu_dm_data *data)
 {
@@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused)
 	schedule_work(&data->dm_alert_work);
 }
 
-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+static void trace_drop_common(struct sk_buff *skb, void *location)
 {
 	struct net_dm_alert_msg *msg;
 	struct nlmsghdr *nlh;
@@ -159,24 +170,80 @@ out:
 	return;
 }
 
+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+{
+	trace_drop_common(skb, location);
+}
+
+static void trace_napi_poll_hit(struct napi_struct *napi)
+{
+	struct dm_hw_stat_delta *new_stat;
+
+	/*
+	 * Ratelimit our check time to dm_hw_check_delta jiffies
+	 */
+	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+		if ((new_stat->dev == napi->dev)  &&
+		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+			trace_drop_common(NULL, NULL);
+			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+
+
+static void free_dm_hw_stat(struct rcu_head *head)
+{
+	struct dm_hw_stat_delta *n;
+	n = container_of(head, struct dm_hw_stat_delta, rcu);
+	kfree(n);
+}
+
 static int set_all_monitor_traces(int state)
 {
 	int rc = 0;
+	struct dm_hw_stat_delta *new_stat = NULL;
+	struct dm_hw_stat_delta *temp;
+
+	spin_lock(&trace_state_lock);
 
 	switch (state) {
 	case TRACE_ON:
 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= register_trace_napi_poll(trace_napi_poll_hit);
 		break;
 	case TRACE_OFF:
 		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
 
 		tracepoint_synchronize_unregister();
+
+		/*
+		 * Clean the device list
+		 */
+		list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
+			if (new_stat->dev == NULL) {
+				list_del_rcu(&new_stat->list);
+				call_rcu(&new_stat->rcu, free_dm_hw_stat);
+			}
+		}
 		break;
 	default:
 		rc = 1;
 		break;
 	}
 
+	if (!rc)
+		trace_state = state;
+
+	spin_unlock(&trace_state_lock);
+
 	if (rc)
 		return -EINPROGRESS;
 	return rc;
@@ -204,6 +271,44 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
 	return -ENOTSUPP;
 }
 
+static int dropmon_net_event(struct notifier_block *ev_block,
+			unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct dm_hw_stat_delta *new_stat = NULL;
+	struct dm_hw_stat_delta *tmp;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+
+		if (!new_stat)
+			goto out;
+
+		new_stat->dev = dev;
+		INIT_RCU_HEAD(&new_stat->rcu);
+		spin_lock(&trace_state_lock);
+		list_add_rcu(&new_stat->list, &hw_stats_list);
+		spin_unlock(&trace_state_lock);
+		break;
+	case NETDEV_UNREGISTER:
+		spin_lock(&trace_state_lock);
+		list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
+			if (new_stat->dev == dev) {
+				new_stat->dev = NULL;
+				if (trace_state == TRACE_OFF) {
+					list_del_rcu(&new_stat->list);
+					call_rcu(&new_stat->rcu, free_dm_hw_stat);
+					break;
+				}
+			}
+		}
+		spin_unlock(&trace_state_lock);
+		break;
+	}
+out:
+	return NOTIFY_DONE;
+}
 
 static struct genl_ops dropmon_ops[] = {
 	{
@@ -220,6 +325,10 @@ static struct genl_ops dropmon_ops[] = {
 	},
 };
 
+static struct notifier_block dropmon_net_notifier = {
+	.notifier_call = dropmon_net_event
+};
+
 static int __init init_net_drop_monitor(void)
 {
 	int cpu;
@@ -243,12 +352,18 @@ static int __init init_net_drop_monitor(void)
 		ret = genl_register_ops(&net_drop_monitor_family,
 					&dropmon_ops[i]);
 		if (ret) {
-			printk(KERN_CRIT "failed to register operation %d\n",
+			printk(KERN_CRIT "Failed to register operation %d\n",
 				dropmon_ops[i].cmd);
 			goto out_unreg;
 		}
 	}
 
+	rc = register_netdevice_notifier(&dropmon_net_notifier);
+	if (rc < 0) {
+		printk(KERN_CRIT "Failed to register netdevice notifier\n");
+		goto out_unreg;
+	}
+
 	rc = 0;
 
 	for_each_present_cpu(cpu) {
@@ -259,6 +374,7 @@ static int __init init_net_drop_monitor(void)
 		data->send_timer.data = cpu;
 		data->send_timer.function = sched_send_work;
 	}
+
 	goto out;
 
 out_unreg:
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index c8fb456..b07b25b 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -20,6 +20,7 @@
 #include <linux/netlink.h>
 #include <linux/net_dropmon.h>
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 #include <asm/bitops.h>
@@ -27,3 +28,6 @@
 
 DEFINE_TRACE(kfree_skb);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
+
+DEFINE_TRACE(napi_poll);
+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index b5873bd..446d3ba 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -24,6 +24,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <asm/unaligned.h>
+#include <trace/napi.h>
 
 /*
  * We maintain a small pool of fully-sized skbs, to make sure the
@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 
 	work = napi->poll(napi, budget);
+	trace_napi_poll(napi->dev);
 
 	clear_bit(NAPI_STATE_NPSVC, &napi->state);
 	atomic_dec(&trapped);

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-18 14:46                             ` Neil Horman
@ 2009-05-21  7:17                               ` David Miller
  2009-05-21 17:36                                 ` Neil Horman
  0 siblings, 1 reply; 36+ messages in thread
From: David Miller @ 2009-05-21  7:17 UTC (permalink / raw)
  To: nhorman; +Cc: dada1, jpirko, paulmck, netdev

From: Neil Horman <nhorman@tuxdriver.com>
Date: Mon, 18 May 2009 10:46:22 -0400

> @@ -0,0 +1,11 @@
> +#ifndef _TRACE_NAPI_H_
> +#define _TRACE_NAPI_H_
> +
> +#include <linux/skbuff.h>
> +#include <linux/tracepoint.h>
> +
> +DECLARE_TRACE(napi_poll,
> +	TP_PROTO(struct napi_struct *napi),
> +	TP_ARGS(napi));
> +
> +#endif

If skbuff.h gets you the declaration for "struct napi_struct"
it's via some indirect accident rather than it actually being
in linux/skbuff.h

Please include the correct headers, thanks.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-21  7:17                               ` David Miller
@ 2009-05-21 17:36                                 ` Neil Horman
  2009-05-21 22:15                                   ` David Miller
  0 siblings, 1 reply; 36+ messages in thread
From: Neil Horman @ 2009-05-21 17:36 UTC (permalink / raw)
  To: David Miller; +Cc: dada1, jpirko, paulmck, netdev

On Thu, May 21, 2009 at 12:17:36AM -0700, David Miller wrote:
> From: Neil Horman <nhorman@tuxdriver.com>
> Date: Mon, 18 May 2009 10:46:22 -0400
> 
> > @@ -0,0 +1,11 @@
> > +#ifndef _TRACE_NAPI_H_
> > +#define _TRACE_NAPI_H_
> > +
> > +#include <linux/skbuff.h>
> > +#include <linux/tracepoint.h>
> > +
> > +DECLARE_TRACE(napi_poll,
> > +	TP_PROTO(struct napi_struct *napi),
> > +	TP_ARGS(napi));
> > +
> > +#endif
> 
> If skbuff.h gets you the declaration for "struct napi_struct"
> it's via some indirect accident rather than it actually being
> in linux/skbuff.h
> 
> Please include the correct headers, thanks.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Thank you Dave, copy n' paste bit me.

Patch v6

Change notes:

1) updated header file in trace/napi.h to include netdevice.h so that I
correctly pull in struct napi_struct definition

Neil


Patch to add the ability to detect drops in hardware interfaces via dropwatch.
Adds a tracepoint to net_rx_action to signal everytime a napi instance is
polled.  The dropmon code then periodically checks to see if the rx_frames
counter has changed, and if so, adds a drop notification to the netlink
protocol, using the reserved all-0's vector to indicate the drop location was in
hardware, rather than somewhere in the code.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>


 include/linux/net_dropmon.h |    8 ++
 include/trace/napi.h        |   11 +++
 net/core/dev.c              |    5 +
 net/core/drop_monitor.c     |  124 ++++++++++++++++++++++++++++++++++++++++++--
 net/core/net-traces.c       |    4 +
 net/core/netpoll.c          |    2 
 6 files changed, 149 insertions(+), 5 deletions(-)

diff --git a/include/linux/net_dropmon.h b/include/linux/net_dropmon.h
index 0217fb8..e8a8b5c 100644
--- a/include/linux/net_dropmon.h
+++ b/include/linux/net_dropmon.h
@@ -2,12 +2,20 @@
 #define __NET_DROPMON_H
 
 #include <linux/netlink.h>
+#include <linux/types.h>
 
 struct net_dm_drop_point {
 	__u8 pc[8];
 	__u32 count;
 };
 
+#define is_drop_point_hw(x) do {\
+	int ____i, ____j;\
+	for (____i = 0; ____i < 8; i ____i++)\
+		____j |= x[____i];\
+	____j;\
+} while (0)
+
 #define NET_DM_CFG_VERSION  0
 #define NET_DM_CFG_ALERT_COUNT  1
 #define NET_DM_CFG_ALERT_DELAY 2
diff --git a/include/trace/napi.h b/include/trace/napi.h
new file mode 100644
index 0000000..a8989c4
--- /dev/null
+++ b/include/trace/napi.h
@@ -0,0 +1,11 @@
+#ifndef _TRACE_NAPI_H_
+#define _TRACE_NAPI_H_
+
+#include <linux/netdevice.h>
+#include <linux/tracepoint.h>
+
+DECLARE_TRACE(napi_poll,
+	TP_PROTO(struct napi_struct *napi),
+	TP_ARGS(napi));
+
+#endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 637ea71..165979c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -126,6 +126,7 @@
 #include <linux/in.h>
 #include <linux/jhash.h>
 #include <linux/random.h>
+#include <trace/napi.h>
 
 #include "net-sysfs.h"
 
@@ -2763,8 +2764,10 @@ static void net_rx_action(struct softirq_action *h)
 		 * accidently calling ->poll() when NAPI is not scheduled.
 		 */
 		work = 0;
-		if (test_bit(NAPI_STATE_SCHED, &n->state))
+		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 			work = n->poll(n, weight);
+			trace_napi_poll(n);
+		}
 
 		WARN_ON_ONCE(work > weight);
 
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 2797b71..a6c2ac2 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -22,8 +22,10 @@
 #include <linux/timer.h>
 #include <linux/bitops.h>
 #include <net/genetlink.h>
+#include <net/netevent.h>
 
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 
@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused);
  * and the work handle that will send up
  * netlink alerts
  */
-struct sock *dm_sock;
+static int trace_state = TRACE_OFF;
+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
 
 struct per_cpu_dm_data {
 	struct work_struct dm_alert_work;
@@ -47,6 +50,13 @@ struct per_cpu_dm_data {
 	struct timer_list send_timer;
 };
 
+struct dm_hw_stat_delta {
+	struct net_device *dev;
+	struct list_head list;
+	struct rcu_head rcu;
+	unsigned long last_drop_val;
+};
+
 static struct genl_family net_drop_monitor_family = {
 	.id             = GENL_ID_GENERATE,
 	.hdrsize        = 0,
@@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
 
 static int dm_hit_limit = 64;
 static int dm_delay = 1;
-
+static unsigned long dm_hw_check_delta = 2*HZ;
+static LIST_HEAD(hw_stats_list);
 
 static void reset_per_cpu_data(struct per_cpu_dm_data *data)
 {
@@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused)
 	schedule_work(&data->dm_alert_work);
 }
 
-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+static void trace_drop_common(struct sk_buff *skb, void *location)
 {
 	struct net_dm_alert_msg *msg;
 	struct nlmsghdr *nlh;
@@ -159,24 +170,80 @@ out:
 	return;
 }
 
+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+{
+	trace_drop_common(skb, location);
+}
+
+static void trace_napi_poll_hit(struct napi_struct *napi)
+{
+	struct dm_hw_stat_delta *new_stat;
+
+	/*
+	 * Ratelimit our check time to dm_hw_check_delta jiffies
+	 */
+	if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+		if ((new_stat->dev == napi->dev)  &&
+		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+			trace_drop_common(NULL, NULL);
+			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+
+
+static void free_dm_hw_stat(struct rcu_head *head)
+{
+	struct dm_hw_stat_delta *n;
+	n = container_of(head, struct dm_hw_stat_delta, rcu);
+	kfree(n);
+}
+
 static int set_all_monitor_traces(int state)
 {
 	int rc = 0;
+	struct dm_hw_stat_delta *new_stat = NULL;
+	struct dm_hw_stat_delta *temp;
+
+	spin_lock(&trace_state_lock);
 
 	switch (state) {
 	case TRACE_ON:
 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= register_trace_napi_poll(trace_napi_poll_hit);
 		break;
 	case TRACE_OFF:
 		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
+		rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
 
 		tracepoint_synchronize_unregister();
+
+		/*
+		 * Clean the device list
+		 */
+		list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
+			if (new_stat->dev == NULL) {
+				list_del_rcu(&new_stat->list);
+				call_rcu(&new_stat->rcu, free_dm_hw_stat);
+			}
+		}
 		break;
 	default:
 		rc = 1;
 		break;
 	}
 
+	if (!rc)
+		trace_state = state;
+
+	spin_unlock(&trace_state_lock);
+
 	if (rc)
 		return -EINPROGRESS;
 	return rc;
@@ -204,6 +271,44 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
 	return -ENOTSUPP;
 }
 
+static int dropmon_net_event(struct notifier_block *ev_block,
+			unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct dm_hw_stat_delta *new_stat = NULL;
+	struct dm_hw_stat_delta *tmp;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+
+		if (!new_stat)
+			goto out;
+
+		new_stat->dev = dev;
+		INIT_RCU_HEAD(&new_stat->rcu);
+		spin_lock(&trace_state_lock);
+		list_add_rcu(&new_stat->list, &hw_stats_list);
+		spin_unlock(&trace_state_lock);
+		break;
+	case NETDEV_UNREGISTER:
+		spin_lock(&trace_state_lock);
+		list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
+			if (new_stat->dev == dev) {
+				new_stat->dev = NULL;
+				if (trace_state == TRACE_OFF) {
+					list_del_rcu(&new_stat->list);
+					call_rcu(&new_stat->rcu, free_dm_hw_stat);
+					break;
+				}
+			}
+		}
+		spin_unlock(&trace_state_lock);
+		break;
+	}
+out:
+	return NOTIFY_DONE;
+}
 
 static struct genl_ops dropmon_ops[] = {
 	{
@@ -220,6 +325,10 @@ static struct genl_ops dropmon_ops[] = {
 	},
 };
 
+static struct notifier_block dropmon_net_notifier = {
+	.notifier_call = dropmon_net_event
+};
+
 static int __init init_net_drop_monitor(void)
 {
 	int cpu;
@@ -243,12 +352,18 @@ static int __init init_net_drop_monitor(void)
 		ret = genl_register_ops(&net_drop_monitor_family,
 					&dropmon_ops[i]);
 		if (ret) {
-			printk(KERN_CRIT "failed to register operation %d\n",
+			printk(KERN_CRIT "Failed to register operation %d\n",
 				dropmon_ops[i].cmd);
 			goto out_unreg;
 		}
 	}
 
+	rc = register_netdevice_notifier(&dropmon_net_notifier);
+	if (rc < 0) {
+		printk(KERN_CRIT "Failed to register netdevice notifier\n");
+		goto out_unreg;
+	}
+
 	rc = 0;
 
 	for_each_present_cpu(cpu) {
@@ -259,6 +374,7 @@ static int __init init_net_drop_monitor(void)
 		data->send_timer.data = cpu;
 		data->send_timer.function = sched_send_work;
 	}
+
 	goto out;
 
 out_unreg:
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index c8fb456..b07b25b 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -20,6 +20,7 @@
 #include <linux/netlink.h>
 #include <linux/net_dropmon.h>
 #include <trace/skb.h>
+#include <trace/napi.h>
 
 #include <asm/unaligned.h>
 #include <asm/bitops.h>
@@ -27,3 +28,6 @@
 
 DEFINE_TRACE(kfree_skb);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
+
+DEFINE_TRACE(napi_poll);
+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index b5873bd..446d3ba 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -24,6 +24,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <asm/unaligned.h>
+#include <trace/napi.h>
 
 /*
  * We maintain a small pool of fully-sized skbs, to make sure the
@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 
 	work = napi->poll(napi, budget);
+	trace_napi_poll(napi->dev);
 
 	clear_bit(NAPI_STATE_NPSVC, &napi->state);
 	atomic_dec(&trapped);

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-21 17:36                                 ` Neil Horman
@ 2009-05-21 22:15                                   ` David Miller
  2009-05-22  0:09                                     ` Neil Horman
  0 siblings, 1 reply; 36+ messages in thread
From: David Miller @ 2009-05-21 22:15 UTC (permalink / raw)
  To: nhorman; +Cc: dada1, jpirko, paulmck, netdev

From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 21 May 2009 13:36:08 -0400

> Thank you Dave, copy n' paste bit me.
> 
> Patch v6

Applied to net-next-2.6, thanks!

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] dropmon: add ability to detect when hardware dropsrxpackets
  2009-05-21 22:15                                   ` David Miller
@ 2009-05-22  0:09                                     ` Neil Horman
  0 siblings, 0 replies; 36+ messages in thread
From: Neil Horman @ 2009-05-22  0:09 UTC (permalink / raw)
  To: David Miller; +Cc: dada1, jpirko, paulmck, netdev

On Thu, May 21, 2009 at 03:15:03PM -0700, David Miller wrote:
> From: Neil Horman <nhorman@tuxdriver.com>
> Date: Thu, 21 May 2009 13:36:08 -0400
> 
> > Thank you Dave, copy n' paste bit me.
> > 
> > Patch v6
> 
> Applied to net-next-2.6, thanks!
> 


Thanks Dave.  I'll try get it right the first try next time around.


Ok, maybe the third try :)
Neil


^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2009-05-22  0:09 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-05-08 19:50 [PATCH] dropmon: add ability to detect when hardware drops rx packets Neil Horman
2009-05-09  6:30 ` Eric Dumazet
2009-05-09 18:07   ` Neil Horman
2009-05-12 16:30   ` Neil Horman
2009-05-13 18:23     ` [PATCH] dropmon: add ability to detect when hardware drops rxpackets Paul E. McKenney
2009-05-14  0:45       ` Neil Horman
2009-05-14  1:03         ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Paul E. McKenney
2009-05-14 12:33           ` Neil Horman
2009-05-14 12:44             ` Jiri Pirko
2009-05-14 16:17               ` [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets Paul E. McKenney
2009-05-14 17:29               ` [PATCH] dropmon: add ability to detect when hardware dropsrxpackets Neil Horman
2009-05-15  5:49                 ` Jarek Poplawski
2009-05-15 11:01                   ` Neil Horman
2009-05-15 11:12                     ` Jarek Poplawski
2009-05-15 11:15                       ` Neil Horman
2009-05-15 11:40                         ` Jarek Poplawski
2009-05-16  0:07                           ` Paul E. McKenney
2009-05-15  6:51                 ` Jiri Pirko
2009-05-15  7:37                   ` Eric Dumazet
2009-05-15 11:12                     ` Neil Horman
2009-05-15 10:59                   ` Neil Horman
2009-05-15 11:27                     ` Jiri Pirko
2009-05-15 16:07                       ` Neil Horman
2009-05-15 18:11                         ` Eric Dumazet
2009-05-15 18:23                           ` Stephen Hemminger
2009-05-15 19:53                             ` Neil Horman
2009-05-15 19:23                           ` Neil Horman
2009-05-16 12:40                           ` Neil Horman
2009-05-18 14:46                             ` Neil Horman
2009-05-21  7:17                               ` David Miller
2009-05-21 17:36                                 ` Neil Horman
2009-05-21 22:15                                   ` David Miller
2009-05-22  0:09                                     ` Neil Horman
2009-05-15 18:18                         ` Stephen Hemminger
2009-05-15 19:12                           ` Neil Horman
2009-05-14 16:18             ` [PATCH] dropmon: add ability to detect when hardwaredropsrxpackets Paul E. McKenney

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).