[PATCH v4 1/1] rps: core implementation

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v4 1/1] rps: core implementation
@ 2009-11-20 23:28 Tom Herbert
  2009-11-20 23:39 ` David Miller
                   ` (3 more replies)
  0 siblings, 4 replies; 36+ messages in thread
From: Tom Herbert @ 2009-11-20 23:28 UTC (permalink / raw)
  To: David Miller, Linux Netdev List

Version 4 of RPS patch.  I think this addresses most of the comments
on the previous versions.  Thanks for all the insightful comments and
patience!

Changes from previous version:

- Added rxhash to kernel-doc for struct sk_buff
- Removed /** on comments not for kernel-doc
- Change get_token declaration to kernel style
- Added struct dev_rps_maps.  Each netdevice now has a pointer to this
structure which contains the array of per NAPI rps maps, the number of
this maps.  rcu is used to protect pointer
- In store_rps_cpus a new map set is allocated each call.
- Removed dev_isalive check and other locks since rps struct in
netdevice are protected by rcu
- Removed NET_RPS_SOFTIRQ and call net_rps_action from net_rx_action instead
- Queue to remote backlog queues only in NAPI case.  This means
rps_remote_softirq_cpus does not need to be accessed with interrupts
disabled and __smp_call_function_single will not be called with
interrupts disabled
- Limit the number of entries in an rps map to min(256, num_possible_cpus())
- Removed unnecessary irq_local_disable
- Renamed skb_tx_hashrnd to just hashrnd and use that for the rps hash as well
- Make index u16 in index=skb_get_rx_queue() and don't check index<0 now
- Replace spin_lock_irqsave with simpler spin_lock_irq in process_backlog

Signed-off-by: Tom Herbert <therbert@google.com>
---

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 97873e3..9b84a38 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -676,6 +676,29 @@ struct net_device_ops {
 };

 /*
+ * Structure for Receive Packet Steering.  Length of map and array of CPU ID's.
+ */
+struct rps_map {
+	int len;
+	u16 map[0];
+};
+
+/*
+ * Structure that contains the rps maps for various NAPI instances of a device.
+ */
+struct dev_rps_maps {
+	int num_maps;
+	struct rcu_head rcu;
+	struct rps_map maps[0];
+};
+
+/* Bound number of CPUs that can be in an rps map */
+#define MAX_RPS_CPUS (num_possible_cpus() < 256 ? num_possible_cpus() : 256)
+
+/* Maximum size of RPS map (for allocation) */
+#define RPS_MAP_SIZE (sizeof(struct rps_map) + (MAX_RPS_CPUS * sizeof(u16)))
+
+/*
  *	The DEVICE structure.
  *	Actually, this whole structure is a big mistake.  It mixes I/O
  *	data with strictly "high-level" data, and it has to know about
@@ -861,6 +884,9 @@ struct net_device {

 	struct netdev_queue	rx_queue;

+	struct dev_rps_maps	*dev_rps_maps;	/* Per-NAPI maps for
+						   receive packet steeing */
+
 	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;

 	/* Number of TX queues allocated at alloc_netdev_mq() time  */
@@ -1276,6 +1302,9 @@ struct softnet_data {
 	struct Qdisc		*output_queue;
 	struct sk_buff_head	input_pkt_queue;
 	struct list_head	poll_list;
+
+	struct call_single_data	csd;
+
 	struct sk_buff		*completion_queue;

 	struct napi_struct	backlog;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 63f4742..f188301 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -267,6 +267,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@mac_header: Link layer header
  *	@_skb_dst: destination entry
  *	@sp: the security path, used for xfrm
+ *	@rxhash: the packet hash computed on receive
  *	@cb: Control buffer. Free for use by every layer. Put private vars here
  *	@len: Length of actual data
  *	@data_len: Data length
@@ -323,6 +324,8 @@ struct sk_buff {
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
 #endif
+	__u32			rxhash;
+
 	/*
 	 * This is the control buffer. It is free to use for every
 	 * layer. Please put your private variables there. If you
diff --git a/net/core/dev.c b/net/core/dev.c
index 9977288..f2c199c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1834,7 +1834,7 @@ out_kfree_skb:
 	return rc;
 }

-static u32 skb_tx_hashrnd;
+static u32 hashrnd;

 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
 {
@@ -1852,7 +1852,7 @@ u16 skb_tx_hash(const struct net_device *dev,
const struct sk_buff *skb)
 	else
 		hash = skb->protocol;

-	hash = jhash_1word(hash, skb_tx_hashrnd);
+	hash = jhash_1word(hash, hashrnd);

 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
 }
@@ -2073,6 +2073,146 @@ int weight_p __read_mostly = 64;            /*
old backlog weight */

 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };

+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving NAPI instance for a given skb.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
+{
+	u32 addr1, addr2, ports;
+	struct ipv6hdr *ip6;
+	struct iphdr *ip;
+	u32 hash, ihl;
+	u8 ip_proto;
+	int cpu = -1;
+	struct dev_rps_maps *drmap;
+	struct rps_map *map = NULL;
+	u16 index;
+
+	rcu_read_lock();
+
+	drmap = rcu_dereference(dev->dev_rps_maps);
+	if (!drmap)
+		goto done;
+
+	index = skb_get_rx_queue(skb);
+	if (index >= drmap->num_maps)
+		index = 0;
+
+	map = (struct rps_map *)
+	    ((void *)drmap->maps + (RPS_MAP_SIZE * index));
+	if (!map->len)
+		goto done;
+
+	hash = skb->rxhash;
+	if (hash)
+		goto got_hash; /* Skip hash computation on packet header */
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		if (!pskb_may_pull(skb, sizeof(*ip)))
+			goto done;
+
+		ip = (struct iphdr *) skb->data;
+		ip_proto = ip->protocol;
+		addr1 = ip->saddr;
+		addr2 = ip->daddr;
+		ihl = ip->ihl;
+		break;
+	case __constant_htons(ETH_P_IPV6):
+		if (!pskb_may_pull(skb, sizeof(*ip6)))
+			goto done;
+
+		ip6 = (struct ipv6hdr *) skb->data;
+		ip_proto = ip6->nexthdr;
+		addr1 = ip6->saddr.s6_addr32[3];
+		addr2 = ip6->daddr.s6_addr32[3];
+		ihl = (40 >> 2);
+		break;
+	default:
+		goto done;
+	}
+	ports = 0;
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDPLITE:
+		if (pskb_may_pull(skb, (ihl * 4) + 4))
+			ports = *((u32 *) (skb->data + (ihl * 4)));
+		break;
+
+	default:
+		break;
+	}
+
+	hash = jhash_3words(addr1, addr2, ports, hashrnd);
+
+got_hash:
+	cpu = map->map[((u64) hash * map->len) >> 32];
+	if (!cpu_online(cpu))
+		cpu = -1;
+done:
+	rcu_read_unlock();
+	return cpu;
+}
+
+static DEFINE_PER_CPU(cpumask_t, rps_remote_softirq_cpus);
+
+/* Called from hardirq (IPI) context */
+static void trigger_softirq(void *data)
+{
+	struct softnet_data *queue = data;
+	__napi_schedule(&queue->backlog);
+}
+
+/*
+ * enqueue_to_backlog is called to queue an skb to a per CPU backlog
+ * queue (may be a remote CPU queue).
+ */
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
+{
+	struct softnet_data *queue;
+	unsigned long flags;
+
+	queue = &per_cpu(softnet_data, cpu);
+
+	local_irq_save(flags);
+	__get_cpu_var(netdev_rx_stat).total++;
+
+	spin_lock(&queue->input_pkt_queue.lock);
+	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
+		if (queue->input_pkt_queue.qlen) {
+enqueue:
+			__skb_queue_tail(&queue->input_pkt_queue, skb);
+			spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
+			    flags);
+			return NET_RX_SUCCESS;
+		}
+
+		/* Schedule NAPI for backlog device */
+		if (napi_schedule_prep(&queue->backlog)) {
+			if (cpu != smp_processor_id()) {
+				cpu_set(cpu,
+				    get_cpu_var(rps_remote_softirq_cpus));
+				__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+			} else
+				__napi_schedule(&queue->backlog);
+		}
+		goto enqueue;
+	}
+
+	spin_unlock(&queue->input_pkt_queue.lock);
+
+	__get_cpu_var(netdev_rx_stat).dropped++;
+	local_irq_restore(flags);
+
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}

 /**
  *	netif_rx	-	post buffer to the network code
@@ -2091,9 +2231,6 @@ DEFINE_PER_CPU(struct netif_rx_stats,
netdev_rx_stat) = { 0, };

 int netif_rx(struct sk_buff *skb)
 {
-	struct softnet_data *queue;
-	unsigned long flags;
-
 	/* if netpoll wants it, pretend we never saw it */
 	if (netpoll_rx(skb))
 		return NET_RX_DROP;
@@ -2101,31 +2238,8 @@ int netif_rx(struct sk_buff *skb)
 	if (!skb->tstamp.tv64)
 		net_timestamp(skb);

-	/*
-	 * The code is rearranged so that the path is the most
-	 * short when CPU is congested, but is still operating.
-	 */
-	local_irq_save(flags);
-	queue = &__get_cpu_var(softnet_data);
-
-	__get_cpu_var(netdev_rx_stat).total++;
-	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
-		if (queue->input_pkt_queue.qlen) {
-enqueue:
-			__skb_queue_tail(&queue->input_pkt_queue, skb);
-			local_irq_restore(flags);
-			return NET_RX_SUCCESS;
-		}
-
-		napi_schedule(&queue->backlog);
-		goto enqueue;
-	}
-
-	__get_cpu_var(netdev_rx_stat).dropped++;
-	local_irq_restore(flags);
-
-	kfree_skb(skb);
-	return NET_RX_DROP;
+	/* Always send to local backlog, no RPS for non NAPI */
+	return enqueue_to_backlog(skb, smp_processor_id());
 }
 EXPORT_SYMBOL(netif_rx);

@@ -2363,10 +2477,10 @@ void netif_nit_deliver(struct sk_buff *skb)
 }

 /**
- *	netif_receive_skb - process receive buffer from network
+ *	__netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
  *
- *	netif_receive_skb() is the main receive data processing function.
+ *	__netif__napireceive_skb() is the main receive data processing function.
  *	It always succeeds. The buffer may be dropped during processing
  *	for congestion control or by the protocol layers.
  *
@@ -2377,7 +2491,8 @@ void netif_nit_deliver(struct sk_buff *skb)
  *	NET_RX_SUCCESS: no congestion
  *	NET_RX_DROP: packet was dropped
  */
-int netif_receive_skb(struct sk_buff *skb)
+
+int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
 	struct net_device *orig_dev;
@@ -2475,6 +2590,16 @@ out:
 }
 EXPORT_SYMBOL(netif_receive_skb);

+int netif_receive_skb(struct sk_buff *skb)
+{
+	int cpu = get_rps_cpu(skb->dev, skb);
+
+	if (cpu < 0)
+		return __netif_receive_skb(skb);
+	else
+		return enqueue_to_backlog(skb, cpu);
+}
+
 /* Network device is going away, flush any packets still pending  */
 static void flush_backlog(void *arg)
 {
@@ -2518,7 +2643,7 @@ static int napi_gro_complete(struct sk_buff *skb)
 	}

 out:
-	return netif_receive_skb(skb);
+	return __netif_receive_skb(skb);
 }

 void napi_gro_flush(struct napi_struct *napi)
@@ -2650,7 +2775,7 @@ gro_result_t napi_skb_finish(gro_result_t ret,
struct sk_buff *skb)
 {
 	switch (ret) {
 	case GRO_NORMAL:
-		if (netif_receive_skb(skb))
+		if (__netif_receive_skb(skb))
 			ret = GRO_DROP;
 		break;

@@ -2724,7 +2849,7 @@ gro_result_t napi_frags_finish(struct
napi_struct *napi, struct sk_buff *skb,

 		if (ret == GRO_HELD)
 			skb_gro_pull(skb, -ETH_HLEN);
-		else if (netif_receive_skb(skb))
+		else if (__netif_receive_skb(skb))
 			ret = GRO_DROP;
 		break;

@@ -2799,16 +2924,16 @@ static int process_backlog(struct napi_struct
*napi, int quota)
 	do {
 		struct sk_buff *skb;

-		local_irq_disable();
+		spin_lock_irq(&queue->input_pkt_queue.lock);
 		skb = __skb_dequeue(&queue->input_pkt_queue);
 		if (!skb) {
 			__napi_complete(napi);
-			local_irq_enable();
+			spin_unlock_irq(&queue->input_pkt_queue.lock);
 			break;
 		}
-		local_irq_enable();
+		spin_unlock_irq(&queue->input_pkt_queue.lock);

-		netif_receive_skb(skb);
+		__netif_receive_skb(skb);
 	} while (++work < quota && jiffies == start_time);

 	return work;
@@ -2897,6 +3022,21 @@ void netif_napi_del(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(netif_napi_del);

+/*
+ * net_rps_action sends any pending IPI's for rps.  This is only called from
+ * softirq and interrupts must be enabled.
+ */
+static void net_rps_action(void)
+{
+	int cpu;
+
+	/* Send pending IPI's to kick RPS processing on remote cpus. */
+	for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
+		struct softnet_data *queue = &per_cpu(softnet_data, cpu);
+		__smp_call_function_single(cpu, &queue->csd, 0);
+	}
+	cpus_clear(__get_cpu_var(rps_remote_softirq_cpus));
+}

 static void net_rx_action(struct softirq_action *h)
 {
@@ -2968,6 +3108,8 @@ static void net_rx_action(struct softirq_action *h)
 out:
 	local_irq_enable();

+	net_rps_action();
+
 #ifdef CONFIG_NET_DMA
 	/*
 	 * There may not be any more sk_buffs coming right now, so push
@@ -5341,6 +5483,8 @@ void free_netdev(struct net_device *dev)
 	/* Flush device addresses */
 	dev_addr_flush(dev);

+	kfree(dev->dev_rps_maps);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);

@@ -5793,6 +5937,10 @@ static int __init net_dev_init(void)
 		queue->completion_queue = NULL;
 		INIT_LIST_HEAD(&queue->poll_list);

+		queue->csd.func = trigger_softirq;
+		queue->csd.info = queue;
+		queue->csd.flags = 0;
+
 		queue->backlog.poll = process_backlog;
 		queue->backlog.weight = weight_p;
 		queue->backlog.gro_list = NULL;
@@ -5831,7 +5979,7 @@ subsys_initcall(net_dev_init);

 static int __init initialize_hashrnd(void)
 {
-	get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
+	get_random_bytes(&hashrnd, sizeof(hashrnd));
 	return 0;
 }

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 157645c..be78dfb 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -18,6 +18,9 @@
 #include <linux/wireless.h>
 #include <net/wext.h>

+#include <linux/string.h>
+#include <linux/ctype.h>
+
 #include "net-sysfs.h"

 #ifdef CONFIG_SYSFS
@@ -253,6 +256,132 @@ static ssize_t store_tx_queue_len(struct device *dev,
 	return netdev_store(dev, attr, buf, len, change_tx_queue_len);
 }

+static char *get_token(const char **cp, size_t *len)
+{
+	const char *bp = *cp;
+	char *start;
+
+	while (isspace(*bp))
+		bp++;
+
+	start = (char *)bp;
+	while (!isspace(*bp) && *bp != '\0')
+		bp++;
+
+	if (start != bp)
+		*len = bp - start;
+	else
+		start = NULL;
+
+	*cp = bp;
+	return start;
+}
+
+static void dev_map_release(struct rcu_head *rcu)
+{
+	struct dev_rps_maps *drmap =
+	    container_of(rcu, struct dev_rps_maps, rcu);
+
+	kfree(drmap);
+}
+
+static ssize_t store_rps_cpus(struct device *dev,
+    struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct net_device *net = to_net_dev(dev);
+	struct napi_struct *napi;
+	cpumask_t mask;
+	int err, cpu, index, i;
+	int cnt = 0;
+	char *token;
+	const char *cp = buf;
+	size_t tlen;
+	struct dev_rps_maps *drmap, *old_drmap;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	cnt = 0;
+	list_for_each_entry(napi, &net->napi_list, dev_list)
+		cnt++;
+
+	drmap = kzalloc(sizeof (struct dev_rps_maps) +
+	    RPS_MAP_SIZE * cnt, GFP_KERNEL);
+	if (!drmap)
+		return -ENOMEM;
+
+	drmap->num_maps = cnt;
+
+	cp = buf;
+	for (index = 0; index < cnt &&
+	   (token = get_token(&cp, &tlen)); index++) {
+		struct rps_map *map = (struct rps_map *)
+		    ((void *)drmap->maps + (RPS_MAP_SIZE * index));
+		err = bitmap_parse(token, tlen, cpumask_bits(&mask),
+		    nr_cpumask_bits);
+
+		if (err) {
+			kfree(drmap);
+			return err;
+		}
+
+		cpus_and(mask, mask, cpu_online_map);
+		i = 0;
+		for_each_cpu_mask(cpu, mask) {
+			if (i >= MAX_RPS_CPUS)
+				break;
+			map->map[i++] =  cpu;
+		}
+		map->len = i;
+	}
+
+	rcu_read_lock_bh();
+	old_drmap = rcu_dereference(net->dev_rps_maps);
+	rcu_assign_pointer(net->dev_rps_maps, drmap);
+	rcu_read_unlock_bh();
+
+	if (old_drmap)
+		call_rcu(&old_drmap->rcu, dev_map_release);
+
+	return len;
+}
+
+static ssize_t show_rps_cpus(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct net_device *net = to_net_dev(dev);
+	size_t len = 0;
+	cpumask_t mask;
+	int i, j;
+	struct dev_rps_maps *drmap;
+
+	rcu_read_lock_bh();
+	drmap = rcu_dereference(net->dev_rps_maps);
+
+	if (drmap) {
+		for (j = 0; j < drmap->num_maps; j++) {
+			struct rps_map *map = (struct rps_map *)
+			    ((void *)drmap->maps + (RPS_MAP_SIZE * j));
+			cpus_clear(mask);
+			for (i = 0; i < map->len; i++)
+				cpu_set(map->map[i], mask);
+
+			len += cpumask_scnprintf(buf + len, PAGE_SIZE, &mask);
+			if (PAGE_SIZE - len < 3) {
+				rcu_read_unlock();
+				return -EINVAL;
+			}
+			if (j < drmap->num_maps)
+				len += sprintf(buf + len, " ");
+		}
+	}
+
+	rcu_read_unlock_bh();
+
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+
 static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t len)
 {
@@ -309,6 +438,7 @@ static struct device_attribute net_class_attributes[] = {
 	__ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
 	__ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
 	       store_tx_queue_len),
+	__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_cpus, store_rps_cpus),
 	{}
 };

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2009-11-20 23:28 [PATCH v4 1/1] rps: core implementation Tom Herbert
@ 2009-11-20 23:39 ` David Miller
  2009-11-20 23:50   ` Tom Herbert
  2009-11-20 23:40 ` Stephen Hemminger
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 36+ messages in thread
From: David Miller @ 2009-11-20 23:39 UTC (permalink / raw)
  To: therbert; +Cc: netdev

From: Tom Herbert <therbert@google.com>
Date: Fri, 20 Nov 2009 15:28:58 -0800

> +		/* Schedule NAPI for backlog device */
> +		if (napi_schedule_prep(&queue->backlog)) {
> +			if (cpu != smp_processor_id()) {
> +				cpu_set(cpu,
> +				    get_cpu_var(rps_remote_softirq_cpus));
> +				__raise_softirq_irqoff(NET_RX_SOFTIRQ);
> +			} else
> +				__napi_schedule(&queue->backlog);
> +		}
> +		goto enqueue;
> +	}

I still think, like Jared, this should occur at the end of the NAPI
->poll() run.

Otherwise show us numbers indicating that it makes a big difference.
:-)

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2009-11-20 23:28 [PATCH v4 1/1] rps: core implementation Tom Herbert
  2009-11-20 23:39 ` David Miller
@ 2009-11-20 23:40 ` Stephen Hemminger
  2009-11-20 23:53   ` Tom Herbert
                     ` (2 more replies)
  2009-11-20 23:42 ` Stephen Hemminger
  2009-11-21  8:07 ` Eric Dumazet
  3 siblings, 3 replies; 36+ messages in thread
From: Stephen Hemminger @ 2009-11-20 23:40 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, Linux Netdev List

On Fri, 20 Nov 2009 15:28:58 -0800
Tom Herbert <therbert@google.com> wrote:

> +static char *get_token(const char **cp, size_t *len)
> +{
> +	const char *bp = *cp;
> +	char *start;
> +
> +	while (isspace(*bp))
> +		bp++;
> +
> +	start = (char *)bp;
> +	while (!isspace(*bp) && *bp != '\0')
> +		bp++;
> +
> +	if (start != bp)
> +		*len = bp - start;
> +	else
> +		start = NULL;
> +
> +	*cp = bp;
> +	return start;
> +}


Sysfs is intentionally one value per file. If you need multiple
values, then it is the wrong interface.

-- 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2009-11-20 23:28 [PATCH v4 1/1] rps: core implementation Tom Herbert
  2009-11-20 23:39 ` David Miller
  2009-11-20 23:40 ` Stephen Hemminger
@ 2009-11-20 23:42 ` Stephen Hemminger
  2009-11-21  0:04   ` Tom Herbert
  2009-11-21  8:07 ` Eric Dumazet
  3 siblings, 1 reply; 36+ messages in thread
From: Stephen Hemminger @ 2009-11-20 23:42 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, Linux Netdev List

On Fri, 20 Nov 2009 15:28:58 -0800
Tom Herbert <therbert@google.com> wrote:

> @@ -861,6 +884,9 @@ struct net_device {
> 
>  	struct netdev_queue	rx_queue;
> 
> +	struct dev_rps_maps	*dev_rps_maps;	/* Per-NAPI maps for
> +						   receive packet steeing */
> +

How does this work for devices with:
   * multiqueue - one device has multiple NAPI instances
   * mulitport  - one NAPI shared by multiple devices

-- 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2009-11-20 23:39 ` David Miller
@ 2009-11-20 23:50   ` Tom Herbert
  2009-11-21  0:05     ` David Miller
  0 siblings, 1 reply; 36+ messages in thread
From: Tom Herbert @ 2009-11-20 23:50 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Fri, Nov 20, 2009 at 3:39 PM, David Miller <davem@davemloft.net> wrote:
> From: Tom Herbert <therbert@google.com>
> Date: Fri, 20 Nov 2009 15:28:58 -0800
>
>> +             /* Schedule NAPI for backlog device */
>> +             if (napi_schedule_prep(&queue->backlog)) {
>> +                     if (cpu != smp_processor_id()) {
>> +                             cpu_set(cpu,
>> +                                 get_cpu_var(rps_remote_softirq_cpus));
>> +                             __raise_softirq_irqoff(NET_RX_SOFTIRQ);
>> +                     } else
>> +                             __napi_schedule(&queue->backlog);
>> +             }
>> +             goto enqueue;
>> +     }
>
> I still think, like Jared, this should occur at the end of the NAPI
> ->poll() run.
>

We only set the bit in remote_softirq_cpus in here.  The actual IPIs
are sent at the end net_rx_action.  I'm not exactly sure what you're
thinking on this...

> Otherwise show us numbers indicating that it makes a big difference.
> :-)
>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2009-11-20 23:40 ` Stephen Hemminger
@ 2009-11-20 23:53   ` Tom Herbert
  2009-11-20 23:56   ` David Miller
  2009-12-17 21:04   ` Tom Herbert
  2 siblings, 0 replies; 36+ messages in thread
From: Tom Herbert @ 2009-11-20 23:53 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, Linux Netdev List

On Fri, Nov 20, 2009 at 3:40 PM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
> On Fri, 20 Nov 2009 15:28:58 -0800
> Tom Herbert <therbert@google.com> wrote:
>
>> +static char *get_token(const char **cp, size_t *len)
>> +{
>> +     const char *bp = *cp;
>> +     char *start;
>> +
>> +     while (isspace(*bp))
>> +             bp++;
>> +
>> +     start = (char *)bp;
>> +     while (!isspace(*bp) && *bp != '\0')
>> +             bp++;
>> +
>> +     if (start != bp)
>> +             *len = bp - start;
>> +     else
>> +             start = NULL;
>> +
>> +     *cp = bp;
>> +     return start;
>> +}
>
>
> Sysfs is intentionally one value per file. If you need multiple
> values, then it is the wrong interface.
>

What would be a better interface?

> --
>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2009-11-20 23:40 ` Stephen Hemminger
  2009-11-20 23:53   ` Tom Herbert
@ 2009-11-20 23:56   ` David Miller
  2009-12-17 21:04   ` Tom Herbert
  2 siblings, 0 replies; 36+ messages in thread
From: David Miller @ 2009-11-20 23:56 UTC (permalink / raw)
  To: shemminger; +Cc: therbert, netdev

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Fri, 20 Nov 2009 15:40:46 -0800

> Sysfs is intentionally one value per file. If you need multiple
> values, then it is the wrong interface.

This is probably right.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2009-11-20 23:42 ` Stephen Hemminger
@ 2009-11-21  0:04   ` Tom Herbert
  0 siblings, 0 replies; 36+ messages in thread
From: Tom Herbert @ 2009-11-21  0:04 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, Linux Netdev List

On Fri, Nov 20, 2009 at 3:42 PM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
> On Fri, 20 Nov 2009 15:28:58 -0800
> Tom Herbert <therbert@google.com> wrote:
>
>> @@ -861,6 +884,9 @@ struct net_device {
>>
>>       struct netdev_queue     rx_queue;
>>
>> +     struct dev_rps_maps     *dev_rps_maps;  /* Per-NAPI maps for
>> +                                                receive packet steeing */
>> +
>
> How does this work for devices with:
>   * multiqueue - one device has multiple NAPI instances

Each NAPI instance has its own map (dev_rps_maps hold the array of these maps)

>   * mulitport  - one NAPI shared by multiple devices
>
I have not tested that, so I'm not sure.  But, I believe since the per
NAPI maps are kept in the netdevice (not in napi structure) this would
mean that each of those devices has its own per NAPI rps map(s).

Tom

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2009-11-20 23:50   ` Tom Herbert
@ 2009-11-21  0:05     ` David Miller
  2009-11-21  0:12       ` Tom Herbert
  0 siblings, 1 reply; 36+ messages in thread
From: David Miller @ 2009-11-21  0:05 UTC (permalink / raw)
  To: therbert; +Cc: netdev

From: Tom Herbert <therbert@google.com>
Date: Fri, 20 Nov 2009 15:50:03 -0800

> We only set the bit in remote_softirq_cpus in here.  The actual IPIs
> are sent at the end net_rx_action.  I'm not exactly sure what you're
> thinking on this...

You're scheduling a softirq so you can schedule remote softirqs.

Just do that "for each rps cpu" loop at the end of NAPI ->poll()
processing.  In fact that seems to be what you're doing :-)

I guess my confusion is from the:

				__raise_softirq_irqoff(NET_RX_SOFTIRQ);

you are doing as you set the cpus in rps_remote_softirq_cpus.

Why do you need to schedule the local RX softirq, when we know we're
in a NAPI poll loop and thus that we're in a softirq, and thus that we
will fire off the IPIs at the end of net_rx_action()?

That's what you're doing, the softirq raising just seems superfluous.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2009-11-21  0:05     ` David Miller
@ 2009-11-21  0:12       ` Tom Herbert
  2009-11-21  0:40         ` Jarek Poplawski
  0 siblings, 1 reply; 36+ messages in thread
From: Tom Herbert @ 2009-11-21  0:12 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

> I guess my confusion is from the:
>
>                                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
>
> you are doing as you set the cpus in rps_remote_softirq_cpus.
>
> Why do you need to schedule the local RX softirq, when we know we're
> in a NAPI poll loop and thus that we're in a softirq, and thus that we
> will fire off the IPIs at the end of net_rx_action()?
>
> That's what you're doing, the softirq raising just seems superfluous.
>

Ah, right. If RPS can't be used non-NAPI case, this line is now
superfluous.  It can be removed.

Thanks

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2009-11-21  0:12       ` Tom Herbert
@ 2009-11-21  0:40         ` Jarek Poplawski
  0 siblings, 0 replies; 36+ messages in thread
From: Jarek Poplawski @ 2009-11-21  0:40 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, netdev

Tom Herbert wrote, On 11/21/2009 01:12 AM:

>> I guess my confusion is from the:
>>
>>                                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
>>
>> you are doing as you set the cpus in rps_remote_softirq_cpus.
>>
>> Why do you need to schedule the local RX softirq, when we know we're
>> in a NAPI poll loop and thus that we're in a softirq, and thus that we
>> will fire off the IPIs at the end of net_rx_action()?
>>
>> That's what you're doing, the softirq raising just seems superfluous.
>>
> 
> Ah, right. If RPS can't be used non-NAPI case, this line is now
> superfluous.  It can be removed.

Hmm... mostly right. At least if it's wrt. my previous opinion. I meant
only that non-NAPI case would be handled less optimally without this
dedicated softirq, but __raise_softirq_irqoff(NET_RX_SOFTIRQ) or
napi_schedule(&queue->backlog) could be called from netif_rx().

Jarek P.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2009-11-20 23:28 [PATCH v4 1/1] rps: core implementation Tom Herbert
                   ` (2 preceding siblings ...)
  2009-11-20 23:42 ` Stephen Hemminger
@ 2009-11-21  8:07 ` Eric Dumazet
  2009-11-21  9:03   ` Tom Herbert
  3 siblings, 1 reply; 36+ messages in thread
From: Eric Dumazet @ 2009-11-21  8:07 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, Linux Netdev List, Andi Kleen

Tom Herbert a écrit :
> Version 4 of RPS patch.  I think this addresses most of the comments
> on the previous versions.  Thanks for all the insightful comments and
> patience!
> 
> Changes from previous version:
> 
> - Added rxhash to kernel-doc for struct sk_buff
> - Removed /** on comments not for kernel-doc
> - Change get_token declaration to kernel style
> - Added struct dev_rps_maps.  Each netdevice now has a pointer to this
> structure which contains the array of per NAPI rps maps, the number of
> this maps.  rcu is used to protect pointer
> - In store_rps_cpus a new map set is allocated each call.
> - Removed dev_isalive check and other locks since rps struct in
> netdevice are protected by rcu
> - Removed NET_RPS_SOFTIRQ and call net_rps_action from net_rx_action instead
> - Queue to remote backlog queues only in NAPI case.  This means
> rps_remote_softirq_cpus does not need to be accessed with interrupts
> disabled and __smp_call_function_single will not be called with
> interrupts disabled
> - Limit the number of entries in an rps map to min(256, num_possible_cpus())
> - Removed unnecessary irq_local_disable
> - Renamed skb_tx_hashrnd to just hashrnd and use that for the rps hash as well
> - Make index u16 in index=skb_get_rx_queue() and don't check index<0 now
> - Replace spin_lock_irqsave with simpler spin_lock_irq in process_backlog

Excellent !

>   *	The DEVICE structure.
>   *	Actually, this whole structure is a big mistake.  It mixes I/O
>   *	data with strictly "high-level" data, and it has to know about
> @@ -861,6 +884,9 @@ struct net_device {
> 
>  	struct netdev_queue	rx_queue;
> 
> +	struct dev_rps_maps	*dev_rps_maps;	/* Per-NAPI maps for
> +						   receive packet steeing */
> +
>  	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
> 
>  	/* Number of TX queues allocated at alloc_netdev_mq() time  */
> @@ -1276,6 +1302,9 @@ struct softnet_data {
>  	struct Qdisc		*output_queue;
>  	struct sk_buff_head	input_pkt_queue;
>  	struct list_head	poll_list;
> +
> +	struct call_single_data	csd;

This is problematic. softnet_data used to be only used by local cpu.
With RPS, other cpus are going to access csd, input_pkt_queue, backlog
and dirty cache lines.

Maybe we should split sofnet_data in two cache lines, one private,
one chared, and 
DEFINE_PER_CPU(struct softnet_data, softnet_data);
-> 
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);

Also you need to change comment at start of struct softnet_data,
since it is wrong after RPS.

/*
 * Incoming packets are placed on per-cpu queues so that
 * no locking is needed.
 */

> +
>  	struct sk_buff		*completion_queue;
> 
>  	struct napi_struct	backlog;
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 63f4742..f188301 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -267,6 +267,7 @@ typedef unsigned char *sk_buff_data_t;
>   *	@mac_header: Link layer header
>   *	@_skb_dst: destination entry
>   *	@sp: the security path, used for xfrm
> + *	@rxhash: the packet hash computed on receive
>   *	@cb: Control buffer. Free for use by every layer. Put private vars here
>   *	@len: Length of actual data
>   *	@data_len: Data length
> @@ -323,6 +324,8 @@ struct sk_buff {
>  #ifdef CONFIG_XFRM
>  	struct	sec_path	*sp;
>  #endif
> +	__u32			rxhash;
> +
>  	/*
>  	 * This is the control buffer. It is free to use for every
>  	 * layer. Please put your private variables there. If you
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 9977288..f2c199c 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1834,7 +1834,7 @@ out_kfree_skb:
>  	return rc;
>  }
> 
> -static u32 skb_tx_hashrnd;
> +static u32 hashrnd;
adding a __read_mostly here could help :)

> 
>  u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
>  {
> @@ -1852,7 +1852,7 @@ u16 skb_tx_hash(const struct net_device *dev,
> const struct sk_buff *skb)
>  	else
>  		hash = skb->protocol;
> 
> -	hash = jhash_1word(hash, skb_tx_hashrnd);
> +	hash = jhash_1word(hash, hashrnd);
> 
>  	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
>  }
> @@ -2073,6 +2073,146 @@ int weight_p __read_mostly = 64;            /*
> old backlog weight */
> 
>  DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
> 
> +/*
> + * get_rps_cpu is called from netif_receive_skb and returns the target
> + * CPU from the RPS map of the receiving NAPI instance for a given skb.
> + */
> +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
> +{
> +	u32 addr1, addr2, ports;
> +	struct ipv6hdr *ip6;
> +	struct iphdr *ip;
> +	u32 hash, ihl;
> +	u8 ip_proto;
> +	int cpu = -1;
> +	struct dev_rps_maps *drmap;
> +	struct rps_map *map = NULL;
> +	u16 index;
> +

> +	rcu_read_lock();
If called from netif_receive_skb(), we already are in a rcu protected section,
but this could be a cleanup patch, because many other parts in stack could
be changed as well.


> +/*
> + * enqueue_to_backlog is called to queue an skb to a per CPU backlog
> + * queue (may be a remote CPU queue).
> + */
> +static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
> +{
> +	struct softnet_data *queue;
> +	unsigned long flags;
> +
> +	queue = &per_cpu(softnet_data, cpu);
> +


> +	local_irq_save(flags);
> +	__get_cpu_var(netdev_rx_stat).total++;
> +
> +	spin_lock(&queue->input_pkt_queue.lock);

could reduce to :
	percpu_add(netdev_rx_stat.total, 1);
	spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);

> +	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
> +		if (queue->input_pkt_queue.qlen) {
> +enqueue:
> +			__skb_queue_tail(&queue->input_pkt_queue, skb);
> +			spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
> +			    flags);
> +			return NET_RX_SUCCESS;
> +		}
> +
> +		/* Schedule NAPI for backlog device */

> +		if (napi_schedule_prep(&queue->backlog)) {

  Is it legal (or wanter at all) to call napi_schedule_prep() on remote cpu backlog ?

> +			if (cpu != smp_processor_id()) {
> +				cpu_set(cpu,
> +				    get_cpu_var(rps_remote_softirq_cpus));
> +				__raise_softirq_irqoff(NET_RX_SOFTIRQ);
> +			} else
> +				__napi_schedule(&queue->backlog);

Why not the more easy :
	if (cpu != smp_processor_id())
		cpu_set(cpu, get_cpu_var(rps_remote_softirq_cpus));
	else
		napi_schedule(&queue->backlog);

This way we dont touch to remote backlog (and this backlog could stay in the private
cache line of remote cpu)

> +		}
> +		goto enqueue;
> +	}
> +

> +	spin_unlock(&queue->input_pkt_queue.lock);
> +
> +	__get_cpu_var(netdev_rx_stat).dropped++;
> +	local_irq_restore(flags);

-> 
	spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags);
	percpu_add(netdev_rx_stat.dropped, 1);


+/*
+ * net_rps_action sends any pending IPI's for rps.  This is only called from
+ * softirq and interrupts must be enabled.
+ */
+static void net_rps_action(void)
+{
+       int cpu;
+
+       /* Send pending IPI's to kick RPS processing on remote cpus. */
+       for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
+               struct softnet_data *queue = &per_cpu(softnet_data, cpu);
+               __smp_call_function_single(cpu, &queue->csd, 0);
+       }
+       cpus_clear(__get_cpu_var(rps_remote_softirq_cpus));
+}


net_rps_action() might be not very descriptive name and bit expensive...

Did you tried smp_call_function_many() ?
(I suspect you did, but found it was not that optimized ?)

CC Andi to get feedback from him :)

static void net_rps_remcpus_fire(void)
{
	smp_call_function_many(__get_cpu_var(rps_remote_softirq_cpus),
			       trigger_softirq, NULL, 0);
}

Of course you would have to use following code as well :
(eg ignore void *data argument)

static void trigger_softirq(void *data)
{
/* kind of :
 * __napi_schedule(__get_cpu_var(softnet_data).backlog);
 * without local_irq_save(flags);/local_irq_restore(flags);
 */
	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

Thanks Herbert


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2009-11-21  8:07 ` Eric Dumazet
@ 2009-11-21  9:03   ` Tom Herbert
  2009-11-21  9:31     ` Eric Dumazet
  0 siblings, 1 reply; 36+ messages in thread
From: Tom Herbert @ 2009-11-21  9:03 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Linux Netdev List, Andi Kleen

>>   *   The DEVICE structure.
>>   *   Actually, this whole structure is a big mistake.  It mixes I/O
>>   *   data with strictly "high-level" data, and it has to know about
>> @@ -861,6 +884,9 @@ struct net_device {
>>
>>       struct netdev_queue     rx_queue;
>>
>> +     struct dev_rps_maps     *dev_rps_maps;  /* Per-NAPI maps for
>> +                                                receive packet steeing */
>> +
>>       struct netdev_queue     *_tx ____cacheline_aligned_in_smp;
>>
>>       /* Number of TX queues allocated at alloc_netdev_mq() time  */
>> @@ -1276,6 +1302,9 @@ struct softnet_data {
>>       struct Qdisc            *output_queue;
>>       struct sk_buff_head     input_pkt_queue;
>>       struct list_head        poll_list;
>> +
>> +     struct call_single_data csd;
>
> This is problematic. softnet_data used to be only used by local cpu.
> With RPS, other cpus are going to access csd, input_pkt_queue, backlog
> and dirty cache lines.
>
> Maybe we should split sofnet_data in two cache lines, one private,
> one chared, and
> DEFINE_PER_CPU(struct softnet_data, softnet_data);
> ->
> DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
>

That would make sense.

> Also you need to change comment at start of struct softnet_data,
> since it is wrong after RPS.
>
> /*
>  * Incoming packets are placed on per-cpu queues so that
>  * no locking is needed.
>  */
>
Okay.


>> -static u32 skb_tx_hashrnd;
>> +static u32 hashrnd;
> adding a __read_mostly here could help :)
>

Yep.

>>
>>  u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
>>  {
>> @@ -1852,7 +1852,7 @@ u16 skb_tx_hash(const struct net_device *dev,
>> const struct sk_buff *skb)
>>       else
>>               hash = skb->protocol;
>>
>> -     hash = jhash_1word(hash, skb_tx_hashrnd);
>> +     hash = jhash_1word(hash, hashrnd);
>>
>>       return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
>>  }
>> @@ -2073,6 +2073,146 @@ int weight_p __read_mostly = 64;            /*
>> old backlog weight */
>>
>>  DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
>>
>> +/*
>> + * get_rps_cpu is called from netif_receive_skb and returns the target
>> + * CPU from the RPS map of the receiving NAPI instance for a given skb.
>> + */
>> +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
>> +{
>> +     u32 addr1, addr2, ports;
>> +     struct ipv6hdr *ip6;
>> +     struct iphdr *ip;
>> +     u32 hash, ihl;
>> +     u8 ip_proto;
>> +     int cpu = -1;
>> +     struct dev_rps_maps *drmap;
>> +     struct rps_map *map = NULL;
>> +     u16 index;
>> +
>
>> +     rcu_read_lock();
> If called from netif_receive_skb(), we already are in a rcu protected section,
> but this could be a cleanup patch, because many other parts in stack could
> be changed as well.
>
So convention would be to not call these then?

>
>> +/*
>> + * enqueue_to_backlog is called to queue an skb to a per CPU backlog
>> + * queue (may be a remote CPU queue).
>> + */
>> +static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
>> +{
>> +     struct softnet_data *queue;
>> +     unsigned long flags;
>> +
>> +     queue = &per_cpu(softnet_data, cpu);
>> +
>
>
>> +     local_irq_save(flags);
>> +     __get_cpu_var(netdev_rx_stat).total++;
>> +
>> +     spin_lock(&queue->input_pkt_queue.lock);
>
> could reduce to :
>        percpu_add(netdev_rx_stat.total, 1);
>        spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);
>
Would it make sense to percpu_add into dev.c just for this when other
parts in dev.c would still use __get_cpu_var(stat)++?  Also, I think
this results in more instructions...

>> +     if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
>> +             if (queue->input_pkt_queue.qlen) {
>> +enqueue:
>> +                     __skb_queue_tail(&queue->input_pkt_queue, skb);
>> +                     spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
>> +                         flags);
>> +                     return NET_RX_SUCCESS;
>> +             }
>> +
>> +             /* Schedule NAPI for backlog device */
>
>> +             if (napi_schedule_prep(&queue->backlog)) {
>
>  Is it legal (or wanter at all) to call napi_schedule_prep() on remote cpu backlog ?
>

I think it is useful.  If the remote backlog is scheduled then there
is no need to do an IPI, so this means there would only ever be on IPI
pending for a backlog queue and hence we can get away with only having
one call_function_data per backlog.  Without the napi schedule, we
would probably need a call structure for each backlog per CPU-- and
also we'd still need some shared bits between source and target CPUs
anyway for when the IPI is completed (want to avoid blocking on
call_function_data).

>> +                     if (cpu != smp_processor_id()) {
>> +                             cpu_set(cpu,
>> +                                 get_cpu_var(rps_remote_softirq_cpus));
>> +                             __raise_softirq_irqoff(NET_RX_SOFTIRQ);
>> +                     } else
>> +                             __napi_schedule(&queue->backlog);
>
> Why not the more easy :
>        if (cpu != smp_processor_id())
>                cpu_set(cpu, get_cpu_var(rps_remote_softirq_cpus));
>        else
>                napi_schedule(&queue->backlog);
>
> This way we dont touch to remote backlog (and this backlog could stay in the private
> cache line of remote cpu)
>
>> +             }
>> +             goto enqueue;
>> +     }
>> +
>
>> +     spin_unlock(&queue->input_pkt_queue.lock);
>> +
>> +     __get_cpu_var(netdev_rx_stat).dropped++;
>> +     local_irq_restore(flags);
>
> ->
>        spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags);
>        percpu_add(netdev_rx_stat.dropped, 1);
>
>
> +/*
> + * net_rps_action sends any pending IPI's for rps.  This is only called from
> + * softirq and interrupts must be enabled.
> + */
> +static void net_rps_action(void)
> +{
> +       int cpu;
> +
> +       /* Send pending IPI's to kick RPS processing on remote cpus. */
> +       for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
> +               struct softnet_data *queue = &per_cpu(softnet_data, cpu);
> +               __smp_call_function_single(cpu, &queue->csd, 0);
> +       }
> +       cpus_clear(__get_cpu_var(rps_remote_softirq_cpus));
> +}
>
>
> net_rps_action() might be not very descriptive name and bit expensive...
>
> Did you tried smp_call_function_many() ?
> (I suspect you did, but found it was not that optimized ?)
>
> CC Andi to get feedback from him :)
>
> static void net_rps_remcpus_fire(void)
> {
>        smp_call_function_many(__get_cpu_var(rps_remote_softirq_cpus),
>                               trigger_softirq, NULL, 0);
> }
>

I'm thinking it is better to avoid possibly blocking on locked cfd_data.

Tom

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2009-11-21  9:03   ` Tom Herbert
@ 2009-11-21  9:31     ` Eric Dumazet
  0 siblings, 0 replies; 36+ messages in thread
From: Eric Dumazet @ 2009-11-21  9:31 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, Linux Netdev List, Andi Kleen

>>        percpu_add(netdev_rx_stat.total, 1);
>>        spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);
>>
> Would it make sense to percpu_add into dev.c just for this when other
> parts in dev.c would still use __get_cpu_var(stat)++?  Also, I think
> this results in more instructions...

Dont worry, this is out of RPS scope anyway, but percpu_add() is
better on x86 at least.

	__get_cpu_var(netdev_rx_stat).total++;
->
	mov    $0xc17aa6b8,%eax  // per_cpu__netdev_rx_stat
	mov    %fs:0xc17a77c0,%edx // per_cpu__this_cpu_off
	incl   (%edx,%eax,1)

While

	percpu_add(netdev_rx_stat.total, 1);
->
	addl   $0x1,%fs:0xc17aa6b8 // per_cpu__netdev_rx_stat


Later can be done in any context, and use no register, so :

1) we reduce window with disabled interrupts.
2) allow compiler to not scratch two registers.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2009-11-20 23:40 ` Stephen Hemminger
  2009-11-20 23:53   ` Tom Herbert
  2009-11-20 23:56   ` David Miller
@ 2009-12-17 21:04   ` Tom Herbert
  2010-01-06  1:32     ` Tom Herbert
  2 siblings, 1 reply; 36+ messages in thread
From: Tom Herbert @ 2009-12-17 21:04 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, Linux Netdev List

On Fri, Nov 20, 2009 at 3:40 PM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
>
> On Fri, 20 Nov 2009 15:28:58 -0800
> Tom Herbert <therbert@google.com> wrote:
>
> > +static char *get_token(const char **cp, size_t *len)
> > +{
> > +     const char *bp = *cp;
> > +     char *start;
> > +
> > +     while (isspace(*bp))
> > +             bp++;
> > +
> > +     start = (char *)bp;
> > +     while (!isspace(*bp) && *bp != '\0')
> > +             bp++;
> > +
> > +     if (start != bp)
> > +             *len = bp - start;
> > +     else
> > +             start = NULL;
> > +
> > +     *cp = bp;
> > +     return start;
> > +}
>
>
> Sysfs is intentionally one value per file. If you need multiple
> values, then it is the wrong interface.

Are there any good recommendations for a more appropriate interface to
use?  Would it make sense to expose the NAPI instances in sysfs...
like eth0/rx-0/, eth0/rx-1/,... which would hopefully be consistent
with interrupts in /proc/interrupts.

Thanks,
Tom

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2009-12-17 21:04   ` Tom Herbert
@ 2010-01-06  1:32     ` Tom Herbert
  2010-01-06  5:54       ` Eric Dumazet
  0 siblings, 1 reply; 36+ messages in thread
From: Tom Herbert @ 2010-01-06  1:32 UTC (permalink / raw)
  To: David Miller; +Cc: Linux Netdev List

Here's an RPS updated patch with some minor fixes, sorry for the long
turnaround.  This addresses most of the comments for last patch:

- Moved shared fields in softnet_data into a separate cacheline
- Make hashrnd __read_mostly
- Removed extra "hash" variable in get_rps_cpu
- Allow use of RPS from netif_rx (we have a use case where this is needed)
- In net_rps_action clear each cpu in the mask before calling the
function, I believe this prevents race condition

I still don't have a better way to do a per-napi RPS mask than using a
single variable in sysfs under the device.  It still seems like we'd
want a file or even directory for each napi instance, but that looks
like some major changes.

Also, we found that a few drivers are calling napi_gro_receive in lieu
of netif_receive_skb (tg3, e1000e for example).  The patch does not
support that, so there is no benefit for them with RPS :-(.  The GRO
path looks pretty intertwined with the receive although way through
TCP so I'm not sure it will be easy to retrofit.  We changed e1000e to
call netif_receive_skb and top netperf RR throughput went for 85K tps
to 241K tps, and for our workloads at least this is may be the bigger
win.

Tom

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 97873e3..7107b13 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -676,6 +676,29 @@ struct net_device_ops {
 };

 /*
+ * Structure for Receive Packet Steering.  Length of map and array of CPU ID's.
+ */
+struct rps_map {
+	int len;
+	u16 map[0];
+};
+
+/*
+ * Structure that contains the rps maps for various NAPI instances of a device.
+ */
+struct dev_rps_maps {
+	int num_maps;
+	struct rcu_head rcu;
+	struct rps_map maps[0];
+};
+
+/* Bound number of CPUs that can be in an rps map */
+#define MAX_RPS_CPUS (num_possible_cpus() < 256 ? num_possible_cpus() : 256)
+
+/* Maximum size of RPS map (for allocation) */
+#define RPS_MAP_SIZE (sizeof(struct rps_map) + (MAX_RPS_CPUS * sizeof(u16)))
+
+/*
  *	The DEVICE structure.
  *	Actually, this whole structure is a big mistake.  It mixes I/O
  *	data with strictly "high-level" data, and it has to know about
@@ -861,6 +884,9 @@ struct net_device {

 	struct netdev_queue	rx_queue;

+	struct dev_rps_maps	*dev_rps_maps;	/* Per-NAPI maps for
+						   receive packet steeing */
+
 	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;

 	/* Number of TX queues allocated at alloc_netdev_mq() time  */
@@ -1274,10 +1300,12 @@ static inline int unregister_gifconf(unsigned
int family)
  */
 struct softnet_data {
 	struct Qdisc		*output_queue;
-	struct sk_buff_head	input_pkt_queue;
 	struct list_head	poll_list;
 	struct sk_buff		*completion_queue;

+	/* Elements below can be accessed between CPUs for RPS */
+	struct call_single_data	csd ____cacheline_aligned_in_smp;
+	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
 };

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 63f4742..f188301 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -267,6 +267,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@mac_header: Link layer header
  *	@_skb_dst: destination entry
  *	@sp: the security path, used for xfrm
+ *	@rxhash: the packet hash computed on receive
  *	@cb: Control buffer. Free for use by every layer. Put private vars here
  *	@len: Length of actual data
  *	@data_len: Data length
@@ -323,6 +324,8 @@ struct sk_buff {
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
 #endif
+	__u32			rxhash;
+
 	/*
 	 * This is the control buffer. It is free to use for every
 	 * layer. Please put your private variables there. If you
diff --git a/net/core/dev.c b/net/core/dev.c
index 9977288..77c3d48 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1834,7 +1834,7 @@ out_kfree_skb:
 	return rc;
 }

-static u32 skb_tx_hashrnd;
+static u32 hashrnd __read_mostly;

 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
 {
@@ -1852,7 +1852,7 @@ u16 skb_tx_hash(const struct net_device *dev,
const struct sk_buff *skb)
 	else
 		hash = skb->protocol;

-	hash = jhash_1word(hash, skb_tx_hashrnd);
+	hash = jhash_1word(hash, hashrnd);

 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
 }
@@ -2073,6 +2073,148 @@ int weight_p __read_mostly = 64;            /*
old backlog weight */

 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };

+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving NAPI instance for a given skb.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
+{
+	u32 addr1, addr2, ports;
+	struct ipv6hdr *ip6;
+	struct iphdr *ip;
+	u32 ihl;
+	u8 ip_proto;
+	int cpu = -1;
+	struct dev_rps_maps *drmap;
+	struct rps_map *map = NULL;
+	u16 index;
+
+	rcu_read_lock();
+
+	drmap = rcu_dereference(dev->dev_rps_maps);
+	if (!drmap)
+		goto done;
+
+	index = skb_get_rx_queue(skb);
+	if (index >= drmap->num_maps)
+		index = 0;
+
+	map = (struct rps_map *)
+	    ((void *)drmap->maps + (RPS_MAP_SIZE * index));
+	if (!map->len)
+		goto done;
+
+	if (skb->rxhash)
+		goto got_hash; /* Skip hash computation on packet header */
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		if (!pskb_may_pull(skb, sizeof(*ip)))
+			goto done;
+
+		ip = (struct iphdr *) skb->data;
+		ip_proto = ip->protocol;
+		addr1 = ip->saddr;
+		addr2 = ip->daddr;
+		ihl = ip->ihl;
+		break;
+	case __constant_htons(ETH_P_IPV6):
+		if (!pskb_may_pull(skb, sizeof(*ip6)))
+			goto done;
+
+		ip6 = (struct ipv6hdr *) skb->data;
+		ip_proto = ip6->nexthdr;
+		addr1 = ip6->saddr.s6_addr32[3];
+		addr2 = ip6->daddr.s6_addr32[3];
+		ihl = (40 >> 2);
+		break;
+	default:
+		goto done;
+	}
+	ports = 0;
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDPLITE:
+		if (pskb_may_pull(skb, (ihl * 4) + 4))
+			ports = *((u32 *) (skb->data + (ihl * 4)));
+		break;
+
+	default:
+		break;
+	}
+
+	skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd);
+	if (!skb->rxhash)
+		skb->rxhash = 1;
+
+got_hash:
+	cpu = map->map[((u64) skb->rxhash * map->len) >> 32];
+
+	if (!cpu_online(cpu))
+		cpu = -1;
+done:
+	rcu_read_unlock();
+	return cpu;
+}
+
+static DEFINE_PER_CPU(cpumask_t, rps_remote_softirq_cpus);
+
+/* Called from hardirq (IPI) context */
+static void trigger_softirq(void *data)
+{
+	struct softnet_data *queue = data;
+	__napi_schedule(&queue->backlog);
+}
+
+/*
+ * enqueue_to_backlog is called to queue an skb to a per CPU backlog
+ * queue (may be a remote CPU queue).
+ */
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
+{
+	struct softnet_data *queue;
+	unsigned long flags;
+
+	queue = &per_cpu(softnet_data, cpu);
+
+	local_irq_save(flags);
+	__get_cpu_var(netdev_rx_stat).total++;
+
+	spin_lock(&queue->input_pkt_queue.lock);
+	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
+		if (queue->input_pkt_queue.qlen) {
+enqueue:
+			__skb_queue_tail(&queue->input_pkt_queue, skb);
+			spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
+			    flags);
+			return NET_RX_SUCCESS;
+		}
+
+		/* Schedule NAPI for backlog device */
+		if (napi_schedule_prep(&queue->backlog)) {
+			if (cpu != smp_processor_id()) {
+				cpu_set(cpu,
+				    get_cpu_var(rps_remote_softirq_cpus));
+				__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+			} else
+				__napi_schedule(&queue->backlog);
+		}
+		goto enqueue;
+	}
+
+	spin_unlock(&queue->input_pkt_queue.lock);
+
+	__get_cpu_var(netdev_rx_stat).dropped++;
+	local_irq_restore(flags);
+
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}

 /**
  *	netif_rx	-	post buffer to the network code
@@ -2091,8 +2233,7 @@ DEFINE_PER_CPU(struct netif_rx_stats,
netdev_rx_stat) = { 0, };

 int netif_rx(struct sk_buff *skb)
 {
-	struct softnet_data *queue;
-	unsigned long flags;
+	int cpu;

 	/* if netpoll wants it, pretend we never saw it */
 	if (netpoll_rx(skb))
@@ -2101,31 +2242,12 @@ int netif_rx(struct sk_buff *skb)
 	if (!skb->tstamp.tv64)
 		net_timestamp(skb);

-	/*
-	 * The code is rearranged so that the path is the most
-	 * short when CPU is congested, but is still operating.
-	 */
-	local_irq_save(flags);
-	queue = &__get_cpu_var(softnet_data);
-
-	__get_cpu_var(netdev_rx_stat).total++;
-	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
-		if (queue->input_pkt_queue.qlen) {
-enqueue:
-			__skb_queue_tail(&queue->input_pkt_queue, skb);
-			local_irq_restore(flags);
-			return NET_RX_SUCCESS;
-		}
-
-		napi_schedule(&queue->backlog);
-		goto enqueue;
-	}

-	__get_cpu_var(netdev_rx_stat).dropped++;
-	local_irq_restore(flags);
-
-	kfree_skb(skb);
-	return NET_RX_DROP;
+	cpu = get_rps_cpu(skb->dev, skb);
+	if (cpu < 0)
+		cpu = smp_processor_id();
+
+	return enqueue_to_backlog(skb, cpu);
 }
 EXPORT_SYMBOL(netif_rx);

@@ -2363,10 +2485,10 @@ void netif_nit_deliver(struct sk_buff *skb)
 }

 /**
- *	netif_receive_skb - process receive buffer from network
+ *	__netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
  *
- *	netif_receive_skb() is the main receive data processing function.
+ *	__netif__napireceive_skb() is the main receive data processing function.
  *	It always succeeds. The buffer may be dropped during processing
  *	for congestion control or by the protocol layers.
  *
@@ -2377,7 +2499,8 @@ void netif_nit_deliver(struct sk_buff *skb)
  *	NET_RX_SUCCESS: no congestion
  *	NET_RX_DROP: packet was dropped
  */
-int netif_receive_skb(struct sk_buff *skb)
+
+int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
 	struct net_device *orig_dev;
@@ -2475,6 +2598,16 @@ out:
 }
 EXPORT_SYMBOL(netif_receive_skb);

+int netif_receive_skb(struct sk_buff *skb)
+{
+	int cpu = get_rps_cpu(skb->dev, skb);
+
+	if (cpu < 0)
+		return __netif_receive_skb(skb);
+	else
+		return enqueue_to_backlog(skb, cpu);
+}
+
 /* Network device is going away, flush any packets still pending  */
 static void flush_backlog(void *arg)
 {
@@ -2518,7 +2651,7 @@ static int napi_gro_complete(struct sk_buff *skb)
 	}

 out:
-	return netif_receive_skb(skb);
+	return __netif_receive_skb(skb);
 }

 void napi_gro_flush(struct napi_struct *napi)
@@ -2650,7 +2783,7 @@ gro_result_t napi_skb_finish(gro_result_t ret,
struct sk_buff *skb)
 {
 	switch (ret) {
 	case GRO_NORMAL:
-		if (netif_receive_skb(skb))
+		if (__netif_receive_skb(skb))
 			ret = GRO_DROP;
 		break;

@@ -2724,7 +2857,7 @@ gro_result_t napi_frags_finish(struct
napi_struct *napi, struct sk_buff *skb,

 		if (ret == GRO_HELD)
 			skb_gro_pull(skb, -ETH_HLEN);
-		else if (netif_receive_skb(skb))
+		else if (__netif_receive_skb(skb))
 			ret = GRO_DROP;
 		break;

@@ -2799,16 +2932,16 @@ static int process_backlog(struct napi_struct
*napi, int quota)
 	do {
 		struct sk_buff *skb;

-		local_irq_disable();
+		spin_lock_irq(&queue->input_pkt_queue.lock);
 		skb = __skb_dequeue(&queue->input_pkt_queue);
 		if (!skb) {
 			__napi_complete(napi);
-			local_irq_enable();
+			spin_unlock_irq(&queue->input_pkt_queue.lock);
 			break;
 		}
-		local_irq_enable();
+		spin_unlock_irq(&queue->input_pkt_queue.lock);

-		netif_receive_skb(skb);
+		__netif_receive_skb(skb);
 	} while (++work < quota && jiffies == start_time);

 	return work;
@@ -2897,6 +3030,21 @@ void netif_napi_del(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(netif_napi_del);

+/*
+ * net_rps_action sends any pending IPI's for rps.  This is only called from
+ * softirq and interrupts must be enabled.
+ */
+static void net_rps_action(void)
+{
+	int cpu;
+
+	/* Send pending IPI's to kick RPS processing on remote cpus. */
+	for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
+		struct softnet_data *queue = &per_cpu(softnet_data, cpu);
+		cpu_clear(cpu, __get_cpu_var(rps_remote_softirq_cpus));
+		__smp_call_function_single(cpu, &queue->csd, 0);
+	}
+}

 static void net_rx_action(struct softirq_action *h)
 {
@@ -2968,6 +3116,8 @@ static void net_rx_action(struct softirq_action *h)
 out:
 	local_irq_enable();

+	net_rps_action();
+
 #ifdef CONFIG_NET_DMA
 	/*
 	 * There may not be any more sk_buffs coming right now, so push
@@ -5341,6 +5491,8 @@ void free_netdev(struct net_device *dev)
 	/* Flush device addresses */
 	dev_addr_flush(dev);

+	kfree(dev->dev_rps_maps);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);

@@ -5793,6 +5945,10 @@ static int __init net_dev_init(void)
 		queue->completion_queue = NULL;
 		INIT_LIST_HEAD(&queue->poll_list);

+		queue->csd.func = trigger_softirq;
+		queue->csd.info = queue;
+		queue->csd.flags = 0;
+
 		queue->backlog.poll = process_backlog;
 		queue->backlog.weight = weight_p;
 		queue->backlog.gro_list = NULL;
@@ -5831,7 +5987,7 @@ subsys_initcall(net_dev_init);

 static int __init initialize_hashrnd(void)
 {
-	get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
+	get_random_bytes(&hashrnd, sizeof(hashrnd));
 	return 0;
 }

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 157645c..be78dfb 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -18,6 +18,9 @@
 #include <linux/wireless.h>
 #include <net/wext.h>

+#include <linux/string.h>
+#include <linux/ctype.h>
+
 #include "net-sysfs.h"

 #ifdef CONFIG_SYSFS
@@ -253,6 +256,132 @@ static ssize_t store_tx_queue_len(struct device *dev,
 	return netdev_store(dev, attr, buf, len, change_tx_queue_len);
 }

+static char *get_token(const char **cp, size_t *len)
+
+	const char *bp = *cp;
+	char *start;
+
+	while (isspace(*bp))
+		bp++;
+
+	start = (char *)bp;
+	while (!isspace(*bp) && *bp != '\0')
+		bp++;
+
+	if (start != bp)
+		*len = bp - start;
+	else
+		start = NULL;
+
+	*cp = bp;
+	return start;
+}
+
+static void dev_map_release(struct rcu_head *rcu)
+{
+	struct dev_rps_maps *drmap =
+	    container_of(rcu, struct dev_rps_maps, rcu);
+
+	kfree(drmap);
+}
+
+static ssize_t store_rps_cpus(struct device *dev,
+    struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct net_device *net = to_net_dev(dev);
+	struct napi_struct *napi;
+	cpumask_t mask;
+	int err, cpu, index, i;
+	int cnt = 0;
+	char *token;
+	const char *cp = buf;
+	size_t tlen;
+	struct dev_rps_maps *drmap, *old_drmap;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	cnt = 0;
+	list_for_each_entry(napi, &net->napi_list, dev_list)
+		cnt++;
+
+	drmap = kzalloc(sizeof(struct dev_rps_maps) +
+	    RPS_MAP_SIZE * cnt, GFP_KERNEL);
+	if (!drmap)
+		return -ENOMEM;
+
+	drmap->num_maps = cnt;
+
+	cp = buf;
+	for (index = 0; index < cnt &&
+	   (token = get_token(&cp, &tlen)); index++) {
+		struct rps_map *map = (struct rps_map *)
+		    ((void *)drmap->maps + (RPS_MAP_SIZE * index));
+		err = bitmap_parse(token, tlen, cpumask_bits(&mask),
+		    nr_cpumask_bits);
+
+		if (err) {
+			kfree(drmap);
+			return err;
+		}
+
+		cpus_and(mask, mask, cpu_online_map);
+		i = 0;
+		for_each_cpu_mask(cpu, mask) {
+			if (i >= MAX_RPS_CPUS)
+				break;
+			map->map[i++] =  cpu;
+		}
+		map->len = i;
+	}
+
+	rcu_read_lock_bh();
+	old_drmap = rcu_dereference(net->dev_rps_maps);
+	rcu_assign_pointer(net->dev_rps_maps, drmap);
+	rcu_read_unlock_bh();
+
+	if (old_drmap)
+		call_rcu(&old_drmap->rcu, dev_map_release);
+
+	return len;
+}
+
+static ssize_t show_rps_cpus(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct net_device *net = to_net_dev(dev);
+	size_t len = 0;
+	cpumask_t mask;
+	int i, j;
+	struct dev_rps_maps *drmap;
+
+	rcu_read_lock_bh();
+	drmap = rcu_dereference(net->dev_rps_maps);
+
+	if (drmap) {
+		for (j = 0; j < drmap->num_maps; j++) {
+			struct rps_map *map = (struct rps_map *)
+			    ((void *)drmap->maps + (RPS_MAP_SIZE * j));
+			cpus_clear(mask);
+			for (i = 0; i < map->len; i++)
+				cpu_set(map->map[i], mask);
+
+			len += cpumask_scnprintf(buf + len, PAGE_SIZE, &mask);
+			if (PAGE_SIZE - len < 3) {
+				rcu_read_unlock();
+				return -EINVAL;
+			}
+			if (j < drmap->num_maps)
+				len += sprintf(buf + len, " ");
+		}
+	}
+
+	rcu_read_unlock_bh();
+
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+
 static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t len)
 {
@@ -309,6 +438,7 @@ static struct device_attribute net_class_attributes[] = {
 	__ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
 	__ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
 	       store_tx_queue_len),
+	__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_cpus, store_rps_cpus),
 	{}
 };

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2010-01-06  1:32     ` Tom Herbert
@ 2010-01-06  5:54       ` Eric Dumazet
  2010-01-06  7:56         ` Tom Herbert
  2010-01-06 18:38         ` Eric Dumazet
  0 siblings, 2 replies; 36+ messages in thread
From: Eric Dumazet @ 2010-01-06  5:54 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, Linux Netdev List

Le 06/01/2010 02:32, Tom Herbert a écrit :
> Here's an RPS updated patch with some minor fixes, sorry for the long
> turnaround.  This addresses most of the comments for last patch:
> 
> - Moved shared fields in softnet_data into a separate cacheline
> - Make hashrnd __read_mostly
> - Removed extra "hash" variable in get_rps_cpu
> - Allow use of RPS from netif_rx (we have a use case where this is needed)
> - In net_rps_action clear each cpu in the mask before calling the
> function, I believe this prevents race condition

Hmm, I cant see a race condition here, could you elaborate on this ?
mask is local to this cpu, and we cannot re-enter a function that could
change some bits under us (we are called from net_rx_action())
If you believe there is a race condition, I suspect race is still there.

> 
> I still don't have a better way to do a per-napi RPS mask than using a
> single variable in sysfs under the device.  It still seems like we'd
> want a file or even directory for each napi instance, but that looks
> like some major changes.
> 
> Also, we found that a few drivers are calling napi_gro_receive in lieu
> of netif_receive_skb (tg3, e1000e for example).  The patch does not
> support that, so there is no benefit for them with RPS :-(.  The GRO
> path looks pretty intertwined with the receive although way through
> TCP so I'm not sure it will be easy to retrofit.  We changed e1000e to
> call netif_receive_skb and top netperf RR throughput went for 85K tps
> to 241K tps, and for our workloads at least this is may be the bigger
> win.

Did you tested with VLANS too ? (with/without hardware support)

> 
> Tom

Excellent, but I suspect big win comes from using few NICS.
(number_of(NICS) < num_online_cpus)

(in the reverse case, possible contention on queue->csd)

> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 97873e3..7107b13 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -676,6 +676,29 @@ struct net_device_ops {
>  };
> 
>  /*
> + * Structure for Receive Packet Steering.  Length of map and array of CPU ID's.
> + */
> +struct rps_map {
> +	int len;
> +	u16 map[0];
> +};
> +
> +/*
> + * Structure that contains the rps maps for various NAPI instances of a device.
> + */
> +struct dev_rps_maps {
> +	int num_maps;
> +	struct rcu_head rcu;
> +	struct rps_map maps[0];
> +};

I feel uneasy about this, because of kmalloc() max size and rounding to power of two effects.
It also uses a single node in NUMA setups.
> +
> +/* Bound number of CPUs that can be in an rps map */
> +#define MAX_RPS_CPUS (num_possible_cpus() < 256 ? num_possible_cpus() : 256)
> +
> +/* Maximum size of RPS map (for allocation) */
> +#define RPS_MAP_SIZE (sizeof(struct rps_map) + (MAX_RPS_CPUS * sizeof(u16)))
> +
> +/*
>   *	The DEVICE structure.
>   *	Actually, this whole structure is a big mistake.  It mixes I/O
>   *	data with strictly "high-level" data, and it has to know about
> @@ -861,6 +884,9 @@ struct net_device {
> 
>  	struct netdev_queue	rx_queue;
> 
> +	struct dev_rps_maps	*dev_rps_maps;	/* Per-NAPI maps for
> +						   receive packet steeing */
> +


If you store rps_map pointer into napi itself, you could avoid this MAX_RPS_CPUS thing
and really dynamically allocate the structure with the number of online cpus mentioned
in the map.

But yes, it makes store_rps_cpus() more complex :(

This probably can be done later, this Version 4 of RPS looks very good, thanks !
I am going to test it today on my dev machine before giving an Acked-by :)

Reviewed-by: Eric Dumazet <eric.dumazet@gmail.com>



^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2010-01-06  5:54       ` Eric Dumazet
@ 2010-01-06  7:56         ` Tom Herbert
  2010-01-06 18:38         ` Eric Dumazet
  1 sibling, 0 replies; 36+ messages in thread
From: Tom Herbert @ 2010-01-06  7:56 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Linux Netdev List

Eric, thanks for the comments.

> Hmm, I cant see a race condition here, could you elaborate on this ?
> mask is local to this cpu, and we cannot re-enter a function that could
> change some bits under us (we are called from net_rx_action())
> If you believe there is a race condition, I suspect race is still there.
>
We're allowing bits in the map to be set in netif_rx out of interrupt
and __smp_call_function_single needs to be called with interrupts
disabled.  I guess an alternative would be to copy the mask to a local
variable, and then clear the mask and scan over the local variable.
Would there be complaints with stack space in local cpumask_t
variable?

>
> Did you tested with VLANS too ? (with/without hardware support)
>
No.  Looks like vlan functions eventually call netif_receive_skb
though... I suppose I could test that.

>>
>> Tom
>
> Excellent, but I suspect big win comes from using few NICS.
> (number_of(NICS) < num_online_cpus)
>
Yes, our primary motivation for developing this was a single NIC with
a single queue on a multicore system.

> (in the reverse case, possible contention on queue->csd)
>
Actually there should never be contention on that, only one cpu at at
time will access it which is the one that successfully schedules napi
on the remote CPU from enqueue_to_backlog.

>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index 97873e3..7107b13 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -676,6 +676,29 @@ struct net_device_ops {
>>  };
>>
>>  /*
>> + * Structure for Receive Packet Steering.  Length of map and array of CPU ID's.
>> + */
>> +struct rps_map {
>> +     int len;
>> +     u16 map[0];
>> +};
>> +
>> +/*
>> + * Structure that contains the rps maps for various NAPI instances of a device.
>> + */
>> +struct dev_rps_maps {
>> +     int num_maps;
>> +     struct rcu_head rcu;
>> +     struct rps_map maps[0];
>> +};
>
> I feel uneasy about this, because of kmalloc() max size and rounding to power of two effects.

Other than some wasted memory what is bad about that?

> It also uses a single node in NUMA setups.

I suppose we should allocate from the devices NUMA node... I'd really
like to store the maps with the napi instances themselves and this
would work well if the napi structs are allocated on the NUMA node
where the interrupt is called (I think this in Peter Waskiewicz's irq
patch).  Unfortunately, the information is lost by the time
netif_receive_skb is called anyway.

>> +
>> +/* Bound number of CPUs that can be in an rps map */
>> +#define MAX_RPS_CPUS (num_possible_cpus() < 256 ? num_possible_cpus() : 256)
>> +
>> +/* Maximum size of RPS map (for allocation) */
>> +#define RPS_MAP_SIZE (sizeof(struct rps_map) + (MAX_RPS_CPUS * sizeof(u16)))
>> +
>> +/*
>>   *   The DEVICE structure.
>>   *   Actually, this whole structure is a big mistake.  It mixes I/O
>>   *   data with strictly "high-level" data, and it has to know about
>> @@ -861,6 +884,9 @@ struct net_device {
>>
>>       struct netdev_queue     rx_queue;
>>
>> +     struct dev_rps_maps     *dev_rps_maps;  /* Per-NAPI maps for
>> +                                                receive packet steeing */
>> +
>
>
> If you store rps_map pointer into napi itself, you could avoid this MAX_RPS_CPUS thing
> and really dynamically allocate the structure with the number of online cpus mentioned
> in the map.
>
> But yes, it makes store_rps_cpus() more complex :(
>
As I pointed out above, I would like to that.  Probably would involve
adding a pointer to napi_struct in skb...

> This probably can be done later, this Version 4 of RPS looks very good, thanks !
> I am going to test it today on my dev machine before giving an Acked-by :)
>
Thanks!

> Reviewed-by: Eric Dumazet <eric.dumazet@gmail.com>
>
>
>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2010-01-06  5:54       ` Eric Dumazet
  2010-01-06  7:56         ` Tom Herbert
@ 2010-01-06 18:38         ` Eric Dumazet
  2010-01-06 21:10           ` [BUG net-next-2.6] Had to revert bonding: allow arp_ip_targets on separate vlans to use arp validation Eric Dumazet
  2010-01-06 22:54           ` [PATCH v4 1/1] rps: core implementation Tom Herbert
  1 sibling, 2 replies; 36+ messages in thread
From: Eric Dumazet @ 2010-01-06 18:38 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, Linux Netdev List

Le 06/01/2010 06:54, Eric Dumazet a écrit :
> 
> This probably can be done later, this Version 4 of RPS looks very good, thanks !
> I am going to test it today on my dev machine before giving an Acked-by :)
> 

Hmm, I had to make some changes to get RPS working a bit on my setup
(bonding + vlans)

Some devices dont have napi contexts at all, so force cnt being not null
in store_rps_cpus() :

+       list_for_each_entry(napi, &net->napi_list, dev_list)
+               cnt++;
+       if (cnt == 0)
+               cnt = 1;

BTW, following sequence is not safe :

	rcu_read_lock_bh();
	old_drmap = rcu_dereference(net->dev_rps_maps); 
	rcu_assign_pointer(net->dev_rps_maps, drmap);
	rcu_read_unlock_bh();

rcu_read_lock/unlock is not the right paradigm for writers :)


I also noticed these messages, there is a lock imbalance somewhere...
(caused by cpu handling original soft interrupt)

[  442.636637] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?
[  442.652122] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?
[  442.735065] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?
[  442.752136] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?
[  442.966726] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?
[  442.982126] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?
[  450.794525] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?
[  451.808901] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?
[  454.610304] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?
[  511.825160] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?
[  571.840765] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?
[  631.856645] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?
[  691.869030] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?
[  721.847106] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?
[  721.860279] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?
[  722.223053] huh, cpu 0 entered softirq 3 NET_RX c051f0a0 preempt_count 00000102, exited with 00000103?


patch I currently used against linux-2.6

(net-next-2.6 doesnt work well on my bond/vlan setup, I suspect I need a bisection)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a3fccc8..7218970 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -216,12 +216,20 @@ enum {
 struct neighbour;
 struct neigh_parms;
 struct sk_buff;
-
+/**
+ * struct netif_rx_stats - softnet stats for rx on each cpu
+ * @total: number of time softirq was trigerred
+ * @dropped: number of dropped frames
+ * @time_squeeze: number of time rx softirq was delayed
+ * @cpu_collision:
+ * @rps: number of time this cpu was given work by Receive Packet Steering
+ */
 struct netif_rx_stats {
 	unsigned total;
 	unsigned dropped;
 	unsigned time_squeeze;
 	unsigned cpu_collision;
+	unsigned rps;
 };
 
 DECLARE_PER_CPU(struct netif_rx_stats, netdev_rx_stat);
@@ -676,6 +684,29 @@ struct net_device_ops {
 };
 
 /*
+ * Structure for Receive Packet Steering.  Length of map and array of CPU ID's.
+ */
+struct rps_map {
+	int len;
+	u16 map[0];
+};
+
+/*
+ * Structure that contains the rps maps for various NAPI instances of a device.
+ */
+struct dev_rps_maps {
+	int num_maps;
+	struct rcu_head rcu;
+	struct rps_map maps[0];
+};
+
+/* Bound number of CPUs that can be in an rps map */
+#define MAX_RPS_CPUS (num_possible_cpus() < 256 ? num_possible_cpus() : 256)
+
+/* Maximum size of RPS map (for allocation) */
+#define RPS_MAP_SIZE (sizeof(struct rps_map) + (MAX_RPS_CPUS * sizeof(u16)))
+
+/*
  *	The DEVICE structure.
  *	Actually, this whole structure is a big mistake.  It mixes I/O
  *	data with strictly "high-level" data, and it has to know about
@@ -861,6 +892,9 @@ struct net_device {
 
 	struct netdev_queue	rx_queue;
 
+	struct dev_rps_maps	*dev_rps_maps;	/* Per-NAPI maps for
+						   receive packet steeing */
+
 	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
 
 	/* Number of TX queues allocated at alloc_netdev_mq() time  */
@@ -1276,10 +1310,12 @@ static inline int unregister_gifconf(unsigned int family)
  */
 struct softnet_data {
 	struct Qdisc		*output_queue;
-	struct sk_buff_head	input_pkt_queue;
 	struct list_head	poll_list;
 	struct sk_buff		*completion_queue;
 
+	/* Elements below can be accessed between CPUs for RPS */
+	struct call_single_data	csd ____cacheline_aligned_in_smp;
+	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
 };
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ae836fd..8ed3f66 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -267,6 +267,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@mac_header: Link layer header
  *	@_skb_dst: destination entry
  *	@sp: the security path, used for xfrm
+ *	@rxhash: the packet hash computed on receive
  *	@cb: Control buffer. Free for use by every layer. Put private vars here
  *	@len: Length of actual data
  *	@data_len: Data length
@@ -323,6 +324,8 @@ struct sk_buff {
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
 #endif
+	__u32			rxhash;
+
 	/*
 	 * This is the control buffer. It is free to use for every
 	 * layer. Please put your private variables there. If you
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a09502e..5bde7bc 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -219,9 +219,10 @@ restart:
 			h->action(h);
 			trace_softirq_exit(h, softirq_vec);
 			if (unlikely(prev_count != preempt_count())) {
-				printk(KERN_ERR "huh, entered softirq %td %s %p"
+				pr_err("huh, cpu %d entered softirq %td %s %p"
 				       "with preempt_count %08x,"
-				       " exited with %08x?\n", h - softirq_vec,
+				       " exited with %08x?\n", cpu,
+				       h - softirq_vec,
 				       softirq_to_name[h - softirq_vec],
 				       h->action, prev_count, preempt_count());
 				preempt_count() = prev_count;
diff --git a/net/core/dev.c b/net/core/dev.c
index be9924f..9c92f10 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1874,7 +1874,7 @@ out_kfree_skb:
 	return rc;
 }
 
-static u32 skb_tx_hashrnd;
+static u32 hashrnd __read_mostly;
 
 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
 {
@@ -1892,7 +1892,7 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
 	else
 		hash = skb->protocol;
 
-	hash = jhash_1word(hash, skb_tx_hashrnd);
+	hash = jhash_1word(hash, hashrnd);
 
 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
 }
@@ -2113,6 +2113,149 @@ int weight_p __read_mostly = 64;            /* old backlog weight */
 
 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
 
+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving NAPI instance for a given skb.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
+{
+	u32 addr1, addr2, ports;
+	struct ipv6hdr *ip6;
+	struct iphdr *ip;
+	u32 ihl;
+	u8 ip_proto;
+	int cpu = -1;
+	struct dev_rps_maps *drmap;
+	struct rps_map *map = NULL;
+	u16 index;
+
+	rcu_read_lock();
+
+	drmap = rcu_dereference(dev->dev_rps_maps);
+	if (!drmap)
+		goto done;
+
+	index = skb_get_rx_queue(skb);
+	if (index >= drmap->num_maps)
+		index = 0;
+
+	map = (struct rps_map *)
+	    ((void *)drmap->maps + (RPS_MAP_SIZE * index));
+	if (!map->len)
+		goto done;
+
+	if (skb->rxhash)
+		goto got_hash; /* Skip hash computation on packet header */
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		if (!pskb_may_pull(skb, sizeof(*ip)))
+			goto done;
+
+		ip = (struct iphdr *) skb->data;
+		ip_proto = ip->protocol;
+		addr1 = ip->saddr;
+		addr2 = ip->daddr;
+		ihl = ip->ihl;
+		break;
+	case __constant_htons(ETH_P_IPV6):
+		if (!pskb_may_pull(skb, sizeof(*ip6)))
+			goto done;
+
+		ip6 = (struct ipv6hdr *) skb->data;
+		ip_proto = ip6->nexthdr;
+		addr1 = ip6->saddr.s6_addr32[3];
+		addr2 = ip6->daddr.s6_addr32[3];
+		ihl = (40 >> 2);
+		break;
+	default:
+		goto done;
+	}
+	ports = 0;
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDPLITE:
+		if (pskb_may_pull(skb, (ihl * 4) + 4))
+			ports = *((u32 *) (skb->data + (ihl * 4)));
+		break;
+
+	default:
+		break;
+	}
+
+	skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd);
+	if (!skb->rxhash)
+		skb->rxhash = 1;
+
+got_hash:
+	cpu = map->map[((u64) skb->rxhash * map->len) >> 32];
+
+	if (!cpu_online(cpu))
+		cpu = -1;
+done:
+	rcu_read_unlock();
+	return cpu;
+}
+
+static DEFINE_PER_CPU(cpumask_t, rps_remote_softirq_cpus);
+
+/* Called from hardirq (IPI) context */
+static void trigger_softirq(void *data)
+{
+	struct softnet_data *queue = data;
+	__napi_schedule(&queue->backlog);
+	__get_cpu_var(netdev_rx_stat).rps++;
+}
+
+/*
+ * enqueue_to_backlog is called to queue an skb to a per CPU backlog
+ * queue (may be a remote CPU queue).
+ */
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
+{
+	struct softnet_data *queue;
+	unsigned long flags;
+
+	queue = &per_cpu(softnet_data, cpu);
+
+	local_irq_save(flags);
+	__get_cpu_var(netdev_rx_stat).total++;
+
+	spin_lock(&queue->input_pkt_queue.lock);
+	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
+		if (queue->input_pkt_queue.qlen) {
+enqueue:
+			__skb_queue_tail(&queue->input_pkt_queue, skb);
+			spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
+			    flags);
+			return NET_RX_SUCCESS;
+		}
+
+		/* Schedule NAPI for backlog device */
+		if (napi_schedule_prep(&queue->backlog)) {
+			if (cpu != smp_processor_id()) {
+				cpu_set(cpu,
+				    get_cpu_var(rps_remote_softirq_cpus));
+				__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+			} else
+				__napi_schedule(&queue->backlog);
+		}
+		goto enqueue;
+	}
+
+	spin_unlock(&queue->input_pkt_queue.lock);
+
+	__get_cpu_var(netdev_rx_stat).dropped++;
+	local_irq_restore(flags);
+
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
 
 /**
  *	netif_rx	-	post buffer to the network code
@@ -2131,8 +2274,7 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
 
 int netif_rx(struct sk_buff *skb)
 {
-	struct softnet_data *queue;
-	unsigned long flags;
+	int cpu;
 
 	/* if netpoll wants it, pretend we never saw it */
 	if (netpoll_rx(skb))
@@ -2141,31 +2283,12 @@ int netif_rx(struct sk_buff *skb)
 	if (!skb->tstamp.tv64)
 		net_timestamp(skb);
 
-	/*
-	 * The code is rearranged so that the path is the most
-	 * short when CPU is congested, but is still operating.
-	 */
-	local_irq_save(flags);
-	queue = &__get_cpu_var(softnet_data);
-
-	__get_cpu_var(netdev_rx_stat).total++;
-	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
-		if (queue->input_pkt_queue.qlen) {
-enqueue:
-			__skb_queue_tail(&queue->input_pkt_queue, skb);
-			local_irq_restore(flags);
-			return NET_RX_SUCCESS;
-		}
-
-		napi_schedule(&queue->backlog);
-		goto enqueue;
-	}
 
-	__get_cpu_var(netdev_rx_stat).dropped++;
-	local_irq_restore(flags);
+	cpu = get_rps_cpu(skb->dev, skb);
+	if (cpu < 0)
+		cpu = smp_processor_id();
 
-	kfree_skb(skb);
-	return NET_RX_DROP;
+	return enqueue_to_backlog(skb, cpu);
 }
 EXPORT_SYMBOL(netif_rx);
 
@@ -2403,10 +2526,10 @@ void netif_nit_deliver(struct sk_buff *skb)
 }
 
 /**
- *	netif_receive_skb - process receive buffer from network
+ *	__netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
  *
- *	netif_receive_skb() is the main receive data processing function.
+ *	__netif__receive_skb() is the main receive data processing function.
  *	It always succeeds. The buffer may be dropped during processing
  *	for congestion control or by the protocol layers.
  *
@@ -2417,7 +2540,8 @@ void netif_nit_deliver(struct sk_buff *skb)
  *	NET_RX_SUCCESS: no congestion
  *	NET_RX_DROP: packet was dropped
  */
-int netif_receive_skb(struct sk_buff *skb)
+
+int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
 	struct net_device *orig_dev;
@@ -2515,6 +2639,16 @@ out:
 }
 EXPORT_SYMBOL(netif_receive_skb);
 
+int netif_receive_skb(struct sk_buff *skb)
+{
+	int cpu = get_rps_cpu(skb->dev, skb);
+
+	if (cpu < 0)
+		return __netif_receive_skb(skb);
+	else
+		return enqueue_to_backlog(skb, cpu);
+}
+
 /* Network device is going away, flush any packets still pending  */
 static void flush_backlog(void *arg)
 {
@@ -2558,7 +2692,7 @@ static int napi_gro_complete(struct sk_buff *skb)
 	}
 
 out:
-	return netif_receive_skb(skb);
+	return __netif_receive_skb(skb);
 }
 
 void napi_gro_flush(struct napi_struct *napi)
@@ -2691,7 +2825,7 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 {
 	switch (ret) {
 	case GRO_NORMAL:
-		if (netif_receive_skb(skb))
+		if (__netif_receive_skb(skb))
 			ret = GRO_DROP;
 		break;
 
@@ -2765,7 +2899,7 @@ gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
 
 		if (ret == GRO_HELD)
 			skb_gro_pull(skb, -ETH_HLEN);
-		else if (netif_receive_skb(skb))
+		else if (__netif_receive_skb(skb))
 			ret = GRO_DROP;
 		break;
 
@@ -2840,16 +2974,16 @@ static int process_backlog(struct napi_struct *napi, int quota)
 	do {
 		struct sk_buff *skb;
 
-		local_irq_disable();
+		spin_lock_irq(&queue->input_pkt_queue.lock);
 		skb = __skb_dequeue(&queue->input_pkt_queue);
 		if (!skb) {
 			__napi_complete(napi);
-			local_irq_enable();
+			spin_unlock_irq(&queue->input_pkt_queue.lock);
 			break;
 		}
-		local_irq_enable();
+		spin_unlock_irq(&queue->input_pkt_queue.lock);
 
-		netif_receive_skb(skb);
+		__netif_receive_skb(skb);
 	} while (++work < quota && jiffies == start_time);
 
 	return work;
@@ -2938,6 +3072,21 @@ void netif_napi_del(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(netif_napi_del);
 
+/*
+ * net_rps_action sends any pending IPI's for rps.  This is only called from
+ * softirq and interrupts must be enabled.
+ */
+static void net_rps_action(void)
+{
+	int cpu;
+
+	/* Send pending IPI's to kick RPS processing on remote cpus. */
+	for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
+		struct softnet_data *queue = &per_cpu(softnet_data, cpu);
+		cpu_clear(cpu, __get_cpu_var(rps_remote_softirq_cpus));
+		__smp_call_function_single(cpu, &queue->csd, 0);
+	}
+}
 
 static void net_rx_action(struct softirq_action *h)
 {
@@ -3009,6 +3158,8 @@ static void net_rx_action(struct softirq_action *h)
 out:
 	local_irq_enable();
 
+	net_rps_action();
+
 #ifdef CONFIG_NET_DMA
 	/*
 	 * There may not be any more sk_buffs coming right now, so push
@@ -3255,7 +3406,7 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
 
 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   s->total, s->dropped, s->time_squeeze, 0,
-		   0, 0, 0, 0, /* was fastroute */
+		   0, 0, 0, s->rps, /* was fastroute */
 		   s->cpu_collision);
 	return 0;
 }
@@ -5403,6 +5554,8 @@ void free_netdev(struct net_device *dev)
 	/* Flush device addresses */
 	dev_addr_flush(dev);
 
+	kfree(dev->dev_rps_maps);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
@@ -5877,6 +6030,10 @@ static int __init net_dev_init(void)
 		queue->completion_queue = NULL;
 		INIT_LIST_HEAD(&queue->poll_list);
 
+		queue->csd.func = trigger_softirq;
+		queue->csd.info = queue;
+		queue->csd.flags = 0;
+
 		queue->backlog.poll = process_backlog;
 		queue->backlog.weight = weight_p;
 		queue->backlog.gro_list = NULL;
@@ -5915,7 +6072,7 @@ subsys_initcall(net_dev_init);
 
 static int __init initialize_hashrnd(void)
 {
-	get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
+	get_random_bytes(&hashrnd, sizeof(hashrnd));
 	return 0;
 }
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index fbc1c74..5ca8731 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -18,6 +18,9 @@
 #include <linux/wireless.h>
 #include <net/wext.h>
 
+#include <linux/string.h>
+#include <linux/ctype.h>
+
 #include "net-sysfs.h"
 
 #ifdef CONFIG_SYSFS
@@ -253,6 +256,137 @@ static ssize_t store_tx_queue_len(struct device *dev,
 	return netdev_store(dev, attr, buf, len, change_tx_queue_len);
 }
 
+static char *get_token(const char **cp, size_t *len)
+{
+	const char *bp = *cp;
+	char *start;
+
+	while (isspace(*bp))
+		bp++;
+
+	start = (char *)bp;
+	while (!isspace(*bp) && *bp != '\0')
+		bp++;
+
+	if (start != bp)
+		*len = bp - start;
+	else
+		start = NULL;
+
+	*cp = bp;
+	return start;
+}
+
+static void dev_map_release(struct rcu_head *rcu)
+{
+	struct dev_rps_maps *drmap =
+	    container_of(rcu, struct dev_rps_maps, rcu);
+
+	kfree(drmap);
+}
+
+static ssize_t store_rps_cpus(struct device *dev,
+    struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct net_device *net = to_net_dev(dev);
+	struct napi_struct *napi;
+	cpumask_t mask;
+	int err, cpu, index, i;
+	int cnt = 0;
+	char *token;
+	const char *cp = buf;
+	size_t tlen;
+	struct dev_rps_maps *drmap, *old_drmap;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	cnt = 0;
+	list_for_each_entry(napi, &net->napi_list, dev_list)
+		cnt++;
+	if (cnt == 0) {
+		pr_err("force a non null cnt on device %s\n", net->name);
+		cnt = 1;
+	}
+	pr_err("cnt=%d for device %s\n", cnt, net->name);
+
+	drmap = kzalloc(sizeof(struct dev_rps_maps) +
+	    RPS_MAP_SIZE * cnt, GFP_KERNEL);
+	if (!drmap)
+		return -ENOMEM;
+
+	drmap->num_maps = cnt;
+
+	cp = buf;
+	for (index = 0; index < cnt &&
+	   (token = get_token(&cp, &tlen)); index++) {
+		struct rps_map *map = (struct rps_map *)
+		    ((void *)drmap->maps + (RPS_MAP_SIZE * index));
+		err = bitmap_parse(token, tlen, cpumask_bits(&mask),
+		    nr_cpumask_bits);
+
+		if (err) {
+			kfree(drmap);
+			return err;
+		}
+
+		cpus_and(mask, mask, cpu_online_map);
+		i = 0;
+		for_each_cpu_mask(cpu, mask) {
+			if (i >= MAX_RPS_CPUS)
+				break;
+			map->map[i++] =  cpu;
+		}
+		map->len = i;
+	}
+
+	rcu_read_lock_bh();
+	old_drmap = rcu_dereference(net->dev_rps_maps);
+	rcu_assign_pointer(net->dev_rps_maps, drmap);
+	rcu_read_unlock_bh();
+
+	if (old_drmap)
+		call_rcu(&old_drmap->rcu, dev_map_release);
+
+	return len;
+}
+
+static ssize_t show_rps_cpus(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct net_device *net = to_net_dev(dev);
+	size_t len = 0;
+	cpumask_t mask;
+	int i, j;
+	struct dev_rps_maps *drmap;
+
+	rcu_read_lock_bh();
+	drmap = rcu_dereference(net->dev_rps_maps);
+
+	if (drmap) {
+		for (j = 0; j < drmap->num_maps; j++) {
+			struct rps_map *map = (struct rps_map *)
+			    ((void *)drmap->maps + (RPS_MAP_SIZE * j));
+			cpus_clear(mask);
+			for (i = 0; i < map->len; i++)
+				cpu_set(map->map[i], mask);
+
+			len += cpumask_scnprintf(buf + len, PAGE_SIZE, &mask);
+			if (PAGE_SIZE - len < 3) {
+				rcu_read_unlock();
+				return -EINVAL;
+			}
+			if (j < drmap->num_maps)
+				len += sprintf(buf + len, " ");
+		}
+	}
+
+	rcu_read_unlock_bh();
+
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+
 static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t len)
 {
@@ -309,6 +443,7 @@ static struct device_attribute net_class_attributes[] = {
 	__ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
 	__ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
 	       store_tx_queue_len),
+	__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_cpus, store_rps_cpus),
 	{}
 };
 

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [BUG net-next-2.6] Had to revert bonding: allow arp_ip_targets on separate vlans to use arp validation
  2010-01-06 18:38         ` Eric Dumazet
@ 2010-01-06 21:10           ` Eric Dumazet
  2010-01-06 21:28             ` Jay Vosburgh
                               ` (2 more replies)
  2010-01-06 22:54           ` [PATCH v4 1/1] rps: core implementation Tom Herbert
  1 sibling, 3 replies; 36+ messages in thread
From: Eric Dumazet @ 2010-01-06 21:10 UTC (permalink / raw)
  Cc: Tom Herbert, David Miller, Linux Netdev List, Jay Vosburgh,
	Andy Gospodarek

Le 06/01/2010 19:38, Eric Dumazet a écrit :
> 
> (net-next-2.6 doesnt work well on my bond/vlan setup, I suspect I need a bisection)

David, I had to revert 1f3c8804acba841b5573b953f5560d2683d2db0d
(bonding: allow arp_ip_targets on separate vlans to use arp validation)

Or else, my vlan devices dont work (unfortunatly I dont have much time
these days to debug the thing)

My config :

              +---------+
vlan.103 -----+ bond0   +--- eth1 (bnx2)
              |         +
vlan.825 -----+         +--- eth2 (tg3)
              +---------+

$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.6.0 (September 26, 2009)

Bonding Mode: fault-tolerance (active-backup)
Primary Slave: None
Currently Active Slave: eth2
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 0
Down Delay (ms): 0

Slave Interface: eth1  (bnx2)
MII Status: down
Link Failure Count: 1
Permanent HW addr: 00:1e:0b:ec:d3:d2

Slave Interface: eth2   (tg3)
MII Status: up
Link Failure Count: 0
Permanent HW addr: 00:1e:0b:92:78:50




author	Andy Gospodarek <andy@greyhouse.net>	
	Mon, 14 Dec 2009 10:48:58 +0000 (10:48 +0000)
committer	David S. Miller <davem@davemloft.net>	
	Mon, 4 Jan 2010 05:17:16 +0000 (21:17 -0800)
commit	1f3c8804acba841b5573b953f5560d2683d2db0d
tree	453fae141b4a37e72ee0513ea1816fbff8f1cf8a	
parent	3a999e6eb5d277cd6a321dcda3fc43c3d9e4e4b8	
bonding: allow arp_ip_targets on separate vlans to use arp validation

This allows a bond device to specify an arp_ip_target as a host that is
not on the same vlan as the base bond device and still use arp
validation.  A configuration like this, now works:

BONDING_OPTS="mode=active-backup arp_interval=1000 arp_ip_target=10.0.100.1 arp_validate=3"

1: lo: <LOOPBACK,UP,LOWER_UP> mtu 16436 qdisc noqueue
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
    inet6 ::1/128 scope host
       valid_lft forever preferred_lft forever
2: eth1: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast master bond0 qlen 1000
    link/ether 00:13:21:be:33:e9 brd ff:ff:ff:ff:ff:ff
3: eth0: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast master bond0 qlen 1000
    link/ether 00:13:21:be:33:e9 brd ff:ff:ff:ff:ff:ff
8: bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue
    link/ether 00:13:21:be:33:e9 brd ff:ff:ff:ff:ff:ff
    inet6 fe80::213:21ff:febe:33e9/64 scope link
       valid_lft forever preferred_lft forever
9: bond0.100@bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue
    link/ether 00:13:21:be:33:e9 brd ff:ff:ff:ff:ff:ff
    inet 10.0.100.2/24 brd 10.0.100.255 scope global bond0.100
    inet6 fe80::213:21ff:febe:33e9/64 scope link
       valid_lft forever preferred_lft forever

Ethernet Channel Bonding Driver: v3.6.0 (September 26, 2009)

Bonding Mode: fault-tolerance (active-backup)
Primary Slave: None
Currently Active Slave: eth1
MII Status: up
MII Polling Interval (ms): 0
Up Delay (ms): 0
Down Delay (ms): 0
ARP Polling Interval (ms): 1000
ARP IP target/s (n.n.n.n form): 10.0.100.1

Slave Interface: eth1
MII Status: up
Link Failure Count: 1
Permanent HW addr: 00:40:05:30:ff:30

Slave Interface: eth0
MII Status: up
Link Failure Count: 0
Permanent HW addr: 00:13:21:be:33:e9

Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [BUG net-next-2.6] Had to revert bonding: allow arp_ip_targets on separate vlans to use arp validation
  2010-01-06 21:10           ` [BUG net-next-2.6] Had to revert bonding: allow arp_ip_targets on separate vlans to use arp validation Eric Dumazet
@ 2010-01-06 21:28             ` Jay Vosburgh
  2010-01-06 21:34               ` Eric Dumazet
  2010-01-06 21:38             ` David Miller
  2010-01-06 22:56             ` [PATCH net-next-2.6] fix " Andy Gospodarek
  2 siblings, 1 reply; 36+ messages in thread
From: Jay Vosburgh @ 2010-01-06 21:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tom Herbert, David Miller, Linux Netdev List, Andy Gospodarek

Eric Dumazet <eric.dumazet@gmail.com> wrote:

>Le 06/01/2010 19:38, Eric Dumazet a écrit :
>> 
>> (net-next-2.6 doesnt work well on my bond/vlan setup, I suspect I need a bisection)
>
>David, I had to revert 1f3c8804acba841b5573b953f5560d2683d2db0d
>(bonding: allow arp_ip_targets on separate vlans to use arp validation)
>
>Or else, my vlan devices dont work (unfortunatly I dont have much time
>these days to debug the thing)
>
>My config :
>
>              +---------+
>vlan.103 -----+ bond0   +--- eth1 (bnx2)
>              |         +
>vlan.825 -----+         +--- eth2 (tg3)
>              +---------+

	I'm looking into this right now; I'm seeing what I suspect is
the same thing: the ARP traffic for the probes is processed, and the
bonding slaves are marked up, but any other incoming traffic on the VLAN
is dropped.  It might be that just the incoming ARP replies are lost;
I'm not sure yet.  Tcpdump clearly shows the traffic from the peer
arriving.

	This is the patch we put in last week that worked for Andy, but
not for me.  Earlier versions worked fine, so this might be something in
the last version.  With Eric now having issues, perhaps this isn't just
my problem.  Perhaps there's some difference in our configurations that
differs from what Andy has.

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [BUG net-next-2.6] Had to revert bonding: allow arp_ip_targets on separate vlans to use arp validation
  2010-01-06 21:28             ` Jay Vosburgh
@ 2010-01-06 21:34               ` Eric Dumazet
  0 siblings, 0 replies; 36+ messages in thread
From: Eric Dumazet @ 2010-01-06 21:34 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: Tom Herbert, David Miller, Linux Netdev List, Andy Gospodarek

Le 06/01/2010 22:28, Jay Vosburgh a écrit :
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
>> Le 06/01/2010 19:38, Eric Dumazet a écrit :
>>>
>>> (net-next-2.6 doesnt work well on my bond/vlan setup, I suspect I need a bisection)
>>
>> David, I had to revert 1f3c8804acba841b5573b953f5560d2683d2db0d
>> (bonding: allow arp_ip_targets on separate vlans to use arp validation)
>>
>> Or else, my vlan devices dont work (unfortunatly I dont have much time
>> these days to debug the thing)
>>
>> My config :
>>
>>              +---------+
>> vlan.103 -----+ bond0   +--- eth1 (bnx2)
>>              |         +
>> vlan.825 -----+         +--- eth2 (tg3)
>>              +---------+
> 
> 	I'm looking into this right now; I'm seeing what I suspect is
> the same thing: the ARP traffic for the probes is processed, and the
> bonding slaves are marked up, but any other incoming traffic on the VLAN
> is dropped.  It might be that just the incoming ARP replies are lost;
> I'm not sure yet.  Tcpdump clearly shows the traffic from the peer
> arriving.
> 
> 	This is the patch we put in last week that worked for Andy, but
> not for me.  Earlier versions worked fine, so this might be something in
> the last version.  With Eric now having issues, perhaps this isn't just
> my problem.  Perhaps there's some difference in our configurations that
> differs from what Andy has.
> 

Before going to sleep, I can confirm ARP traffic was going out/coming in, but ARP
table entries stay in incomplete state. 

I just had the time to try one single revert (no time for a bisect), and this commit
was an obvious candidate :)

Thanks

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [BUG net-next-2.6] Had to revert bonding: allow arp_ip_targets on separate vlans to use arp validation
  2010-01-06 21:10           ` [BUG net-next-2.6] Had to revert bonding: allow arp_ip_targets on separate vlans to use arp validation Eric Dumazet
  2010-01-06 21:28             ` Jay Vosburgh
@ 2010-01-06 21:38             ` David Miller
  2010-01-06 21:45               ` Andy Gospodarek
  2010-01-06 22:56             ` [PATCH net-next-2.6] fix " Andy Gospodarek
  2 siblings, 1 reply; 36+ messages in thread
From: David Miller @ 2010-01-06 21:38 UTC (permalink / raw)
  To: eric.dumazet; +Cc: therbert, netdev, fubar, andy

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 06 Jan 2010 22:10:03 +0100

> Le 06/01/2010 19:38, Eric Dumazet a écrit :
>> 
>> (net-next-2.6 doesnt work well on my bond/vlan setup, I suspect I need a bisection)
> 
> David, I had to revert 1f3c8804acba841b5573b953f5560d2683d2db0d
> (bonding: allow arp_ip_targets on separate vlans to use arp validation)
> 
> Or else, my vlan devices dont work (unfortunatly I dont have much time
> these days to debug the thing)

I bet this is the same issue Jay was running into, and he
ACK'd the patch anyways. :-)

This is why I wanted full testing and feedback before committing this
change.

Unless I see a fix in the next day I'm reverting from net-next-2.6


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [BUG net-next-2.6] Had to revert bonding: allow arp_ip_targets on separate vlans to use arp validation
  2010-01-06 21:38             ` David Miller
@ 2010-01-06 21:45               ` Andy Gospodarek
  0 siblings, 0 replies; 36+ messages in thread
From: Andy Gospodarek @ 2010-01-06 21:45 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, therbert, netdev, fubar

On Wed, Jan 6, 2010 at 4:38 PM, David Miller <davem@davemloft.net> wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 06 Jan 2010 22:10:03 +0100
>
>> Le 06/01/2010 19:38, Eric Dumazet a écrit :
>>>
>>> (net-next-2.6 doesnt work well on my bond/vlan setup, I suspect I need a bisection)
>>
>> David, I had to revert 1f3c8804acba841b5573b953f5560d2683d2db0d
>> (bonding: allow arp_ip_targets on separate vlans to use arp validation)
>>
>> Or else, my vlan devices dont work (unfortunatly I dont have much time
>> these days to debug the thing)
>
> I bet this is the same issue Jay was running into, and he
> ACK'd the patch anyways. :-)
>
> This is why I wanted full testing and feedback before committing this
> change.
>
> Unless I see a fix in the next day I'm reverting from net-next-2.6
>
>

Seems reasonable.  I'm working on it now trying to understand what
might be different about my setup and why I'm not seeing any problems.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2010-01-06 18:38         ` Eric Dumazet
  2010-01-06 21:10           ` [BUG net-next-2.6] Had to revert bonding: allow arp_ip_targets on separate vlans to use arp validation Eric Dumazet
@ 2010-01-06 22:54           ` Tom Herbert
  2010-01-07  9:15             ` Eric Dumazet
  1 sibling, 1 reply; 36+ messages in thread
From: Tom Herbert @ 2010-01-06 22:54 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Linux Netdev List

Eric, thanks again for your good comments. Here is my patch that
addresses them, including:

- Added softnet counter for number of rps softirq triggers
- Force at least one map entry for devices with no napi's
- Replace rcu_read_lock_bh with rtnl_lock when assigning dev_rps_maps
pointer in store_rps_cpus
- Replaced get_cpu_var with __get_cpu_var in enqueue_to_backlog (fix
unmatched preempt_disable)
- Restored calling napi_receive_skb in napi_gro_complete,
napi_skb_finish, and napi_frags_finish.  This fixes the problem with
GRO that I had described previously.  Patch should now work with
drivers that call napi_gro_receive (verified with e1000e)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 97873e3..078cb55 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -222,6 +222,7 @@ struct netif_rx_stats {
 	unsigned dropped;
 	unsigned time_squeeze;
 	unsigned cpu_collision;
+	unsigned received_rps;
 };

 DECLARE_PER_CPU(struct netif_rx_stats, netdev_rx_stat);
@@ -676,6 +677,29 @@ struct net_device_ops {
 };

 /*
+ * Structure for Receive Packet Steering.  Length of map and array of CPU ID's.
+ */
+struct rps_map {
+	int len;
+	u16 map[0];
+};
+
+/*
+ * Structure that contains the rps maps for various NAPI instances of a device.
+ */
+struct dev_rps_maps {
+	int num_maps;
+	struct rcu_head rcu;
+	struct rps_map maps[0];
+};
+
+/* Bound number of CPUs that can be in an rps map */
+#define MAX_RPS_CPUS (num_possible_cpus() < 256 ? num_possible_cpus() : 256)
+
+/* Maximum size of RPS map (for allocation) */
+#define RPS_MAP_SIZE (sizeof(struct rps_map) + (MAX_RPS_CPUS * sizeof(u16)))
+
+/*
  *	The DEVICE structure.
  *	Actually, this whole structure is a big mistake.  It mixes I/O
  *	data with strictly "high-level" data, and it has to know about
@@ -861,6 +885,9 @@ struct net_device {

 	struct netdev_queue	rx_queue;

+	struct dev_rps_maps	*dev_rps_maps;	/* Per-NAPI maps for
+						   receive packet steeing */
+
 	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;

 	/* Number of TX queues allocated at alloc_netdev_mq() time  */
@@ -1274,10 +1301,12 @@ static inline int unregister_gifconf(unsigned
int family)
  */
 struct softnet_data {
 	struct Qdisc		*output_queue;
-	struct sk_buff_head	input_pkt_queue;
 	struct list_head	poll_list;
 	struct sk_buff		*completion_queue;

+	/* Elements below can be accessed between CPUs for RPS */
+	struct call_single_data	csd ____cacheline_aligned_in_smp;
+	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
 };

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 63f4742..f188301 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -267,6 +267,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@mac_header: Link layer header
  *	@_skb_dst: destination entry
  *	@sp: the security path, used for xfrm
+ *	@rxhash: the packet hash computed on receive
  *	@cb: Control buffer. Free for use by every layer. Put private vars here
  *	@len: Length of actual data
  *	@data_len: Data length
@@ -323,6 +324,8 @@ struct sk_buff {
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
 #endif
+	__u32			rxhash;
+
 	/*
 	 * This is the control buffer. It is free to use for every
 	 * layer. Please put your private variables there. If you
diff --git a/net/core/dev.c b/net/core/dev.c
index 9977288..732efe3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1834,7 +1834,7 @@ out_kfree_skb:
 	return rc;
 }

-static u32 skb_tx_hashrnd;
+static u32 hashrnd __read_mostly;

 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
 {
@@ -1852,7 +1852,7 @@ u16 skb_tx_hash(const struct net_device *dev,
const struct sk_buff *skb)
 	else
 		hash = skb->protocol;

-	hash = jhash_1word(hash, skb_tx_hashrnd);
+	hash = jhash_1word(hash, hashrnd);

 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
 }
@@ -2073,6 +2073,149 @@ int weight_p __read_mostly = 64;            /*
old backlog weight */

 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };

+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving NAPI instance for a given skb.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
+{
+	u32 addr1, addr2, ports;
+	struct ipv6hdr *ip6;
+	struct iphdr *ip;
+	u32 ihl;
+	u8 ip_proto;
+	int cpu = -1;
+	struct dev_rps_maps *drmap;
+	struct rps_map *map = NULL;
+	u16 index;
+
+	rcu_read_lock();
+
+	drmap = rcu_dereference(dev->dev_rps_maps);
+	if (!drmap)
+		goto done;
+
+	index = skb_get_rx_queue(skb);
+	if (index >= drmap->num_maps)
+		index = 0;
+
+	map = (struct rps_map *)
+	    ((void *)drmap->maps + (RPS_MAP_SIZE * index));
+	if (!map->len)
+		goto done;
+
+	if (skb->rxhash)
+		goto got_hash; /* Skip hash computation on packet header */
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		if (!pskb_may_pull(skb, sizeof(*ip)))
+			goto done;
+
+		ip = (struct iphdr *) skb->data;
+		ip_proto = ip->protocol;
+		addr1 = ip->saddr;
+		addr2 = ip->daddr;
+		ihl = ip->ihl;
+		break;
+	case __constant_htons(ETH_P_IPV6):
+		if (!pskb_may_pull(skb, sizeof(*ip6)))
+			goto done;
+
+		ip6 = (struct ipv6hdr *) skb->data;
+		ip_proto = ip6->nexthdr;
+		addr1 = ip6->saddr.s6_addr32[3];
+		addr2 = ip6->daddr.s6_addr32[3];
+		ihl = (40 >> 2);
+		break;
+	default:
+		goto done;
+	}
+	ports = 0;
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDPLITE:
+		if (pskb_may_pull(skb, (ihl * 4) + 4))
+			ports = *((u32 *) (skb->data + (ihl * 4)));
+		break;
+
+	default:
+		break;
+	}
+
+	skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd);
+	if (!skb->rxhash)
+		skb->rxhash = 1;
+
+got_hash:
+	cpu = map->map[((u64) skb->rxhash * map->len) >> 32];
+
+	if (!cpu_online(cpu))
+		cpu = -1;
+done:
+	rcu_read_unlock();
+	return cpu;
+}
+
+static DEFINE_PER_CPU(cpumask_t, rps_remote_softirq_cpus);
+
+/* Called from hardirq (IPI) context */
+static void trigger_softirq(void *data)
+{
+	struct softnet_data *queue = data;
+	__napi_schedule(&queue->backlog);
+	__get_cpu_var(netdev_rx_stat).received_rps++;
+}
+
+/*
+ * enqueue_to_backlog is called to queue an skb to a per CPU backlog
+ * queue (may be a remote CPU queue).
+ */
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
+{
+	struct softnet_data *queue;
+	unsigned long flags;
+
+	queue = &per_cpu(softnet_data, cpu);
+
+	local_irq_save(flags);
+	__get_cpu_var(netdev_rx_stat).total++;
+
+	spin_lock(&queue->input_pkt_queue.lock);
+	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
+		if (queue->input_pkt_queue.qlen) {
+enqueue:
+			__skb_queue_tail(&queue->input_pkt_queue, skb);
+			spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
+			    flags);
+			return NET_RX_SUCCESS;
+		}
+
+		/* Schedule NAPI for backlog device */
+		if (napi_schedule_prep(&queue->backlog)) {
+			if (cpu != smp_processor_id()) {
+				cpu_set(cpu,
+				    __get_cpu_var(rps_remote_softirq_cpus));
+				__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+			} else
+				__napi_schedule(&queue->backlog);
+		}
+		goto enqueue;
+	}
+
+	spin_unlock(&queue->input_pkt_queue.lock);
+
+	__get_cpu_var(netdev_rx_stat).dropped++;
+	local_irq_restore(flags);
+
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}

 /**
  *	netif_rx	-	post buffer to the network code
@@ -2091,8 +2234,7 @@ DEFINE_PER_CPU(struct netif_rx_stats,
netdev_rx_stat) = { 0, };

 int netif_rx(struct sk_buff *skb)
 {
-	struct softnet_data *queue;
-	unsigned long flags;
+	int cpu;

 	/* if netpoll wants it, pretend we never saw it */
 	if (netpoll_rx(skb))
@@ -2101,31 +2243,12 @@ int netif_rx(struct sk_buff *skb)
 	if (!skb->tstamp.tv64)
 		net_timestamp(skb);

-	/*
-	 * The code is rearranged so that the path is the most
-	 * short when CPU is congested, but is still operating.
-	 */
-	local_irq_save(flags);
-	queue = &__get_cpu_var(softnet_data);
-
-	__get_cpu_var(netdev_rx_stat).total++;
-	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
-		if (queue->input_pkt_queue.qlen) {
-enqueue:
-			__skb_queue_tail(&queue->input_pkt_queue, skb);
-			local_irq_restore(flags);
-			return NET_RX_SUCCESS;
-		}
-
-		napi_schedule(&queue->backlog);
-		goto enqueue;
-	}

-	__get_cpu_var(netdev_rx_stat).dropped++;
-	local_irq_restore(flags);
+	cpu = get_rps_cpu(skb->dev, skb);
+	if (cpu < 0)
+		cpu = smp_processor_id();

-	kfree_skb(skb);
-	return NET_RX_DROP;
+	return enqueue_to_backlog(skb, cpu);
 }
 EXPORT_SYMBOL(netif_rx);

@@ -2363,10 +2486,10 @@ void netif_nit_deliver(struct sk_buff *skb)
 }

 /**
- *	netif_receive_skb - process receive buffer from network
+ *	__netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
  *
- *	netif_receive_skb() is the main receive data processing function.
+ *	__netif__napireceive_skb() is the main receive data processing function.
  *	It always succeeds. The buffer may be dropped during processing
  *	for congestion control or by the protocol layers.
  *
@@ -2377,7 +2500,8 @@ void netif_nit_deliver(struct sk_buff *skb)
  *	NET_RX_SUCCESS: no congestion
  *	NET_RX_DROP: packet was dropped
  */
-int netif_receive_skb(struct sk_buff *skb)
+
+int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
 	struct net_device *orig_dev;
@@ -2475,6 +2599,16 @@ out:
 }
 EXPORT_SYMBOL(netif_receive_skb);

+int netif_receive_skb(struct sk_buff *skb)
+{
+	int cpu = get_rps_cpu(skb->dev, skb);
+
+	if (cpu < 0)
+		return __netif_receive_skb(skb);
+	else
+		return enqueue_to_backlog(skb, cpu);
+}
+
 /* Network device is going away, flush any packets still pending  */
 static void flush_backlog(void *arg)
 {
@@ -2799,16 +2933,16 @@ static int process_backlog(struct napi_struct
*napi, int quota)
 	do {
 		struct sk_buff *skb;

-		local_irq_disable();
+		spin_lock_irq(&queue->input_pkt_queue.lock);
 		skb = __skb_dequeue(&queue->input_pkt_queue);
 		if (!skb) {
 			__napi_complete(napi);
-			local_irq_enable();
+			spin_unlock_irq(&queue->input_pkt_queue.lock);
 			break;
 		}
-		local_irq_enable();
+		spin_unlock_irq(&queue->input_pkt_queue.lock);

-		netif_receive_skb(skb);
+		__netif_receive_skb(skb);
 	} while (++work < quota && jiffies == start_time);

 	return work;
@@ -2897,6 +3031,21 @@ void netif_napi_del(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(netif_napi_del);

+/*
+ * net_rps_action sends any pending IPI's for rps.  This is only called from
+ * softirq and interrupts must be enabled.
+ */
+static void net_rps_action(void)
+{
+	int cpu;
+
+	/* Send pending IPI's to kick RPS processing on remote cpus. */
+	for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
+		struct softnet_data *queue = &per_cpu(softnet_data, cpu);
+		cpu_clear(cpu, __get_cpu_var(rps_remote_softirq_cpus));
+		__smp_call_function_single(cpu, &queue->csd, 0);
+	}
+}

 static void net_rx_action(struct softirq_action *h)
 {
@@ -2968,6 +3117,8 @@ static void net_rx_action(struct softirq_action *h)
 out:
 	local_irq_enable();

+	net_rps_action();
+
 #ifdef CONFIG_NET_DMA
 	/*
 	 * There may not be any more sk_buffs coming right now, so push
@@ -3212,10 +3363,10 @@ static int softnet_seq_show(struct seq_file
*seq, void *v)
 {
 	struct netif_rx_stats *s = v;

-	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   s->total, s->dropped, s->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
-		   s->cpu_collision);
+		   s->cpu_collision, s->received_rps);
 	return 0;
 }

@@ -5341,6 +5492,8 @@ void free_netdev(struct net_device *dev)
 	/* Flush device addresses */
 	dev_addr_flush(dev);

+	kfree(dev->dev_rps_maps);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);

@@ -5793,6 +5946,10 @@ static int __init net_dev_init(void)
 		queue->completion_queue = NULL;
 		INIT_LIST_HEAD(&queue->poll_list);

+		queue->csd.func = trigger_softirq;
+		queue->csd.info = queue;
+		queue->csd.flags = 0;
+
 		queue->backlog.poll = process_backlog;
 		queue->backlog.weight = weight_p;
 		queue->backlog.gro_list = NULL;
@@ -5831,7 +5988,7 @@ subsys_initcall(net_dev_init);

 static int __init initialize_hashrnd(void)
 {
-	get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
+	get_random_bytes(&hashrnd, sizeof(hashrnd));
 	return 0;
 }

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 157645c..4bf4851 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -18,6 +18,9 @@
 #include <linux/wireless.h>
 #include <net/wext.h>

+#include <linux/string.h>
+#include <linux/ctype.h>
+
 #include "net-sysfs.h"

 #ifdef CONFIG_SYSFS
@@ -253,6 +256,134 @@ static ssize_t store_tx_queue_len(struct device *dev,
 	return netdev_store(dev, attr, buf, len, change_tx_queue_len);
 }

+static char *get_token(const char **cp, size_t *len)
+{
+	const char *bp = *cp;
+	char *start;
+
+	while (isspace(*bp))
+		bp++;
+
+	start = (char *)bp;
+	while (!isspace(*bp) && *bp != '\0')
+		bp++;
+
+	if (start != bp)
+		*len = bp - start;
+	else
+		start = NULL;
+
+	*cp = bp;
+	return start;
+}
+
+static void dev_map_release(struct rcu_head *rcu)
+{
+	struct dev_rps_maps *drmap =
+	    container_of(rcu, struct dev_rps_maps, rcu);
+
+	kfree(drmap);
+}
+
+static ssize_t store_rps_cpus(struct device *dev,
+    struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct net_device *net = to_net_dev(dev);
+	struct napi_struct *napi;
+	cpumask_t mask;
+	int err, cpu, index, i;
+	int cnt = 0;
+	char *token;
+	const char *cp = buf;
+	size_t tlen;
+	struct dev_rps_maps *drmap, *old_drmap;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	cnt = 0;
+	list_for_each_entry(napi, &net->napi_list, dev_list)
+		cnt++;
+	if (cnt == 0)
+		cnt = 1; /* For devices with no napi instances */
+
+	drmap = kzalloc(sizeof(struct dev_rps_maps) +
+	    RPS_MAP_SIZE * cnt, GFP_KERNEL);
+	if (!drmap)
+		return -ENOMEM;
+
+	drmap->num_maps = cnt;
+
+	cp = buf;
+	for (index = 0; index < cnt &&
+	   (token = get_token(&cp, &tlen)); index++) {
+		struct rps_map *map = (struct rps_map *)
+		    ((void *)drmap->maps + (RPS_MAP_SIZE * index));
+		err = bitmap_parse(token, tlen, cpumask_bits(&mask),
+		    nr_cpumask_bits);
+
+		if (err) {
+			kfree(drmap);
+			return err;
+		}
+
+		cpus_and(mask, mask, cpu_online_map);
+		i = 0;
+		for_each_cpu_mask(cpu, mask) {
+			if (i >= MAX_RPS_CPUS)
+				break;
+			map->map[i++] =  cpu;
+		}
+		map->len = i;
+	}
+
+	rtnl_lock();
+	old_drmap = rcu_dereference(net->dev_rps_maps);
+	rcu_assign_pointer(net->dev_rps_maps, drmap);
+	rtnl_unlock();
+
+	if (old_drmap)
+		call_rcu(&old_drmap->rcu, dev_map_release);
+
+	return len;
+}
+
+static ssize_t show_rps_cpus(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct net_device *net = to_net_dev(dev);
+	size_t len = 0;
+	cpumask_t mask;
+	int i, j;
+	struct dev_rps_maps *drmap;
+
+	rcu_read_lock_bh();
+	drmap = rcu_dereference(net->dev_rps_maps);
+
+	if (drmap) {
+		for (j = 0; j < drmap->num_maps; j++) {
+			struct rps_map *map = (struct rps_map *)
+			    ((void *)drmap->maps + (RPS_MAP_SIZE * j));
+			cpus_clear(mask);
+			for (i = 0; i < map->len; i++)
+				cpu_set(map->map[i], mask);
+
+			len += cpumask_scnprintf(buf + len, PAGE_SIZE, &mask);
+			if (PAGE_SIZE - len < 3) {
+				rcu_read_unlock();
+				return -EINVAL;
+			}
+			if (j < drmap->num_maps)
+				len += sprintf(buf + len, " ");
+		}
+	}
+
+	rcu_read_unlock_bh();
+
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+
 static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t len)
 {
@@ -309,6 +440,7 @@ static struct device_attribute net_class_attributes[] = {
 	__ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
 	__ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
 	       store_tx_queue_len),
+	__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_cpus, store_rps_cpus),
 	{}
 };

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH net-next-2.6] fix bonding: allow arp_ip_targets on separate vlans to use arp validation
  2010-01-06 21:10           ` [BUG net-next-2.6] Had to revert bonding: allow arp_ip_targets on separate vlans to use arp validation Eric Dumazet
  2010-01-06 21:28             ` Jay Vosburgh
  2010-01-06 21:38             ` David Miller
@ 2010-01-06 22:56             ` Andy Gospodarek
  2010-01-06 23:53               ` Jay Vosburgh
  2 siblings, 1 reply; 36+ messages in thread
From: Andy Gospodarek @ 2010-01-06 22:56 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tom Herbert, David Miller, Linux Netdev List, Jay Vosburgh,
	Andy Gospodarek

On Wed, Jan 06, 2010 at 10:10:03PM +0100, Eric Dumazet wrote:
> Le 06/01/2010 19:38, Eric Dumazet a écrit :
> > 
> > (net-next-2.6 doesnt work well on my bond/vlan setup, I suspect I need a bisection)
> 
> David, I had to revert 1f3c8804acba841b5573b953f5560d2683d2db0d
> (bonding: allow arp_ip_targets on separate vlans to use arp validation)
> 
> Or else, my vlan devices dont work (unfortunatly I dont have much time
> these days to debug the thing)
> 
> My config :
> 
>               +---------+
> vlan.103 -----+ bond0   +--- eth1 (bnx2)
>               |         +
> vlan.825 -----+         +--- eth2 (tg3)
>               +---------+
> 
> $ cat /proc/net/bonding/bond0
> Ethernet Channel Bonding Driver: v3.6.0 (September 26, 2009)
> 
> Bonding Mode: fault-tolerance (active-backup)
> Primary Slave: None
> Currently Active Slave: eth2
> MII Status: up
> MII Polling Interval (ms): 100
> Up Delay (ms): 0
> Down Delay (ms): 0
> 
> Slave Interface: eth1  (bnx2)
> MII Status: down
> Link Failure Count: 1
> Permanent HW addr: 00:1e:0b:ec:d3:d2
> 
> Slave Interface: eth2   (tg3)
> MII Status: up
> Link Failure Count: 0
> Permanent HW addr: 00:1e:0b:92:78:50
> 


This patch fixes up a problem with found with commit
1f3c8804acba841b5573b953f5560d2683d2db0d.  The original change
overloaded null_or_orig, but doing that prevented any packet handlers
that were not tied to a specific device (i.e. ptype->dev == NULL) from
ever receiving any frames.

The null_or_orig variable cannot be overloaded, and must be kept as NULL
to prevent the frame from being ignored by packet handlers designed to
accept frames on any interface.

Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
---
 dev.c |   11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index f9aa699..d9ab9be 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2430,6 +2430,7 @@ int netif_receive_skb(struct sk_buff *skb)
 	struct packet_type *ptype, *pt_prev;
 	struct net_device *orig_dev;
 	struct net_device *null_or_orig;
+	struct net_device *null_or_bond;
 	int ret = NET_RX_DROP;
 	__be16 type;
 
@@ -2500,21 +2501,19 @@ ncls:
 	 * bonding interfaces still make their way to any base bonding
 	 * device that may have registered for a specific ptype.  The
 	 * handler may have to adjust skb->dev and orig_dev.
-	 *
-	 * null_or_orig can be overloaded since it will not be set when
-	 * using VLANs on top of bonding.  Putting it here prevents
-	 * disturbing the ptype_all handlers above.
 	 */
+	null_or_bond = NULL;
 	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
 	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
-		null_or_orig = vlan_dev_real_dev(skb->dev);
+		null_or_bond = vlan_dev_real_dev(skb->dev);
 	}
 
 	type = skb->protocol;
 	list_for_each_entry_rcu(ptype,
 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
 		if (ptype->type == type && (ptype->dev == null_or_orig ||
-		     ptype->dev == skb->dev || ptype->dev == orig_dev)) {
+		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
+		     ptype->dev == null_or_bond)) {
 			if (pt_prev)
 				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next-2.6] fix bonding: allow arp_ip_targets on separate vlans to use arp validation
  2010-01-06 22:56             ` [PATCH net-next-2.6] fix " Andy Gospodarek
@ 2010-01-06 23:53               ` Jay Vosburgh
  2010-01-07  8:37                 ` Eric Dumazet
  0 siblings, 1 reply; 36+ messages in thread
From: Jay Vosburgh @ 2010-01-06 23:53 UTC (permalink / raw)
  To: Andy Gospodarek
  Cc: Eric Dumazet, Tom Herbert, David Miller, Linux Netdev List

Andy Gospodarek <andy@greyhouse.net> wrote:

>This patch fixes up a problem with found with commit
>1f3c8804acba841b5573b953f5560d2683d2db0d.  The original change
>overloaded null_or_orig, but doing that prevented any packet handlers
>that were not tied to a specific device (i.e. ptype->dev == NULL) from
>ever receiving any frames.
>
>The null_or_orig variable cannot be overloaded, and must be kept as NULL
>to prevent the frame from being ignored by packet handlers designed to
>accept frames on any interface.
>
>Signed-off-by: Andy Gospodarek <andy@greyhouse.net>

	Tested, and it appears to work fine.  Tcpdump also appears to
behave rationally.

	-J

Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>


>---
> dev.c |   11 +++++------
> 1 file changed, 5 insertions(+), 6 deletions(-)
>
>diff --git a/net/core/dev.c b/net/core/dev.c
>index f9aa699..d9ab9be 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -2430,6 +2430,7 @@ int netif_receive_skb(struct sk_buff *skb)
> 	struct packet_type *ptype, *pt_prev;
> 	struct net_device *orig_dev;
> 	struct net_device *null_or_orig;
>+	struct net_device *null_or_bond;
> 	int ret = NET_RX_DROP;
> 	__be16 type;
>
>@@ -2500,21 +2501,19 @@ ncls:
> 	 * bonding interfaces still make their way to any base bonding
> 	 * device that may have registered for a specific ptype.  The
> 	 * handler may have to adjust skb->dev and orig_dev.
>-	 *
>-	 * null_or_orig can be overloaded since it will not be set when
>-	 * using VLANs on top of bonding.  Putting it here prevents
>-	 * disturbing the ptype_all handlers above.
> 	 */
>+	null_or_bond = NULL;
> 	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
> 	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
>-		null_or_orig = vlan_dev_real_dev(skb->dev);
>+		null_or_bond = vlan_dev_real_dev(skb->dev);
> 	}
>
> 	type = skb->protocol;
> 	list_for_each_entry_rcu(ptype,
> 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
> 		if (ptype->type == type && (ptype->dev == null_or_orig ||
>-		     ptype->dev == skb->dev || ptype->dev == orig_dev)) {
>+		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
>+		     ptype->dev == null_or_bond)) {
> 			if (pt_prev)
> 				ret = deliver_skb(skb, pt_prev, orig_dev);
> 			pt_prev = ptype;
>--
>To unsubscribe from this list: send the line "unsubscribe netdev" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next-2.6] fix bonding: allow arp_ip_targets on separate vlans to use arp validation
  2010-01-06 23:53               ` Jay Vosburgh
@ 2010-01-07  8:37                 ` Eric Dumazet
  2010-01-07  8:41                   ` David Miller
  0 siblings, 1 reply; 36+ messages in thread
From: Eric Dumazet @ 2010-01-07  8:37 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: Andy Gospodarek, Tom Herbert, David Miller, Linux Netdev List

Le 07/01/2010 00:53, Jay Vosburgh a écrit :
> Andy Gospodarek <andy@greyhouse.net> wrote:
> 
>> This patch fixes up a problem with found with commit
>> 1f3c8804acba841b5573b953f5560d2683d2db0d.  The original change
>> overloaded null_or_orig, but doing that prevented any packet handlers
>> that were not tied to a specific device (i.e. ptype->dev == NULL) from
>> ever receiving any frames.
>>
>> The null_or_orig variable cannot be overloaded, and must be kept as NULL
>> to prevent the frame from being ignored by packet handlers designed to
>> accept frames on any interface.
>>
>> Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
> 
> 	Tested, and it appears to work fine.  Tcpdump also appears to
> behave rationally.
> 
> 	-J
> 
> Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>

Tested as well on my machine, I confirm this fixes the problem I had.

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>

Thanks guys !

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH net-next-2.6] fix bonding: allow arp_ip_targets on separate vlans to use arp validation
  2010-01-07  8:37                 ` Eric Dumazet
@ 2010-01-07  8:41                   ` David Miller
  0 siblings, 0 replies; 36+ messages in thread
From: David Miller @ 2010-01-07  8:41 UTC (permalink / raw)
  To: eric.dumazet; +Cc: fubar, andy, therbert, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 07 Jan 2010 09:37:58 +0100

> Le 07/01/2010 00:53, Jay Vosburgh a écrit :
>> Andy Gospodarek <andy@greyhouse.net> wrote:
>> 
>>> This patch fixes up a problem with found with commit
>>> 1f3c8804acba841b5573b953f5560d2683d2db0d.  The original change
>>> overloaded null_or_orig, but doing that prevented any packet handlers
>>> that were not tied to a specific device (i.e. ptype->dev == NULL) from
>>> ever receiving any frames.
>>>
>>> The null_or_orig variable cannot be overloaded, and must be kept as NULL
>>> to prevent the frame from being ignored by packet handlers designed to
>>> accept frames on any interface.
>>>
>>> Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
>> 
>> 	Tested, and it appears to work fine.  Tcpdump also appears to
>> behave rationally.
>> 
>> 	-J
>> 
>> Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
> 
> Tested as well on my machine, I confirm this fixes the problem I had.
> 
> Acked-by: Eric Dumazet <eric.dumazet@gmail.com>

Thanks I'll add this to net-next-2.6 to fix the problem.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2010-01-06 22:54           ` [PATCH v4 1/1] rps: core implementation Tom Herbert
@ 2010-01-07  9:15             ` Eric Dumazet
  2010-01-07 17:42               ` rps: some comments Eric Dumazet
  2010-01-11  6:25               ` [PATCH v4 1/1] rps: core implementation Tom Herbert
  0 siblings, 2 replies; 36+ messages in thread
From: Eric Dumazet @ 2010-01-07  9:15 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, Linux Netdev List

Le 06/01/2010 23:54, Tom Herbert a écrit :
> Eric, thanks again for your good comments. Here is my patch that
> addresses them, including:
> 
> - Added softnet counter for number of rps softirq triggers
> - Force at least one map entry for devices with no napi's
> - Replace rcu_read_lock_bh with rtnl_lock when assigning dev_rps_maps
> pointer in store_rps_cpus
> - Replaced get_cpu_var with __get_cpu_var in enqueue_to_backlog (fix
> unmatched preempt_disable)

Ah good :)

> - Restored calling napi_receive_skb in napi_gro_complete,
> napi_skb_finish, and napi_frags_finish.  This fixes the problem with
> GRO that I had described previously.  Patch should now work with
> drivers that call napi_gro_receive (verified with e1000e)

Seems your v4/v5 patches are mangled by your mailer, I had to apply them manually...

> 
>  	/* Number of TX queues allocated at alloc_netdev_mq() time  */
> @@ -1274,10 +1301,12 @@ static inline int unregister_gifconf(unsigned
> int family)

(line wrap above, and some others later...)


> @@ -2091,8 +2234,7 @@ DEFINE_PER_CPU(struct netif_rx_stats,
> netdev_rx_stat) = { 0, };

> 
>  /**
> - *	netif_receive_skb - process receive buffer from network
> + *	__netif_receive_skb - process receive buffer from network
>   *	@skb: buffer to process
>   *
> - *	netif_receive_skb() is the main receive data processing function.
> + *	__netif__napireceive_skb() is the main receive data processing function.

Please remove '_napi' from __netif__napireceive_skb(), this is a leftover ...


+	rtnl_lock();
+	old_drmap = rcu_dereference(net->dev_rps_maps);
+	rcu_assign_pointer(net->dev_rps_maps, drmap);
+	rtnl_unlock();

You dont need the rcu_dereference() ->

+	rtnl_lock();
+	old_drmap = net->dev_rps_maps;
+	rcu_assign_pointer(net->dev_rps_maps, drmap);
+	rtnl_unlock();

I wonder if a small spinlock would be better than rtnl here (rtnl is so overloaded these days... :) )

in show_rps_cpus(), I dont believe you need to disable BH.

rcu_read_lock_bh() -> rcu_read_lock()


Patch works very well on my machine (original soft irqs handled by CPU 0, and RPS
distributes packets to eight cpus). This is an RTP server (many UDP messages on many sockets)

# grep eth /proc/interrupts ; cat /proc/net/softnet_stat
 34:     589363          0          0          0          0          0          0          0   PCI-MSI-edge      eth0
 35:         63          0          0          0          0          0          0          0   PCI-MSI-edge      eth1
 36:    1267129          0          0          0          0          0          0          0   PCI-MSI-edge      eth2
001ceff8 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 0000000e
0001eeee 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 0001ed70
0002ab18 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 0002a768
00041cb7 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 000415d1
0003d79b 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 0003d459
00031ea5 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00031c36
0003705f 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00036e5b
00026010 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00025d94

# grep . ` find /sys -name rps_cpus` 
/sys/class/net/eth0/rps_cpus:ff ff ff ff ff ff ff ff 00 
/sys/class/net/eth1/rps_cpus:ff ff ff ff ff ff ff ff 00 
/sys/class/net/bond0/rps_cpus:ff 
/sys/class/net/eth2/rps_cpus:ff 
/sys/class/net/eth3/rps_cpus:ff 
/sys/class/net/vlan.103/rps_cpus:ff 
/sys/class/net/vlan.825/rps_cpus:ff 



If somebody wants to play with RPS, here is the patch I use on top of net-next-2.6
(plus last patch from Andy Gospodarek)

Many thanks Tom !

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a3fccc8..6d79458 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -222,6 +222,7 @@ struct netif_rx_stats {
 	unsigned dropped;
 	unsigned time_squeeze;
 	unsigned cpu_collision;
+	unsigned received_rps;
 };
 
 DECLARE_PER_CPU(struct netif_rx_stats, netdev_rx_stat);
@@ -676,6 +677,29 @@ struct net_device_ops {
 };
 
 /*
+ * Structure for Receive Packet Steering.  Length of map and array of CPU ID's.
+ */
+struct rps_map {
+	int len;
+	u16 map[0];
+};
+
+/*
+ * Structure that contains the rps maps for various NAPI instances of a device.
+ */
+struct dev_rps_maps {
+	int num_maps;
+	struct rcu_head rcu;
+	struct rps_map maps[0];
+};
+
+/* Bound number of CPUs that can be in an rps map */
+#define MAX_RPS_CPUS (num_possible_cpus() < 256 ? num_possible_cpus() : 256)
+
+/* Maximum size of RPS map (for allocation) */
+#define RPS_MAP_SIZE (sizeof(struct rps_map) + (MAX_RPS_CPUS * sizeof(u16)))
+
+/*
  *	The DEVICE structure.
  *	Actually, this whole structure is a big mistake.  It mixes I/O
  *	data with strictly "high-level" data, and it has to know about
@@ -861,6 +885,9 @@ struct net_device {
 
 	struct netdev_queue	rx_queue;
 
+	struct dev_rps_maps	*dev_rps_maps;	/* Per-NAPI maps for
+						   receive packet steeing */
+
 	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
 
 	/* Number of TX queues allocated at alloc_netdev_mq() time  */
@@ -1276,10 +1303,12 @@ static inline int unregister_gifconf(unsigned int family)
  */
 struct softnet_data {
 	struct Qdisc		*output_queue;
-	struct sk_buff_head	input_pkt_queue;
 	struct list_head	poll_list;
 	struct sk_buff		*completion_queue;
 
+	/* Elements below can be accessed between CPUs for RPS */
+	struct call_single_data	csd ____cacheline_aligned_in_smp;
+	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
 };
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ae836fd..8ed3f66 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -267,6 +267,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@mac_header: Link layer header
  *	@_skb_dst: destination entry
  *	@sp: the security path, used for xfrm
+ *	@rxhash: the packet hash computed on receive
  *	@cb: Control buffer. Free for use by every layer. Put private vars here
  *	@len: Length of actual data
  *	@data_len: Data length
@@ -323,6 +324,8 @@ struct sk_buff {
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
 #endif
+	__u32			rxhash;
+
 	/*
 	 * This is the control buffer. It is free to use for every
 	 * layer. Please put your private variables there. If you
diff --git a/net/core/dev.c b/net/core/dev.c
index d9ab9be..6260fd8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1882,7 +1882,7 @@ out_kfree_skb:
 	return rc;
 }
 
-static u32 skb_tx_hashrnd;
+static u32 hashrnd __read_mostly;
 
 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
 {
@@ -1900,7 +1900,7 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
 	else
 		hash = skb->protocol;
 
-	hash = jhash_1word(hash, skb_tx_hashrnd);
+	hash = jhash_1word(hash, hashrnd);
 
 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
 }
@@ -2121,6 +2121,149 @@ int weight_p __read_mostly = 64;            /* old backlog weight */
 
 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
 
+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving NAPI instance for a given skb.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
+{
+	u32 addr1, addr2, ports;
+	struct ipv6hdr *ip6;
+	struct iphdr *ip;
+	u32 ihl;
+	u8 ip_proto;
+	int cpu = -1;
+	struct dev_rps_maps *drmap;
+	struct rps_map *map = NULL;
+	u16 index;
+
+	rcu_read_lock();
+
+	drmap = rcu_dereference(dev->dev_rps_maps);
+	if (!drmap)
+		goto done;
+
+	index = skb_get_rx_queue(skb);
+	if (index >= drmap->num_maps)
+		index = 0;
+
+	map = (struct rps_map *)
+	    ((void *)drmap->maps + (RPS_MAP_SIZE * index));
+	if (!map->len)
+		goto done;
+
+	if (skb->rxhash)
+		goto got_hash; /* Skip hash computation on packet header */
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		if (!pskb_may_pull(skb, sizeof(*ip)))
+			goto done;
+
+		ip = (struct iphdr *) skb->data;
+		ip_proto = ip->protocol;
+		addr1 = ip->saddr;
+		addr2 = ip->daddr;
+		ihl = ip->ihl;
+		break;
+	case __constant_htons(ETH_P_IPV6):
+		if (!pskb_may_pull(skb, sizeof(*ip6)))
+			goto done;
+
+		ip6 = (struct ipv6hdr *) skb->data;
+		ip_proto = ip6->nexthdr;
+		addr1 = ip6->saddr.s6_addr32[3];
+		addr2 = ip6->daddr.s6_addr32[3];
+		ihl = (40 >> 2);
+		break;
+	default:
+		goto done;
+	}
+	ports = 0;
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDPLITE:
+		if (pskb_may_pull(skb, (ihl * 4) + 4))
+			ports = *((u32 *) (skb->data + (ihl * 4)));
+		break;
+
+	default:
+		break;
+	}
+
+	skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd);
+	if (!skb->rxhash)
+		skb->rxhash = 1;
+
+got_hash:
+	cpu = map->map[((u64) skb->rxhash * map->len) >> 32];
+
+	if (!cpu_online(cpu))
+		cpu = -1;
+done:
+	rcu_read_unlock();
+	return cpu;
+}
+
+static DEFINE_PER_CPU(cpumask_t, rps_remote_softirq_cpus);
+
+/* Called from hardirq (IPI) context */
+static void trigger_softirq(void *data)
+{
+	struct softnet_data *queue = data;
+	__napi_schedule(&queue->backlog);
+	__get_cpu_var(netdev_rx_stat).received_rps++;
+}
+
+/*
+ * enqueue_to_backlog is called to queue an skb to a per CPU backlog
+ * queue (may be a remote CPU queue).
+ */
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
+{
+	struct softnet_data *queue;
+	unsigned long flags;
+
+	queue = &per_cpu(softnet_data, cpu);
+
+	local_irq_save(flags);
+	__get_cpu_var(netdev_rx_stat).total++;
+
+	spin_lock(&queue->input_pkt_queue.lock);
+	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
+		if (queue->input_pkt_queue.qlen) {
+enqueue:
+			__skb_queue_tail(&queue->input_pkt_queue, skb);
+			spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
+			    flags);
+			return NET_RX_SUCCESS;
+		}
+
+		/* Schedule NAPI for backlog device */
+		if (napi_schedule_prep(&queue->backlog)) {
+			if (cpu != smp_processor_id()) {
+				cpu_set(cpu,
+				    __get_cpu_var(rps_remote_softirq_cpus));
+				__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+			} else
+				__napi_schedule(&queue->backlog);
+		}
+		goto enqueue;
+	}
+
+	spin_unlock(&queue->input_pkt_queue.lock);
+
+	__get_cpu_var(netdev_rx_stat).dropped++;
+	local_irq_restore(flags);
+
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
 
 /**
  *	netif_rx	-	post buffer to the network code
@@ -2139,8 +2282,7 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
 
 int netif_rx(struct sk_buff *skb)
 {
-	struct softnet_data *queue;
-	unsigned long flags;
+	int cpu;
 
 	/* if netpoll wants it, pretend we never saw it */
 	if (netpoll_rx(skb))
@@ -2149,31 +2291,12 @@ int netif_rx(struct sk_buff *skb)
 	if (!skb->tstamp.tv64)
 		net_timestamp(skb);
 
-	/*
-	 * The code is rearranged so that the path is the most
-	 * short when CPU is congested, but is still operating.
-	 */
-	local_irq_save(flags);
-	queue = &__get_cpu_var(softnet_data);
-
-	__get_cpu_var(netdev_rx_stat).total++;
-	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
-		if (queue->input_pkt_queue.qlen) {
-enqueue:
-			__skb_queue_tail(&queue->input_pkt_queue, skb);
-			local_irq_restore(flags);
-			return NET_RX_SUCCESS;
-		}
-
-		napi_schedule(&queue->backlog);
-		goto enqueue;
-	}
 
-	__get_cpu_var(netdev_rx_stat).dropped++;
-	local_irq_restore(flags);
+	cpu = get_rps_cpu(skb->dev, skb);
+	if (cpu < 0)
+		cpu = smp_processor_id();
 
-	kfree_skb(skb);
-	return NET_RX_DROP;
+	return enqueue_to_backlog(skb, cpu);
 }
 EXPORT_SYMBOL(netif_rx);
 
@@ -2411,10 +2534,10 @@ void netif_nit_deliver(struct sk_buff *skb)
 }
 
 /**
- *	netif_receive_skb - process receive buffer from network
+ *	__netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
  *
- *	netif_receive_skb() is the main receive data processing function.
+ *	__netif_receive_skb() is the main receive data processing function.
  *	It always succeeds. The buffer may be dropped during processing
  *	for congestion control or by the protocol layers.
  *
@@ -2425,7 +2548,8 @@ void netif_nit_deliver(struct sk_buff *skb)
  *	NET_RX_SUCCESS: no congestion
  *	NET_RX_DROP: packet was dropped
  */
-int netif_receive_skb(struct sk_buff *skb)
+
+int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
 	struct net_device *orig_dev;
@@ -2536,6 +2660,16 @@ out:
 }
 EXPORT_SYMBOL(netif_receive_skb);
 
+int netif_receive_skb(struct sk_buff *skb)
+{
+	int cpu = get_rps_cpu(skb->dev, skb);
+
+	if (cpu < 0)
+		return __netif_receive_skb(skb);
+	else
+		return enqueue_to_backlog(skb, cpu);
+}
+
 /* Network device is going away, flush any packets still pending  */
 static void flush_backlog(void *arg)
 {
@@ -2861,16 +2995,16 @@ static int process_backlog(struct napi_struct *napi, int quota)
 	do {
 		struct sk_buff *skb;
 
-		local_irq_disable();
+		spin_lock_irq(&queue->input_pkt_queue.lock);
 		skb = __skb_dequeue(&queue->input_pkt_queue);
 		if (!skb) {
 			__napi_complete(napi);
-			local_irq_enable();
+			spin_unlock_irq(&queue->input_pkt_queue.lock);
 			break;
 		}
-		local_irq_enable();
+		spin_unlock_irq(&queue->input_pkt_queue.lock);
 
-		netif_receive_skb(skb);
+		__netif_receive_skb(skb);
 	} while (++work < quota && jiffies == start_time);
 
 	return work;
@@ -2959,6 +3093,21 @@ void netif_napi_del(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(netif_napi_del);
 
+/*
+ * net_rps_action sends any pending IPI's for rps.  This is only called from
+ * softirq and interrupts must be enabled.
+ */
+static void net_rps_action(void)
+{
+	int cpu;
+
+	/* Send pending IPI's to kick RPS processing on remote cpus. */
+	for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
+		struct softnet_data *queue = &per_cpu(softnet_data, cpu);
+		cpu_clear(cpu, __get_cpu_var(rps_remote_softirq_cpus));
+		__smp_call_function_single(cpu, &queue->csd, 0);
+	}
+}
 
 static void net_rx_action(struct softirq_action *h)
 {
@@ -3030,6 +3179,8 @@ static void net_rx_action(struct softirq_action *h)
 out:
 	local_irq_enable();
 
+	net_rps_action();
+
 #ifdef CONFIG_NET_DMA
 	/*
 	 * There may not be any more sk_buffs coming right now, so push
@@ -3274,10 +3425,10 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
 {
 	struct netif_rx_stats *s = v;
 
-	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   s->total, s->dropped, s->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
-		   s->cpu_collision);
+		   s->cpu_collision, s->received_rps);
 	return 0;
 }
 
@@ -5424,6 +5575,8 @@ void free_netdev(struct net_device *dev)
 	/* Flush device addresses */
 	dev_addr_flush(dev);
 
+	kfree(dev->dev_rps_maps);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
@@ -5898,6 +6051,10 @@ static int __init net_dev_init(void)
 		queue->completion_queue = NULL;
 		INIT_LIST_HEAD(&queue->poll_list);
 
+		queue->csd.func = trigger_softirq;
+		queue->csd.info = queue;
+		queue->csd.flags = 0;
+
 		queue->backlog.poll = process_backlog;
 		queue->backlog.weight = weight_p;
 		queue->backlog.gro_list = NULL;
@@ -5936,7 +6093,7 @@ subsys_initcall(net_dev_init);
 
 static int __init initialize_hashrnd(void)
 {
-	get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
+	get_random_bytes(&hashrnd, sizeof(hashrnd));
 	return 0;
 }
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index fbc1c74..a7e4db3 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -18,6 +18,9 @@
 #include <linux/wireless.h>
 #include <net/wext.h>
 
+#include <linux/string.h>
+#include <linux/ctype.h>
+
 #include "net-sysfs.h"
 
 #ifdef CONFIG_SYSFS
@@ -253,6 +256,134 @@ static ssize_t store_tx_queue_len(struct device *dev,
 	return netdev_store(dev, attr, buf, len, change_tx_queue_len);
 }
 
+static char *get_token(const char **cp, size_t *len)
+{
+	const char *bp = *cp;
+	char *start;
+
+	while (isspace(*bp))
+		bp++;
+
+	start = (char *)bp;
+	while (!isspace(*bp) && *bp != '\0')
+		bp++;
+
+	if (start != bp)
+		*len = bp - start;
+	else
+		start = NULL;
+
+	*cp = bp;
+	return start;
+}
+
+static void dev_map_release(struct rcu_head *rcu)
+{
+	struct dev_rps_maps *drmap =
+	    container_of(rcu, struct dev_rps_maps, rcu);
+
+	kfree(drmap);
+}
+
+static ssize_t store_rps_cpus(struct device *dev,
+    struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct net_device *net = to_net_dev(dev);
+	struct napi_struct *napi;
+	cpumask_t mask;
+	int err, cpu, index, i;
+	int cnt = 0;
+	char *token;
+	const char *cp = buf;
+	size_t tlen;
+	struct dev_rps_maps *drmap, *old_drmap;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	cnt = 0;
+	list_for_each_entry(napi, &net->napi_list, dev_list)
+		cnt++;
+	if (cnt == 0)
+		cnt = 1; /* For devices with no napi instances */
+
+	drmap = kzalloc(sizeof(struct dev_rps_maps) +
+	    RPS_MAP_SIZE * cnt, GFP_KERNEL);
+	if (!drmap)
+		return -ENOMEM;
+
+	drmap->num_maps = cnt;
+
+	cp = buf;
+	for (index = 0; index < cnt &&
+	   (token = get_token(&cp, &tlen)); index++) {
+		struct rps_map *map = (struct rps_map *)
+		    ((void *)drmap->maps + (RPS_MAP_SIZE * index));
+		err = bitmap_parse(token, tlen, cpumask_bits(&mask),
+		    nr_cpumask_bits);
+
+		if (err) {
+			kfree(drmap);
+			return err;
+		}
+
+		cpus_and(mask, mask, cpu_online_map);
+		i = 0;
+		for_each_cpu_mask(cpu, mask) {
+			if (i >= MAX_RPS_CPUS)
+				break;
+			map->map[i++] =  cpu;
+		}
+		map->len = i;
+	}
+
+	rtnl_lock();
+	old_drmap = net->dev_rps_maps;
+	rcu_assign_pointer(net->dev_rps_maps, drmap);
+	rtnl_unlock();
+
+	if (old_drmap)
+		call_rcu(&old_drmap->rcu, dev_map_release);
+
+	return len;
+}
+
+static ssize_t show_rps_cpus(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct net_device *net = to_net_dev(dev);
+	size_t len = 0;
+	cpumask_t mask;
+	int i, j;
+	struct dev_rps_maps *drmap;
+
+	rcu_read_lock_bh();
+	drmap = rcu_dereference(net->dev_rps_maps);
+
+	if (drmap) {
+		for (j = 0; j < drmap->num_maps; j++) {
+			struct rps_map *map = (struct rps_map *)
+			    ((void *)drmap->maps + (RPS_MAP_SIZE * j));
+			cpus_clear(mask);
+			for (i = 0; i < map->len; i++)
+				cpu_set(map->map[i], mask);
+
+			len += cpumask_scnprintf(buf + len, PAGE_SIZE, &mask);
+			if (PAGE_SIZE - len < 3) {
+				rcu_read_unlock();
+				return -EINVAL;
+			}
+			if (j < drmap->num_maps)
+				len += sprintf(buf + len, " ");
+		}
+	}
+
+	rcu_read_unlock_bh();
+
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+
 static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t len)
 {
@@ -309,6 +440,7 @@ static struct device_attribute net_class_attributes[] = {
 	__ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
 	__ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
 	       store_tx_queue_len),
+	__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_cpus, store_rps_cpus),
 	{}
 };
 

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* rps: some comments
  2010-01-07  9:15             ` Eric Dumazet
@ 2010-01-07 17:42               ` Eric Dumazet
  2010-01-08  0:07                 ` Tom Herbert
  2010-01-11  6:25               ` [PATCH v4 1/1] rps: core implementation Tom Herbert
  1 sibling, 1 reply; 36+ messages in thread
From: Eric Dumazet @ 2010-01-07 17:42 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, Linux Netdev List

Hi Tom

1) netif_receive_skb() might enqueue skb instead of handling them directly.

int netif_receive_skb(struct sk_buff *skb)
{
        int cpu = get_rps_cpu(skb->dev, skb);

        if (cpu < 0)
                return __netif_receive_skb(skb);
        else
                return enqueue_to_backlog(skb, cpu);
}

If get_rps_cpu() happens to select the current cpu, we enqueue to backlog the
skb instead of directly calling __netif_receive_skb().

One way to solve this is to make get_rps_cpu() returns -1 in this case :

...
if (cpu == smp_processor_id() || !cpu_online(cpu))
	cpu = -1;

done:
	rcu_read_unlock();
	return cpu;
} /* get_rps_cpu() */


2) RPS_MAP_SIZE might be too expensive to use in fast path, on big (NR_CPUS=4096) machines :
 num_possible_cpus() has to count all bits set in a bitmap.

#define MAX_RPS_CPUS (num_possible_cpus() < 256 ? num_possible_cpus() : 256)
#define RPS_MAP_SIZE (sizeof(struct rps_map) + (MAX_RPS_CPUS * sizeof(u16))

You can use nr_cpu_ids, or even better a new variable, that you init _once_ to

unsigned int rps_map_size = sizeof(struct rps_map) + (min(256, nr_cpu_ids) * sizeof(u16));


3) cpu hotplug.

I cannot find how cpu unplug can be safe.

if (!cpu_online(cpu))
	cpu = -1;
rcu_read_unlock();
...
cpu_set(cpu, __get_cpu_var(rps_remote_softirq_cpus));
...
__smp_call_function_single(cpu, &queue->csd, 0);

Thanks


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: rps: some comments
  2010-01-07 17:42               ` rps: some comments Eric Dumazet
@ 2010-01-08  0:07                 ` Tom Herbert
  2010-01-08  6:27                   ` Eric Dumazet
  0 siblings, 1 reply; 36+ messages in thread
From: Tom Herbert @ 2010-01-08  0:07 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Linux Netdev List

On Thu, Jan 7, 2010 at 9:42 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Hi Tom
>
> 1) netif_receive_skb() might enqueue skb instead of handling them directly.
>
> int netif_receive_skb(struct sk_buff *skb)
> {
>        int cpu = get_rps_cpu(skb->dev, skb);
>
>        if (cpu < 0)
>                return __netif_receive_skb(skb);
>        else
>                return enqueue_to_backlog(skb, cpu);
> }
>
> If get_rps_cpu() happens to select the current cpu, we enqueue to backlog the
> skb instead of directly calling __netif_receive_skb().
>
> One way to solve this is to make get_rps_cpu() returns -1 in this case :
>
> ...
> if (cpu == smp_processor_id() || !cpu_online(cpu))
>        cpu = -1;
>
> done:
>        rcu_read_unlock();
>        return cpu;
> } /* get_rps_cpu() */
>
I kind of like the way it is right now, which is to enqueue even on
the local CPU.  The advantage of that is that all the packets received
in the napi run are dispatched to all the participating CPUs before
any are processed, this is good for average and worst case latency on
a packet burst.

>
> 2) RPS_MAP_SIZE might be too expensive to use in fast path, on big (NR_CPUS=4096) machines :
>  num_possible_cpus() has to count all bits set in a bitmap.
>
> #define MAX_RPS_CPUS (num_possible_cpus() < 256 ? num_possible_cpus() : 256)
> #define RPS_MAP_SIZE (sizeof(struct rps_map) + (MAX_RPS_CPUS * sizeof(u16))
>
> You can use nr_cpu_ids, or even better a new variable, that you init _once_ to
>
> unsigned int rps_map_size = sizeof(struct rps_map) + (min(256, nr_cpu_ids) * sizeof(u16));
>
Yes that's probably better.

>
> 3) cpu hotplug.
>
> I cannot find how cpu unplug can be safe.
>
> if (!cpu_online(cpu))
>        cpu = -1;
> rcu_read_unlock();
> ...
> cpu_set(cpu, __get_cpu_var(rps_remote_softirq_cpus));
> ...
> __smp_call_function_single(cpu, &queue->csd, 0);

I believe were dealing with remote CPUs with preemption disabled (from
softirq's), so maybe that is sufficient to prevent CPU from going away
below us.  We probably need a "if cpu_online(cpu)" before
__smp_call_function_single though.

Thanks,
Tom

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: rps: some comments
  2010-01-08  0:07                 ` Tom Herbert
@ 2010-01-08  6:27                   ` Eric Dumazet
  0 siblings, 0 replies; 36+ messages in thread
From: Eric Dumazet @ 2010-01-08  6:27 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, Linux Netdev List

Le 08/01/2010 01:07, Tom Herbert a écrit :
> I kind of like the way it is right now, which is to enqueue even on
> the local CPU.  The advantage of that is that all the packets received
> in the napi run are dispatched to all the participating CPUs before
> any are processed, this is good for average and worst case latency on
> a packet burst.

I see... this makes sense, thanks.

Do you know if some non multi queue NIC would be able to provide us
a suitable hash to fill skb->rxhash in NIC driver itself ?


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2010-01-07  9:15             ` Eric Dumazet
  2010-01-07 17:42               ` rps: some comments Eric Dumazet
@ 2010-01-11  6:25               ` Tom Herbert
  2010-01-11  9:00                 ` Eric Dumazet
  1 sibling, 1 reply; 36+ messages in thread
From: Tom Herbert @ 2010-01-11  6:25 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Linux Netdev List

Eric, patch below has some more minor fixes per your latest comments.

- added variables for rps_map_size and rps_cpus_in_map for efficiency
- added preempt_disable/enable around __smp_call_function_single to
prevent CPUs from being removed during this action (hotplug fix)
- check cpu_online before calling __smp_call_function_single (also
hotplug related)
- do rcu_read_lock instead of rcu_read_lock_bh in store_rps_cpus
- don't do rcu_derefence in store_rps_cpus

Thanks,
Tom


diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 97873e3..5ea2569 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -222,6 +222,7 @@ struct netif_rx_stats {
 	unsigned dropped;
 	unsigned time_squeeze;
 	unsigned cpu_collision;
+	unsigned received_rps;
 };

 DECLARE_PER_CPU(struct netif_rx_stats, netdev_rx_stat);
@@ -676,6 +677,27 @@ struct net_device_ops {
 };

 /*
+ * Structure for Receive Packet Steering.  Length of map and array of CPU ID's.
+ */
+struct rps_map {
+	int len;
+	u16 map[0];
+};
+
+#define MAX_RPS_CPUS 256 /* Limit maximum number of CPUs in a map */
+extern int rps_map_size; /* Size of an RPS map */
+extern int rps_cpus_in_map; /* Number of CPUs in a map */
+
+/*
+ * Structure that contains the rps maps for various NAPI instances of a device.
+ */
+struct dev_rps_maps {
+	int num_maps;
+	struct rcu_head rcu;
+	struct rps_map maps[0];
+};
+
+/*
  *	The DEVICE structure.
  *	Actually, this whole structure is a big mistake.  It mixes I/O
  *	data with strictly "high-level" data, and it has to know about
@@ -861,6 +883,9 @@ struct net_device {

 	struct netdev_queue	rx_queue;

+	struct dev_rps_maps	*dev_rps_maps;	/* Per-NAPI maps for
+						   receive packet steeing */
+
 	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;

 	/* Number of TX queues allocated at alloc_netdev_mq() time  */
@@ -1274,10 +1299,12 @@ static inline int unregister_gifconf(unsigned
int family)
  */
 struct softnet_data {
 	struct Qdisc		*output_queue;
-	struct sk_buff_head	input_pkt_queue;
 	struct list_head	poll_list;
 	struct sk_buff		*completion_queue;

+	/* Elements below can be accessed between CPUs for RPS */
+	struct call_single_data	csd ____cacheline_aligned_in_smp;
+	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
 };

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 63f4742..f188301 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -267,6 +267,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@mac_header: Link layer header
  *	@_skb_dst: destination entry
  *	@sp: the security path, used for xfrm
+ *	@rxhash: the packet hash computed on receive
  *	@cb: Control buffer. Free for use by every layer. Put private vars here
  *	@len: Length of actual data
  *	@data_len: Data length
@@ -323,6 +324,8 @@ struct sk_buff {
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
 #endif
+	__u32			rxhash;
+
 	/*
 	 * This is the control buffer. It is free to use for every
 	 * layer. Please put your private variables there. If you
diff --git a/net/core/dev.c b/net/core/dev.c
index 9977288..988c747 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1834,7 +1834,7 @@ out_kfree_skb:
 	return rc;
 }

-static u32 skb_tx_hashrnd;
+static u32 hashrnd __read_mostly;

 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
 {
@@ -1852,7 +1852,7 @@ u16 skb_tx_hash(const struct net_device *dev,
const struct sk_buff *skb)
 	else
 		hash = skb->protocol;

-	hash = jhash_1word(hash, skb_tx_hashrnd);
+	hash = jhash_1word(hash, hashrnd);

 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
 }
@@ -2070,9 +2070,154 @@ EXPORT_SYMBOL(dev_queue_xmit);
 int netdev_max_backlog __read_mostly = 1000;
 int netdev_budget __read_mostly = 300;
 int weight_p __read_mostly = 64;            /* old backlog weight */
+int rps_cpus_in_map __read_mostly;
+int rps_map_size __read_mostly;

 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };

+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving NAPI instance for a given skb.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
+{
+	u32 addr1, addr2, ports;
+	struct ipv6hdr *ip6;
+	struct iphdr *ip;
+	u32 ihl;
+	u8 ip_proto;
+	int cpu = -1;
+	struct dev_rps_maps *drmap;
+	struct rps_map *map = NULL;
+	u16 index;
+
+	rcu_read_lock();
+
+	drmap = rcu_dereference(dev->dev_rps_maps);
+	if (!drmap)
+		goto done;
+
+	index = skb_get_rx_queue(skb);
+	if (index >= drmap->num_maps)
+		index = 0;
+
+	map = (struct rps_map *)
+	    ((void *)drmap->maps + (rps_map_size * index));
+	if (!map->len)
+		goto done;
+
+	if (skb->rxhash)
+		goto got_hash; /* Skip hash computation on packet header */
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		if (!pskb_may_pull(skb, sizeof(*ip)))
+			goto done;
+
+		ip = (struct iphdr *) skb->data;
+		ip_proto = ip->protocol;
+		addr1 = ip->saddr;
+		addr2 = ip->daddr;
+		ihl = ip->ihl;
+		break;
+	case __constant_htons(ETH_P_IPV6):
+		if (!pskb_may_pull(skb, sizeof(*ip6)))
+			goto done;
+
+		ip6 = (struct ipv6hdr *) skb->data;
+		ip_proto = ip6->nexthdr;
+		addr1 = ip6->saddr.s6_addr32[3];
+		addr2 = ip6->daddr.s6_addr32[3];
+		ihl = (40 >> 2);
+		break;
+	default:
+		goto done;
+	}
+	ports = 0;
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+	case IPPROTO_AH:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDPLITE:
+		if (pskb_may_pull(skb, (ihl * 4) + 4))
+			ports = *((u32 *) (skb->data + (ihl * 4)));
+		break;
+
+	default:
+		break;
+	}
+
+	skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd);
+	if (!skb->rxhash)
+		skb->rxhash = 1;
+
+got_hash:
+	cpu = map->map[((u64) skb->rxhash * map->len) >> 32];
+
+	if (!cpu_online(cpu))
+		cpu = -1;
+done:
+	rcu_read_unlock();
+	return cpu;
+}
+
+static DEFINE_PER_CPU(cpumask_t, rps_remote_softirq_cpus);
+
+/* Called from hardirq (IPI) context */
+static void trigger_softirq(void *data)
+{
+	struct softnet_data *queue = data;
+	__napi_schedule(&queue->backlog);
+	__get_cpu_var(netdev_rx_stat).received_rps++;
+}
+
+/*
+ * enqueue_to_backlog is called to queue an skb to a per CPU backlog
+ * queue (may be a remote CPU queue).
+ */
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
+{
+	struct softnet_data *queue;
+	unsigned long flags;
+
+	queue = &per_cpu(softnet_data, cpu);
+
+	local_irq_save(flags);
+	__get_cpu_var(netdev_rx_stat).total++;
+
+	spin_lock(&queue->input_pkt_queue.lock);
+	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
+		if (queue->input_pkt_queue.qlen) {
+enqueue:
+			__skb_queue_tail(&queue->input_pkt_queue, skb);
+			spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
+			    flags);
+			return NET_RX_SUCCESS;
+		}
+
+		/* Schedule NAPI for backlog device */
+		if (napi_schedule_prep(&queue->backlog)) {
+			if (cpu != smp_processor_id()) {
+				cpu_set(cpu,
+				    __get_cpu_var(rps_remote_softirq_cpus));
+				__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+			} else
+				__napi_schedule(&queue->backlog);
+		}
+		goto enqueue;
+	}
+
+	spin_unlock(&queue->input_pkt_queue.lock);
+
+	__get_cpu_var(netdev_rx_stat).dropped++;
+	local_irq_restore(flags);
+
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}

 /**
  *	netif_rx	-	post buffer to the network code
@@ -2091,8 +2236,7 @@ DEFINE_PER_CPU(struct netif_rx_stats,
netdev_rx_stat) = { 0, };

 int netif_rx(struct sk_buff *skb)
 {
-	struct softnet_data *queue;
-	unsigned long flags;
+	int cpu;

 	/* if netpoll wants it, pretend we never saw it */
 	if (netpoll_rx(skb))
@@ -2101,31 +2245,11 @@ int netif_rx(struct sk_buff *skb)
 	if (!skb->tstamp.tv64)
 		net_timestamp(skb);

-	/*
-	 * The code is rearranged so that the path is the most
-	 * short when CPU is congested, but is still operating.
-	 */
-	local_irq_save(flags);
-	queue = &__get_cpu_var(softnet_data);
-
-	__get_cpu_var(netdev_rx_stat).total++;
-	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
-		if (queue->input_pkt_queue.qlen) {
-enqueue:
-			__skb_queue_tail(&queue->input_pkt_queue, skb);
-			local_irq_restore(flags);
-			return NET_RX_SUCCESS;
-		}
-
-		napi_schedule(&queue->backlog);
-		goto enqueue;
-	}
-
-	__get_cpu_var(netdev_rx_stat).dropped++;
-	local_irq_restore(flags);
+	cpu = get_rps_cpu(skb->dev, skb);
+	if (cpu < 0)
+		cpu = smp_processor_id();

-	kfree_skb(skb);
-	return NET_RX_DROP;
+	return enqueue_to_backlog(skb, cpu);
 }
 EXPORT_SYMBOL(netif_rx);

@@ -2363,10 +2487,10 @@ void netif_nit_deliver(struct sk_buff *skb)
 }

 /**
- *	netif_receive_skb - process receive buffer from network
+ *	__netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
  *
- *	netif_receive_skb() is the main receive data processing function.
+ *	__netif__receive_skb() is the main receive data processing function.
  *	It always succeeds. The buffer may be dropped during processing
  *	for congestion control or by the protocol layers.
  *
@@ -2377,7 +2501,8 @@ void netif_nit_deliver(struct sk_buff *skb)
  *	NET_RX_SUCCESS: no congestion
  *	NET_RX_DROP: packet was dropped
  */
-int netif_receive_skb(struct sk_buff *skb)
+
+int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
 	struct net_device *orig_dev;
@@ -2475,6 +2600,16 @@ out:
 }
 EXPORT_SYMBOL(netif_receive_skb);

+int netif_receive_skb(struct sk_buff *skb)
+{
+	int cpu = get_rps_cpu(skb->dev, skb);
+
+	if (cpu < 0)
+		return __netif_receive_skb(skb);
+	else
+		return enqueue_to_backlog(skb, cpu);
+}
+
 /* Network device is going away, flush any packets still pending  */
 static void flush_backlog(void *arg)
 {
@@ -2799,16 +2934,16 @@ static int process_backlog(struct napi_struct
*napi, int quota)
 	do {
 		struct sk_buff *skb;

-		local_irq_disable();
+		spin_lock_irq(&queue->input_pkt_queue.lock);
 		skb = __skb_dequeue(&queue->input_pkt_queue);
 		if (!skb) {
 			__napi_complete(napi);
-			local_irq_enable();
+			spin_unlock_irq(&queue->input_pkt_queue.lock);
 			break;
 		}
-		local_irq_enable();
+		spin_unlock_irq(&queue->input_pkt_queue.lock);

-		netif_receive_skb(skb);
+		__netif_receive_skb(skb);
 	} while (++work < quota && jiffies == start_time);

 	return work;
@@ -2897,6 +3032,26 @@ void netif_napi_del(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(netif_napi_del);

+/*
+ * net_rps_action sends any pending IPI's for rps.  This is only called from
+ * softirq and interrupts must be enabled.
+ */
+static void net_rps_action(void)
+{
+	int cpu;
+
+	preempt_disable();
+
+	/* Send pending IPI's to kick RPS processing on remote cpus. */
+	for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
+		struct softnet_data *queue = &per_cpu(softnet_data, cpu);
+		cpu_clear(cpu, __get_cpu_var(rps_remote_softirq_cpus));
+		if (cpu_online(cpu))
+			__smp_call_function_single(cpu, &queue->csd, 0);
+	}
+
+	preempt_enable();
+}

 static void net_rx_action(struct softirq_action *h)
 {
@@ -2968,6 +3123,8 @@ static void net_rx_action(struct softirq_action *h)
 out:
 	local_irq_enable();

+	net_rps_action();
+
 #ifdef CONFIG_NET_DMA
 	/*
 	 * There may not be any more sk_buffs coming right now, so push
@@ -3212,10 +3369,10 @@ static int softnet_seq_show(struct seq_file
*seq, void *v)
 {
 	struct netif_rx_stats *s = v;

-	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   s->total, s->dropped, s->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
-		   s->cpu_collision);
+		   s->cpu_collision, s->received_rps);
 	return 0;
 }

@@ -5341,6 +5498,8 @@ void free_netdev(struct net_device *dev)
 	/* Flush device addresses */
 	dev_addr_flush(dev);

+	kfree(dev->dev_rps_maps);
+
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);

@@ -5793,12 +5952,20 @@ static int __init net_dev_init(void)
 		queue->completion_queue = NULL;
 		INIT_LIST_HEAD(&queue->poll_list);

+		queue->csd.func = trigger_softirq;
+		queue->csd.info = queue;
+		queue->csd.flags = 0;
+
 		queue->backlog.poll = process_backlog;
 		queue->backlog.weight = weight_p;
 		queue->backlog.gro_list = NULL;
 		queue->backlog.gro_count = 0;
 	}

+	rps_cpus_in_map = num_possible_cpus() < MAX_RPS_CPUS ?
+	    num_possible_cpus() : MAX_RPS_CPUS;
+	rps_map_size = sizeof(struct rps_map) + (rps_cpus_in_map * sizeof(u16));
+
 	dev_boot_phase = 0;

 	/* The loopback device is special if any other network devices
@@ -5831,7 +5998,7 @@ subsys_initcall(net_dev_init);

 static int __init initialize_hashrnd(void)
 {
-	get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
+	get_random_bytes(&hashrnd, sizeof(hashrnd));
 	return 0;
 }

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 157645c..a390c07 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -18,6 +18,9 @@
 #include <linux/wireless.h>
 #include <net/wext.h>

+#include <linux/string.h>
+#include <linux/ctype.h>
+
 #include "net-sysfs.h"

 #ifdef CONFIG_SYSFS
@@ -253,6 +256,134 @@ static ssize_t store_tx_queue_len(struct device *dev,
 	return netdev_store(dev, attr, buf, len, change_tx_queue_len);
 }

+static char *get_token(const char **cp, size_t *len)
+{
+	const char *bp = *cp;
+	char *start;
+
+	while (isspace(*bp))
+		bp++;
+
+	start = (char *)bp;
+	while (!isspace(*bp) && *bp != '\0')
+		bp++;
+
+	if (start != bp)
+		*len = bp - start;
+	else
+		start = NULL;
+
+	*cp = bp;
+	return start;
+}
+
+static void dev_map_release(struct rcu_head *rcu)
+{
+	struct dev_rps_maps *drmap =
+	    container_of(rcu, struct dev_rps_maps, rcu);
+
+	kfree(drmap);
+}
+
+static ssize_t store_rps_cpus(struct device *dev,
+    struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct net_device *net = to_net_dev(dev);
+	struct napi_struct *napi;
+	cpumask_t mask;
+	int err, cpu, index, i;
+	int cnt = 0;
+	char *token;
+	const char *cp = buf;
+	size_t tlen;
+	struct dev_rps_maps *drmap, *old_drmap;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	cnt = 0;
+	list_for_each_entry(napi, &net->napi_list, dev_list)
+		cnt++;
+	if (cnt == 0)
+		cnt = 1; /* For devices with no napi instances */
+
+	drmap = kzalloc(sizeof(struct dev_rps_maps) +
+	    rps_map_size * cnt, GFP_KERNEL);
+	if (!drmap)
+		return -ENOMEM;
+
+	drmap->num_maps = cnt;
+
+	cp = buf;
+	for (index = 0; index < cnt &&
+	   (token = get_token(&cp, &tlen)); index++) {
+		struct rps_map *map = (struct rps_map *)
+		    ((void *)drmap->maps + (rps_map_size * index));
+		err = bitmap_parse(token, tlen, cpumask_bits(&mask),
+		    nr_cpumask_bits);
+
+		if (err) {
+			kfree(drmap);
+			return err;
+		}
+
+		cpus_and(mask, mask, cpu_online_map);
+		i = 0;
+		for_each_cpu_mask(cpu, mask) {
+			if (i >= rps_cpus_in_map)
+				break;
+			map->map[i++] =  cpu;
+		}
+		map->len = i;
+	}
+
+	rtnl_lock();
+	old_drmap = net->dev_rps_maps;
+	rcu_assign_pointer(net->dev_rps_maps, drmap);
+	rtnl_unlock();
+
+	if (old_drmap)
+		call_rcu(&old_drmap->rcu, dev_map_release);
+
+	return len;
+}
+
+static ssize_t show_rps_cpus(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct net_device *net = to_net_dev(dev);
+	size_t len = 0;
+	cpumask_t mask;
+	int i, j;
+	struct dev_rps_maps *drmap;
+
+	rcu_read_lock();
+	drmap = rcu_dereference(net->dev_rps_maps);
+
+	if (drmap) {
+		for (j = 0; j < drmap->num_maps; j++) {
+			struct rps_map *map = (struct rps_map *)
+			    ((void *)drmap->maps + (rps_map_size * j));
+			cpus_clear(mask);
+			for (i = 0; i < map->len; i++)
+				cpu_set(map->map[i], mask);
+
+			len += cpumask_scnprintf(buf + len, PAGE_SIZE, &mask);
+			if (PAGE_SIZE - len < 3) {
+				rcu_read_unlock();
+				return -EINVAL;
+			}
+			if (j < drmap->num_maps)
+				len += sprintf(buf + len, " ");
+		}
+	}
+
+	rcu_read_unlock();
+
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+
 static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t len)
 {
@@ -309,6 +440,7 @@ static struct device_attribute net_class_attributes[] = {
 	__ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
 	__ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
 	       store_tx_queue_len),
+	__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_cpus, store_rps_cpus),
 	{}
 };

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2010-01-11  6:25               ` [PATCH v4 1/1] rps: core implementation Tom Herbert
@ 2010-01-11  9:00                 ` Eric Dumazet
  2010-01-14  4:40                   ` David Miller
  0 siblings, 1 reply; 36+ messages in thread
From: Eric Dumazet @ 2010-01-11  9:00 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, Linux Netdev List

Le 11/01/2010 07:25, Tom Herbert a écrit :
> Eric, patch below has some more minor fixes per your latest comments.
> 
> - added variables for rps_map_size and rps_cpus_in_map for efficiency
> - added preempt_disable/enable around __smp_call_function_single to
> prevent CPUs from being removed during this action (hotplug fix)
> - check cpu_online before calling __smp_call_function_single (also
> hotplug related)
> - do rcu_read_lock instead of rcu_read_lock_bh in store_rps_cpus
> - don't do rcu_derefence in store_rps_cpus
> 
> Thanks,
> Tom

Tom, I am currently running one production server with this version,
everything seems fine so far.

(Only problem is with your mail program, some lines were folded, and
I had to manually adjust your patch to apply it...)

I also did one small change :

Since struct softnet_data is now aligned to a cache line boundary, its better
to move it to appropriate section (to avoid adding holes in percpu section,
because linker is not very smart)

-DEFINE_PER_CPU(struct softnet_data, softnet_data);
+DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);


I believe its ready for inclusion, but of course David should take
a look before :)

Could you post it formally with the appropriate ChangeLog with a nice
RPS description, so that other people can ack it ?

Thanks !

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v4 1/1] rps: core implementation
  2010-01-11  9:00                 ` Eric Dumazet
@ 2010-01-14  4:40                   ` David Miller
  0 siblings, 0 replies; 36+ messages in thread
From: David Miller @ 2010-01-14  4:40 UTC (permalink / raw)
  To: eric.dumazet; +Cc: therbert, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 11 Jan 2010 10:00:53 +0100

> I believe its ready for inclusion, but of course David should take
> a look before :)
> 
> Could you post it formally with the appropriate ChangeLog with a nice
> RPS description, so that other people can ack it ?

Yes Tom, please do, I think I'm ready to apply this thing.

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2010-01-14  4:40 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-11-20 23:28 [PATCH v4 1/1] rps: core implementation Tom Herbert
2009-11-20 23:39 ` David Miller
2009-11-20 23:50   ` Tom Herbert
2009-11-21  0:05     ` David Miller
2009-11-21  0:12       ` Tom Herbert
2009-11-21  0:40         ` Jarek Poplawski
2009-11-20 23:40 ` Stephen Hemminger
2009-11-20 23:53   ` Tom Herbert
2009-11-20 23:56   ` David Miller
2009-12-17 21:04   ` Tom Herbert
2010-01-06  1:32     ` Tom Herbert
2010-01-06  5:54       ` Eric Dumazet
2010-01-06  7:56         ` Tom Herbert
2010-01-06 18:38         ` Eric Dumazet
2010-01-06 21:10           ` [BUG net-next-2.6] Had to revert bonding: allow arp_ip_targets on separate vlans to use arp validation Eric Dumazet
2010-01-06 21:28             ` Jay Vosburgh
2010-01-06 21:34               ` Eric Dumazet
2010-01-06 21:38             ` David Miller
2010-01-06 21:45               ` Andy Gospodarek
2010-01-06 22:56             ` [PATCH net-next-2.6] fix " Andy Gospodarek
2010-01-06 23:53               ` Jay Vosburgh
2010-01-07  8:37                 ` Eric Dumazet
2010-01-07  8:41                   ` David Miller
2010-01-06 22:54           ` [PATCH v4 1/1] rps: core implementation Tom Herbert
2010-01-07  9:15             ` Eric Dumazet
2010-01-07 17:42               ` rps: some comments Eric Dumazet
2010-01-08  0:07                 ` Tom Herbert
2010-01-08  6:27                   ` Eric Dumazet
2010-01-11  6:25               ` [PATCH v4 1/1] rps: core implementation Tom Herbert
2010-01-11  9:00                 ` Eric Dumazet
2010-01-14  4:40                   ` David Miller
2009-11-20 23:42 ` Stephen Hemminger
2009-11-21  0:04   ` Tom Herbert
2009-11-21  8:07 ` Eric Dumazet
2009-11-21  9:03   ` Tom Herbert
2009-11-21  9:31     ` Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).