[PATCH] xps-mq: Transmit Packet Steering for multiqueue

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] xps-mq: Transmit Packet Steering for multiqueue
@ 2010-08-23  5:39 Tom Herbert
  2010-08-23 17:09 ` Ben Hutchings
                   ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Tom Herbert @ 2010-08-23  5:39 UTC (permalink / raw)
  To: davem, netdev; +Cc: eric.dumazet

This patch implements transmit packet steering (XPS) for multiqueue
devices.  XPS selects a transmit queue during packet transmission based
on configuration.  This is done by mapping the CPU transmitting the
packet to a queue.  This is the transmit side analogue to RPS-- where
RPS is selecting a CPU based on receive queue, XPS selects a queue
based on the CPU (previously there was an XPS patch from Eric
Dumazet, but that might more appropriately be called transmit completion
steering).

Each transmit queue can be associated with a number of CPUs which will
used the queue to send packets.  This is configured as a CPU mask on a
per queue basis in:

/sys/class/net/eth<n>/queues/tx-<n>/xps_cpus

The mappings are stored per device in an inverted data structure that
maps CPUs to queues.  In the netdevice structure this is an array of
num_possible_cpu structures where each array entry contains a bit map
of queues which that CPU can use.

We also allow the mapping of a socket to queue to be modified, for
instance if a thread is scheduled on a different CPU the desired queue
for transmitting packets would likely change.  To maintain in order
packet transmission a flag (ooo_okay) has been added to the sk_buf
structure.  If a transport layer sets this flag on a packet, the
transmit queue can be changed for this socket.  Presumably, the
transport would set this is there was no possbility of creating ooo
packets (for instance there are no packets in flight for the socket).
This patch includes the modification in TCP output for setting this
flag.

The benefits of XPS are improved locality in the per queue data
strutures.  Also, transmit completions are more likely to be done
nearer to the sending thread so this should promote locality back 
to the socket (e.g. UDP).  The benefits of XPS are dependent on
cache hierarchy, application load, and other factors.  XPS would
nominally be configured so that a queue would only be shared by CPUs
which are sharing a cache, the degenerative configuration woud be that
each CPU has it's own queue.

Below are some benchmark results which show the potential benfit of
this patch.  The netperf test has 500 instances of netperf TCP_RR test
with 1 byte req. and resp.

bnx2x on 16 core AMD
   XPS (16 queues, 1 TX queue per CPU)	1015K at 99% CPU
   No XPS (16 queues)			1127K at 98% CPU

Signed-off-by: Tom Herbert <therbert@google.com>
---
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 46c36ff..0ff6c9f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -497,6 +497,12 @@ struct netdev_queue {
 	struct Qdisc		*qdisc;
 	unsigned long		state;
 	struct Qdisc		*qdisc_sleeping;
+#ifdef CONFIG_RPS
+	struct kobject		kobj;
+	struct netdev_queue	*first;
+	atomic_t		count;
+#endif
+
 /*
  * write mostly part
  */
@@ -524,6 +530,22 @@ struct rps_map {
 #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
 
 /*
+ * This structure holds an XPS map which can be of variable length.  queues
+ * is an array of num_possible_cpus entries, where each entry is a mask of
+ * queues for that CPU (up to num_tx_queues bits for device).
+ */
+struct xps_map {
+	struct rcu_head rcu;
+	unsigned long queues[0];
+};
+
+#define QUEUE_MASK_SIZE(dev) (BITS_TO_LONGS(dev->num_tx_queues))
+#define XPS_MAP_SIZE(dev) (sizeof(struct xps_map) + (num_possible_cpus() * \
+    QUEUE_MASK_SIZE(dev) * sizeof(unsigned long)))
+#define XPS_ENTRY(map, offset, dev) \
+    (&map->queues[offset * QUEUE_MASK_SIZE(dev)])
+
+/*
  * The rps_dev_flow structure contains the mapping of a flow to a CPU and the
  * tail pointer for that CPU's input queue at the time of last enqueue.
  */
@@ -978,6 +1000,9 @@ struct net_device {
 	void			*rx_handler_data;
 
 	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
+#ifdef CONFIG_RPS
+	struct xps_map		*xps_maps;
+#endif
 
 	/* Number of TX queues allocated at alloc_netdev_mq() time  */
 	unsigned int		num_tx_queues;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f067c95..146df6f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -381,6 +381,7 @@ struct sk_buff {
 #else
 	__u8			deliver_no_wcard:1;
 #endif
+	__u8			ooo_okay:1;
 	kmemcheck_bitfield_end(flags2);
 
 	/* 0/14 bit hole */
diff --git a/net/core/dev.c b/net/core/dev.c
index da584f5..d23f9c4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2054,6 +2054,60 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
 	return queue_index;
 }
 
+static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb,
+				int queue_index)
+{
+	struct xps_map *maps;
+	int cpu = smp_processor_id();
+	u32 hash;
+	unsigned long *queues;
+	int weight, select;
+
+	rcu_read_lock();
+	maps = rcu_dereference(dev->xps_maps);
+
+	if (!maps) {
+		rcu_read_unlock();
+		return queue_index;
+	}
+
+	queues = XPS_ENTRY(maps, cpu, dev);
+
+	if (queue_index >= 0) {
+		if (test_bit(queue_index, queues)) {
+			rcu_read_unlock();
+			return queue_index;
+		}
+	}
+
+	weight = bitmap_weight(queues, dev->real_num_tx_queues);
+	switch (weight) {
+	case 0:
+		break;
+	case 1:
+		queue_index =
+		    find_first_bit(queues, dev->real_num_tx_queues);
+		break;
+	default:
+		if (skb->sk && skb->sk->sk_hash)
+			hash = skb->sk->sk_hash;
+		else
+			hash = (__force u16) skb->protocol ^ skb->rxhash;
+		hash = jhash_1word(hash, hashrnd);
+
+		select = ((u64) hash * weight) >> 32;
+		queue_index =
+		    find_first_bit(queues, dev->real_num_tx_queues);
+		while (select--)
+			queue_index = find_next_bit(queues,
+			    dev->real_num_tx_queues, queue_index);
+		break;
+	}
+
+	rcu_read_unlock();
+	return queue_index;
+}
+
 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 					struct sk_buff *skb)
 {
@@ -2061,23 +2115,30 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 	struct sock *sk = skb->sk;
 
 	queue_index = sk_tx_queue_get(sk);
-	if (queue_index < 0) {
+
+	if (queue_index < 0 || (skb->ooo_okay && dev->real_num_tx_queues > 1)) {
 		const struct net_device_ops *ops = dev->netdev_ops;
+		int old_index = queue_index;
 
 		if (ops->ndo_select_queue) {
 			queue_index = ops->ndo_select_queue(dev, skb);
 			queue_index = dev_cap_txqueue(dev, queue_index);
 		} else {
-			queue_index = 0;
-			if (dev->real_num_tx_queues > 1)
-				queue_index = skb_tx_hash(dev, skb);
+			if (dev->real_num_tx_queues > 1) {
+				queue_index = get_xps_queue(dev,
+				    skb, queue_index);
+				if (queue_index < 0)
+					queue_index = skb_tx_hash(dev, skb);
+			} else
+				queue_index = 0;
+		}
 
-			if (sk) {
-				struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
+		if ((queue_index != old_index) && sk) {
+			struct dst_entry *dst =
+			    rcu_dereference_check(sk->sk_dst_cache, 1);
 
-				if (dst && skb_dst(skb) == dst)
-					sk_tx_queue_set(sk, queue_index);
-			}
+			if (dst && skb_dst(skb) == dst)
+				sk_tx_queue_set(sk, queue_index);
 		}
 	}
 
@@ -5429,6 +5490,15 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	}
 
 #ifdef CONFIG_RPS
+	atomic_set(&tx->count, queue_count);
+
+	/*
+	 * Set a pointer to first element in the array which holds the
+	 * reference count.
+	 */
+	for (i = 0; i < queue_count; i++)
+		tx[i].first = tx;
+
 	rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
 	if (!rx) {
 		printk(KERN_ERR "alloc_netdev: Unable to allocate "
@@ -5506,7 +5576,9 @@ void free_netdev(struct net_device *dev)
 
 	release_net(dev_net(dev));
 
+#ifndef CONFIG_RPS
 	kfree(dev->_tx);
+#endif
 
 	/* Flush device addresses */
 	dev_addr_flush(dev);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index af4dfba..661c481 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -742,34 +742,295 @@ static int rx_queue_add_kobject(struct net_device *net, int index)
 	return error;
 }
 
-static int rx_queue_register_kobjects(struct net_device *net)
+/*
+ * netdev_queue sysfs structures and functions.
+ */
+struct netdev_queue_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct netdev_queue *queue,
+	    struct netdev_queue_attribute *attr, char *buf);
+	ssize_t (*store)(struct netdev_queue *queue,
+	    struct netdev_queue_attribute *attr, const char *buf, size_t len);
+};
+#define to_netdev_queue_attr(_attr) container_of(_attr,		\
+    struct netdev_queue_attribute, attr)
+
+#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj)
+
+static ssize_t netdev_queue_attr_show(struct kobject *kobj,
+				      struct attribute *attr, char *buf)
+{
+	struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
+	struct netdev_queue *queue = to_netdev_queue(kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	return attribute->show(queue, attribute, buf);
+}
+
+static ssize_t netdev_queue_attr_store(struct kobject *kobj,
+				       struct attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
+	struct netdev_queue *queue = to_netdev_queue(kobj);
+
+	if (!attribute->store)
+		return -EIO;
+
+	return attribute->store(queue, attribute, buf, count);
+}
+
+static struct sysfs_ops netdev_queue_sysfs_ops = {
+	.show = netdev_queue_attr_show,
+	.store = netdev_queue_attr_store,
+};
+
+static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
 {
+	struct net_device *dev = queue->dev;
+	int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++)
+		if (queue == &dev->_tx[i])
+			break;
+
+	BUG_ON(i >= dev->num_tx_queues);
+
+	return i;
+}
+
+static ssize_t show_xps_map(struct netdev_queue *queue,
+			    struct netdev_queue_attribute *attribute, char *buf)
+{
+	struct net_device *dev = queue->dev;
+	struct xps_map *maps;
+	cpumask_var_t mask;
+	unsigned long *qmask, index;
+	size_t len = 0;
 	int i;
+	unsigned int qmask_size = QUEUE_MASK_SIZE(dev);
+
+	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	index = get_netdev_queue_index(queue);
+
+	rcu_read_lock();
+	maps = rcu_dereference(dev->xps_maps);
+	if (maps) {
+		qmask = maps->queues;
+		for (i = 0; i < num_possible_cpus(); i++) {
+			if (test_bit(index, qmask))
+				cpumask_set_cpu(i, mask);
+			qmask += qmask_size;
+		}
+	}
+	len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
+	if (PAGE_SIZE - len < 3) {
+		rcu_read_unlock();
+		free_cpumask_var(mask);
+		return -EINVAL;
+	}
+	rcu_read_unlock();
+
+	free_cpumask_var(mask);
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+
+static void xps_map_release(struct rcu_head *rcu)
+{
+	struct xps_map *map = container_of(rcu, struct xps_map, rcu);
+
+	kfree(map);
+}
+
+static DEFINE_MUTEX(xps_map_lock);
+
+static ssize_t store_xps_map(struct netdev_queue *queue,
+		      struct netdev_queue_attribute *attribute,
+		      const char *buf, size_t len)
+{
+	struct net_device *dev = queue->dev;
+	struct xps_map *maps;
+	cpumask_var_t mask;
+	int err, i, nonempty = 0;
+	unsigned long *qmask, index;
+	unsigned int qmask_size = QUEUE_MASK_SIZE(dev);
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+	if (err) {
+		free_cpumask_var(mask);
+		return err;
+	}
+
+	mutex_lock(&xps_map_lock);
+
+	maps = dev->xps_maps;
+	if (!maps) {
+		if (!cpumask_weight(mask)) {
+			mutex_unlock(&xps_map_lock);
+			free_cpumask_var(mask);
+			return 0;
+		}
+		maps = kzalloc(XPS_MAP_SIZE(dev), GFP_KERNEL);
+		if (!maps) {
+			mutex_unlock(&xps_map_lock);
+			free_cpumask_var(mask);
+			return -ENOMEM;
+		}
+		rcu_assign_pointer(dev->xps_maps, maps);
+	}
+
+	index = get_netdev_queue_index(queue);
+
+	qmask = maps->queues;
+	for (i = 0; i < num_possible_cpus(); i++) {
+		if (cpu_isset(i, *mask) && cpu_online(i)) {
+			set_bit(index, qmask);
+			nonempty = 1;
+		} else
+			clear_bit(index, qmask);
+		if (!nonempty &&
+		    bitmap_weight(qmask, dev->real_num_tx_queues))
+			nonempty = 1;
+		qmask += qmask_size;
+	}
+
+	if (!nonempty) {
+		rcu_assign_pointer(dev->xps_maps, NULL);
+		call_rcu(&maps->rcu, xps_map_release);
+	}
+
+	mutex_unlock(&xps_map_lock);
+
+	free_cpumask_var(mask);
+	return len;
+}
+
+static struct netdev_queue_attribute xps_cpus_attribute =
+    __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
+
+static struct attribute *netdev_queue_default_attrs[] = {
+	&xps_cpus_attribute.attr,
+	NULL
+};
+
+static void netdev_queue_release(struct kobject *kobj)
+{
+	struct netdev_queue *queue = to_netdev_queue(kobj);
+	struct net_device *dev = queue->dev;
+	struct netdev_queue *first = queue->first;
+	struct xps_map *maps;
+	unsigned long *qmask, index;
+	int i, nonempty = 0;
+	unsigned int qmask_size = QUEUE_MASK_SIZE(dev);
+
+	index = get_netdev_queue_index(queue);
+
+	mutex_lock(&xps_map_lock);
+
+	maps = dev->xps_maps;
+
+	if (maps) {
+		qmask = maps->queues;
+		for (i = 0; i < num_possible_cpus(); i++) {
+			clear_bit(index, qmask);
+			if (!nonempty &&
+			    bitmap_weight(qmask, dev->real_num_tx_queues))
+				nonempty = 1;
+			qmask += qmask_size;
+		}
+
+		if (!nonempty) {
+			rcu_assign_pointer(dev->xps_maps, NULL);
+			call_rcu(&maps->rcu, xps_map_release);
+		}
+	}
+	mutex_unlock(&xps_map_lock);
+
+	if (atomic_dec_and_test(&first->count)) {
+		kfree(first);
+		dev_put(dev);
+	}
+}
+
+static struct kobj_type netdev_queue_ktype = {
+	.sysfs_ops = &netdev_queue_sysfs_ops,
+	.release = netdev_queue_release,
+	.default_attrs = netdev_queue_default_attrs,
+};
+
+static int netdev_queue_add_kobject(struct net_device *net, int index)
+{
+	struct netdev_queue *queue = net->_tx + index;
+	struct kobject *kobj = &queue->kobj;
+	int error = 0;
+
+	kobj->kset = net->queues_kset;
+	error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
+	    "tx-%u", index);
+	if (error) {
+		kobject_put(kobj);
+		return error;
+	}
+
+	kobject_uevent(kobj, KOBJ_ADD);
+
+	return error;
+}
+
+static int register_queue_kobjects(struct net_device *net)
+{
+	int rx = 0, tx = 0;
 	int error = 0;
 
 	net->queues_kset = kset_create_and_add("queues",
 	    NULL, &net->dev.kobj);
 	if (!net->queues_kset)
 		return -ENOMEM;
-	for (i = 0; i < net->num_rx_queues; i++) {
-		error = rx_queue_add_kobject(net, i);
+
+	for (rx = 0; rx < net->num_rx_queues; rx++) {
+		error = rx_queue_add_kobject(net, rx);
 		if (error)
-			break;
+			goto error;
 	}
 
-	if (error)
-		while (--i >= 0)
-			kobject_put(&net->_rx[i].kobj);
+	for (tx = 0; tx < net->num_tx_queues; tx++) {
+		error = netdev_queue_add_kobject(net, tx);
+		if (error)
+			goto error;
+	}
+	dev_hold(net);
+
+	return error;
+
+error:
+	while (--rx >= 0)
+		kobject_put(&net->_rx[rx].kobj);
+
+	while (--tx >= 0)
+		kobject_put(&net->_tx[tx].kobj);
 
 	return error;
 }
 
-static void rx_queue_remove_kobjects(struct net_device *net)
+static void remove_queue_kobjects(struct net_device *net)
 {
 	int i;
 
 	for (i = 0; i < net->num_rx_queues; i++)
 		kobject_put(&net->_rx[i].kobj);
+	for (i = 0; i < net->num_tx_queues; i++)
+		kobject_put(&net->_tx[i].kobj);
 	kset_unregister(net->queues_kset);
 }
 #endif /* CONFIG_RPS */
@@ -871,7 +1132,7 @@ void netdev_unregister_kobject(struct net_device * net)
 	kobject_get(&dev->kobj);
 
 #ifdef CONFIG_RPS
-	rx_queue_remove_kobjects(net);
+	remove_queue_kobjects(net);
 #endif
 
 	device_del(dev);
@@ -912,7 +1173,7 @@ int netdev_register_kobject(struct net_device *net)
 		return error;
 
 #ifdef CONFIG_RPS
-	error = rx_queue_register_kobjects(net);
+	error = register_queue_kobjects(net);
 	if (error) {
 		device_del(dev);
 		return error;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index de3bd84..80c1928 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -828,8 +828,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 							   &md5);
 	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
 
-	if (tcp_packets_in_flight(tp) == 0)
+	if (tcp_packets_in_flight(tp) == 0) {
 		tcp_ca_event(sk, CA_EVENT_TX_START);
+		skb->ooo_okay = 1;
+	}
 
 	skb_push(skb, tcp_header_size);
 	skb_reset_transport_header(skb);


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-08-23  5:39 [PATCH] xps-mq: Transmit Packet Steering for multiqueue Tom Herbert
@ 2010-08-23 17:09 ` Ben Hutchings
       [not found]   ` <AANLkTinST5zaS0NtBjrzyLbsg=w_EVsHE3DCDcrmQNc6@mail.gmail.com>
  2010-08-23 17:59 ` Stephen Hemminger
  2010-08-24  4:31 ` Bill Fink
  2 siblings, 1 reply; 28+ messages in thread
From: Ben Hutchings @ 2010-08-23 17:09 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, eric.dumazet

On Sun, 2010-08-22 at 22:39 -0700, Tom Herbert wrote:
[...]
> Each transmit queue can be associated with a number of CPUs which will
> used the queue to send packets.  This is configured as a CPU mask on a
> per queue basis in:
> 
> /sys/class/net/eth<n>/queues/tx-<n>/xps_cpus
> 
> The mappings are stored per device in an inverted data structure that
> maps CPUs to queues.  In the netdevice structure this is an array of
> num_possible_cpu structures where each array entry contains a bit map
> of queues which that CPU can use.
[...]

The mapping of TX queue to CPU should match the affinity of the
completion IRQ for that queue.  It should not be a separate setting.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
       [not found]   ` <AANLkTinST5zaS0NtBjrzyLbsg=w_EVsHE3DCDcrmQNc6@mail.gmail.com>
@ 2010-08-23 17:50     ` Ben Hutchings
  0 siblings, 0 replies; 28+ messages in thread
From: Ben Hutchings @ 2010-08-23 17:50 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, eric.dumazet

On Mon, 2010-08-23 at 10:30 -0700, Tom Herbert wrote:
> 
> 
> On Mon, Aug 23, 2010 at 10:09 AM, Ben Hutchings
> <bhutchings@solarflare.com> wrote:
>         On Sun, 2010-08-22 at 22:39 -0700, Tom Herbert wrote:
>         [...]
>         > Each transmit queue can be associated with a number of CPUs
>         which will
>         > used the queue to send packets.  This is configured as a CPU
>         mask on a
>         > per queue basis in:
>         >
>         > /sys/class/net/eth<n>/queues/tx-<n>/xps_cpus
>         >
>         > The mappings are stored per device in an inverted data
>         structure that
>         > maps CPUs to queues.  In the netdevice structure this is an
>         array of
>         > num_possible_cpu structures where each array entry contains
>         a bit map
>         > of queues which that CPU can use.
>         
>         [...]
>         
>         The mapping of TX queue to CPU should match the affinity of
>         the
>         completion IRQ for that queue.  It should not be a separate
>         setting.
>         
>         
>         
> That implies one possible configuration, but there are others.  For
> instance, there may be fewer queues than CPUs in which case a TX queue
> would be used by more than just the CPU handling the IRQ.

The affinity of IRQs is not restricted to a single CPU either, but I
take your point.

The IRQ affinity and the mapping of sender to queue do at least need to
be coordinated, and I think that continuing to add independent knobs for
CPU affinity of closely-related objects makes it too hard for
administrators to get this right.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-08-23  5:39 [PATCH] xps-mq: Transmit Packet Steering for multiqueue Tom Herbert
  2010-08-23 17:09 ` Ben Hutchings
@ 2010-08-23 17:59 ` Stephen Hemminger
  2010-09-01 15:41   ` Tom Herbert
  2010-08-24  4:31 ` Bill Fink
  2 siblings, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2010-08-23 17:59 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, eric.dumazet

On Sun, 22 Aug 2010 22:39:57 -0700 (PDT)
Tom Herbert <therbert@google.com> wrote:

> This patch implements transmit packet steering (XPS) for multiqueue
> devices.  XPS selects a transmit queue during packet transmission based
> on configuration.  This is done by mapping the CPU transmitting the
> packet to a queue.  This is the transmit side analogue to RPS-- where
> RPS is selecting a CPU based on receive queue, XPS selects a queue
> based on the CPU (previously there was an XPS patch from Eric
> Dumazet, but that might more appropriately be called transmit completion
> steering).
> 
> Each transmit queue can be associated with a number of CPUs which will
> used the queue to send packets.  This is configured as a CPU mask on a
> per queue basis in:
> 
> /sys/class/net/eth<n>/queues/tx-<n>/xps_cpus
> 
> The mappings are stored per device in an inverted data structure that
> maps CPUs to queues.  In the netdevice structure this is an array of
> num_possible_cpu structures where each array entry contains a bit map
> of queues which that CPU can use.
> 
> We also allow the mapping of a socket to queue to be modified, for
> instance if a thread is scheduled on a different CPU the desired queue
> for transmitting packets would likely change.  To maintain in order
> packet transmission a flag (ooo_okay) has been added to the sk_buf
> structure.  If a transport layer sets this flag on a packet, the
> transmit queue can be changed for this socket.  Presumably, the
> transport would set this is there was no possbility of creating ooo
> packets (for instance there are no packets in flight for the socket).
> This patch includes the modification in TCP output for setting this
> flag.
> 
> The benefits of XPS are improved locality in the per queue data
> strutures.  Also, transmit completions are more likely to be done
> nearer to the sending thread so this should promote locality back 
> to the socket (e.g. UDP).  The benefits of XPS are dependent on
> cache hierarchy, application load, and other factors.  XPS would
> nominally be configured so that a queue would only be shared by CPUs
> which are sharing a cache, the degenerative configuration woud be that
> each CPU has it's own queue.
> 
> Below are some benchmark results which show the potential benfit of
> this patch.  The netperf test has 500 instances of netperf TCP_RR test
> with 1 byte req. and resp.
> 
> bnx2x on 16 core AMD
>    XPS (16 queues, 1 TX queue per CPU)	1015K at 99% CPU
>    No XPS (16 queues)			1127K at 98% CPU
> 
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 46c36ff..0ff6c9f 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -497,6 +497,12 @@ struct netdev_queue {
>  	struct Qdisc		*qdisc;
>  	unsigned long		state;
>  	struct Qdisc		*qdisc_sleeping;
> +#ifdef CONFIG_RPS
> +	struct kobject		kobj;
> +	struct netdev_queue	*first;
> +	atomic_t		count;
> +#endif
> +
>  /*
>   * write mostly part
>   */
> @@ -524,6 +530,22 @@ struct rps_map {
>  #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
>  
>  /*
> + * This structure holds an XPS map which can be of variable length.  queues
> + * is an array of num_possible_cpus entries, where each entry is a mask of
> + * queues for that CPU (up to num_tx_queues bits for device).
> + */
> +struct xps_map {
> +	struct rcu_head rcu;
> +	unsigned long queues[0];
> +};
> +
> +#define QUEUE_MASK_SIZE(dev) (BITS_TO_LONGS(dev->num_tx_queues))
> +#define XPS_MAP_SIZE(dev) (sizeof(struct xps_map) + (num_possible_cpus() * \
> +    QUEUE_MASK_SIZE(dev) * sizeof(unsigned long)))
> +#define XPS_ENTRY(map, offset, dev) \
> +    (&map->queues[offset * QUEUE_MASK_SIZE(dev)])
> +
> +/*
>   * The rps_dev_flow structure contains the mapping of a flow to a CPU and the
>   * tail pointer for that CPU's input queue at the time of last enqueue.
>   */
> @@ -978,6 +1000,9 @@ struct net_device {
>  	void			*rx_handler_data;
>  
>  	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
> +#ifdef CONFIG_RPS
> +	struct xps_map		*xps_maps;
> +#endif
>  
>  	/* Number of TX queues allocated at alloc_netdev_mq() time  */
>  	unsigned int		num_tx_queues;
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index f067c95..146df6f 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -381,6 +381,7 @@ struct sk_buff {
>  #else
>  	__u8			deliver_no_wcard:1;
>  #endif
> +	__u8			ooo_okay:1;
>  	kmemcheck_bitfield_end(flags2);
>  
>  	/* 0/14 bit hole */
> diff --git a/net/core/dev.c b/net/core/dev.c
> index da584f5..d23f9c4 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2054,6 +2054,60 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
>  	return queue_index;
>  }
>  
> +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb,
> +				int queue_index)
> +{
> +	struct xps_map *maps;
> +	int cpu = smp_processor_id();
> +	u32 hash;
> +	unsigned long *queues;
> +	int weight, select;
> +
> +	rcu_read_lock();
> +	maps = rcu_dereference(dev->xps_maps);
> +
> +	if (!maps) {
> +		rcu_read_unlock();
> +		return queue_index;
> +	}
> +
> +	queues = XPS_ENTRY(maps, cpu, dev);
> +
> +	if (queue_index >= 0) {
> +		if (test_bit(queue_index, queues)) {
> +			rcu_read_unlock();
> +			return queue_index;
> +		}
> +	}
> +
> +	weight = bitmap_weight(queues, dev->real_num_tx_queues);
> +	switch (weight) {
> +	case 0:
> +		break;
> +	case 1:
> +		queue_index =
> +		    find_first_bit(queues, dev->real_num_tx_queues);
> +		break;
> +	default:
> +		if (skb->sk && skb->sk->sk_hash)
> +			hash = skb->sk->sk_hash;
> +		else
> +			hash = (__force u16) skb->protocol ^ skb->rxhash;
> +		hash = jhash_1word(hash, hashrnd);
> +
> +		select = ((u64) hash * weight) >> 32;
> +		queue_index =
> +		    find_first_bit(queues, dev->real_num_tx_queues);
> +		while (select--)
> +			queue_index = find_next_bit(queues,
> +			    dev->real_num_tx_queues, queue_index);
> +		break;
> +	}
> +
> +	rcu_read_unlock();
> +	return queue_index;
> +}
> +
>  static struct netdev_queue *dev_pick_tx(struct net_device *dev,
>  					struct sk_buff *skb)
>  {
> @@ -2061,23 +2115,30 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
>  	struct sock *sk = skb->sk;
>  
>  	queue_index = sk_tx_queue_get(sk);
> -	if (queue_index < 0) {
> +
> +	if (queue_index < 0 || (skb->ooo_okay && dev->real_num_tx_queues > 1)) {
>  		const struct net_device_ops *ops = dev->netdev_ops;
> +		int old_index = queue_index;
>  
>  		if (ops->ndo_select_queue) {
>  			queue_index = ops->ndo_select_queue(dev, skb);
>  			queue_index = dev_cap_txqueue(dev, queue_index);
>  		} else {
> -			queue_index = 0;
> -			if (dev->real_num_tx_queues > 1)
> -				queue_index = skb_tx_hash(dev, skb);
> +			if (dev->real_num_tx_queues > 1) {
> +				queue_index = get_xps_queue(dev,
> +				    skb, queue_index);
> +				if (queue_index < 0)
> +					queue_index = skb_tx_hash(dev, skb);
> +			} else
> +				queue_index = 0;
> +		}
>  
> -			if (sk) {
> -				struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
> +		if ((queue_index != old_index) && sk) {
> +			struct dst_entry *dst =
> +			    rcu_dereference_check(sk->sk_dst_cache, 1);
>  
> -				if (dst && skb_dst(skb) == dst)
> -					sk_tx_queue_set(sk, queue_index);
> -			}
> +			if (dst && skb_dst(skb) == dst)
> +				sk_tx_queue_set(sk, queue_index);
>  		}
>  	}
>  
> @@ -5429,6 +5490,15 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>  	}
>  
>  #ifdef CONFIG_RPS
> +	atomic_set(&tx->count, queue_count);
> +
> +	/*
> +	 * Set a pointer to first element in the array which holds the
> +	 * reference count.
> +	 */
> +	for (i = 0; i < queue_count; i++)
> +		tx[i].first = tx;
> +
>  	rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
>  	if (!rx) {
>  		printk(KERN_ERR "alloc_netdev: Unable to allocate "
> @@ -5506,7 +5576,9 @@ void free_netdev(struct net_device *dev)
>  
>  	release_net(dev_net(dev));
>  
> +#ifndef CONFIG_RPS
>  	kfree(dev->_tx);
> +#endif
>  
>  	/* Flush device addresses */
>  	dev_addr_flush(dev);
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index af4dfba..661c481 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -742,34 +742,295 @@ static int rx_queue_add_kobject(struct net_device *net, int index)
>  	return error;
>  }
>  
> -static int rx_queue_register_kobjects(struct net_device *net)
> +/*
> + * netdev_queue sysfs structures and functions.
> + */
> +struct netdev_queue_attribute {
> +	struct attribute attr;
> +	ssize_t (*show)(struct netdev_queue *queue,
> +	    struct netdev_queue_attribute *attr, char *buf);
> +	ssize_t (*store)(struct netdev_queue *queue,
> +	    struct netdev_queue_attribute *attr, const char *buf, size_t len);
> +};
> +#define to_netdev_queue_attr(_attr) container_of(_attr,		\
> +    struct netdev_queue_attribute, attr)
> +
> +#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj)
> +
> +static ssize_t netdev_queue_attr_show(struct kobject *kobj,
> +				      struct attribute *attr, char *buf)
> +{
> +	struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
> +	struct netdev_queue *queue = to_netdev_queue(kobj);
> +
> +	if (!attribute->show)
> +		return -EIO;
> +
> +	return attribute->show(queue, attribute, buf);
> +}
> +
> +static ssize_t netdev_queue_attr_store(struct kobject *kobj,
> +				       struct attribute *attr,
> +				       const char *buf, size_t count)
> +{
> +	struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
> +	struct netdev_queue *queue = to_netdev_queue(kobj);
> +
> +	if (!attribute->store)
> +		return -EIO;
> +
> +	return attribute->store(queue, attribute, buf, count);
> +}
> +
> +static struct sysfs_ops netdev_queue_sysfs_ops = {
> +	.show = netdev_queue_attr_show,
> +	.store = netdev_queue_attr_store,
> +};
> +
> +static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
>  {
> +	struct net_device *dev = queue->dev;
> +	int i;
> +
> +	for (i = 0; i < dev->num_tx_queues; i++)
> +		if (queue == &dev->_tx[i])
> +			break;
> +
> +	BUG_ON(i >= dev->num_tx_queues);
> +
> +	return i;
> +}
> +
> +static ssize_t show_xps_map(struct netdev_queue *queue,
> +			    struct netdev_queue_attribute *attribute, char *buf)
> +{
> +	struct net_device *dev = queue->dev;
> +	struct xps_map *maps;
> +	cpumask_var_t mask;
> +	unsigned long *qmask, index;
> +	size_t len = 0;
>  	int i;
> +	unsigned int qmask_size = QUEUE_MASK_SIZE(dev);
> +
> +	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	index = get_netdev_queue_index(queue);
> +
> +	rcu_read_lock();
> +	maps = rcu_dereference(dev->xps_maps);
> +	if (maps) {
> +		qmask = maps->queues;
> +		for (i = 0; i < num_possible_cpus(); i++) {
> +			if (test_bit(index, qmask))
> +				cpumask_set_cpu(i, mask);
> +			qmask += qmask_size;
> +		}
> +	}
> +	len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
> +	if (PAGE_SIZE - len < 3) {
> +		rcu_read_unlock();
> +		free_cpumask_var(mask);
> +		return -EINVAL;
> +	}
> +	rcu_read_unlock();
> +
> +	free_cpumask_var(mask);
> +	len += sprintf(buf + len, "\n");
> +	return len;
> +}
> +
> +static void xps_map_release(struct rcu_head *rcu)
> +{
> +	struct xps_map *map = container_of(rcu, struct xps_map, rcu);
> +
> +	kfree(map);
> +}
> +
> +static DEFINE_MUTEX(xps_map_lock);
> +
> +static ssize_t store_xps_map(struct netdev_queue *queue,
> +		      struct netdev_queue_attribute *attribute,
> +		      const char *buf, size_t len)
> +{
> +	struct net_device *dev = queue->dev;
> +	struct xps_map *maps;
> +	cpumask_var_t mask;
> +	int err, i, nonempty = 0;
> +	unsigned long *qmask, index;
> +	unsigned int qmask_size = QUEUE_MASK_SIZE(dev);
> +
> +	if (!capable(CAP_NET_ADMIN))
> +		return -EPERM;
> +
> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
> +	if (err) {
> +		free_cpumask_var(mask);
> +		return err;
> +	}
> +
> +	mutex_lock(&xps_map_lock);
> +
> +	maps = dev->xps_maps;
> +	if (!maps) {
> +		if (!cpumask_weight(mask)) {
> +			mutex_unlock(&xps_map_lock);
> +			free_cpumask_var(mask);
> +			return 0;
> +		}
> +		maps = kzalloc(XPS_MAP_SIZE(dev), GFP_KERNEL);
> +		if (!maps) {
> +			mutex_unlock(&xps_map_lock);
> +			free_cpumask_var(mask);
> +			return -ENOMEM;
> +		}
> +		rcu_assign_pointer(dev->xps_maps, maps);
> +	}
> +
> +	index = get_netdev_queue_index(queue);
> +
> +	qmask = maps->queues;
> +	for (i = 0; i < num_possible_cpus(); i++) {
> +		if (cpu_isset(i, *mask) && cpu_online(i)) {
> +			set_bit(index, qmask);
> +			nonempty = 1;
> +		} else
> +			clear_bit(index, qmask);
> +		if (!nonempty &&
> +		    bitmap_weight(qmask, dev->real_num_tx_queues))
> +			nonempty = 1;
> +		qmask += qmask_size;
> +	}
> +
> +	if (!nonempty) {
> +		rcu_assign_pointer(dev->xps_maps, NULL);
> +		call_rcu(&maps->rcu, xps_map_release);
> +	}
> +
> +	mutex_unlock(&xps_map_lock);
> +
> +	free_cpumask_var(mask);
> +	return len;
> +}
> +
> +static struct netdev_queue_attribute xps_cpus_attribute =
> +    __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
> +
> +static struct attribute *netdev_queue_default_attrs[] = {
> +	&xps_cpus_attribute.attr,
> +	NULL
> +};
> +
> +static void netdev_queue_release(struct kobject *kobj)
> +{
> +	struct netdev_queue *queue = to_netdev_queue(kobj);
> +	struct net_device *dev = queue->dev;
> +	struct netdev_queue *first = queue->first;
> +	struct xps_map *maps;
> +	unsigned long *qmask, index;
> +	int i, nonempty = 0;
> +	unsigned int qmask_size = QUEUE_MASK_SIZE(dev);
> +
> +	index = get_netdev_queue_index(queue);
> +
> +	mutex_lock(&xps_map_lock);
> +
> +	maps = dev->xps_maps;
> +
> +	if (maps) {
> +		qmask = maps->queues;
> +		for (i = 0; i < num_possible_cpus(); i++) {
> +			clear_bit(index, qmask);
> +			if (!nonempty &&
> +			    bitmap_weight(qmask, dev->real_num_tx_queues))
> +				nonempty = 1;
> +			qmask += qmask_size;
> +		}
> +
> +		if (!nonempty) {
> +			rcu_assign_pointer(dev->xps_maps, NULL);
> +			call_rcu(&maps->rcu, xps_map_release);
> +		}
> +	}
> +	mutex_unlock(&xps_map_lock);
> +
> +	if (atomic_dec_and_test(&first->count)) {
> +		kfree(first);
> +		dev_put(dev);
> +	}
> +}
> +
> +static struct kobj_type netdev_queue_ktype = {
> +	.sysfs_ops = &netdev_queue_sysfs_ops,
> +	.release = netdev_queue_release,
> +	.default_attrs = netdev_queue_default_attrs,
> +};
> +
> +static int netdev_queue_add_kobject(struct net_device *net, int index)
> +{
> +	struct netdev_queue *queue = net->_tx + index;
> +	struct kobject *kobj = &queue->kobj;
> +	int error = 0;
> +
> +	kobj->kset = net->queues_kset;
> +	error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
> +	    "tx-%u", index);
> +	if (error) {
> +		kobject_put(kobj);
> +		return error;
> +	}
> +
> +	kobject_uevent(kobj, KOBJ_ADD);
> +
> +	return error;
> +}
> +
> +static int register_queue_kobjects(struct net_device *net)
> +{
> +	int rx = 0, tx = 0;
>  	int error = 0;
>  
>  	net->queues_kset = kset_create_and_add("queues",
>  	    NULL, &net->dev.kobj);
>  	if (!net->queues_kset)
>  		return -ENOMEM;
> -	for (i = 0; i < net->num_rx_queues; i++) {
> -		error = rx_queue_add_kobject(net, i);
> +
> +	for (rx = 0; rx < net->num_rx_queues; rx++) {
> +		error = rx_queue_add_kobject(net, rx);
>  		if (error)
> -			break;
> +			goto error;
>  	}
>  
> -	if (error)
> -		while (--i >= 0)
> -			kobject_put(&net->_rx[i].kobj);
> +	for (tx = 0; tx < net->num_tx_queues; tx++) {
> +		error = netdev_queue_add_kobject(net, tx);
> +		if (error)
> +			goto error;
> +	}
> +	dev_hold(net);
> +
> +	return error;
> +
> +error:
> +	while (--rx >= 0)
> +		kobject_put(&net->_rx[rx].kobj);
> +
> +	while (--tx >= 0)
> +		kobject_put(&net->_tx[tx].kobj);
>  
>  	return error;
>  }
>  
> -static void rx_queue_remove_kobjects(struct net_device *net)
> +static void remove_queue_kobjects(struct net_device *net)
>  {
>  	int i;
>  
>  	for (i = 0; i < net->num_rx_queues; i++)
>  		kobject_put(&net->_rx[i].kobj);
> +	for (i = 0; i < net->num_tx_queues; i++)
> +		kobject_put(&net->_tx[i].kobj);
>  	kset_unregister(net->queues_kset);
>  }
>  #endif /* CONFIG_RPS */
> @@ -871,7 +1132,7 @@ void netdev_unregister_kobject(struct net_device * net)
>  	kobject_get(&dev->kobj);
>  
>  #ifdef CONFIG_RPS
> -	rx_queue_remove_kobjects(net);
> +	remove_queue_kobjects(net);
>  #endif
>  
>  	device_del(dev);
> @@ -912,7 +1173,7 @@ int netdev_register_kobject(struct net_device *net)
>  		return error;
>  
>  #ifdef CONFIG_RPS
> -	error = rx_queue_register_kobjects(net);
> +	error = register_queue_kobjects(net);
>  	if (error) {
>  		device_del(dev);
>  		return error;
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index de3bd84..80c1928 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -828,8 +828,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
>  							   &md5);
>  	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
>  
> -	if (tcp_packets_in_flight(tp) == 0)
> +	if (tcp_packets_in_flight(tp) == 0) {
>  		tcp_ca_event(sk, CA_EVENT_TX_START);
> +		skb->ooo_okay = 1;
> +	}
>  
>  	skb_push(skb, tcp_header_size);
>  	skb_reset_transport_header(skb);

Why don't we do this in the normal transmit processing.
There is already so much policy mechanism filters/actions/qdisc that
doing it in higher level is fighting against these.



-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-08-23  5:39 [PATCH] xps-mq: Transmit Packet Steering for multiqueue Tom Herbert
  2010-08-23 17:09 ` Ben Hutchings
  2010-08-23 17:59 ` Stephen Hemminger
@ 2010-08-24  4:31 ` Bill Fink
  2010-08-24  4:37   ` Tom Herbert
  2 siblings, 1 reply; 28+ messages in thread
From: Bill Fink @ 2010-08-24  4:31 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, eric.dumazet

On Sun, 22 Aug 2010, Tom Herbert wrote:

> This patch implements transmit packet steering (XPS) for multiqueue
> devices.  XPS selects a transmit queue during packet transmission based
> on configuration.  This is done by mapping the CPU transmitting the
> packet to a queue.  This is the transmit side analogue to RPS-- where
> RPS is selecting a CPU based on receive queue, XPS selects a queue
> based on the CPU (previously there was an XPS patch from Eric
> Dumazet, but that might more appropriately be called transmit completion
> steering).
> 
> Each transmit queue can be associated with a number of CPUs which will
> used the queue to send packets.  This is configured as a CPU mask on a
> per queue basis in:
> 
> /sys/class/net/eth<n>/queues/tx-<n>/xps_cpus
> 
> The mappings are stored per device in an inverted data structure that
> maps CPUs to queues.  In the netdevice structure this is an array of
> num_possible_cpu structures where each array entry contains a bit map
> of queues which that CPU can use.
> 
> We also allow the mapping of a socket to queue to be modified, for
> instance if a thread is scheduled on a different CPU the desired queue
> for transmitting packets would likely change.  To maintain in order
> packet transmission a flag (ooo_okay) has been added to the sk_buf
> structure.  If a transport layer sets this flag on a packet, the
> transmit queue can be changed for this socket.  Presumably, the
> transport would set this is there was no possbility of creating ooo
> packets (for instance there are no packets in flight for the socket).
> This patch includes the modification in TCP output for setting this
> flag.
> 
> The benefits of XPS are improved locality in the per queue data
> strutures.  Also, transmit completions are more likely to be done
> nearer to the sending thread so this should promote locality back 
> to the socket (e.g. UDP).  The benefits of XPS are dependent on
> cache hierarchy, application load, and other factors.  XPS would
> nominally be configured so that a queue would only be shared by CPUs
> which are sharing a cache, the degenerative configuration woud be that
> each CPU has it's own queue.
> 
> Below are some benchmark results which show the potential benfit of
> this patch.  The netperf test has 500 instances of netperf TCP_RR test
> with 1 byte req. and resp.
> 
> bnx2x on 16 core AMD
>    XPS (16 queues, 1 TX queue per CPU)	1015K at 99% CPU
>    No XPS (16 queues)			1127K at 98% CPU

I don't grok your performance numbers.  What do the 1015K and 1127K
numbers represent?  I was originally guessing that they were basically
transactions per second, but that would seem to imply that the No XPS
case was better.  Please clarify.

						-Thanks

						-Bill

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-08-24  4:31 ` Bill Fink
@ 2010-08-24  4:37   ` Tom Herbert
  0 siblings, 0 replies; 28+ messages in thread
From: Tom Herbert @ 2010-08-24  4:37 UTC (permalink / raw)
  To: Bill Fink; +Cc: davem, netdev, eric.dumazet

1 byte req. and resp.
> >
> > bnx2x on 16 core AMD
> >    XPS (16 queues, 1 TX queue per CPU)        1015K at 99% CPU
> >    No XPS (16 queues)                 1127K at 98% CPU
>
> I don't grok your performance numbers.  What do the 1015K and 1127K
> numbers represent?  I was originally guessing that they were basically
> transactions per second, but that would seem to imply that the No XPS
> case was better.  Please clarify.
>
Yes, TPS and the numbers were switched... XPS case was better!

Thanks for pointing that out.

Tom

>                                                -Thanks
>
>                                                -Bill

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-08-23 17:59 ` Stephen Hemminger
@ 2010-09-01 15:41   ` Tom Herbert
  2010-09-01 15:54     ` Eric Dumazet
  2010-09-01 16:09     ` David Miller
  0 siblings, 2 replies; 28+ messages in thread
From: Tom Herbert @ 2010-09-01 15:41 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: davem, netdev, eric.dumazet

> Why don't we do this in the normal transmit processing.
> There is already so much policy mechanism filters/actions/qdisc that
> doing it in higher level is fighting against these.
>
Are you proposing that TX queue selection be done in the qdiscs?  The
queue has to be selected before taking the lock (cannot afford taking
a lock over the whole interface).  This would necessitate moving the
locking and probably rearranging a lot of the xmit code around that.

Tom

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-09-01 15:41   ` Tom Herbert
@ 2010-09-01 15:54     ` Eric Dumazet
  2010-09-01 16:24       ` Tom Herbert
  2010-09-01 16:09     ` David Miller
  1 sibling, 1 reply; 28+ messages in thread
From: Eric Dumazet @ 2010-09-01 15:54 UTC (permalink / raw)
  To: Tom Herbert; +Cc: Stephen Hemminger, davem, netdev

Le mercredi 01 septembre 2010 à 08:41 -0700, Tom Herbert a écrit :
> > Why don't we do this in the normal transmit processing.
> > There is already so much policy mechanism filters/actions/qdisc that
> > doing it in higher level is fighting against these.
> >
> Are you proposing that TX queue selection be done in the qdiscs?  The
> queue has to be selected before taking the lock (cannot afford taking
> a lock over the whole interface).  This would necessitate moving the
> locking and probably rearranging a lot of the xmit code around that.

Stephen point is not adding yet another layer 'before' qdisc layer.

I would like something not as complex as your patch.

1) Why current selection fails ?

2) Could we change current selection to :

 - Use a lightweight selection, with no special configuration.

 - Use driver RX multiqueue information if available, in a one-to-one
relationship.

3) Eventually have a user selectable selection (socket option, or system
wide, but one sysctl, not many bitmasks ;) ).




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-09-01 15:41   ` Tom Herbert
  2010-09-01 15:54     ` Eric Dumazet
@ 2010-09-01 16:09     ` David Miller
  1 sibling, 0 replies; 28+ messages in thread
From: David Miller @ 2010-09-01 16:09 UTC (permalink / raw)
  To: therbert; +Cc: shemminger, netdev, eric.dumazet

From: Tom Herbert <therbert@google.com>
Date: Wed, 1 Sep 2010 08:41:14 -0700

>> Why don't we do this in the normal transmit processing.
>> There is already so much policy mechanism filters/actions/qdisc that
>> doing it in higher level is fighting against these.
>>
> Are you proposing that TX queue selection be done in the qdiscs?  The
> queue has to be selected before taking the lock (cannot afford taking
> a lock over the whole interface).  This would necessitate moving the
> locking and probably rearranging a lot of the xmit code around that.

Right, we really have to pick queues before entering the qdisc for
the full benefit of lock seperation.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-09-01 15:54     ` Eric Dumazet
@ 2010-09-01 16:24       ` Tom Herbert
  2010-09-02  1:32         ` David Miller
  0 siblings, 1 reply; 28+ messages in thread
From: Tom Herbert @ 2010-09-01 16:24 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Stephen Hemminger, davem, netdev

On Wed, Sep 1, 2010 at 8:54 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le mercredi 01 septembre 2010 à 08:41 -0700, Tom Herbert a écrit :
>> > Why don't we do this in the normal transmit processing.
>> > There is already so much policy mechanism filters/actions/qdisc that
>> > doing it in higher level is fighting against these.
>> >
>> Are you proposing that TX queue selection be done in the qdiscs?  The
>> queue has to be selected before taking the lock (cannot afford taking
>> a lock over the whole interface).  This would necessitate moving the
>> locking and probably rearranging a lot of the xmit code around that.
>
> Stephen point is not adding yet another layer 'before' qdisc layer.
>
> I would like something not as complex as your patch.
>
> 1) Why current selection fails ?
>
Current selection does a hash on 4-tuple to map packets to queues.  So
any CPU can send on any queue which leads to cache line bouncing of
transmit structures.  Also when sending from one CPU to a queue whose
transmit interrupt is on a CPU in another cache domain cause more
cache line bouncing with transmit completion.  So while the current
scheme nicely distributes load across the queues, it does nothing  to
promote locality.  Getting some reasonable locality is where the
benefits come from that we are demonstrating.

> 2) Could we change current selection to :
>
>  - Use a lightweight selection, with no special configuration.
>
>  - Use driver RX multiqueue information if available, in a one-to-one
> relationship.
>
Not generally.  It's very possible that the only a subset of CPUs are
getting RX interrupts in multiqueue (consider when #queues < #CPUs),
so there's really not an obvious 1-1 relationship.  But each CPU can
send and should be mapped to at least one transmit queue; the most
obvious plan would be to send in a queue in the same cache domain.

> 3) Eventually have a user selectable selection (socket option, or system
> wide, but one sysctl, not many bitmasks ;) ).
>
Right, but it would also be nice if a single sysctl could optimally
set up multiqueue, RSS, RPS, and all my interrupt affinities for me
;-)

Tom

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-09-01 16:24       ` Tom Herbert
@ 2010-09-02  1:32         ` David Miller
  2010-09-02  1:48           ` Stephen Hemminger
                             ` (4 more replies)
  0 siblings, 5 replies; 28+ messages in thread
From: David Miller @ 2010-09-02  1:32 UTC (permalink / raw)
  To: therbert; +Cc: eric.dumazet, shemminger, netdev

From: Tom Herbert <therbert@google.com>
Date: Wed, 1 Sep 2010 09:24:18 -0700

> On Wed, Sep 1, 2010 at 8:54 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> 3) Eventually have a user selectable selection (socket option, or system
>> wide, but one sysctl, not many bitmasks ;) ).
>>
> Right, but it would also be nice if a single sysctl could optimally
> set up multiqueue, RSS, RPS, and all my interrupt affinities for me
> ;-)

It's becomming increasingly obvious to me that we need (somewhere,
not necessarily the kernel) a complete datastructure representing
the NUMA, cache, cpu, device hierarchy.

And that can be used to tweak all of this stuff.

The policy should probably be in userspace, we just need to provide
the knobs in the kernel to tweak it however userspace wants.

Userspace should be able to, for example, move a TX queue into a
NUMA domain and have this invoke several side effects:

1) IRQs for that TX queue get rerouted to a cpu in the NUMA
   domain.

2) TX queue datastructures in the driver get reallocated using
   memory in that NUMA domain.

3) TX hashing is configured to use the set of cpus in the NUMA
   domain.

It's alot of tedious work and involves some delicate tasks figuring
out where each of these things go, but really then we'd solve all
of this crap one and for all.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-09-02  1:32         ` David Miller
@ 2010-09-02  1:48           ` Stephen Hemminger
  2010-09-02 16:00             ` Loke, Chetan
  2010-09-02  1:56           ` Stephen Hemminger
                             ` (3 subsequent siblings)
  4 siblings, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2010-09-02  1:48 UTC (permalink / raw)
  To: David Miller; +Cc: therbert, eric.dumazet, netdev

On Wed, 01 Sep 2010 18:32:51 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Tom Herbert <therbert@google.com>
> Date: Wed, 1 Sep 2010 09:24:18 -0700
> 
> > On Wed, Sep 1, 2010 at 8:54 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >> 3) Eventually have a user selectable selection (socket option, or system
> >> wide, but one sysctl, not many bitmasks ;) ).
> >>
> > Right, but it would also be nice if a single sysctl could optimally
> > set up multiqueue, RSS, RPS, and all my interrupt affinities for me
> > ;-)
> 
> It's becomming increasingly obvious to me that we need (somewhere,
> not necessarily the kernel) a complete datastructure representing
> the NUMA, cache, cpu, device hierarchy.
> 
> And that can be used to tweak all of this stuff.
> 
> The policy should probably be in userspace, we just need to provide
> the knobs in the kernel to tweak it however userspace wants.
> 
> Userspace should be able to, for example, move a TX queue into a
> NUMA domain and have this invoke several side effects:
> 
> 1) IRQs for that TX queue get rerouted to a cpu in the NUMA
>    domain.
> 
> 2) TX queue datastructures in the driver get reallocated using
>    memory in that NUMA domain.
> 
> 3) TX hashing is configured to use the set of cpus in the NUMA
>    domain.
> 
> It's alot of tedious work and involves some delicate tasks figuring
> out where each of these things go, but really then we'd solve all
> of this crap one and for all.

Plus it needs to work with scheduler (not fight it).  All this doesn't
work very well if process keeps bouncing away from its resources.

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-09-02  1:32         ` David Miller
  2010-09-02  1:48           ` Stephen Hemminger
@ 2010-09-02  1:56           ` Stephen Hemminger
  2010-09-02  6:41             ` Greg Lindahl
  2010-09-02 16:18             ` Loke, Chetan
  2010-09-02 15:55           ` Loke, Chetan
                             ` (2 subsequent siblings)
  4 siblings, 2 replies; 28+ messages in thread
From: Stephen Hemminger @ 2010-09-02  1:56 UTC (permalink / raw)
  To: David Miller; +Cc: therbert, eric.dumazet, netdev

On Wed, 01 Sep 2010 18:32:51 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Tom Herbert <therbert@google.com>
> Date: Wed, 1 Sep 2010 09:24:18 -0700
> 
> > On Wed, Sep 1, 2010 at 8:54 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >> 3) Eventually have a user selectable selection (socket option, or system
> >> wide, but one sysctl, not many bitmasks ;) ).
> >>
> > Right, but it would also be nice if a single sysctl could optimally
> > set up multiqueue, RSS, RPS, and all my interrupt affinities for me
> > ;-)
> 
> It's becomming increasingly obvious to me that we need (somewhere,
> not necessarily the kernel) a complete datastructure representing
> the NUMA, cache, cpu, device hierarchy.
> 
> And that can be used to tweak all of this stuff.
> 
> The policy should probably be in userspace, we just need to provide
> the knobs in the kernel to tweak it however userspace wants.
> 
> Userspace should be able to, for example, move a TX queue into a
> NUMA domain and have this invoke several side effects:
> 
> 1) IRQs for that TX queue get rerouted to a cpu in the NUMA
>    domain.
> 
> 2) TX queue datastructures in the driver get reallocated using
>    memory in that NUMA domain.
> 
> 3) TX hashing is configured to use the set of cpus in the NUMA
>    domain.
> 
> It's alot of tedious work and involves some delicate tasks figuring
> out where each of these things go, but really then we'd solve all
> of this crap one and for all.

Just to be contrarian :-) This same idea had started before when IBM
proposed a user-space NUMA API.  It never got any traction, the concept
of "lets make the applications NUMA aware" never got accepted because
it is so hard to do right and fragile that it was the wrong idea
to start with. The only people that can manage it are the engineers
tweeking a one off database benchmark.

I would rather see a "good enough" policy in the kernel that works
for everything from a single-core embedded system to a 100 core
server environment.  Forget the benchmarkers. The ideal solution
should work with a mix of traffic and adapt. Today the application
doesn't have to make a service level agreement with kernel
everytime it opens a TCP socket.

Doing it in userspace doesn't really help much. The API's keep changing
and the focus fades (see irqbalance).



-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-09-02  1:56           ` Stephen Hemminger
@ 2010-09-02  6:41             ` Greg Lindahl
  2010-09-02 16:18             ` Loke, Chetan
  1 sibling, 0 replies; 28+ messages in thread
From: Greg Lindahl @ 2010-09-02  6:41 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, therbert, eric.dumazet, netdev

On Wed, Sep 01, 2010 at 06:56:27PM -0700, Stephen Hemminger wrote:

> Just to be contrarian :-) This same idea had started before when IBM
> proposed a user-space NUMA API.  It never got any traction, the concept
> of "lets make the applications NUMA aware" never got accepted because
> it is so hard to do right and fragile that it was the wrong idea
> to start with. The only people that can manage it are the engineers
> tweeking a one off database benchmark.

As an non-database user-space example, there are many applications
which know about the typical 'first touch' locality policy for pages
and use that to be NUMA-aware. Just about every OpenMP program ever
written does that; it's even fairly portable among OSes.

A second user-level example is MPI implementations such as OpenMPI.
Those guys run 1 process per core and they don't need to move around,
so getting process locked to a core and all the pages in the right
place is a nice win without the MPI programmer doing anything.

For kernel (but non-Ethernet) networking examples, HPC interconnects
typically go out of their way to ensure locality of kernel pages
related to a given core's workload.  Examples include Myrinet's
OpenMX+MPI and the InfiniPath InfiniBand adapater, whatever QLogic
renamed it to this week (TrueScale, I suppose.) How can you get ~ 1
microsecond messages if you've got a buffer in the wrong place?  Or
achieve extremely high messaging rates when you're waiting for remote
memory all the time?

> I would rather see a "good enough" policy in the kernel that works
> for everything from a single-core embedded system to a 100 core
> server environment.

I'd like a pony. Yes, it's challenging to directly aapply the above
networking example to Ethernet networking, but there's a pony in there
somewhere.

-- greg

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-09-02  1:32         ` David Miller
  2010-09-02  1:48           ` Stephen Hemminger
  2010-09-02  1:56           ` Stephen Hemminger
@ 2010-09-02 15:55           ` Loke, Chetan
  2010-09-16 21:52           ` Ben Hutchings
  2011-02-21 18:19           ` [PATCH] xps-mq: Transmit Packet Steering for multiqueue Ben Hutchings
  4 siblings, 0 replies; 28+ messages in thread
From: Loke, Chetan @ 2010-09-02 15:55 UTC (permalink / raw)
  To: David Miller, therbert; +Cc: eric.dumazet, shemminger, netdev, Lee.Schermerhorn

> From: netdev-owner@vger.kernel.org [mailto:netdev-
> owner@vger.kernel.org] On Behalf Of David Miller
> It's becomming increasingly obvious to me that we need (somewhere,
> not necessarily the kernel) a complete datastructure representing
> the NUMA, cache, cpu, device hierarchy.
> 
> And that can be used to tweak all of this stuff.
> 
> The policy should probably be in userspace, we just need to provide
> the knobs in the kernel to tweak it however userspace wants.
> 

I agree. But only if we have all the knobs in the kernel.

http://www.spinics.net/lists/linux-numa/msg00709.html

I had to do some changes manually. Thanks to Lee for pointing out the
dma-call.


> Userspace should be able to, for example, move a TX queue into a
> NUMA domain and have this invoke several side effects:
> 
> 1) IRQs for that TX queue get rerouted to a cpu in the NUMA
>    domain.
> 
> 2) TX queue datastructures in the driver get reallocated using
>    memory in that NUMA domain.
> 
> 3) TX hashing is configured to use the set of cpus in the NUMA
>    domain.
> 
> It's alot of tedious work and involves some delicate tasks figuring
> out where each of these things go, but really then we'd solve all
> of this crap one and for all.
> --

The MSI-X+MQ combo is something that should be taken care off. We could
come up with an automatic-node-binding shim in the kernel and then all
the sub-systems can use that.

Basically, the userland should query 
i) the adapter-capabilities 
ii)get the node-binding info
iii)and then start affinitizing everything


Some older msi-x (non-ethernet)adapters assume the msi-x info to remain
static once the adapter is initialized. If we had an auto-node-binding
shim then that would mean re-initializing the adapter,correct?



Chetan Loke

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-09-02  1:48           ` Stephen Hemminger
@ 2010-09-02 16:00             ` Loke, Chetan
  2010-09-02 19:52               ` Tom Herbert
  0 siblings, 1 reply; 28+ messages in thread
From: Loke, Chetan @ 2010-09-02 16:00 UTC (permalink / raw)
  To: Stephen Hemminger, David Miller; +Cc: therbert, eric.dumazet, netdev

> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-
> owner@vger.kernel.org] On Behalf Of Stephen Hemminger
> Sent: September 01, 2010 9:48 PM
> To: David Miller
> Cc: therbert@google.com; eric.dumazet@gmail.com;
netdev@vger.kernel.org
> Subject: Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
 
> Plus it needs to work with scheduler (not fight it).  All this doesn't
> work very well if process keeps bouncing away from its resources.
> 

userland folks who actually try to exploit the MQ/MSIX-ness will almost
always pin down their high-prio[or subset of] threads/processes.

Chetan Loke
 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-09-02  1:56           ` Stephen Hemminger
  2010-09-02  6:41             ` Greg Lindahl
@ 2010-09-02 16:18             ` Loke, Chetan
  1 sibling, 0 replies; 28+ messages in thread
From: Loke, Chetan @ 2010-09-02 16:18 UTC (permalink / raw)
  To: Stephen Hemminger, David Miller; +Cc: therbert, eric.dumazet, netdev

> Just to be contrarian :-) This same idea had started before when IBM
> proposed a user-space NUMA API.  It never got any traction, the
concept
> of "lets make the applications NUMA aware" never got accepted because
> it is so hard to do right and fragile that it was the wrong idea
> to start with. The only people that can manage it are the engineers
> tweeking a one off database benchmark.
> 

If you design an appliance then this would be one of the knobs that
people tinker with. To reap the actual benefits the whole stack(from
adapter's f/w to the userland-thread) needs to be astro-aligned. Almost
all the user-space guys that I've talked to never got it right because
they never understood the concepts behind NUMA-IOH-Mempolicy-MSI-X etc. 
So it's a little difficult for them to get it right the first time.
Also, Stoakley/Nehalem rigs got into mainstream ~2-3 years back? Before
that only a handful of engineers could experiment because the mobo's
were expensive. Plus, you didn't really have 10G fabric then.

Chetan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-09-02 16:00             ` Loke, Chetan
@ 2010-09-02 19:52               ` Tom Herbert
  2010-09-02 23:17                 ` Loke, Chetan
  0 siblings, 1 reply; 28+ messages in thread
From: Tom Herbert @ 2010-09-02 19:52 UTC (permalink / raw)
  To: Loke, Chetan; +Cc: Stephen Hemminger, David Miller, eric.dumazet, netdev

> userland folks who actually try to exploit the MQ/MSIX-ness will almost
> always pin down their high-prio[or subset of] threads/processes.
>
I don't really see that.  Pinning is a last resort and in this context
we could only do that on a dedicated server.  On a shared server, with
many different apps, pinning for MQ/MSIX is not an easy option;
meeting scheduler constraints will be the first priority and its up to
networking to work with the scheduler to to the right thing.
Scheduler aware networking (or vice versa) is important.

Tom

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-09-02 19:52               ` Tom Herbert
@ 2010-09-02 23:17                 ` Loke, Chetan
  0 siblings, 0 replies; 28+ messages in thread
From: Loke, Chetan @ 2010-09-02 23:17 UTC (permalink / raw)
  To: Tom Herbert; +Cc: Stephen Hemminger, David Miller, eric.dumazet, netdev

> From: Tom Herbert [mailto:therbert@google.com]
> Sent: September 02, 2010 3:53 PM
> To: Loke, Chetan
> Cc: Stephen Hemminger; David Miller; eric.dumazet@gmail.com;
> netdev@vger.kernel.org
> Subject: Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
> 
> > userland folks who actually try to exploit the MQ/MSIX-ness will
> almost
> > always pin down their high-prio[or subset of] threads/processes.
> >
> I don't really see that.  Pinning is a last resort and in this context
> we could only do that on a dedicated server.  On a shared server, with
> many different apps, pinning for MQ/MSIX is not an easy option;
> meeting scheduler constraints will be the first priority and its up to
> networking to work with the scheduler to to the right thing.
> Scheduler aware networking (or vice versa) is important.
> 

For my use-case it's an appliance. Newer adapters might have like 64+(?)
h/w queues and not just f/w emulated queues. With these many queues you
could partition your threads/queues, no?
It's easier to get started that way. All you need is a shim(or just a
driver stub because you can then load it on any box that had older
kernels) in the kernel that will tell you which queue-set(for a
MQ-capable adapter) is still under the high-watermark. If all are full
then it should just round-robin(across queues and nodes).So make a
syscall(or shoot a mbx-cmd or pick your trick), find out which queue you
could use, get the binding info and then launch your threads. So once
you narrow down the scope, the scheduler will have less work to do.

If the worker threads are short lived then there's no point in this
binding. And for long-lived tasks, a couple of initial prep-calls will
not hurt performance that much.And if you still care about syscalls at
runtime then you could have a dedicated mgmt-thread that will receive
async-events from the shim. And all other user-land logic could consult
this mgmt-thread.

> Tom
Chetan Loke

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-09-02  1:32         ` David Miller
                             ` (2 preceding siblings ...)
  2010-09-02 15:55           ` Loke, Chetan
@ 2010-09-16 21:52           ` Ben Hutchings
  2010-09-19 17:24             ` Michael S. Tsirkin
  2011-02-21 18:19           ` [PATCH] xps-mq: Transmit Packet Steering for multiqueue Ben Hutchings
  4 siblings, 1 reply; 28+ messages in thread
From: Ben Hutchings @ 2010-09-16 21:52 UTC (permalink / raw)
  To: David Miller; +Cc: therbert, eric.dumazet, shemminger, netdev

On Wed, 2010-09-01 at 18:32 -0700, David Miller wrote:
> From: Tom Herbert <therbert@google.com>
> Date: Wed, 1 Sep 2010 09:24:18 -0700
> 
> > On Wed, Sep 1, 2010 at 8:54 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >> 3) Eventually have a user selectable selection (socket option, or system
> >> wide, but one sysctl, not many bitmasks ;) ).
> >>
> > Right, but it would also be nice if a single sysctl could optimally
> > set up multiqueue, RSS, RPS, and all my interrupt affinities for me
> > ;-)
> 
> It's becomming increasingly obvious to me that we need (somewhere,
> not necessarily the kernel) a complete datastructure representing
> the NUMA, cache, cpu, device hierarchy.

And ideally a cheap way (not O(N^2)) to find the distance between 2 CPU
threads (not just nodes).

> And that can be used to tweak all of this stuff.
> 
> The policy should probably be in userspace, we just need to provide
> the knobs in the kernel to tweak it however userspace wants.
> 
> Userspace should be able to, for example, move a TX queue into a
> NUMA domain and have this invoke several side effects:
> 
> 1) IRQs for that TX queue get rerouted to a cpu in the NUMA
>    domain.
> 
> 2) TX queue datastructures in the driver get reallocated using
>    memory in that NUMA domain.

I've actually done some work on an interface and implementation of this,
although I didn't include actually setting the IRQ affinity as there has
been pushback whenever people propose letting drivers set this.  If they
only do so as directed by the administrator this might be more
acceptable though.

Unfortunately in my limited testing on a 2-node system I didn't see a
whole lot of improvement in performance when the affinities were all
lined up.  I should try to get some time on a 4-node system.

> 3) TX hashing is configured to use the set of cpus in the NUMA
>    domain.
> 
> It's alot of tedious work and involves some delicate tasks figuring
> out where each of these things go, but really then we'd solve all
> of this crap one and for all.

Right.

The other thing I've been working on lately which sort of ties into this
is hardware acceleration of Receive Flow Steering.  Multiqueue NICs such
as ours tend to have RX flow filters as well as hashing.  So why not use
those to do a first level of steering?  We're going to do some more
internal testing and review but I hope to send out a first version of
this next week.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-09-16 21:52           ` Ben Hutchings
@ 2010-09-19 17:24             ` Michael S. Tsirkin
  2010-09-20 12:44               ` [RFC][PATCH 1/3] IRQ: Add irq_get_numa_node() Ben Hutchings
                                 ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Michael S. Tsirkin @ 2010-09-19 17:24 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: David Miller, therbert, eric.dumazet, shemminger, netdev

On Thu, Sep 16, 2010 at 10:52:41PM +0100, Ben Hutchings wrote:
> On Wed, 2010-09-01 at 18:32 -0700, David Miller wrote:
> > From: Tom Herbert <therbert@google.com>
> > Date: Wed, 1 Sep 2010 09:24:18 -0700
> > 
> > > On Wed, Sep 1, 2010 at 8:54 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > >> 3) Eventually have a user selectable selection (socket option, or system
> > >> wide, but one sysctl, not many bitmasks ;) ).
> > >>
> > > Right, but it would also be nice if a single sysctl could optimally
> > > set up multiqueue, RSS, RPS, and all my interrupt affinities for me
> > > ;-)
> > 
> > It's becomming increasingly obvious to me that we need (somewhere,
> > not necessarily the kernel) a complete datastructure representing
> > the NUMA, cache, cpu, device hierarchy.
> 
> And ideally a cheap way (not O(N^2)) to find the distance between 2 CPU
> threads (not just nodes).
> 
> > And that can be used to tweak all of this stuff.
> > 
> > The policy should probably be in userspace, we just need to provide
> > the knobs in the kernel to tweak it however userspace wants.
> > 
> > Userspace should be able to, for example, move a TX queue into a
> > NUMA domain and have this invoke several side effects:
> > 
> > 1) IRQs for that TX queue get rerouted to a cpu in the NUMA
> >    domain.
> > 
> > 2) TX queue datastructures in the driver get reallocated using
> >    memory in that NUMA domain.
> 
> I've actually done some work on an interface and implementation of this,
> although I didn't include actually setting the IRQ affinity as there has
> been pushback whenever people propose letting drivers set this.  If they
> only do so as directed by the administrator this might be more
> acceptable though.
> 
> Unfortunately in my limited testing on a 2-node system I didn't see a
> whole lot of improvement in performance when the affinities were all
> lined up.  I should try to get some time on a 4-node system.

I've been trying to look into this as well.
It'd be very interesting to see the patches even if they don't show
good performance.  Could you post them?


> > 3) TX hashing is configured to use the set of cpus in the NUMA
> >    domain.
> > 
> > It's alot of tedious work and involves some delicate tasks figuring
> > out where each of these things go, but really then we'd solve all
> > of this crap one and for all.
> 
> Right.
> 
> The other thing I've been working on lately which sort of ties into this
> is hardware acceleration of Receive Flow Steering.  Multiqueue NICs such
> as ours tend to have RX flow filters as well as hashing.  So why not use
> those to do a first level of steering?  We're going to do some more
> internal testing and review but I hope to send out a first version of
> this next week.
> 
> Ben.
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [RFC][PATCH 1/3] IRQ: Add irq_get_numa_node()
  2010-09-19 17:24             ` Michael S. Tsirkin
@ 2010-09-20 12:44               ` Ben Hutchings
  2010-09-20 13:04                 ` Eric Dumazet
  2010-09-20 12:45               ` [RFC][PATCH 2/3] ethtool: NUMA affinity control Ben Hutchings
  2010-09-20 12:48               ` [RFC][PATCH 3/3] sfc: Add support for " Ben Hutchings
  2 siblings, 1 reply; 28+ messages in thread
From: Ben Hutchings @ 2010-09-20 12:44 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, therbert, eric.dumazet, shemminger, netdev

This will be used to support NUMA-aware memory allocation of
structures used in interrupt handlers, starting with network drivers.
---
irq_get_numa_node() probably needs to grab desc->lock.  Other than that,
this should work.

Ben.

 include/linux/interrupt.h |    9 +++++++++
 kernel/irq/manage.c       |   35 +++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 0 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a0384a4..82e9a08 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -260,6 +260,15 @@ static inline int irq_set_affinity_hint(unsigned int irq,
 }
 #endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */
 
+#if defined(CONFIG_NUMA) && defined(CONFIG_GENERIC_HARDIRQS)
+extern int irq_get_numa_node(unsigned int irq);
+#else
+static inline int irq_get_numa_node(unsigned int irq)
+{
+	return -1;
+}
+#endif
+
 #ifdef CONFIG_GENERIC_HARDIRQS
 /*
  * Special lockdep variants of irq disabling/enabling.
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c3003e9..03e683e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -154,6 +154,41 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 }
 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
 
+#ifdef CONFIG_NUMA
+/**
+ *	irq_get_numa_node - get the NUMA node of a given IRQ
+ *	@irq:		Interrupt to get NUMA node for
+ *
+ *	If the current SMP affinity mask of the IRQ corresponds to a
+ *	single NUMA node, return the node number.  Otherwise return
+ *	%NUMA_NO_NODE.
+ */
+int irq_get_numa_node(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	const struct cpumask *mask;
+	int cpu, node = NUMA_NO_NODE;
+
+	if (!desc)
+		return node;
+
+	mask = desc->affinity;
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	if (desc->status & IRQ_MOVE_PENDING)
+		mask = desc->pending_mask;
+#endif
+
+	for_each_cpu(cpu, mask) {
+		if (node < 0)
+			node = cpu_to_node(cpu);
+		else if (node != cpu_to_node(cpu))
+			return NUMA_NO_NODE;
+	}
+
+	return node;
+}
+#endif
+
 #ifndef CONFIG_AUTO_IRQ_AFFINITY
 /*
  * Generic version of the affinity autoselector.
-- 
1.7.2.1



-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [RFC][PATCH 2/3] ethtool: NUMA affinity control
  2010-09-19 17:24             ` Michael S. Tsirkin
  2010-09-20 12:44               ` [RFC][PATCH 1/3] IRQ: Add irq_get_numa_node() Ben Hutchings
@ 2010-09-20 12:45               ` Ben Hutchings
  2010-09-20 12:48               ` [RFC][PATCH 3/3] sfc: Add support for " Ben Hutchings
  2 siblings, 0 replies; 28+ messages in thread
From: Ben Hutchings @ 2010-09-20 12:45 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, therbert, eric.dumazet, shemminger, netdev

Define operations to get and set the numbers of channels belonging to
a net device.  A channel is defined as the combination of an IRQ and
all the queues that can trigger that IRQ.  Channels are identified by
type and index, similarly to the naming scheme used for IRQ handlers.

Define operations to get and set the NUMA affinity of objects
associated with a channel.
---
 include/linux/ethtool.h |   56 ++++++++++++++++
 net/core/ethtool.c      |  165 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 221 insertions(+), 0 deletions(-)

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index b67af60..0eaae5d 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -517,6 +517,47 @@ struct ethtool_flash {
 	char	data[ETHTOOL_FLASH_MAX_FILENAME];
 };
 
+/* Network channel information.  A network channel is the combination of
+ * an IRQ and all the queues that can trigger that IRQ. */
+struct ethtool_channels {
+	__u32	cmd;
+	__u32	combined_count;		/* number of multi-purpose channels */
+	__u32	rx_count;		/* number of RX-only channels */
+	__u32	tx_count;		/* number of TX-only channels */
+	__u32	other_count;		/* number of additional channels */
+};
+
+/* Channel ID is made up of a type and an index */
+enum ethtool_channel_id {
+	ETH_CHAN_INDEX_MASK	= 0x0fffffff,
+	ETH_CHAN_TYPE_MASK	= 0xc0000000,
+	ETH_CHAN_TYPE_COMBINED	= 0x00000000,
+	ETH_CHAN_TYPE_RX	= 0x40000000,
+	ETH_CHAN_TYPE_TX	= 0x80000000,
+	ETH_CHAN_TYPE_OTHER	= 0xc0000000,
+	ETH_CHAN_ALL		= 0xffffffff	/* special: operate on all */
+};
+
+/* Special NUMA node IDs */
+enum ethtool_numa_node {
+	ETH_NUMA_NODE_UNSET	= -1,	/* get/set: no affinity set */
+	ETH_NUMA_NODE_N_A	= -2,	/* get/set: not applicable; channel
+					 * doesn't have this object */
+	ETH_NUMA_NODE_IRQ	= -3,	/* set: match current IRQ affinity */
+	ETH_NUMA_NODE_DEV	= -4,	/* set: match device affinity */
+};
+
+struct ethtool_affinity {
+	__u32	cmd;
+	__u32	channel_id;		/* channel type and index; may be
+					 * ETH_CHAN_ALL when setting */
+	__s32	rx_ring_node;		/* affinity of RX descriptor ring */
+	__s32	tx_ring_node;		/* affinity of TX descriptor ring */
+	__s32	event_ring_node;	/* affinity of event/completion ring */
+	__s32	handler_data_node;	/* affinity of IRQ/NAPI handler's
+					 * software structures */
+};
+
 #ifdef __KERNEL__
 
 #include <linux/rculist.h>
@@ -551,6 +592,9 @@ int ethtool_op_set_ufo(struct net_device *dev, u32 data);
 u32 ethtool_op_get_flags(struct net_device *dev);
 int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported);
 void ethtool_ntuple_flush(struct net_device *dev);
+#ifdef CONFIG_NUMA
+int ethtool_affinity_resolve(s32 node_id, struct net_device *dev, unsigned irq);
+#endif
 
 /**
  * &ethtool_ops - Alter and report network device settings
@@ -672,6 +716,14 @@ struct ethtool_ops {
 				  struct ethtool_rxfh_indir *);
 	int	(*set_rxfh_indir)(struct net_device *,
 				  const struct ethtool_rxfh_indir *);
+	int	(*get_channels)(struct net_device *, struct ethtool_channels *);
+	int	(*set_channels)(struct net_device *,
+				const struct ethtool_channels *);
+#ifdef CONFIG_NUMA
+	int	(*get_affinity)(struct net_device *, struct ethtool_affinity *);
+	int	(*set_affinity)(struct net_device *,
+				const struct ethtool_affinity *);
+#endif
 };
 #endif /* __KERNEL__ */
 
@@ -735,6 +787,10 @@ struct ethtool_ops {
 #define ETHTOOL_GSSET_INFO	0x00000037 /* Get string set info */
 #define ETHTOOL_GRXFHINDIR	0x00000038 /* Get RX flow hash indir'n table */
 #define ETHTOOL_SRXFHINDIR	0x00000039 /* Set RX flow hash indir'n table */
+#define ETHTOOL_GCHANNELS	0x0000003a /* Get numbers of channels */
+#define ETHTOOL_SCHANNELS	0x0000003b /* Set numbers of channels */
+#define ETHTOOL_GAFFINITY	0x0000003c /* Get NUMA affinity */
+#define ETHTOOL_SAFFINITY	0x0000003d /* Set NUMA affinity */
 
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 34fae15..753a186 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -20,6 +20,8 @@
 #include <linux/bitops.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/numa.h>
 
 /*
  * Some useful ethtool_ops methods that're device independent.
@@ -1429,6 +1431,155 @@ static noinline_for_stack int ethtool_flash_device(struct net_device *dev,
 	return dev->ethtool_ops->flash_device(dev, &efl);
 }
 
+static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
+						   char __user *useraddr)
+{
+	struct ethtool_channels channels;
+	int rc;
+
+	if (!dev->ethtool_ops->get_channels)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&channels, useraddr, sizeof(channels)))
+		return -EFAULT;
+
+	rc = dev->ethtool_ops->get_channels(dev, &channels);
+	if (rc)
+		return rc;
+
+	if (copy_to_user(useraddr, &channels, sizeof(channels)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
+						   char __user *useraddr)
+{
+	struct ethtool_channels channels;
+
+	if (!dev->ethtool_ops->set_channels)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&channels, useraddr, sizeof(channels)))
+		return -EFAULT;
+
+	return dev->ethtool_ops->set_channels(dev, &channels);
+}
+
+#ifdef CONFIG_NUMA
+
+static int ethtool_check_channel_id(struct net_device *dev, u32 channel_id)
+{
+	struct ethtool_channels channels;
+	u32 channel_count;
+	int rc;
+
+	if (!dev->ethtool_ops->get_channels)
+		return -EOPNOTSUPP;
+
+	memset(&channels, 0, sizeof(channels));
+	channels.cmd = ETHTOOL_GCHANNELS;
+	rc = dev->ethtool_ops->get_channels(dev, &channels);
+	if (rc)
+		return rc;
+
+	switch (channel_id & ETH_CHAN_TYPE_MASK) {
+	case ETH_CHAN_TYPE_COMBINED:
+		channel_count = channels.combined_count;
+		break;
+	case ETH_CHAN_TYPE_RX:
+		channel_count = channels.rx_count;
+		break;
+	case ETH_CHAN_TYPE_TX:
+		channel_count = channels.tx_count;
+		break;
+	case ETH_CHAN_TYPE_OTHER:
+	default:
+		channel_count = channels.other_count;
+		break;
+	}
+	if ((channel_id & ETH_CHAN_INDEX_MASK) >= channel_count)
+		return -EINVAL;
+
+	return 0;
+}
+
+static noinline_for_stack int ethtool_get_affinity(struct net_device *dev,
+						   char __user *useraddr)
+{
+	struct ethtool_affinity affin;
+	int rc;
+
+	if (!dev->ethtool_ops->get_affinity)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&affin, useraddr, sizeof(affin)))
+		return -EFAULT;
+
+	rc = ethtool_check_channel_id(dev, affin.channel_id);
+	if (rc)
+		return rc;
+
+	rc = dev->ethtool_ops->get_affinity(dev, &affin);
+	if (rc)
+		return rc;
+
+	if (copy_to_user(useraddr, &affin, sizeof(affin)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static noinline_for_stack int ethtool_set_affinity(struct net_device *dev,
+						   char __user *useraddr)
+{
+	struct ethtool_affinity affin;
+	int rc;
+
+	if (!dev->ethtool_ops->set_affinity)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&affin, useraddr, sizeof(affin)))
+		return -EFAULT;
+
+	if (affin.channel_id != ETH_CHAN_ALL) {
+		rc = ethtool_check_channel_id(dev, affin.channel_id);
+		if (rc)
+			return rc;
+	}
+
+	return dev->ethtool_ops->set_affinity(dev, &affin);
+}
+
+/**
+ * ethtool_affinity_resolve - resolve the NUMA node ID for a channel
+ * @node_id:		User-specified node ID
+ * @dev:		The channel's net device
+ * @irq:		The channel's IRQ
+ *
+ * This resolves the special node IDs %ETH_NUMA_NODE_IRQ,
+ * %ETH_NUMA_NODE_DEV and %ETH_NUMA_NODE_UNSET and validates that any
+ * other specified node ID is in the valid range.  It returns either a
+ * specific node ID, %NUMA_NO_NODE or a negative error code (less than
+ * %NUMA_NO_NODE).
+ */
+int ethtool_affinity_resolve(s32 node_id, struct net_device *dev, unsigned irq)
+{
+	if (node_id == ETH_NUMA_NODE_IRQ)
+		return irq_get_numa_node(irq);
+	if (node_id == ETH_NUMA_NODE_DEV && dev->dev.parent)
+		return dev_to_node(dev->dev.parent);
+	if (node_id == ETH_NUMA_NODE_UNSET)
+		return NUMA_NO_NODE;
+	if (node_id >= 0 && node_id < MAX_NUMNODES && nr_cpus_node(node_id))
+		return node_id;
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ethtool_affinity_resolve);
+
+#endif /* CONFIG_NUMA */
+
 /* The main entry point in this file.  Called from net/core/dev.c */
 
 int dev_ethtool(struct net *net, struct ifreq *ifr)
@@ -1673,6 +1824,20 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_SRXFHINDIR:
 		rc = ethtool_set_rxfh_indir(dev, useraddr);
 		break;
+	case ETHTOOL_GCHANNELS:
+		rc = ethtool_get_channels(dev, useraddr);
+		break;
+	case ETHTOOL_SCHANNELS:
+		rc = ethtool_set_channels(dev, useraddr);
+		break;
+#ifdef CONFIG_NUMA
+	case ETHTOOL_GAFFINITY:
+		rc = ethtool_get_affinity(dev, useraddr);
+		break;
+	case ETHTOOL_SAFFINITY:
+		rc = ethtool_set_affinity(dev, useraddr);
+		break;
+#endif
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
1.7.2.1



-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [RFC][PATCH 3/3] sfc: Add support for NUMA affinity control
  2010-09-19 17:24             ` Michael S. Tsirkin
  2010-09-20 12:44               ` [RFC][PATCH 1/3] IRQ: Add irq_get_numa_node() Ben Hutchings
  2010-09-20 12:45               ` [RFC][PATCH 2/3] ethtool: NUMA affinity control Ben Hutchings
@ 2010-09-20 12:48               ` Ben Hutchings
  2 siblings, 0 replies; 28+ messages in thread
From: Ben Hutchings @ 2010-09-20 12:48 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: David Miller, therbert, eric.dumazet, shemminger, netdev

Allow channel structures and hardware queues to be reallocated with
specific affinity.
---
The reallocation code is admittedly rather complicated and ugly.

Ben.

 drivers/net/sfc/efx.c        |  115 ++++++++++++++++++++++++++++++++++++------
 drivers/net/sfc/efx.h        |    4 +-
 drivers/net/sfc/ethtool.c    |   72 ++++++++++++++++++++++++++-
 drivers/net/sfc/net_driver.h |   12 ++++
 drivers/net/sfc/nic.c        |   47 ++++++++++++-----
 5 files changed, 220 insertions(+), 30 deletions(-)

diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c
index f702f1f..a116d2a 100644
--- a/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@ -416,10 +416,74 @@ static void efx_remove_eventq(struct efx_channel *channel)
  *
  *************************************************************************/
 
+static struct efx_channel *
+efx_clone_channel_with_affinity(struct efx_nic *efx,
+				struct efx_channel *old_channel,
+				const struct ethtool_affinity *affin)
+{
+#ifdef CONFIG_NUMA
+	struct net_device *net_dev = efx->net_dev;
+	struct efx_channel *channel;
+	unsigned irq = (efx->interrupt_mode == EFX_INT_MODE_LEGACY ?
+			efx->legacy_irq : old_channel->irq);
+	int node_id;
+
+	node_id = ethtool_affinity_resolve(affin->handler_data_node,
+					   net_dev, irq);
+	if (node_id < NUMA_NO_NODE)
+		return ERR_PTR(node_id);
+			
+	channel = kmalloc_node(sizeof(*channel), GFP_KERNEL, node_id);
+	if (!channel)
+		return ERR_PTR(-ENOMEM);
+	*channel = *old_channel;
+	channel->channel_node = node_id;
+
+	if (efx_channel_get_rx_queue(old_channel)) {
+		node_id = ethtool_affinity_resolve(affin->rx_ring_node,
+						   net_dev, irq);
+		if (node_id < NUMA_NO_NODE)
+			goto fail;
+		channel->rxq_node = node_id;
+	} else {
+		if (affin->rx_ring_node != ETH_NUMA_NODE_N_A &&
+		    affin->rx_ring_node != ETH_NUMA_NODE_UNSET)
+			return ERR_PTR(-EINVAL);
+	}
+
+	if (efx_channel_get_tx_queue(old_channel, 0)) {
+		node_id = ethtool_affinity_resolve(affin->tx_ring_node,
+						   net_dev, irq);
+		if (node_id < NUMA_NO_NODE)
+			goto fail;
+		channel->txq_node = node_id;
+	} else {
+		if (affin->tx_ring_node != ETH_NUMA_NODE_N_A &&
+		    affin->tx_ring_node != ETH_NUMA_NODE_UNSET)
+			return ERR_PTR(-EINVAL);
+	}
+
+	node_id = ethtool_affinity_resolve(affin->event_ring_node,
+					   net_dev, irq);
+	if (node_id < NUMA_NO_NODE)
+		goto fail;
+	channel->evq_node = node_id;
+
+	return channel;
+
+fail:
+	kfree(channel);
+	return ERR_PTR(node_id);
+#else /* !CONFIG_NUMA */
+	return ERR_PTR(-EOPNOTSUPP);
+#endif /* CONFIG_NUMA */
+}
+
 /* Allocate and initialise a channel structure, optionally copying
  * parameters (but not resources) from an old channel structure. */
 static struct efx_channel *
-efx_alloc_channel(struct efx_nic *efx, int i, struct efx_channel *old_channel)
+efx_alloc_channel(struct efx_nic *efx, int i, struct efx_channel *old_channel,
+		  const struct ethtool_affinity *new_affin)
 {
 	struct efx_channel *channel;
 	struct efx_rx_queue *rx_queue;
@@ -427,11 +491,18 @@ efx_alloc_channel(struct efx_nic *efx, int i, struct efx_channel *old_channel)
 	int j;
 
 	if (old_channel) {
-		channel = kmalloc(sizeof(*channel), GFP_KERNEL);
-		if (!channel)
-			return NULL;
-
-		*channel = *old_channel;
+		if (new_affin) {
+			channel = efx_clone_channel_with_affinity(
+				efx, old_channel, new_affin);
+			if (IS_ERR(channel))
+				return channel;
+		} else {
+			channel = kmalloc_node(sizeof(*channel), GFP_KERNEL,
+					       old_channel->channel_node);
+			if (!channel)
+				return ERR_PTR(-ENOMEM);
+			*channel = *old_channel;
+		}
 
 		memset(&channel->eventq, 0, sizeof(channel->eventq));
 
@@ -449,10 +520,13 @@ efx_alloc_channel(struct efx_nic *efx, int i, struct efx_channel *old_channel)
 	} else {
 		channel = kzalloc(sizeof(*channel), GFP_KERNEL);
 		if (!channel)
-			return NULL;
+			return ERR_PTR(-ENOMEM);
 
 		channel->efx = efx;
 		channel->channel = i;
+		channel->rxq_node = channel->txq_node =
+			dev_to_node(&efx->pci_dev->dev);
+		channel->evq_node = channel->channel_node = numa_node_id();
 
 		for (j = 0; j < EFX_TXQ_TYPES; j++) {
 			tx_queue = &channel->tx_queue[j];
@@ -707,7 +781,9 @@ static void efx_remove_channels(struct efx_nic *efx)
 }
 
 int
-efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries)
+efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries,
+		     struct efx_channel *affin_channel,
+		     const struct ethtool_affinity *affin)
 {
 	struct efx_channel *other_channel[EFX_MAX_CHANNELS], *channel;
 	u32 old_rxq_entries, old_txq_entries;
@@ -720,9 +796,13 @@ efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries)
 	/* Clone channels */
 	memset(other_channel, 0, sizeof(other_channel));
 	for (i = 0; i < efx->n_channels; i++) {
-		channel = efx_alloc_channel(efx, i, efx->channel[i]);
-		if (!channel) {
-			rc = -ENOMEM;
+		const struct ethtool_affinity *new_affin =
+			(affin_channel == NULL ||
+			 affin_channel == efx->channel[i]) ? affin : NULL;
+
+		channel = efx_alloc_channel(efx, i, efx->channel[i], new_affin);
+		if (IS_ERR(channel)) {
+			rc = PTR_ERR(channel);
 			goto out;
 		}
 		other_channel[i] = channel;
@@ -1281,13 +1361,16 @@ static void efx_set_channels(struct efx_nic *efx)
 	unsigned tx_channel_offset =
 		separate_tx_channels ? efx->n_channels - efx->n_tx_channels : 0;
 
-	/* Channel pointers were set in efx_init_struct() but we now
-	 * need to clear them for TX queues in any RX-only channels. */
+	/* Invalidate pointers and node IDs for unused RX and TX queue
+	 * structures */
 	efx_for_each_channel(channel, efx) {
+		if (channel->channel >= efx->n_rx_channels)
+			channel->rxq_node = ETH_NUMA_NODE_N_A;
 		if (channel->channel - tx_channel_offset >=
 		    efx->n_tx_channels) {
 			efx_for_each_channel_tx_queue(tx_queue, channel)
 				tx_queue->channel = NULL;
+			channel->txq_node = ETH_NUMA_NODE_N_A;
 		}
 	}
 }
@@ -2198,6 +2281,7 @@ static struct efx_phy_operations efx_dummy_phy_operations = {
 static int efx_init_struct(struct efx_nic *efx, struct efx_nic_type *type,
 			   struct pci_dev *pci_dev, struct net_device *net_dev)
 {
+	struct efx_channel *channel;
 	int i;
 
 	/* Initialise common structures */
@@ -2226,9 +2310,10 @@ static int efx_init_struct(struct efx_nic *efx, struct efx_nic_type *type,
 	INIT_WORK(&efx->mac_work, efx_mac_work);
 
 	for (i = 0; i < EFX_MAX_CHANNELS; i++) {
-		efx->channel[i] = efx_alloc_channel(efx, i, NULL);
-		if (!efx->channel[i])
+		channel = efx_alloc_channel(efx, i, NULL, NULL);
+		if (IS_ERR(channel))
 			goto fail;
+		efx->channel[i] = channel;
 	}
 
 	efx->type = type;
diff --git a/drivers/net/sfc/efx.h b/drivers/net/sfc/efx.h
index e783c0f..8baad6f 100644
--- a/drivers/net/sfc/efx.h
+++ b/drivers/net/sfc/efx.h
@@ -67,7 +67,9 @@ extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);
 /* Channels */
 extern void efx_process_channel_now(struct efx_channel *channel);
 extern int
-efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries);
+efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries,
+		     struct efx_channel *channel,
+		     const struct ethtool_affinity *affin);
 
 /* Ports */
 extern int efx_reconfigure_port(struct efx_nic *efx);
diff --git a/drivers/net/sfc/ethtool.c b/drivers/net/sfc/ethtool.c
index 7f735d8..4029819 100644
--- a/drivers/net/sfc/ethtool.c
+++ b/drivers/net/sfc/ethtool.c
@@ -775,7 +775,8 @@ static int efx_ethtool_set_ringparam(struct net_device *net_dev,
 		return -EINVAL;
 	}
 
-	return efx_realloc_channels(efx, ring->rx_pending, ring->tx_pending);
+	return efx_realloc_channels(efx, ring->rx_pending, ring->tx_pending,
+				    NULL, NULL);
 }
 
 static int efx_ethtool_set_pauseparam(struct net_device *net_dev,
@@ -993,6 +994,70 @@ static int efx_ethtool_set_rxfh_indir(struct net_device *net_dev,
 	return 0;
 }
 
+static int efx_ethtool_get_channels(struct net_device *net_dev,
+				    struct ethtool_channels *channels)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+
+	channels->rx_count = efx->n_channels - efx->n_tx_channels;
+	channels->tx_count = efx->n_channels - efx->n_rx_channels;
+	channels->combined_count = (efx->n_channels - channels->rx_count -
+				    channels->tx_count);
+	channels->other_count = 0;
+	return 0;
+}
+
+#ifdef CONFIG_NUMA
+
+static struct efx_channel *
+efx_channel_from_id(struct efx_nic *efx, u32 channel_id)
+{
+	u32 channel_index = channel_id & ETH_CHAN_INDEX_MASK;
+
+	switch (channel_id & ETH_CHAN_TYPE_MASK) {
+	case ETH_CHAN_TYPE_RX:
+		return efx_get_channel(efx, channel_index);
+	case ETH_CHAN_TYPE_COMBINED:
+		return efx_get_channel(efx,
+				       efx->n_channels - efx->n_tx_channels +
+				       channel_index);
+	case ETH_CHAN_TYPE_TX:
+		return efx_get_channel(efx, efx->n_rx_channels + channel_index);
+	default:
+		BUG();
+	}
+}
+
+static int efx_ethtool_get_affinity(struct net_device *net_dev,
+				    struct ethtool_affinity *affin)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+	struct efx_channel *channel =
+		efx_channel_from_id(efx, affin->channel_id);
+
+	affin->rx_ring_node = channel->rxq_node;
+	affin->tx_ring_node = channel->txq_node;
+	affin->event_ring_node = channel->evq_node;
+	affin->handler_data_node = channel->channel_node;
+	return 0;
+}
+
+static int efx_ethtool_set_affinity(struct net_device *net_dev,
+				    const struct ethtool_affinity *affin)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+	struct efx_channel *channel;
+
+	if (affin->channel_id == ETH_CHAN_ALL)
+		channel = NULL;
+	else
+		channel = efx_channel_from_id(efx, affin->channel_id);
+	return efx_realloc_channels(efx, efx->rxq_entries, efx->txq_entries,
+				    channel, affin);
+}
+
+#endif /* CONFIG_NUMA */
+
 const struct ethtool_ops efx_ethtool_ops = {
 	.get_settings		= efx_ethtool_get_settings,
 	.set_settings		= efx_ethtool_set_settings,
@@ -1035,4 +1100,9 @@ const struct ethtool_ops efx_ethtool_ops = {
 	.get_rxnfc		= efx_ethtool_get_rxnfc,
 	.get_rxfh_indir		= efx_ethtool_get_rxfh_indir,
 	.set_rxfh_indir		= efx_ethtool_set_rxfh_indir,
+	.get_channels		= efx_ethtool_get_channels,
+#ifdef CONFIG_NUMA
+	.get_affinity		= efx_ethtool_get_affinity,
+	.set_affinity		= efx_ethtool_set_affinity,
+#endif
 };
diff --git a/drivers/net/sfc/net_driver.h b/drivers/net/sfc/net_driver.h
index 152342d..de69ac3 100644
--- a/drivers/net/sfc/net_driver.h
+++ b/drivers/net/sfc/net_driver.h
@@ -69,8 +69,10 @@
 
 /**
  * struct efx_special_buffer - An Efx special buffer
+ * @page: Page pointer, iff buffer was allocated with alloc_pages_node()
  * @addr: CPU base address of the buffer
  * @dma_addr: DMA base address of the buffer
+ * @dma_dir: Direction of DMA mapping
  * @len: Buffer length, in bytes
  * @index: Buffer index within controller;s buffer table
  * @entries: Number of buffer table entries
@@ -80,8 +82,10 @@
  * actual transmit and receive buffers.
  */
 struct efx_special_buffer {
+	struct page *page;
 	void *addr;
 	dma_addr_t dma_addr;
+	enum dma_data_direction dma_dir;
 	unsigned int len;
 	int index;
 	int entries;
@@ -299,6 +303,10 @@ enum efx_rx_alloc_method {
  *
  * @efx: Associated Efx NIC
  * @channel: Channel instance number
+ * @rxq_node: Hardware RX queue NUMA affinity
+ * @txq_node: Hardware TX queue NUMA affinity
+ * @evq_node: Hardware event queue NUMA affinity
+ * @channel_node: NUMA affinity for this structure
  * @enabled: Channel enabled indicator
  * @irq: IRQ number (MSI and MSI-X only)
  * @irq_moderation: IRQ moderation value (in hardware ticks)
@@ -332,6 +340,10 @@ enum efx_rx_alloc_method {
 struct efx_channel {
 	struct efx_nic *efx;
 	int channel;
+	int rxq_node;
+	int txq_node;
+	int evq_node;
+	int channel_node;
 	bool enabled;
 	int irq;
 	unsigned int irq_moderation;
diff --git a/drivers/net/sfc/nic.c b/drivers/net/sfc/nic.c
index 6c5c0ce..799e881 100644
--- a/drivers/net/sfc/nic.c
+++ b/drivers/net/sfc/nic.c
@@ -259,17 +259,31 @@ efx_fini_special_buffer(struct efx_nic *efx, struct efx_special_buffer *buffer)
  */
 static int efx_alloc_special_buffer(struct efx_nic *efx,
 				    struct efx_special_buffer *buffer,
-				    unsigned int len)
+				    unsigned int len,
+				    int node_id,
+				    enum dma_data_direction dma_dir)
 {
+	unsigned int order;
+
 	len = ALIGN(len, EFX_BUF_SIZE);
+	buffer->dma_dir = dma_dir;
 
-	buffer->addr = dma_alloc_coherent(&efx->pci_dev->dev, len,
-					  &buffer->dma_addr, GFP_KERNEL);
-	if (!buffer->addr)
+	order = order_base_2(DIV_ROUND_UP(len, PAGE_SIZE));
+	buffer->page = alloc_pages_node(node_id, GFP_KERNEL, order);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	buffer->dma_addr = dma_map_page(&efx->pci_dev->dev, buffer->page, 0,
+					buffer->len, buffer->dma_dir);
+	if (unlikely(dma_mapping_error(&efx->pci_dev->dev, buffer->dma_addr))) {
+		__free_pages(buffer->page, order);
 		return -ENOMEM;
+	}
+	EFX_BUG_ON_PARANOID(buffer->dma_addr & (EFX_BUF_SIZE - 1));
+
+	buffer->addr = page_address(buffer->page);
 	buffer->len = len;
 	buffer->entries = len / EFX_BUF_SIZE;
-	BUG_ON(buffer->dma_addr & (EFX_BUF_SIZE - 1));
 
 	/* All zeros is a potentially valid event so memset to 0xff */
 	memset(buffer->addr, 0xff, len);
@@ -291,6 +305,8 @@ static int efx_alloc_special_buffer(struct efx_nic *efx,
 static void
 efx_free_special_buffer(struct efx_nic *efx, struct efx_special_buffer *buffer)
 {
+	unsigned int order;
+
 	if (!buffer->addr)
 		return;
 
@@ -301,8 +317,10 @@ efx_free_special_buffer(struct efx_nic *efx, struct efx_special_buffer *buffer)
 		  (u64)buffer->dma_addr, buffer->len,
 		  buffer->addr, (u64)virt_to_phys(buffer->addr));
 
-	dma_free_coherent(&efx->pci_dev->dev, buffer->len, buffer->addr,
-			  buffer->dma_addr);
+	order = order_base_2(DIV_ROUND_UP(buffer->len, PAGE_SIZE));
+	dma_unmap_page(&efx->pci_dev->dev, buffer->dma_addr,
+		       buffer->len, buffer->dma_dir);
+	__free_pages(buffer->page, order);
 	buffer->addr = NULL;
 	buffer->entries = 0;
 }
@@ -401,8 +419,9 @@ int efx_nic_probe_tx(struct efx_tx_queue *tx_queue)
 	unsigned entries;
 
 	entries = tx_queue->ptr_mask + 1;
-	return efx_alloc_special_buffer(efx, &tx_queue->txd,
-					entries * sizeof(efx_qword_t));
+	return efx_alloc_special_buffer(
+		efx, &tx_queue->txd, entries * sizeof(efx_qword_t),
+		tx_queue->channel->txq_node, DMA_TO_DEVICE);
 }
 
 void efx_nic_init_tx(struct efx_tx_queue *tx_queue)
@@ -551,8 +570,9 @@ int efx_nic_probe_rx(struct efx_rx_queue *rx_queue)
 	unsigned entries;
 
 	entries = rx_queue->ptr_mask + 1;
-	return efx_alloc_special_buffer(efx, &rx_queue->rxd,
-					entries * sizeof(efx_qword_t));
+	return efx_alloc_special_buffer(
+		efx, &rx_queue->rxd, entries * sizeof(efx_qword_t),
+		efx_rx_queue_channel(rx_queue)->rxq_node, DMA_TO_DEVICE);
 }
 
 void efx_nic_init_rx(struct efx_rx_queue *rx_queue)
@@ -1080,8 +1100,9 @@ int efx_nic_probe_eventq(struct efx_channel *channel)
 	unsigned entries;
 
 	entries = channel->eventq_mask + 1;
-	return efx_alloc_special_buffer(efx, &channel->eventq,
-					entries * sizeof(efx_qword_t));
+	return efx_alloc_special_buffer(
+		efx, &channel->eventq, entries * sizeof(efx_qword_t),
+		channel->evq_node, DMA_BIDIRECTIONAL);
 }
 
 void efx_nic_init_eventq(struct efx_channel *channel)
-- 
1.7.2.1


-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH 1/3] IRQ: Add irq_get_numa_node()
  2010-09-20 12:44               ` [RFC][PATCH 1/3] IRQ: Add irq_get_numa_node() Ben Hutchings
@ 2010-09-20 13:04                 ` Eric Dumazet
  0 siblings, 0 replies; 28+ messages in thread
From: Eric Dumazet @ 2010-09-20 13:04 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Michael S. Tsirkin, David Miller, therbert, shemminger, netdev

Le lundi 20 septembre 2010 à 13:44 +0100, Ben Hutchings a écrit :
> This will be used to support NUMA-aware memory allocation of
> structures used in interrupt handlers, starting with network drivers.
> ---
> irq_get_numa_node() probably needs to grab desc->lock.  Other than that,
> this should work.
> 
> Ben.
> 
>  include/linux/interrupt.h |    9 +++++++++
>  kernel/irq/manage.c       |   35 +++++++++++++++++++++++++++++++++++
>  2 files changed, 44 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index a0384a4..82e9a08 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -260,6 +260,15 @@ static inline int irq_set_affinity_hint(unsigned int irq,
>  }
>  #endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */
>  
> +#if defined(CONFIG_NUMA) && defined(CONFIG_GENERIC_HARDIRQS)
> +extern int irq_get_numa_node(unsigned int irq);
> +#else
> +static inline int irq_get_numa_node(unsigned int irq)
> +{
> +	return -1;
> +}
> +#endif
> +
>  #ifdef CONFIG_GENERIC_HARDIRQS
>  /*
>   * Special lockdep variants of irq disabling/enabling.
> diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
> index c3003e9..03e683e 100644
> --- a/kernel/irq/manage.c
> +++ b/kernel/irq/manage.c
> @@ -154,6 +154,41 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
>  }
>  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
>  
> +#ifdef CONFIG_NUMA
> +/**
> + *	irq_get_numa_node - get the NUMA node of a given IRQ
> + *	@irq:		Interrupt to get NUMA node for
> + *
> + *	If the current SMP affinity mask of the IRQ corresponds to a
> + *	single NUMA node, return the node number.  Otherwise return
> + *	%NUMA_NO_NODE.
> + */
> +int irq_get_numa_node(unsigned int irq)
> +{
> +	struct irq_desc *desc = irq_to_desc(irq);
> +	const struct cpumask *mask;
> +	int cpu, node = NUMA_NO_NODE;
> +
> +	if (!desc)
> +		return node;
> +
> +	mask = desc->affinity;
> +#ifdef CONFIG_GENERIC_PENDING_IRQ
> +	if (desc->status & IRQ_MOVE_PENDING)
> +		mask = desc->pending_mask;
> +#endif
> +
> +	for_each_cpu(cpu, mask) {
> +		if (node < 0)

or : if (node == NUMA_NO_NODE)

> +			node = cpu_to_node(cpu);
> +		else if (node != cpu_to_node(cpu))
> +			return NUMA_NO_NODE;
> +	}
> +
> +	return node;
> +}
> +#endif
> +
>  #ifndef CONFIG_AUTO_IRQ_AFFINITY
>  /*
>   * Generic version of the affinity autoselector.
> -- 
> 1.7.2.1
> 
> 
> 



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2010-09-02  1:32         ` David Miller
                             ` (3 preceding siblings ...)
  2010-09-16 21:52           ` Ben Hutchings
@ 2011-02-21 18:19           ` Ben Hutchings
  2011-02-21 19:31             ` Jeremy Eder
  2011-02-26  7:09             ` David Miller
  4 siblings, 2 replies; 28+ messages in thread
From: Ben Hutchings @ 2011-02-21 18:19 UTC (permalink / raw)
  To: David Miller; +Cc: therbert, eric.dumazet, shemminger, netdev

On Wed, 2010-09-01 at 18:32 -0700, David Miller wrote:
> From: Tom Herbert <therbert@google.com>
> Date: Wed, 1 Sep 2010 09:24:18 -0700
> 
> > On Wed, Sep 1, 2010 at 8:54 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >> 3) Eventually have a user selectable selection (socket option, or system
> >> wide, but one sysctl, not many bitmasks ;) ).
> >>
> > Right, but it would also be nice if a single sysctl could optimally
> > set up multiqueue, RSS, RPS, and all my interrupt affinities for me
> > ;-)
> 
> It's becomming increasingly obvious to me that we need (somewhere,
> not necessarily the kernel) a complete datastructure representing
> the NUMA, cache, cpu, device hierarchy.
> 
> And that can be used to tweak all of this stuff.
> 
> The policy should probably be in userspace, we just need to provide
> the knobs in the kernel to tweak it however userspace wants.
> 
> Userspace should be able to, for example, move a TX queue into a
> NUMA domain and have this invoke several side effects:

I think most of the pieces are now ready:

> 1) IRQs for that TX queue get rerouted to a cpu in the NUMA
>    domain.

There is a longstanding procfs interface for IRQ affinity, and userland
infrastructure built on it.  Adding a new interface would be contentious
and I have tried to build on it instead.

> 2) TX queue datastructures in the driver get reallocated using
>    memory in that NUMA domain.

I've previously sent patches to add an ethtool API for NUMA control,
which include the option to allocate on the same node where IRQs are
handled.  However, there is currently no function to allocate
DMA-coherent memory on a specified NUMA node (rather than the device's
node).  This is likely to be beneficial for event rings and might be
good for descriptor rings for some devices.  (The implementation I sent
for sfc mistakenly switched it to allocating non-coherent memory, for
which it *is* possible to specify the node.)

> 3) TX hashing is configured to use the set of cpus in the NUMA
>    domain.

I posted patches for automatic XPS configuration at the end of last
week.  And RFS acceleration covers the other direction.

Ben.

> It's alot of tedious work and involves some delicate tasks figuring
> out where each of these things go, but really then we'd solve all
> of this crap one and for all.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2011-02-21 18:19           ` [PATCH] xps-mq: Transmit Packet Steering for multiqueue Ben Hutchings
@ 2011-02-21 19:31             ` Jeremy Eder
  2011-02-26  7:09             ` David Miller
  1 sibling, 0 replies; 28+ messages in thread
From: Jeremy Eder @ 2011-02-21 19:31 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 1035 bytes --]

On Mon, 2011-02-21 at 18:19 +0000, Ben Hutchings wrote:
> On Wed, 2010-09-01 at 18:32 -0700, David Miller wrote:
> > From: Tom Herbert <therbert@google.com>
> > Date: Wed, 1 Sep 2010 09:24:18 -0700
> > 
> > > On Wed, Sep 1, 2010 at 8:54 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > >> 3) Eventually have a user selectable selection (socket option, or system
> > >> wide, but one sysctl, not many bitmasks ;) ).
> > >>
> > > Right, but it would also be nice if a single sysctl could optimally
> > > set up multiqueue, RSS, RPS, and all my interrupt affinities for me
> > > ;-)

Are cgroups the right place to have the network stack pull "guidance"
from ?

If an app is bound to a socket/core/NUMA node; then the network stack
could inherit the cgroup's tuning and adjust kernel knobs accordingly.

This would not replace procfs tuning, but it seems like a natural
extension of how cgroups and RPS/RFS/XPS could be integrated to ease the
management burden that these new technologies might impose.

--jer

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] xps-mq: Transmit Packet Steering for multiqueue
  2011-02-21 18:19           ` [PATCH] xps-mq: Transmit Packet Steering for multiqueue Ben Hutchings
  2011-02-21 19:31             ` Jeremy Eder
@ 2011-02-26  7:09             ` David Miller
  1 sibling, 0 replies; 28+ messages in thread
From: David Miller @ 2011-02-26  7:09 UTC (permalink / raw)
  To: bhutchings; +Cc: therbert, eric.dumazet, shemminger, netdev

From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 21 Feb 2011 18:19:55 +0000

> On Wed, 2010-09-01 at 18:32 -0700, David Miller wrote:
>> 2) TX queue datastructures in the driver get reallocated using
>>    memory in that NUMA domain.
> 
> I've previously sent patches to add an ethtool API for NUMA control,
> which include the option to allocate on the same node where IRQs are
> handled.  However, there is currently no function to allocate
> DMA-coherent memory on a specified NUMA node (rather than the device's
> node).  This is likely to be beneficial for event rings and might be
> good for descriptor rings for some devices.  (The implementation I sent
> for sfc mistakenly switched it to allocating non-coherent memory, for
> which it *is* possible to specify the node.)

The thing to do is to work with someone like FUJITA Tomonori on this.

It's simply a matter of making new APIs that take the node specifier,
have the implementations either make use of or completely ignore the node,
and have the existing APIs pass in "-1" for the node or whatever the
CPP macro is for this :-)

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2011-02-26  7:09 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-08-23  5:39 [PATCH] xps-mq: Transmit Packet Steering for multiqueue Tom Herbert
2010-08-23 17:09 ` Ben Hutchings
     [not found]   ` <AANLkTinST5zaS0NtBjrzyLbsg=w_EVsHE3DCDcrmQNc6@mail.gmail.com>
2010-08-23 17:50     ` Ben Hutchings
2010-08-23 17:59 ` Stephen Hemminger
2010-09-01 15:41   ` Tom Herbert
2010-09-01 15:54     ` Eric Dumazet
2010-09-01 16:24       ` Tom Herbert
2010-09-02  1:32         ` David Miller
2010-09-02  1:48           ` Stephen Hemminger
2010-09-02 16:00             ` Loke, Chetan
2010-09-02 19:52               ` Tom Herbert
2010-09-02 23:17                 ` Loke, Chetan
2010-09-02  1:56           ` Stephen Hemminger
2010-09-02  6:41             ` Greg Lindahl
2010-09-02 16:18             ` Loke, Chetan
2010-09-02 15:55           ` Loke, Chetan
2010-09-16 21:52           ` Ben Hutchings
2010-09-19 17:24             ` Michael S. Tsirkin
2010-09-20 12:44               ` [RFC][PATCH 1/3] IRQ: Add irq_get_numa_node() Ben Hutchings
2010-09-20 13:04                 ` Eric Dumazet
2010-09-20 12:45               ` [RFC][PATCH 2/3] ethtool: NUMA affinity control Ben Hutchings
2010-09-20 12:48               ` [RFC][PATCH 3/3] sfc: Add support for " Ben Hutchings
2011-02-21 18:19           ` [PATCH] xps-mq: Transmit Packet Steering for multiqueue Ben Hutchings
2011-02-21 19:31             ` Jeremy Eder
2011-02-26  7:09             ` David Miller
2010-09-01 16:09     ` David Miller
2010-08-24  4:31 ` Bill Fink
2010-08-24  4:37   ` Tom Herbert

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).