[PATCH v3 1/2] core: Factor out flow calculation from get_rps

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu
@ 2010-08-03  3:02 Krishna Kumar
  2010-08-03  3:03 ` [PATCH v3 2/2] macvtap: Implement multiqueue macvtap driver Krishna Kumar
  2010-08-03  4:05 ` [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu Changli Gao
  0 siblings, 2 replies; 8+ messages in thread
From: Krishna Kumar @ 2010-08-03  3:02 UTC (permalink / raw)
  To: davem, arnd; +Cc: bhutchings, netdev, therbert, Krishna Kumar, mst

From: Krishna Kumar <krkumar2@in.ibm.com>

Factor out flow calculation code from get_rps_cpu, since macvtap
driver can use the same code.

Revisions:

v2 - Ben: Separate flow calcuation out and use in select queue
v3 - Arnd: Don't re-implement MIN

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
---
 include/linux/netdevice.h |    1 
 net/core/dev.c            |   94 ++++++++++++++++++++++--------------
 2 files changed, 59 insertions(+), 36 deletions(-)

diff -ruNp org/include/linux/netdevice.h new/include/linux/netdevice.h
--- org/include/linux/netdevice.h	2010-08-03 08:19:57.000000000 +0530
+++ new/include/linux/netdevice.h	2010-08-03 08:19:57.000000000 +0530
@@ -2253,6 +2253,7 @@ static inline const char *netdev_name(co
 	return dev->name;
 }
 
+extern int skb_calculate_flow(struct net_device *dev, struct sk_buff *skb);
 extern int netdev_printk(const char *level, const struct net_device *dev,
 			 const char *format, ...)
 	__attribute__ ((format (printf, 3, 4)));
diff -ruNp org/net/core/dev.c new/net/core/dev.c
--- org/net/core/dev.c	2010-08-03 08:19:57.000000000 +0530
+++ new/net/core/dev.c	2010-08-03 08:19:57.000000000 +0530
@@ -2263,51 +2263,24 @@ static inline void ____napi_schedule(str
 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 }
 
-#ifdef CONFIG_RPS
-
-/* One global table that all flow-based protocols share. */
-struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
-EXPORT_SYMBOL(rps_sock_flow_table);
-
 /*
- * get_rps_cpu is called from netif_receive_skb and returns the target
- * CPU from the RPS map of the receiving queue for a given skb.
- * rcu_read_lock must be held on entry.
+ * skb_calculate_flow: calculate a flow hash based on src/dst addresses
+ * and src/dst port numbers. On success, returns a hash number (> 0),
+ * otherwise -1.
  */
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
-		       struct rps_dev_flow **rflowp)
+int skb_calculate_flow(struct net_device *dev, struct sk_buff *skb)
 {
+	int hash = skb->rxhash;
 	struct ipv6hdr *ip6;
 	struct iphdr *ip;
-	struct netdev_rx_queue *rxqueue;
-	struct rps_map *map;
-	struct rps_dev_flow_table *flow_table;
-	struct rps_sock_flow_table *sock_flow_table;
-	int cpu = -1;
 	u8 ip_proto;
-	u16 tcpu;
 	u32 addr1, addr2, ihl;
 	union {
 		u32 v32;
 		u16 v16[2];
 	} ports;
 
-	if (skb_rx_queue_recorded(skb)) {
-		u16 index = skb_get_rx_queue(skb);
-		if (unlikely(index >= dev->num_rx_queues)) {
-			WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
-				"on queue %u, but number of RX queues is %u\n",
-				dev->name, index, dev->num_rx_queues);
-			goto done;
-		}
-		rxqueue = dev->_rx + index;
-	} else
-		rxqueue = dev->_rx;
-
-	if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
-		goto done;
-
-	if (skb->rxhash)
+	if (hash)
 		goto got_hash; /* Skip hash computation on packet header */
 
 	switch (skb->protocol) {
@@ -2334,6 +2307,7 @@ static int get_rps_cpu(struct net_device
 	default:
 		goto done;
 	}
+
 	switch (ip_proto) {
 	case IPPROTO_TCP:
 	case IPPROTO_UDP:
@@ -2356,11 +2330,59 @@ static int get_rps_cpu(struct net_device
 	/* get a consistent hash (same value on both flow directions) */
 	if (addr2 < addr1)
 		swap(addr1, addr2);
-	skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
-	if (!skb->rxhash)
-		skb->rxhash = 1;
+
+	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
+	if (!hash)
+		hash = 1;
 
 got_hash:
+	return hash;
+
+done:
+	return -1;
+}
+EXPORT_SYMBOL(skb_calculate_flow);
+
+#ifdef CONFIG_RPS
+
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
+EXPORT_SYMBOL(rps_sock_flow_table);
+
+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving queue for a given skb.
+ * rcu_read_lock must be held on entry.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+		       struct rps_dev_flow **rflowp)
+{
+	struct netdev_rx_queue *rxqueue;
+	struct rps_map *map;
+	struct rps_dev_flow_table *flow_table;
+	struct rps_sock_flow_table *sock_flow_table;
+	int cpu = -1;
+	u16 tcpu;
+
+	if (skb_rx_queue_recorded(skb)) {
+		u16 index = skb_get_rx_queue(skb);
+		if (unlikely(index >= dev->num_rx_queues)) {
+			WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
+				"on queue %u, but number of RX queues is %u\n",
+				dev->name, index, dev->num_rx_queues);
+			goto done;
+		}
+		rxqueue = dev->_rx + index;
+	} else
+		rxqueue = dev->_rx;
+
+	if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
+		goto done;
+
+	skb->rxhash = skb_calculate_flow(dev, skb);
+	if (skb->rxhash < 0)
+		goto done;
+
 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 	if (flow_table && sock_flow_table) {

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v3 2/2] macvtap: Implement multiqueue macvtap driver
  2010-08-03  3:02 [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu Krishna Kumar
@ 2010-08-03  3:03 ` Krishna Kumar
  2010-08-03  4:05 ` [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu Changli Gao
  1 sibling, 0 replies; 8+ messages in thread
From: Krishna Kumar @ 2010-08-03  3:03 UTC (permalink / raw)
  To: davem, arnd; +Cc: bhutchings, netdev, mst, Krishna Kumar, therbert

From: Krishna Kumar <krkumar2@in.ibm.com>

Implement multiqueue facility for macvtap driver. The idea is that
a macvtap device can be opened multiple times and the fd's can be
used to register eg, as backend for vhost.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
---
 drivers/net/macvtap.c      |   89 ++++++++++++++++++++++++++++-------
 include/linux/if_macvlan.h |    9 +++
 2 files changed, 80 insertions(+), 18 deletions(-)

diff -ruNp org/include/linux/if_macvlan.h new/include/linux/if_macvlan.h
--- org/include/linux/if_macvlan.h	2010-08-03 08:19:57.000000000 +0530
+++ new/include/linux/if_macvlan.h	2010-08-03 08:20:39.000000000 +0530
@@ -40,6 +40,12 @@ struct macvlan_rx_stats {
 	unsigned long		rx_errors;
 };
 
+/*
+ * Maximum times a macvtap device can be opened. This can be used to
+ * configure the number of receive queue, e.g. for multiqueue virtio.
+ */
+#define MAX_MACVTAP_QUEUES	(NR_CPUS < 16 ? NR_CPUS : 16)
+
 struct macvlan_dev {
 	struct net_device	*dev;
 	struct list_head	list;
@@ -50,7 +56,8 @@ struct macvlan_dev {
 	enum macvlan_mode	mode;
 	int (*receive)(struct sk_buff *skb);
 	int (*forward)(struct net_device *dev, struct sk_buff *skb);
-	struct macvtap_queue	*tap;
+	struct macvtap_queue	*taps[MAX_MACVTAP_QUEUES];
+	int			numvtaps;
 };
 
 static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
diff -ruNp org/drivers/net/macvtap.c new/drivers/net/macvtap.c
--- org/drivers/net/macvtap.c	2010-08-03 08:19:57.000000000 +0530
+++ new/drivers/net/macvtap.c	2010-08-03 08:19:57.000000000 +0530
@@ -84,26 +84,45 @@ static const struct proto_ops macvtap_so
 static DEFINE_SPINLOCK(macvtap_lock);
 
 /*
- * Choose the next free queue, for now there is only one
+ * get_slot: return a [unused/occupied] slot in vlan->taps[]:
+ *	- if 'q' is NULL, return the first empty slot;
+ *	- otherwise, return the slot this pointer occupies.
  */
+static int get_slot(struct macvlan_dev *vlan, struct macvtap_queue *q)
+{
+	int i;
+
+	for (i = 0; i < MAX_MACVTAP_QUEUES; i++) {
+		if (rcu_dereference(vlan->taps[i]) == q)
+			return i;
+	}
+
+	/* Should never happen */
+	BUG_ON(1);
+}
+
 static int macvtap_set_queue(struct net_device *dev, struct file *file,
 				struct macvtap_queue *q)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
+	int index;
 	int err = -EBUSY;
 
 	spin_lock(&macvtap_lock);
-	if (rcu_dereference(vlan->tap))
+	if (vlan->numvtaps == MAX_MACVTAP_QUEUES)
 		goto out;
 
 	err = 0;
+	index = get_slot(vlan, NULL);
 	rcu_assign_pointer(q->vlan, vlan);
-	rcu_assign_pointer(vlan->tap, q);
+	rcu_assign_pointer(vlan->taps[index], q);
 	sock_hold(&q->sk);
 
 	q->file = file;
 	file->private_data = q;
 
+	vlan->numvtaps++;
+
 out:
 	spin_unlock(&macvtap_lock);
 	return err;
@@ -124,9 +143,12 @@ static void macvtap_put_queue(struct mac
 	spin_lock(&macvtap_lock);
 	vlan = rcu_dereference(q->vlan);
 	if (vlan) {
-		rcu_assign_pointer(vlan->tap, NULL);
+		int index = get_slot(vlan, q);
+
+		rcu_assign_pointer(vlan->taps[index], NULL);
 		rcu_assign_pointer(q->vlan, NULL);
 		sock_put(&q->sk);
+		--vlan->numvtaps;
 	}
 
 	spin_unlock(&macvtap_lock);
@@ -136,39 +158,72 @@ static void macvtap_put_queue(struct mac
 }
 
 /*
- * Since we only support one queue, just dereference the pointer.
+ * Select a queue based on the rxq of the device on which this packet
+ * arrived. If the incoming device is not mq, calculate a flow hash to
+ * select a queue. vlan->numvtaps is cached in case it reduces during
+ * the execution of this function.
  */
 static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
 					       struct sk_buff *skb)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct macvtap_queue *tap = NULL;
+	int numvtaps = vlan->numvtaps;
+	u16 rxq;
+
+	if (!numvtaps)
+		goto out;
+
+	if (likely(skb_rx_queue_recorded(skb))) {
+		rxq = skb_get_rx_queue(skb);
+
+		while (unlikely(rxq >= numvtaps))
+			rxq -= numvtaps;
 
-	return rcu_dereference(vlan->tap);
+		tap = rcu_dereference(vlan->taps[rxq]);
+		if (tap)
+			goto out;
+	}
+
+	rxq = skb_calculate_flow(dev, skb);
+	if (rxq < 0)
+		rxq = smp_processor_id();
+
+	tap = rcu_dereference(vlan->taps[rxq & (numvtaps - 1)]);
+
+out:
+	return tap;
 }
 
 /*
  * The net_device is going away, give up the reference
- * that it holds on the queue (all the queues one day)
- * and safely set the pointer from the queues to NULL.
+ * that it holds on all queues and safely set the pointer
+ * from the queues to NULL.
  */
 static void macvtap_del_queues(struct net_device *dev)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
-	struct macvtap_queue *q;
+	struct macvtap_queue *q, *qlist[MAX_MACVTAP_QUEUES];
+	int i, j = 0;
 
+	/* macvtap_put_queue can free some slots, so go through all slots */
 	spin_lock(&macvtap_lock);
-	q = rcu_dereference(vlan->tap);
-	if (!q) {
-		spin_unlock(&macvtap_lock);
-		return;
+	for (i = 0; i < MAX_MACVTAP_QUEUES && vlan->numvtaps; i++) {
+		q = rcu_dereference(vlan->taps[i]);
+		if (q) {
+			qlist[j++] = q;
+			rcu_assign_pointer(vlan->taps[i], NULL);
+			rcu_assign_pointer(q->vlan, NULL);
+			vlan->numvtaps--;
+		}
 	}
-
-	rcu_assign_pointer(vlan->tap, NULL);
-	rcu_assign_pointer(q->vlan, NULL);
+	BUG_ON(vlan->numvtaps != 0);
 	spin_unlock(&macvtap_lock);
 
 	synchronize_rcu();
-	sock_put(&q->sk);
+
+	for (--j; j >= 0; j--)
+		sock_put(&qlist[j]->sk);
 }
 
 /*

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu
  2010-08-03  3:02 [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu Krishna Kumar
  2010-08-03  3:03 ` [PATCH v3 2/2] macvtap: Implement multiqueue macvtap driver Krishna Kumar
@ 2010-08-03  4:05 ` Changli Gao
  2010-08-03  5:57   ` Krishna Kumar2
  1 sibling, 1 reply; 8+ messages in thread
From: Changli Gao @ 2010-08-03  4:05 UTC (permalink / raw)
  To: Krishna Kumar; +Cc: davem, arnd, bhutchings, netdev, therbert, mst

On Tue, Aug 3, 2010 at 11:02 AM, Krishna Kumar <krkumar2@in.ibm.com> wrote:
> From: Krishna Kumar <krkumar2@in.ibm.com>
>
> Factor out flow calculation code from get_rps_cpu, since macvtap
> driver can use the same code.
>
> Revisions:
>
> v2 - Ben: Separate flow calcuation out and use in select queue
> v3 - Arnd: Don't re-implement MIN
>
> Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
> ---
>  include/linux/netdevice.h |    1
>  net/core/dev.c            |   94 ++++++++++++++++++++++--------------
>  2 files changed, 59 insertions(+), 36 deletions(-)
>
> diff -ruNp org/include/linux/netdevice.h new/include/linux/netdevice.h
> --- org/include/linux/netdevice.h       2010-08-03 08:19:57.000000000 +0530
> +++ new/include/linux/netdevice.h       2010-08-03 08:19:57.000000000 +0530
> @@ -2253,6 +2253,7 @@ static inline const char *netdev_name(co
>        return dev->name;
>  }
>
> +extern int skb_calculate_flow(struct net_device *dev, struct sk_buff *skb);
>  extern int netdev_printk(const char *level, const struct net_device *dev,
>                         const char *format, ...)
>        __attribute__ ((format (printf, 3, 4)));
> diff -ruNp org/net/core/dev.c new/net/core/dev.c
> --- org/net/core/dev.c  2010-08-03 08:19:57.000000000 +0530
> +++ new/net/core/dev.c  2010-08-03 08:19:57.000000000 +0530
> @@ -2263,51 +2263,24 @@ static inline void ____napi_schedule(str
>        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
>  }
>
> -#ifdef CONFIG_RPS
> -
> -/* One global table that all flow-based protocols share. */
> -struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
> -EXPORT_SYMBOL(rps_sock_flow_table);
> -
>  /*
> - * get_rps_cpu is called from netif_receive_skb and returns the target
> - * CPU from the RPS map of the receiving queue for a given skb.
> - * rcu_read_lock must be held on entry.
> + * skb_calculate_flow: calculate a flow hash based on src/dst addresses
> + * and src/dst port numbers. On success, returns a hash number (> 0),
> + * otherwise -1.
>  */
> -static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> -                      struct rps_dev_flow **rflowp)
> +int skb_calculate_flow(struct net_device *dev, struct sk_buff *skb)
>  {
> +       int hash = skb->rxhash;
>        struct ipv6hdr *ip6;
>        struct iphdr *ip;
> -       struct netdev_rx_queue *rxqueue;
> -       struct rps_map *map;
> -       struct rps_dev_flow_table *flow_table;
> -       struct rps_sock_flow_table *sock_flow_table;
> -       int cpu = -1;
>        u8 ip_proto;
> -       u16 tcpu;
>        u32 addr1, addr2, ihl;
>        union {
>                u32 v32;
>                u16 v16[2];
>        } ports;
>
> -       if (skb_rx_queue_recorded(skb)) {
> -               u16 index = skb_get_rx_queue(skb);
> -               if (unlikely(index >= dev->num_rx_queues)) {
> -                       WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
> -                               "on queue %u, but number of RX queues is %u\n",
> -                               dev->name, index, dev->num_rx_queues);
> -                       goto done;
> -               }
> -               rxqueue = dev->_rx + index;
> -       } else
> -               rxqueue = dev->_rx;
> -
> -       if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
> -               goto done;
> -
> -       if (skb->rxhash)
> +       if (hash)
>                goto got_hash; /* Skip hash computation on packet header */
>
>        switch (skb->protocol) {
> @@ -2334,6 +2307,7 @@ static int get_rps_cpu(struct net_device
>        default:
>                goto done;
>        }
> +
>        switch (ip_proto) {
>        case IPPROTO_TCP:
>        case IPPROTO_UDP:
> @@ -2356,11 +2330,59 @@ static int get_rps_cpu(struct net_device
>        /* get a consistent hash (same value on both flow directions) */
>        if (addr2 < addr1)
>                swap(addr1, addr2);
> -       skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
> -       if (!skb->rxhash)
> -               skb->rxhash = 1;
> +
> +       hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
> +       if (!hash)
> +               hash = 1;
>
>  got_hash:
> +       return hash;
> +
> +done:
> +       return -1;
> +}
> +EXPORT_SYMBOL(skb_calculate_flow);

I have noticed that you use skb_calculate_flow() in
macvtap_get_queue() where skb->data doesn't point to the network
header but the ethernet header. However, skb_calculate_flow() assume
skb->data points to the network header. There are two choices:
 * update skb_calculate_flow to support called in ethernet layer.
 * pull skb before skb_calculate_flow, and push skb after
skb_calculate_flow() in macvtap_get_queue().

I prefer the former way.

BTW: the function name skb_calculate_flow isn't good. How about
skb_get_rxhash(). Maybe we can implement two versions: fast path and
slow path. And implement the fast path version as a inline function in
skbuff.h.

static inline u32 skb_get_rxhash(struct sk_buff *skb)
{
        u32 rxhash;

        rxhash = skb->rxhash;
        if (!rxhash)
                return __skb_get_rxhash(skb);
        return rxhash;
}


> +
> +#ifdef CONFIG_RPS
> +
> +/* One global table that all flow-based protocols share. */
> +struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
> +EXPORT_SYMBOL(rps_sock_flow_table);
> +
> +/*
> + * get_rps_cpu is called from netif_receive_skb and returns the target
> + * CPU from the RPS map of the receiving queue for a given skb.
> + * rcu_read_lock must be held on entry.
> + */
> +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> +                      struct rps_dev_flow **rflowp)
> +{
> +       struct netdev_rx_queue *rxqueue;
> +       struct rps_map *map;
> +       struct rps_dev_flow_table *flow_table;
> +       struct rps_sock_flow_table *sock_flow_table;
> +       int cpu = -1;
> +       u16 tcpu;
> +
> +       if (skb_rx_queue_recorded(skb)) {
> +               u16 index = skb_get_rx_queue(skb);
> +               if (unlikely(index >= dev->num_rx_queues)) {
> +                       WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
> +                               "on queue %u, but number of RX queues is %u\n",
> +                               dev->name, index, dev->num_rx_queues);
> +                       goto done;
> +               }
> +               rxqueue = dev->_rx + index;
> +       } else
> +               rxqueue = dev->_rx;
> +
> +       if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
> +               goto done;
> +
> +       skb->rxhash = skb_calculate_flow(dev, skb);
> +       if (skb->rxhash < 0)
> +               goto done;
> +
>        flow_table = rcu_dereference(rxqueue->rps_flow_table);
>        sock_flow_table = rcu_dereference(rps_sock_flow_table);
>        if (flow_table && sock_flow_table) {
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>



-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu
  2010-08-03  4:05 ` [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu Changli Gao
@ 2010-08-03  5:57   ` Krishna Kumar2
  2010-08-03  6:11     ` Changli Gao
  0 siblings, 1 reply; 8+ messages in thread
From: Krishna Kumar2 @ 2010-08-03  5:57 UTC (permalink / raw)
  To: Changli Gao; +Cc: arnd, bhutchings, davem, mst, netdev, therbert

Hi Changli,

Good catch.

Instead of adding support for ethernet header or pull/push,
I could defer the skb_push(ETH_HLEN), something like:

static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
{
	struct macvtap_queue *q = macvtap_get_queue(dev, skb);
	if (!q)
		goto drop;

	if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len)
		goto drop;

+	skb_push(skb, ETH_HLEN);
	...
}

and remove the same in macvtap_receive. Will this be better?

Your other suggestions also looks good.

Thanks,

- KK

Changli Gao <xiaosuo@gmail.com> wrote on 08/03/2010 09:35:34 AM:

> Changli Gao <xiaosuo@gmail.com>
> 08/03/2010 09:35 AM
>
> To
>
> Krishna Kumar2/India/IBM@IBMIN
>
> cc
>
> davem@davemloft.net, arnd@arndb.de, bhutchings@solarflare.com,
> netdev@vger.kernel.org, therbert@google.com, mst@redhat.com
>
> Subject
>
> Re: [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu
>
> On Tue, Aug 3, 2010 at 11:02 AM, Krishna Kumar <krkumar2@in.ibm.com>
wrote:
> > From: Krishna Kumar <krkumar2@in.ibm.com>
> >
> > Factor out flow calculation code from get_rps_cpu, since macvtap
> > driver can use the same code.
> >
> > Revisions:
> >
> > v2 - Ben: Separate flow calcuation out and use in select queue
> > v3 - Arnd: Don't re-implement MIN
> >
> > Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
> > ---
> >  include/linux/netdevice.h |    1
> >  net/core/dev.c            |   94 ++++++++++++++++++++++--------------
> >  2 files changed, 59 insertions(+), 36 deletions(-)
> >
> > diff -ruNp org/include/linux/netdevice.h new/include/linux/netdevice.h
> > --- org/include/linux/netdevice.h       2010-08-03 08:19:57.000000000
+0530
> > +++ new/include/linux/netdevice.h       2010-08-03 08:19:57.000000000
+0530
> > @@ -2253,6 +2253,7 @@ static inline const char *netdev_name(co
> >        return dev->name;
> >  }
> >
> > +extern int skb_calculate_flow(struct net_device *dev, struct sk_buff
*skb);
> >  extern int netdev_printk(const char *level, const struct net_device
*dev,
> >                         const char *format, ...)
> >        __attribute__ ((format (printf, 3, 4)));
> > diff -ruNp org/net/core/dev.c new/net/core/dev.c
> > --- org/net/core/dev.c  2010-08-03 08:19:57.000000000 +0530
> > +++ new/net/core/dev.c  2010-08-03 08:19:57.000000000 +0530
> > @@ -2263,51 +2263,24 @@ static inline void ____napi_schedule(str
> >        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
> >  }
> >
> > -#ifdef CONFIG_RPS
> > -
> > -/* One global table that all flow-based protocols share. */
> > -struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
> > -EXPORT_SYMBOL(rps_sock_flow_table);
> > -
> >  /*
> > - * get_rps_cpu is called from netif_receive_skb and returns the target
> > - * CPU from the RPS map of the receiving queue for a given skb.
> > - * rcu_read_lock must be held on entry.
> > + * skb_calculate_flow: calculate a flow hash based on src/dst
addresses
> > + * and src/dst port numbers. On success, returns a hash number (> 0),
> > + * otherwise -1.
> >  */
> > -static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> > -                      struct rps_dev_flow **rflowp)
> > +int skb_calculate_flow(struct net_device *dev, struct sk_buff *skb)
> >  {
> > +       int hash = skb->rxhash;
> >        struct ipv6hdr *ip6;
> >        struct iphdr *ip;
> > -       struct netdev_rx_queue *rxqueue;
> > -       struct rps_map *map;
> > -       struct rps_dev_flow_table *flow_table;
> > -       struct rps_sock_flow_table *sock_flow_table;
> > -       int cpu = -1;
> >        u8 ip_proto;
> > -       u16 tcpu;
> >        u32 addr1, addr2, ihl;
> >        union {
> >                u32 v32;
> >                u16 v16[2];
> >        } ports;
> >
> > -       if (skb_rx_queue_recorded(skb)) {
> > -               u16 index = skb_get_rx_queue(skb);
> > -               if (unlikely(index >= dev->num_rx_queues)) {
> > -                       WARN_ONCE(dev->num_rx_queues > 1, "%s
> received packet "
> > -                               "on queue %u, but number of RX
> queues is %u\n",
> > -                               dev->name, index, dev->num_rx_queues);
> > -                       goto done;
> > -               }
> > -               rxqueue = dev->_rx + index;
> > -       } else
> > -               rxqueue = dev->_rx;
> > -
> > -       if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
> > -               goto done;
> > -
> > -       if (skb->rxhash)
> > +       if (hash)
> >                goto got_hash; /* Skip hash computation on packet header
*/
> >
> >        switch (skb->protocol) {
> > @@ -2334,6 +2307,7 @@ static int get_rps_cpu(struct net_device
> >        default:
> >                goto done;
> >        }
> > +
> >        switch (ip_proto) {
> >        case IPPROTO_TCP:
> >        case IPPROTO_UDP:
> > @@ -2356,11 +2330,59 @@ static int get_rps_cpu(struct net_device
> >        /* get a consistent hash (same value on both flow directions) */
> >        if (addr2 < addr1)
> >                swap(addr1, addr2);
> > -       skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
> > -       if (!skb->rxhash)
> > -               skb->rxhash = 1;
> > +
> > +       hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
> > +       if (!hash)
> > +               hash = 1;
> >
> >  got_hash:
> > +       return hash;
> > +
> > +done:
> > +       return -1;
> > +}
> > +EXPORT_SYMBOL(skb_calculate_flow);
>
> I have noticed that you use skb_calculate_flow() in
> macvtap_get_queue() where skb->data doesn't point to the network
> header but the ethernet header. However, skb_calculate_flow() assume
> skb->data points to the network header. There are two choices:
>  * update skb_calculate_flow to support called in ethernet layer.
>  * pull skb before skb_calculate_flow, and push skb after
> skb_calculate_flow() in macvtap_get_queue().
>
> I prefer the former way.
>
> BTW: the function name skb_calculate_flow isn't good. How about
> skb_get_rxhash(). Maybe we can implement two versions: fast path and
> slow path. And implement the fast path version as a inline function in
> skbuff.h.
>
> static inline u32 skb_get_rxhash(struct sk_buff *skb)
> {
>         u32 rxhash;
>
>         rxhash = skb->rxhash;
>         if (!rxhash)
>                 return __skb_get_rxhash(skb);
>         return rxhash;
> }
>
>
> > +
> > +#ifdef CONFIG_RPS
> > +
> > +/* One global table that all flow-based protocols share. */
> > +struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
> > +EXPORT_SYMBOL(rps_sock_flow_table);
> > +
> > +/*
> > + * get_rps_cpu is called from netif_receive_skb and returns the target
> > + * CPU from the RPS map of the receiving queue for a given skb.
> > + * rcu_read_lock must be held on entry.
> > + */
> > +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> > +                      struct rps_dev_flow **rflowp)
> > +{
> > +       struct netdev_rx_queue *rxqueue;
> > +       struct rps_map *map;
> > +       struct rps_dev_flow_table *flow_table;
> > +       struct rps_sock_flow_table *sock_flow_table;
> > +       int cpu = -1;
> > +       u16 tcpu;
> > +
> > +       if (skb_rx_queue_recorded(skb)) {
> > +               u16 index = skb_get_rx_queue(skb);
> > +               if (unlikely(index >= dev->num_rx_queues)) {
> > +                       WARN_ONCE(dev->num_rx_queues > 1, "%s
> received packet "
> > +                               "on queue %u, but number of RX
> queues is %u\n",
> > +                               dev->name, index, dev->num_rx_queues);
> > +                       goto done;
> > +               }
> > +               rxqueue = dev->_rx + index;
> > +       } else
> > +               rxqueue = dev->_rx;
> > +
> > +       if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
> > +               goto done;
> > +
> > +       skb->rxhash = skb_calculate_flow(dev, skb);
> > +       if (skb->rxhash < 0)
> > +               goto done;
> > +
> >        flow_table = rcu_dereference(rxqueue->rps_flow_table);
> >        sock_flow_table = rcu_dereference(rps_sock_flow_table);
> >        if (flow_table && sock_flow_table) {
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
>
>
>
> --
> Regards,
> Changli Gao(xiaosuo@gmail.com)


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu
  2010-08-03  5:57   ` Krishna Kumar2
@ 2010-08-03  6:11     ` Changli Gao
  2010-08-03  7:18       ` Changli Gao
  0 siblings, 1 reply; 8+ messages in thread
From: Changli Gao @ 2010-08-03  6:11 UTC (permalink / raw)
  To: Krishna Kumar2; +Cc: arnd, bhutchings, davem, mst, netdev, therbert

On Tue, Aug 3, 2010 at 1:57 PM, Krishna Kumar2 <krkumar2@in.ibm.com> wrote:
> Hi Changli,
>
> Good catch.
>
> Instead of adding support for ethernet header or pull/push,
> I could defer the skb_push(ETH_HLEN), something like:
>
> static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
> {
>        struct macvtap_queue *q = macvtap_get_queue(dev, skb);
>        if (!q)
>                goto drop;
>
>        if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len)
>                goto drop;
>
> +       skb_push(skb, ETH_HLEN);
>        ...
> }
>
> and remove the same in macvtap_receive. Will this be better?
>

I am confused by the call sites of macvlan_dev.receive and
macvlan_dev.forward. They both are possible to be called in both
RX(skb->data points to network header) and TX(skb->data points to
ethernet) paths. The current code in macvtap shows that
macvlan_dev.receive should be called in network layer, and
macvlan_dev.forward should be called in dev layer. Am I correct?


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu
  2010-08-03  6:11     ` Changli Gao
@ 2010-08-03  7:18       ` Changli Gao
  2010-08-03  8:32         ` Arnd Bergmann
  0 siblings, 1 reply; 8+ messages in thread
From: Changli Gao @ 2010-08-03  7:18 UTC (permalink / raw)
  To: Krishna Kumar2; +Cc: arnd, bhutchings, davem, mst, netdev, therbert

On Tue, Aug 3, 2010 at 2:11 PM, Changli Gao <xiaosuo@gmail.com> wrote:
> On Tue, Aug 3, 2010 at 1:57 PM, Krishna Kumar2 <krkumar2@in.ibm.com> wrote:
>> Hi Changli,
>>
>> Good catch.
>>
>> Instead of adding support for ethernet header or pull/push,
>> I could defer the skb_push(ETH_HLEN), something like:
>>
>> static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
>> {
>>        struct macvtap_queue *q = macvtap_get_queue(dev, skb);
>>        if (!q)
>>                goto drop;
>>
>>        if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len)
>>                goto drop;
>>
>> +       skb_push(skb, ETH_HLEN);
>>        ...
>> }
>>
>> and remove the same in macvtap_receive. Will this be better?
>>
>
> I am confused by the call sites of macvlan_dev.receive and
> macvlan_dev.forward. They both are possible to be called in both
> RX(skb->data points to network header) and TX(skb->data points to
> ethernet) paths. The current code in macvtap shows that
> macvlan_dev.receive should be called in network layer, and
> macvlan_dev.forward should be called in dev layer. Am I correct?
>

After checking the code carefully, I find macvlan_dev.receive is
called in network layer(RX path), and macvlan_dev.forward is called in
dev layer(TX path). The current code hasn't any issue. Your solution
won't work, as macvtap_forward is called in dev layer, when skb->data
points to ethernet header already.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu
  2010-08-03  7:18       ` Changli Gao
@ 2010-08-03  8:32         ` Arnd Bergmann
  2010-08-03 22:36           ` Sridhar Samudrala
  0 siblings, 1 reply; 8+ messages in thread
From: Arnd Bergmann @ 2010-08-03  8:32 UTC (permalink / raw)
  To: Changli Gao; +Cc: Krishna Kumar2, bhutchings, davem, mst, netdev, therbert

On Tuesday 03 August 2010, Changli Gao wrote:
> > I am confused by the call sites of macvlan_dev.receive and
> > macvlan_dev.forward. They both are possible to be called in both
> > RX(skb->data points to network header) and TX(skb->data points to
> > ethernet) paths. The current code in macvtap shows that
> > macvlan_dev.receive should be called in network layer, and
> > macvlan_dev.forward should be called in dev layer. Am I correct?
> >
> 
> After checking the code carefully, I find macvlan_dev.receive is
> called in network layer(RX path), and macvlan_dev.forward is called in
> dev layer(TX path). The current code hasn't any issue. Your solution
> won't work, as macvtap_forward is called in dev layer, when skb->data
> points to ethernet header already.

Yes, that is correct. Forward is used for "bridge" mode, where we
can send data between two macvlan/macvtap endpoints directly, as
opposed to the default "vepa" mode, where all data is always sent
out to the lowerdev and may be returned by an adjacent switch in
hairpin configuration.

	Arnd

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu
  2010-08-03  8:32         ` Arnd Bergmann
@ 2010-08-03 22:36           ` Sridhar Samudrala
  0 siblings, 0 replies; 8+ messages in thread
From: Sridhar Samudrala @ 2010-08-03 22:36 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Changli Gao, Krishna Kumar2, bhutchings, davem, mst, netdev,
	therbert

On Tue, 2010-08-03 at 10:32 +0200, Arnd Bergmann wrote:
> On Tuesday 03 August 2010, Changli Gao wrote:
> > > I am confused by the call sites of macvlan_dev.receive and
> > > macvlan_dev.forward. They both are possible to be called in both
> > > RX(skb->data points to network header) and TX(skb->data points to
> > > ethernet) paths. The current code in macvtap shows that
> > > macvlan_dev.receive should be called in network layer, and
> > > macvlan_dev.forward should be called in dev layer. Am I correct?
> > >
> > 
> > After checking the code carefully, I find macvlan_dev.receive is
> > called in network layer(RX path), and macvlan_dev.forward is called in
> > dev layer(TX path). The current code hasn't any issue. Your solution
> > won't work, as macvtap_forward is called in dev layer, when skb->data
> > points to ethernet header already.
> 
> Yes, that is correct. Forward is used for "bridge" mode, where we
> can send data between two macvlan/macvtap endpoints directly, as
> opposed to the default "vepa" mode, where all data is always sent
> out to the lowerdev and may be returned by an adjacent switch in
> hairpin configuration.

macvtap_forward() is also used in the normal receive path as
macvtap_receive() has a direct call to macvtap_forward() after 
updating skb->data to point to the ethernet header.

Thanks
Sridhar


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2010-08-03 22:36 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-08-03  3:02 [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu Krishna Kumar
2010-08-03  3:03 ` [PATCH v3 2/2] macvtap: Implement multiqueue macvtap driver Krishna Kumar
2010-08-03  4:05 ` [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu Changli Gao
2010-08-03  5:57   ` Krishna Kumar2
2010-08-03  6:11     ` Changli Gao
2010-08-03  7:18       ` Changli Gao
2010-08-03  8:32         ` Arnd Bergmann
2010-08-03 22:36           ` Sridhar Samudrala

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).