* [net-next rfc V2 1/8] macvtap: do not add self to waitqueue if doing a nonblock read
2013-05-31 9:53 [net-next rfc V2 0/8] Multiqueue API for macvtap Jason Wang
@ 2013-05-31 9:53 ` Jason Wang
2013-05-31 9:53 ` [net-next rfc V2 2/8] macvlan: switch to use IS_ENABLED() Jason Wang
` (6 subsequent siblings)
7 siblings, 0 replies; 17+ messages in thread
From: Jason Wang @ 2013-05-31 9:53 UTC (permalink / raw)
To: davem, netdev, linux-kernel, mst; +Cc: Jason Wang
There's no need to add self to waitqueue if doing a nonblock read. This could
help to avoid the spinlock contention.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/macvtap.c | 7 +++++--
1 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 59e9605..2395062 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -845,7 +845,9 @@ static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb,
ssize_t ret = 0;
while (len) {
- prepare_to_wait(sk_sleep(&q->sk), &wait, TASK_INTERRUPTIBLE);
+ if (!noblock)
+ prepare_to_wait(sk_sleep(&q->sk), &wait,
+ TASK_INTERRUPTIBLE);
/* Read frames from the queue */
skb = skb_dequeue(&q->sk.sk_receive_queue);
@@ -867,7 +869,8 @@ static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb,
break;
}
- finish_wait(sk_sleep(&q->sk), &wait);
+ if (!noblock)
+ finish_wait(sk_sleep(&q->sk), &wait);
return ret;
}
--
1.7.1
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [net-next rfc V2 2/8] macvlan: switch to use IS_ENABLED()
2013-05-31 9:53 [net-next rfc V2 0/8] Multiqueue API for macvtap Jason Wang
2013-05-31 9:53 ` [net-next rfc V2 1/8] macvtap: do not add self to waitqueue if doing a nonblock read Jason Wang
@ 2013-05-31 9:53 ` Jason Wang
2013-05-31 9:53 ` [net-next rfc V2 3/8] macvtap: introduce macvtap_get_vlan() Jason Wang
` (5 subsequent siblings)
7 siblings, 0 replies; 17+ messages in thread
From: Jason Wang @ 2013-05-31 9:53 UTC (permalink / raw)
To: davem, netdev, linux-kernel, mst; +Cc: Jason Wang
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
include/linux/if_macvlan.h | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 84dde1d..e47ad46 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -8,7 +8,7 @@
#include <net/netlink.h>
#include <linux/u64_stats_sync.h>
-#if defined(CONFIG_MACVTAP) || defined(CONFIG_MACVTAP_MODULE)
+#if IS_ENABLED(CONFIG_MACVTAP)
struct socket *macvtap_get_socket(struct file *);
#else
#include <linux/err.h>
--
1.7.1
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [net-next rfc V2 3/8] macvtap: introduce macvtap_get_vlan()
2013-05-31 9:53 [net-next rfc V2 0/8] Multiqueue API for macvtap Jason Wang
2013-05-31 9:53 ` [net-next rfc V2 1/8] macvtap: do not add self to waitqueue if doing a nonblock read Jason Wang
2013-05-31 9:53 ` [net-next rfc V2 2/8] macvlan: switch to use IS_ENABLED() Jason Wang
@ 2013-05-31 9:53 ` Jason Wang
2013-05-31 9:53 ` [net-next rfc V2 4/8] macvlan: change the max number of queues to 16 Jason Wang
` (4 subsequent siblings)
7 siblings, 0 replies; 17+ messages in thread
From: Jason Wang @ 2013-05-31 9:53 UTC (permalink / raw)
To: davem, netdev, linux-kernel, mst; +Cc: Jason Wang
Factor out the device holding logic to a macvtap_get_vlan(), this will be also
used by multiqueue API.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/macvtap.c | 27 ++++++++++++++++++++-------
1 files changed, 20 insertions(+), 7 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 2395062..56b38e9 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -893,6 +893,24 @@ out:
return ret;
}
+static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q)
+{
+ struct macvlan_dev *vlan;
+
+ rcu_read_lock_bh();
+ vlan = rcu_dereference_bh(q->vlan);
+ if (vlan)
+ dev_hold(vlan->dev);
+ rcu_read_unlock_bh();
+
+ return vlan;
+}
+
+static void macvtap_put_vlan(struct macvlan_dev *vlan)
+{
+ dev_put(vlan->dev);
+}
+
/*
* provide compatibility with generic tun/tap interface
*/
@@ -924,12 +942,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
return ret;
case TUNGETIFF:
- rcu_read_lock_bh();
- vlan = rcu_dereference_bh(q->vlan);
- if (vlan)
- dev_hold(vlan->dev);
- rcu_read_unlock_bh();
-
+ vlan = macvtap_get_vlan(q);
if (!vlan)
return -ENOLINK;
@@ -937,7 +950,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
put_user(q->flags, &ifr->ifr_flags))
ret = -EFAULT;
- dev_put(vlan->dev);
+ macvtap_put_vlan(vlan);
return ret;
case TUNGETFEATURES:
--
1.7.1
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [net-next rfc V2 4/8] macvlan: change the max number of queues to 16
2013-05-31 9:53 [net-next rfc V2 0/8] Multiqueue API for macvtap Jason Wang
` (2 preceding siblings ...)
2013-05-31 9:53 ` [net-next rfc V2 3/8] macvtap: introduce macvtap_get_vlan() Jason Wang
@ 2013-05-31 9:53 ` Jason Wang
2013-05-31 9:53 ` [net-next rfc V2 5/8] macvtap: eliminate linear search Jason Wang
` (3 subsequent siblings)
7 siblings, 0 replies; 17+ messages in thread
From: Jason Wang @ 2013-05-31 9:53 UTC (permalink / raw)
To: davem, netdev, linux-kernel, mst; +Cc: Jason Wang
Macvtap should be at least compatible with tap, so change the max number to 16.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
include/linux/if_macvlan.h | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index e47ad46..62d8bda 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -50,7 +50,7 @@ struct macvlan_pcpu_stats {
* Maximum times a macvtap device can be opened. This can be used to
* configure the number of receive queue, e.g. for multiqueue virtio.
*/
-#define MAX_MACVTAP_QUEUES (NR_CPUS < 16 ? NR_CPUS : 16)
+#define MAX_MACVTAP_QUEUES 16
#define MACVLAN_MC_FILTER_BITS 8
#define MACVLAN_MC_FILTER_SZ (1 << MACVLAN_MC_FILTER_BITS)
--
1.7.1
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [net-next rfc V2 5/8] macvtap: eliminate linear search
2013-05-31 9:53 [net-next rfc V2 0/8] Multiqueue API for macvtap Jason Wang
` (3 preceding siblings ...)
2013-05-31 9:53 ` [net-next rfc V2 4/8] macvlan: change the max number of queues to 16 Jason Wang
@ 2013-05-31 9:53 ` Jason Wang
2013-05-31 9:53 ` [net-next rfc V2 6/8] macvtap: allow TUNSETIFF to create multiqueue device Jason Wang
` (2 subsequent siblings)
7 siblings, 0 replies; 17+ messages in thread
From: Jason Wang @ 2013-05-31 9:53 UTC (permalink / raw)
To: davem, netdev, linux-kernel, mst; +Cc: Jason Wang
Linear search were used in both get_slot() and macvtap_get_queue(), this is
because:
- macvtap didn't reshuffle the array of taps when create or destroy a queue, so
when adding a new queue, macvtap must do linear search to find a location for
the new queue. This will also complicate the TUNSETQUEUE implementation for
multiqueue API.
- the queue itself didn't track the queue index, so the we must do a linear
search in the array to find the location of a existed queue.
The solution is straightforward: reshuffle the array and introduce a queue_index
to macvtap_queue.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/macvtap.c | 63 +++++++++++++++---------------------------------
1 files changed, 20 insertions(+), 43 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 56b38e9..e76484c 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -44,6 +44,7 @@ struct macvtap_queue {
struct macvlan_dev __rcu *vlan;
struct file *file;
unsigned int flags;
+ u16 queue_index;
};
static struct proto macvtap_proto = {
@@ -84,30 +85,10 @@ static const struct proto_ops macvtap_socket_ops;
*/
static DEFINE_SPINLOCK(macvtap_lock);
-/*
- * get_slot: return a [unused/occupied] slot in vlan->taps[]:
- * - if 'q' is NULL, return the first empty slot;
- * - otherwise, return the slot this pointer occupies.
- */
-static int get_slot(struct macvlan_dev *vlan, struct macvtap_queue *q)
-{
- int i;
-
- for (i = 0; i < MAX_MACVTAP_QUEUES; i++) {
- if (rcu_dereference_protected(vlan->taps[i],
- lockdep_is_held(&macvtap_lock)) == q)
- return i;
- }
-
- /* Should never happen */
- BUG_ON(1);
-}
-
static int macvtap_set_queue(struct net_device *dev, struct file *file,
struct macvtap_queue *q)
{
struct macvlan_dev *vlan = netdev_priv(dev);
- int index;
int err = -EBUSY;
spin_lock(&macvtap_lock);
@@ -115,12 +96,12 @@ static int macvtap_set_queue(struct net_device *dev, struct file *file,
goto out;
err = 0;
- index = get_slot(vlan, NULL);
rcu_assign_pointer(q->vlan, vlan);
- rcu_assign_pointer(vlan->taps[index], q);
+ rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
sock_hold(&q->sk);
q->file = file;
+ q->queue_index = vlan->numvtaps;
file->private_data = q;
vlan->numvtaps++;
@@ -140,16 +121,23 @@ out:
*/
static void macvtap_put_queue(struct macvtap_queue *q)
{
+ struct macvtap_queue *nq;
struct macvlan_dev *vlan;
spin_lock(&macvtap_lock);
vlan = rcu_dereference_protected(q->vlan,
lockdep_is_held(&macvtap_lock));
if (vlan) {
- int index = get_slot(vlan, q);
+ int index = q->queue_index;
+ BUG_ON(index >= vlan->numvtaps);
+
+ nq = rcu_dereference_protected(vlan->taps[vlan->numvtaps - 1],
+ lockdep_is_held(&macvtap_lock));
+ rcu_assign_pointer(vlan->taps[index], nq);
+ nq->queue_index = index;
- RCU_INIT_POINTER(vlan->taps[index], NULL);
RCU_INIT_POINTER(q->vlan, NULL);
+
sock_put(&q->sk);
--vlan->numvtaps;
}
@@ -182,8 +170,7 @@ static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
rxq = skb_get_rxhash(skb);
if (rxq) {
tap = rcu_dereference(vlan->taps[rxq % numvtaps]);
- if (tap)
- goto out;
+ goto out;
}
if (likely(skb_rx_queue_recorded(skb))) {
@@ -193,17 +180,10 @@ static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
rxq -= numvtaps;
tap = rcu_dereference(vlan->taps[rxq]);
- if (tap)
- goto out;
- }
-
- /* Everything failed - find first available queue */
- for (rxq = 0; rxq < MAX_MACVTAP_QUEUES; rxq++) {
- tap = rcu_dereference(vlan->taps[rxq]);
- if (tap)
- break;
+ goto out;
}
+ tap = rcu_dereference(vlan->taps[0]);
out:
return tap;
}
@@ -221,17 +201,14 @@ static void macvtap_del_queues(struct net_device *dev)
/* macvtap_put_queue can free some slots, so go through all slots */
spin_lock(&macvtap_lock);
- for (i = 0; i < MAX_MACVTAP_QUEUES && vlan->numvtaps; i++) {
+ for (i = 0; i < vlan->numvtaps; i++) {
q = rcu_dereference_protected(vlan->taps[i],
lockdep_is_held(&macvtap_lock));
- if (q) {
- qlist[j++] = q;
- RCU_INIT_POINTER(vlan->taps[i], NULL);
- RCU_INIT_POINTER(q->vlan, NULL);
- vlan->numvtaps--;
- }
+ BUG_ON(q == NULL);
+ qlist[j++] = q;
+ RCU_INIT_POINTER(vlan->taps[i], NULL);
+ RCU_INIT_POINTER(q->vlan, NULL);
}
- BUG_ON(vlan->numvtaps != 0);
/* guarantee that any future macvtap_set_queue will fail */
vlan->numvtaps = MAX_MACVTAP_QUEUES;
spin_unlock(&macvtap_lock);
--
1.7.1
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [net-next rfc V2 6/8] macvtap: allow TUNSETIFF to create multiqueue device
2013-05-31 9:53 [net-next rfc V2 0/8] Multiqueue API for macvtap Jason Wang
` (4 preceding siblings ...)
2013-05-31 9:53 ` [net-next rfc V2 5/8] macvtap: eliminate linear search Jason Wang
@ 2013-05-31 9:53 ` Jason Wang
2013-05-31 14:48 ` Sergei Shtylyov
2013-05-31 9:53 ` [net-next rfc V2 7/8] macvtap: add TUNSETQUEUE ioctl Jason Wang
2013-05-31 9:53 ` [net-next rfc V2 8/8] macvtap: enable multiqueue flag Jason Wang
7 siblings, 1 reply; 17+ messages in thread
From: Jason Wang @ 2013-05-31 9:53 UTC (permalink / raw)
To: davem, netdev, linux-kernel, mst; +Cc: Jason Wang
Though the queue were in fact created by open(), we still need to add this check
to be compatible with tuntap which can let mgmt software use a single API to
manage queues. This patch only validates the device name and moves the TUNSETIFF
to a helper.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/macvtap.c | 51 ++++++++++++++++++++++++++++++++++++++----------
1 files changed, 40 insertions(+), 11 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index e76484c..eac49cb 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -870,6 +870,7 @@ out:
return ret;
}
+
static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q)
{
struct macvlan_dev *vlan;
@@ -888,6 +889,44 @@ static void macvtap_put_vlan(struct macvlan_dev *vlan)
dev_put(vlan->dev);
}
+static int macvtap_set_iff(struct file *file, struct ifreq __user *ifr_u)
+{
+ struct macvtap_queue *q = file->private_data;
+ struct net *net = current->nsproxy->net_ns;
+ struct inode *inode = file_inode(file);
+ struct net_device *dev, *dev2;
+ struct ifreq ifr;
+
+ if (copy_from_user(&ifr, ifr_u, sizeof(struct ifreq)))
+ return -EFAULT;
+
+ /* To keep the same behavior of tuntap, validate ifr_name */
+ if (ifr.ifr_flags & IFF_MULTI_QUEUE) {
+ dev = __dev_get_by_name(net, ifr.ifr_name);
+ if (!dev)
+ return -EINVAL;
+
+ dev2 = dev_get_by_macvtap_minor(iminor(inode));
+ if (!dev2)
+ return -EINVAL;
+
+ if (dev != dev2) {
+ dev_put(dev2);
+ return -EINVAL;
+ }
+
+ dev_put(dev2);
+ }
+
+ if ((ifr.ifr_flags & ~(IFF_VNET_HDR | IFF_MULTI_QUEUE)) !=
+ (IFF_NO_PI | IFF_TAP))
+ return -EINVAL;
+ else
+ q->flags = ifr.ifr_flags;
+
+ return 0;
+}
+
/*
* provide compatibility with generic tun/tap interface
*/
@@ -906,17 +945,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case TUNSETIFF:
- /* ignore the name, just look at flags */
- if (get_user(u, &ifr->ifr_flags))
- return -EFAULT;
-
- ret = 0;
- if ((u & ~IFF_VNET_HDR) != (IFF_NO_PI | IFF_TAP))
- ret = -EINVAL;
- else
- q->flags = u;
-
- return ret;
+ return macvtap_set_iff(file, ifr);
case TUNGETIFF:
vlan = macvtap_get_vlan(q);
--
1.7.1
^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [net-next rfc V2 6/8] macvtap: allow TUNSETIFF to create multiqueue device
2013-05-31 9:53 ` [net-next rfc V2 6/8] macvtap: allow TUNSETIFF to create multiqueue device Jason Wang
@ 2013-05-31 14:48 ` Sergei Shtylyov
2013-06-03 3:05 ` Jason Wang
0 siblings, 1 reply; 17+ messages in thread
From: Sergei Shtylyov @ 2013-05-31 14:48 UTC (permalink / raw)
To: Jason Wang; +Cc: davem, netdev, linux-kernel, mst
Hello.
On 31-05-2013 13:53, Jason Wang wrote:
> Though the queue were in fact created by open(), we still need to add this check
> to be compatible with tuntap which can let mgmt software use a single API to
> manage queues. This patch only validates the device name and moves the TUNSETIFF
> to a helper.
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> drivers/net/macvtap.c | 51 ++++++++++++++++++++++++++++++++++++++----------
> 1 files changed, 40 insertions(+), 11 deletions(-)
> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> index e76484c..eac49cb 100644
> --- a/drivers/net/macvtap.c
> +++ b/drivers/net/macvtap.c
> @@ -870,6 +870,7 @@ out:
> return ret;
> }
>
> +
Seems like random whitespace change.
> static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q)
> {
> struct macvlan_dev *vlan;
[...]
WBR, Sergei
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [net-next rfc V2 6/8] macvtap: allow TUNSETIFF to create multiqueue device
2013-05-31 14:48 ` Sergei Shtylyov
@ 2013-06-03 3:05 ` Jason Wang
0 siblings, 0 replies; 17+ messages in thread
From: Jason Wang @ 2013-06-03 3:05 UTC (permalink / raw)
To: Sergei Shtylyov; +Cc: davem, netdev, linux-kernel, mst
On 05/31/2013 10:48 PM, Sergei Shtylyov wrote:
> Hello.
>
> On 31-05-2013 13:53, Jason Wang wrote:
>
>> Though the queue were in fact created by open(), we still need to add
>> this check
>> to be compatible with tuntap which can let mgmt software use a single
>> API to
>> manage queues. This patch only validates the device name and moves
>> the TUNSETIFF
>> to a helper.
>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>> drivers/net/macvtap.c | 51
>> ++++++++++++++++++++++++++++++++++++++----------
>> 1 files changed, 40 insertions(+), 11 deletions(-)
>
>> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
>> index e76484c..eac49cb 100644
>> --- a/drivers/net/macvtap.c
>> +++ b/drivers/net/macvtap.c
>> @@ -870,6 +870,7 @@ out:
>> return ret;
>> }
>>
>> +
>
> Seems like random whitespace change.
Will remove it thanks.
>
>> static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q)
>> {
>> struct macvlan_dev *vlan;
> [...]
>
> WBR, Sergei
>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [net-next rfc V2 7/8] macvtap: add TUNSETQUEUE ioctl
2013-05-31 9:53 [net-next rfc V2 0/8] Multiqueue API for macvtap Jason Wang
` (5 preceding siblings ...)
2013-05-31 9:53 ` [net-next rfc V2 6/8] macvtap: allow TUNSETIFF to create multiqueue device Jason Wang
@ 2013-05-31 9:53 ` Jason Wang
2013-06-02 11:22 ` Michael S. Tsirkin
2013-05-31 9:53 ` [net-next rfc V2 8/8] macvtap: enable multiqueue flag Jason Wang
7 siblings, 1 reply; 17+ messages in thread
From: Jason Wang @ 2013-05-31 9:53 UTC (permalink / raw)
To: davem, netdev, linux-kernel, mst; +Cc: Jason Wang
This patch adds TUNSETQUEUE ioctl to let userspace can temporarily disable or
enable a queue of macvtap. This is used to be compatible at API layer of tuntap
to simplify the userspace to manage the queues.
This is done by split the taps array into three different areas:
- [0, numvtaps) : enabled taps
- [numvtaps, numvtaps + numdisabled) : disabled taps
- [numvtaps + numdisabled, MAX_MAXVTAP_QUEUES) : unused slots
When a tap were enabled and disabled, it was moved to another area.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/macvtap.c | 167 ++++++++++++++++++++++++++++++++++++++++----
include/linux/if_macvlan.h | 7 ++
2 files changed, 159 insertions(+), 15 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index eac49cb..03b781c 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -85,32 +85,126 @@ static const struct proto_ops macvtap_socket_ops;
*/
static DEFINE_SPINLOCK(macvtap_lock);
-static int macvtap_set_queue(struct net_device *dev, struct file *file,
+static void macvtap_swap_slot(struct macvlan_dev *vlan, int a, int b)
+{
+ struct macvtap_queue *q1, *q2;
+
+ if (a == b)
+ return;
+
+ q1 = rcu_dereference_protected(vlan->taps[a],
+ lockdep_is_held(&macvtap_lock));
+ q2 = rcu_dereference_protected(vlan->taps[b],
+ lockdep_is_held(&macvtap_lock));
+
+ BUG_ON(q1 == NULL || q2 == NULL);
+
+ rcu_assign_pointer(vlan->taps[a], q2);
+ rcu_assign_pointer(vlan->taps[b], q1);
+
+ q1->queue_index = b;
+ q2->queue_index = a;
+}
+
+static int macvtap_enable_queue(struct net_device *dev, struct file *file,
struct macvtap_queue *q)
{
struct macvlan_dev *vlan = netdev_priv(dev);
+ int err = -EINVAL;
+ int total;
+
+ spin_lock(&macvtap_lock);
+ total = vlan->numvtaps + vlan->numdisabled;
+
+ if (q->queue_index < vlan->numvtaps)
+ goto out;
+
+ err = 0;
+
+ BUG_ON(q->queue_index >= total);
+ macvtap_swap_slot(vlan, q->queue_index, vlan->numvtaps);
+
+ /* Make sure the pointers were seen before indices. */
+ wmb();
+
+ vlan->numdisabled--;
+ vlan->numvtaps++;
+out:
+ spin_unlock(&macvtap_lock);
+ return err;
+}
+
+static int macvtap_set_queue(struct net_device *dev, struct file *file,
+ struct macvtap_queue *q)
+{
+ struct macvlan_dev *vlan = netdev_priv(dev);
int err = -EBUSY;
+ int total;
spin_lock(&macvtap_lock);
- if (vlan->numvtaps == MAX_MACVTAP_QUEUES)
+
+ total = vlan->numvtaps + vlan->numdisabled;
+ if (total == MAX_MACVTAP_QUEUES)
goto out;
err = 0;
+
rcu_assign_pointer(q->vlan, vlan);
- rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
+ rcu_assign_pointer(vlan->taps[total], q);
sock_hold(&q->sk);
q->file = file;
- q->queue_index = vlan->numvtaps;
+ q->queue_index = total;
file->private_data = q;
+ if (vlan->numdisabled)
+ macvtap_swap_slot(vlan, vlan->numvtaps, total);
- vlan->numvtaps++;
+ /* Make sure the pointers were seen before indices. */
+ wmb();
+ vlan->numvtaps++;
out:
spin_unlock(&macvtap_lock);
return err;
}
+static int macvtap_disable_queue(struct macvtap_queue *q)
+{
+ struct macvlan_dev *vlan;
+ int err = -EINVAL;
+
+ spin_lock(&macvtap_lock);
+ vlan = rcu_dereference_protected(q->vlan,
+ lockdep_is_held(&macvtap_lock));
+
+ if (vlan) {
+ int total = vlan->numvtaps + vlan->numdisabled;
+ int index = q->queue_index;
+
+ BUG_ON(q->queue_index >= total);
+ if (q->queue_index >= vlan->numvtaps)
+ goto out;
+
+ err = 0;
+ macvtap_swap_slot(vlan, index, total - 1);
+ if (vlan->numdisabled)
+ /* If there's disabled taps, the above swap will cause
+ * a disabled tap to be moved to enabled area. So
+ * another swap is needed to keep the right order.
+ */
+ macvtap_swap_slot(vlan, index, vlan->numvtaps - 1);
+
+ /* make sure the pointers were seen before indices */
+ wmb();
+
+ vlan->numvtaps--;
+ vlan->numdisabled++;
+ }
+
+out:
+ spin_unlock(&macvtap_lock);
+ return err;
+}
/*
* The file owning the queue got closed, give up both
* the reference that the files holds as well as the
@@ -121,25 +215,38 @@ out:
*/
static void macvtap_put_queue(struct macvtap_queue *q)
{
- struct macvtap_queue *nq;
struct macvlan_dev *vlan;
spin_lock(&macvtap_lock);
vlan = rcu_dereference_protected(q->vlan,
lockdep_is_held(&macvtap_lock));
+
if (vlan) {
+ int total = vlan->numvtaps + vlan->numdisabled;
int index = q->queue_index;
- BUG_ON(index >= vlan->numvtaps);
+ bool disabled = q->queue_index >= vlan->numvtaps;
+
+ BUG_ON(q->queue_index >= total);
+ macvtap_swap_slot(vlan, index, total - 1);
+ if (!disabled && vlan->numdisabled)
+ /* If there's disabled taps, the above swap will cause
+ * a disabled tap to be moved to enabled area. So
+ * another swap is needed to keep the right order.
+ */
+ macvtap_swap_slot(vlan, index, vlan->numvtaps - 1);
+
+ RCU_INIT_POINTER(vlan->taps[total - 1], NULL);
+ RCU_INIT_POINTER(q->vlan, NULL);
+ sock_put(&q->sk);
- nq = rcu_dereference_protected(vlan->taps[vlan->numvtaps - 1],
- lockdep_is_held(&macvtap_lock));
- rcu_assign_pointer(vlan->taps[index], nq);
- nq->queue_index = index;
+ /* Make sure the pointers were seen before indices */
+ wmb();
- RCU_INIT_POINTER(q->vlan, NULL);
+ if (disabled)
+ vlan->numdisabled--;
+ else
+ vlan->numvtaps--;
- sock_put(&q->sk);
- --vlan->numvtaps;
}
spin_unlock(&macvtap_lock);
@@ -166,6 +273,9 @@ static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
if (!numvtaps)
goto out;
+ /* Check taps after numvtaps were exposed. */
+ rmb();
+
/* Check if we can use flow to select a queue */
rxq = skb_get_rxhash(skb);
if (rxq) {
@@ -201,7 +311,7 @@ static void macvtap_del_queues(struct net_device *dev)
/* macvtap_put_queue can free some slots, so go through all slots */
spin_lock(&macvtap_lock);
- for (i = 0; i < vlan->numvtaps; i++) {
+ for (i = 0; i < vlan->numvtaps + vlan->numdisabled; i++) {
q = rcu_dereference_protected(vlan->taps[i],
lockdep_is_held(&macvtap_lock));
BUG_ON(q == NULL);
@@ -211,6 +321,7 @@ static void macvtap_del_queues(struct net_device *dev)
}
/* guarantee that any future macvtap_set_queue will fail */
vlan->numvtaps = MAX_MACVTAP_QUEUES;
+ vlan->numdisabled = 0;
spin_unlock(&macvtap_lock);
synchronize_rcu();
@@ -927,6 +1038,27 @@ static int macvtap_set_iff(struct file *file, struct ifreq __user *ifr_u)
return 0;
}
+static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags)
+{
+ struct macvtap_queue *q = file->private_data;
+ struct macvlan_dev *vlan;
+ int ret = -EINVAL;
+
+ vlan = macvtap_get_vlan(q);
+ if (!vlan)
+ goto done;
+
+ if (flags & IFF_ATTACH_QUEUE)
+ ret = macvtap_enable_queue(vlan->dev, file, q);
+ else if (flags & IFF_DETACH_QUEUE)
+ ret = macvtap_disable_queue(q);
+
+ macvtap_put_vlan(vlan);
+
+done:
+ return ret;
+}
+
/*
* provide compatibility with generic tun/tap interface
*/
@@ -959,6 +1091,11 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
macvtap_put_vlan(vlan);
return ret;
+ case TUNSETQUEUE:
+ if (get_user(u, &ifr->ifr_flags))
+ return -EFAULT;
+ return macvtap_ioctl_set_queue(file, u);
+
case TUNGETFEATURES:
if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR, up))
return -EFAULT;
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 62d8bda..d528f38 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -69,8 +69,15 @@ struct macvlan_dev {
u16 flags;
int (*receive)(struct sk_buff *skb);
int (*forward)(struct net_device *dev, struct sk_buff *skb);
+ /* This array tracks all taps (include disabled ones) and will be
+ * reshuffled to keep the following order:
+ * [0, numvtaps) : enabled taps,
+ * [numvtaps, numvtaps + numdisabled) : disabled taps,
+ * [numvtaps + numdisabled, MAX_MACVTAP_QUEUES) : unused slots
+ */
struct macvtap_queue *taps[MAX_MACVTAP_QUEUES];
int numvtaps;
+ int numdisabled;
int minor;
};
--
1.7.1
^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [net-next rfc V2 7/8] macvtap: add TUNSETQUEUE ioctl
2013-05-31 9:53 ` [net-next rfc V2 7/8] macvtap: add TUNSETQUEUE ioctl Jason Wang
@ 2013-06-02 11:22 ` Michael S. Tsirkin
2013-06-03 5:20 ` Jason Wang
0 siblings, 1 reply; 17+ messages in thread
From: Michael S. Tsirkin @ 2013-06-02 11:22 UTC (permalink / raw)
To: Jason Wang; +Cc: davem, netdev, linux-kernel
On Fri, May 31, 2013 at 05:53:24PM +0800, Jason Wang wrote:
> This patch adds TUNSETQUEUE ioctl to let userspace can temporarily disable or
> enable a queue of macvtap. This is used to be compatible at API layer of tuntap
> to simplify the userspace to manage the queues.
>
> This is done by split the taps array into three different areas:
>
> - [0, numvtaps) : enabled taps
> - [numvtaps, numvtaps + numdisabled) : disabled taps
> - [numvtaps + numdisabled, MAX_MAXVTAP_QUEUES) : unused slots
>
> When a tap were enabled and disabled, it was moved to another area.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
This seems a bit tricky. Can we just move the tap out of the
array? the only reason we have the array is for fast
lookup on xmit.
What's the reason to keep disabled queues there?
To be able to track all queues for cleanups, all we need is
a linked list of all queues (enabled and disabled).
> ---
> drivers/net/macvtap.c | 167 ++++++++++++++++++++++++++++++++++++++++----
> include/linux/if_macvlan.h | 7 ++
> 2 files changed, 159 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> index eac49cb..03b781c 100644
> --- a/drivers/net/macvtap.c
> +++ b/drivers/net/macvtap.c
> @@ -85,32 +85,126 @@ static const struct proto_ops macvtap_socket_ops;
> */
> static DEFINE_SPINLOCK(macvtap_lock);
>
> -static int macvtap_set_queue(struct net_device *dev, struct file *file,
> +static void macvtap_swap_slot(struct macvlan_dev *vlan, int a, int b)
> +{
> + struct macvtap_queue *q1, *q2;
> +
> + if (a == b)
> + return;
> +
> + q1 = rcu_dereference_protected(vlan->taps[a],
> + lockdep_is_held(&macvtap_lock));
> + q2 = rcu_dereference_protected(vlan->taps[b],
> + lockdep_is_held(&macvtap_lock));
> +
> + BUG_ON(q1 == NULL || q2 == NULL);
> +
> + rcu_assign_pointer(vlan->taps[a], q2);
> + rcu_assign_pointer(vlan->taps[b], q1);
> +
> + q1->queue_index = b;
> + q2->queue_index = a;
> +}
> +
> +static int macvtap_enable_queue(struct net_device *dev, struct file *file,
> struct macvtap_queue *q)
> {
> struct macvlan_dev *vlan = netdev_priv(dev);
> + int err = -EINVAL;
> + int total;
> +
> + spin_lock(&macvtap_lock);
> + total = vlan->numvtaps + vlan->numdisabled;
> +
> + if (q->queue_index < vlan->numvtaps)
> + goto out;
> +
> + err = 0;
> +
> + BUG_ON(q->queue_index >= total);
> + macvtap_swap_slot(vlan, q->queue_index, vlan->numvtaps);
> +
> + /* Make sure the pointers were seen before indices. */
> + wmb();
Which indices? We only care about numvtaps right?
So let's just say so.
Why is this wmb and not smp_wmb()?
> +
> + vlan->numdisabled--;
> + vlan->numvtaps++;
> +out:
> + spin_unlock(&macvtap_lock);
> + return err;
> +}
> +
> +static int macvtap_set_queue(struct net_device *dev, struct file *file,
> + struct macvtap_queue *q)
> +{
> + struct macvlan_dev *vlan = netdev_priv(dev);
> int err = -EBUSY;
> + int total;
>
> spin_lock(&macvtap_lock);
> - if (vlan->numvtaps == MAX_MACVTAP_QUEUES)
> +
> + total = vlan->numvtaps + vlan->numdisabled;
> + if (total == MAX_MACVTAP_QUEUES)
> goto out;
>
> err = 0;
> +
> rcu_assign_pointer(q->vlan, vlan);
> - rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
> + rcu_assign_pointer(vlan->taps[total], q);
> sock_hold(&q->sk);
>
> q->file = file;
> - q->queue_index = vlan->numvtaps;
> + q->queue_index = total;
> file->private_data = q;
> + if (vlan->numdisabled)
> + macvtap_swap_slot(vlan, vlan->numvtaps, total);
>
> - vlan->numvtaps++;
> + /* Make sure the pointers were seen before indices. */
> + wmb();
>
> + vlan->numvtaps++;
> out:
> spin_unlock(&macvtap_lock);
> return err;
> }
>
> +static int macvtap_disable_queue(struct macvtap_queue *q)
> +{
> + struct macvlan_dev *vlan;
> + int err = -EINVAL;
> +
> + spin_lock(&macvtap_lock);
> + vlan = rcu_dereference_protected(q->vlan,
> + lockdep_is_held(&macvtap_lock));
> +
> + if (vlan) {
> + int total = vlan->numvtaps + vlan->numdisabled;
> + int index = q->queue_index;
> +
> + BUG_ON(q->queue_index >= total);
> + if (q->queue_index >= vlan->numvtaps)
> + goto out;
> +
> + err = 0;
> + macvtap_swap_slot(vlan, index, total - 1);
> + if (vlan->numdisabled)
> + /* If there's disabled taps, the above swap will cause
> + * a disabled tap to be moved to enabled area. So
> + * another swap is needed to keep the right order.
> + */
> + macvtap_swap_slot(vlan, index, vlan->numvtaps - 1);
> +
> + /* make sure the pointers were seen before indices */
> + wmb();
Hmm this looks questionable. The code near rmb first
checks numvtaps then dereferences the queue.
So it might see a new queue but old value of numvtaps.
> +
> + vlan->numvtaps--;
> + vlan->numdisabled++;
> + }
> +
> +out:
> + spin_unlock(&macvtap_lock);
> + return err;
> +}
> /*
> * The file owning the queue got closed, give up both
> * the reference that the files holds as well as the
> @@ -121,25 +215,38 @@ out:
> */
> static void macvtap_put_queue(struct macvtap_queue *q)
> {
> - struct macvtap_queue *nq;
> struct macvlan_dev *vlan;
>
> spin_lock(&macvtap_lock);
> vlan = rcu_dereference_protected(q->vlan,
> lockdep_is_held(&macvtap_lock));
> +
> if (vlan) {
> + int total = vlan->numvtaps + vlan->numdisabled;
> int index = q->queue_index;
> - BUG_ON(index >= vlan->numvtaps);
> + bool disabled = q->queue_index >= vlan->numvtaps;
> +
> + BUG_ON(q->queue_index >= total);
> + macvtap_swap_slot(vlan, index, total - 1);
> + if (!disabled && vlan->numdisabled)
> + /* If there's disabled taps, the above swap will cause
> + * a disabled tap to be moved to enabled area. So
> + * another swap is needed to keep the right order.
> + */
> + macvtap_swap_slot(vlan, index, vlan->numvtaps - 1);
> +
> + RCU_INIT_POINTER(vlan->taps[total - 1], NULL);
> + RCU_INIT_POINTER(q->vlan, NULL);
> + sock_put(&q->sk);
>
> - nq = rcu_dereference_protected(vlan->taps[vlan->numvtaps - 1],
> - lockdep_is_held(&macvtap_lock));
> - rcu_assign_pointer(vlan->taps[index], nq);
> - nq->queue_index = index;
> + /* Make sure the pointers were seen before indices */
Here it's one pointer, right?
> + wmb();
Same issue as above, looks even more worrying
as queue is freed here.
>
> - RCU_INIT_POINTER(q->vlan, NULL);
> + if (disabled)
> + vlan->numdisabled--;
> + else
> + vlan->numvtaps--;
>
> - sock_put(&q->sk);
> - --vlan->numvtaps;
> }
>
> spin_unlock(&macvtap_lock);
> @@ -166,6 +273,9 @@ static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
> if (!numvtaps)
> goto out;
>
> + /* Check taps after numvtaps were exposed. */
> + rmb();
> +
Except this doesn't seem to handle case where taps are going away ...
> /* Check if we can use flow to select a queue */
> rxq = skb_get_rxhash(skb);
> if (rxq) {
> @@ -201,7 +311,7 @@ static void macvtap_del_queues(struct net_device *dev)
>
> /* macvtap_put_queue can free some slots, so go through all slots */
> spin_lock(&macvtap_lock);
> - for (i = 0; i < vlan->numvtaps; i++) {
> + for (i = 0; i < vlan->numvtaps + vlan->numdisabled; i++) {
> q = rcu_dereference_protected(vlan->taps[i],
> lockdep_is_held(&macvtap_lock));
> BUG_ON(q == NULL);
> @@ -211,6 +321,7 @@ static void macvtap_del_queues(struct net_device *dev)
> }
> /* guarantee that any future macvtap_set_queue will fail */
> vlan->numvtaps = MAX_MACVTAP_QUEUES;
> + vlan->numdisabled = 0;
> spin_unlock(&macvtap_lock);
>
> synchronize_rcu();
> @@ -927,6 +1038,27 @@ static int macvtap_set_iff(struct file *file, struct ifreq __user *ifr_u)
> return 0;
> }
>
> +static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags)
> +{
> + struct macvtap_queue *q = file->private_data;
> + struct macvlan_dev *vlan;
> + int ret = -EINVAL;
> +
> + vlan = macvtap_get_vlan(q);
> + if (!vlan)
> + goto done;
> +
> + if (flags & IFF_ATTACH_QUEUE)
> + ret = macvtap_enable_queue(vlan->dev, file, q);
> + else if (flags & IFF_DETACH_QUEUE)
> + ret = macvtap_disable_queue(q);
> +
> + macvtap_put_vlan(vlan);
> +
> +done:
> + return ret;
> +}
> +
> /*
> * provide compatibility with generic tun/tap interface
> */
> @@ -959,6 +1091,11 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
> macvtap_put_vlan(vlan);
> return ret;
>
> + case TUNSETQUEUE:
> + if (get_user(u, &ifr->ifr_flags))
> + return -EFAULT;
> + return macvtap_ioctl_set_queue(file, u);
> +
> case TUNGETFEATURES:
> if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR, up))
> return -EFAULT;
> diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
> index 62d8bda..d528f38 100644
> --- a/include/linux/if_macvlan.h
> +++ b/include/linux/if_macvlan.h
> @@ -69,8 +69,15 @@ struct macvlan_dev {
> u16 flags;
> int (*receive)(struct sk_buff *skb);
> int (*forward)(struct net_device *dev, struct sk_buff *skb);
> + /* This array tracks all taps (include disabled ones) and will be
> + * reshuffled to keep the following order:
> + * [0, numvtaps) : enabled taps,
> + * [numvtaps, numvtaps + numdisabled) : disabled taps,
> + * [numvtaps + numdisabled, MAX_MACVTAP_QUEUES) : unused slots
> + */
> struct macvtap_queue *taps[MAX_MACVTAP_QUEUES];
> int numvtaps;
> + int numdisabled;
> int minor;
> };
>
> --
> 1.7.1
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [net-next rfc V2 7/8] macvtap: add TUNSETQUEUE ioctl
2013-06-02 11:22 ` Michael S. Tsirkin
@ 2013-06-03 5:20 ` Jason Wang
2013-06-03 11:09 ` Michael S. Tsirkin
0 siblings, 1 reply; 17+ messages in thread
From: Jason Wang @ 2013-06-03 5:20 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: davem, netdev, linux-kernel
On 06/02/2013 07:22 PM, Michael S. Tsirkin wrote:
> On Fri, May 31, 2013 at 05:53:24PM +0800, Jason Wang wrote:
>> This patch adds TUNSETQUEUE ioctl to let userspace can temporarily disable or
>> enable a queue of macvtap. This is used to be compatible at API layer of tuntap
>> to simplify the userspace to manage the queues.
>>
>> This is done by split the taps array into three different areas:
>>
>> - [0, numvtaps) : enabled taps
>> - [numvtaps, numvtaps + numdisabled) : disabled taps
>> - [numvtaps + numdisabled, MAX_MAXVTAP_QUEUES) : unused slots
>>
>> When a tap were enabled and disabled, it was moved to another area.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>
> This seems a bit tricky. Can we just move the tap out of the
> array?
Certainly yes.
> the only reason we have the array is for fast
> lookup on xmit.
> What's the reason to keep disabled queues there?
It saves us some space and make code simpler.
> To be able to track all queues for cleanups, all we need is
> a linked list of all queues (enabled and disabled).
Yes, but you need iterate both arrays and linked list which won't be
simpler than keeping them in place.
>> ---
>> drivers/net/macvtap.c | 167 ++++++++++++++++++++++++++++++++++++++++----
>> include/linux/if_macvlan.h | 7 ++
>> 2 files changed, 159 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
>> index eac49cb..03b781c 100644
>> --- a/drivers/net/macvtap.c
>> +++ b/drivers/net/macvtap.c
>> @@ -85,32 +85,126 @@ static const struct proto_ops macvtap_socket_ops;
>> */
>> static DEFINE_SPINLOCK(macvtap_lock);
>>
>> -static int macvtap_set_queue(struct net_device *dev, struct file *file,
>> +static void macvtap_swap_slot(struct macvlan_dev *vlan, int a, int b)
>> +{
>> + struct macvtap_queue *q1, *q2;
>> +
>> + if (a == b)
>> + return;
>> +
>> + q1 = rcu_dereference_protected(vlan->taps[a],
>> + lockdep_is_held(&macvtap_lock));
>> + q2 = rcu_dereference_protected(vlan->taps[b],
>> + lockdep_is_held(&macvtap_lock));
>> +
>> + BUG_ON(q1 == NULL || q2 == NULL);
>> +
>> + rcu_assign_pointer(vlan->taps[a], q2);
>> + rcu_assign_pointer(vlan->taps[b], q1);
>> +
>> + q1->queue_index = b;
>> + q2->queue_index = a;
>> +}
>> +
>> +static int macvtap_enable_queue(struct net_device *dev, struct file *file,
>> struct macvtap_queue *q)
>> {
>> struct macvlan_dev *vlan = netdev_priv(dev);
>> + int err = -EINVAL;
>> + int total;
>> +
>> + spin_lock(&macvtap_lock);
>> + total = vlan->numvtaps + vlan->numdisabled;
>> +
>> + if (q->queue_index < vlan->numvtaps)
>> + goto out;
>> +
>> + err = 0;
>> +
>> + BUG_ON(q->queue_index >= total);
>> + macvtap_swap_slot(vlan, q->queue_index, vlan->numvtaps);
>> +
>> + /* Make sure the pointers were seen before indices. */
>> + wmb();
> Which indices? We only care about numvtaps right?
> So let's just say so.
ok
>
> Why is this wmb and not smp_wmb()?
will correct it.
>
>> +
>> + vlan->numdisabled--;
>> + vlan->numvtaps++;
>> +out:
>> + spin_unlock(&macvtap_lock);
>> + return err;
>> +}
>> +
>> +static int macvtap_set_queue(struct net_device *dev, struct file *file,
>> + struct macvtap_queue *q)
>> +{
>> + struct macvlan_dev *vlan = netdev_priv(dev);
>> int err = -EBUSY;
>> + int total;
>>
>> spin_lock(&macvtap_lock);
>> - if (vlan->numvtaps == MAX_MACVTAP_QUEUES)
>> +
>> + total = vlan->numvtaps + vlan->numdisabled;
>> + if (total == MAX_MACVTAP_QUEUES)
>> goto out;
>>
>> err = 0;
>> +
>> rcu_assign_pointer(q->vlan, vlan);
>> - rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
>> + rcu_assign_pointer(vlan->taps[total], q);
>> sock_hold(&q->sk);
>>
>> q->file = file;
>> - q->queue_index = vlan->numvtaps;
>> + q->queue_index = total;
>> file->private_data = q;
>> + if (vlan->numdisabled)
>> + macvtap_swap_slot(vlan, vlan->numvtaps, total);
>>
>> - vlan->numvtaps++;
>> + /* Make sure the pointers were seen before indices. */
>> + wmb();
>>
>> + vlan->numvtaps++;
>> out:
>> spin_unlock(&macvtap_lock);
>> return err;
>> }
>>
>> +static int macvtap_disable_queue(struct macvtap_queue *q)
>> +{
>> + struct macvlan_dev *vlan;
>> + int err = -EINVAL;
>> +
>> + spin_lock(&macvtap_lock);
>> + vlan = rcu_dereference_protected(q->vlan,
>> + lockdep_is_held(&macvtap_lock));
>> +
>> + if (vlan) {
>> + int total = vlan->numvtaps + vlan->numdisabled;
>> + int index = q->queue_index;
>> +
>> + BUG_ON(q->queue_index >= total);
>> + if (q->queue_index >= vlan->numvtaps)
>> + goto out;
>> +
>> + err = 0;
>> + macvtap_swap_slot(vlan, index, total - 1);
>> + if (vlan->numdisabled)
>> + /* If there's disabled taps, the above swap will cause
>> + * a disabled tap to be moved to enabled area. So
>> + * another swap is needed to keep the right order.
>> + */
>> + macvtap_swap_slot(vlan, index, vlan->numvtaps - 1);
>> +
>> + /* make sure the pointers were seen before indices */
>> + wmb();
> Hmm this looks questionable. The code near rmb first
> checks numvtaps then dereferences the queue.
> So it might see a new queue but old value of numvtaps.
Right, barriers here were just best effort to reduce the possibility of
wrong queue selection when changing the number of queues.
>
>> +
>> + vlan->numvtaps--;
>> + vlan->numdisabled++;
>> + }
>> +
>> +out:
>> + spin_unlock(&macvtap_lock);
>> + return err;
>> +}
>> /*
>> * The file owning the queue got closed, give up both
>> * the reference that the files holds as well as the
>> @@ -121,25 +215,38 @@ out:
>> */
>> static void macvtap_put_queue(struct macvtap_queue *q)
>> {
>> - struct macvtap_queue *nq;
>> struct macvlan_dev *vlan;
>>
>> spin_lock(&macvtap_lock);
>> vlan = rcu_dereference_protected(q->vlan,
>> lockdep_is_held(&macvtap_lock));
>> +
>> if (vlan) {
>> + int total = vlan->numvtaps + vlan->numdisabled;
>> int index = q->queue_index;
>> - BUG_ON(index >= vlan->numvtaps);
>> + bool disabled = q->queue_index >= vlan->numvtaps;
>> +
>> + BUG_ON(q->queue_index >= total);
>> + macvtap_swap_slot(vlan, index, total - 1);
>> + if (!disabled && vlan->numdisabled)
>> + /* If there's disabled taps, the above swap will cause
>> + * a disabled tap to be moved to enabled area. So
>> + * another swap is needed to keep the right order.
>> + */
>> + macvtap_swap_slot(vlan, index, vlan->numvtaps - 1);
>> +
>> + RCU_INIT_POINTER(vlan->taps[total - 1], NULL);
>> + RCU_INIT_POINTER(q->vlan, NULL);
>> + sock_put(&q->sk);
>>
>> - nq = rcu_dereference_protected(vlan->taps[vlan->numvtaps - 1],
>> - lockdep_is_held(&macvtap_lock));
>> - rcu_assign_pointer(vlan->taps[index], nq);
>> - nq->queue_index = index;
>> + /* Make sure the pointers were seen before indices */
> Here it's one pointer, right?
Right.
>
>> + wmb();
> Same issue as above, looks even more worrying
> as queue is freed here.
The read side were protected by rcu_read_lock(), so no worries here.
>
>>
>> - RCU_INIT_POINTER(q->vlan, NULL);
>> + if (disabled)
>> + vlan->numdisabled--;
>> + else
>> + vlan->numvtaps--;
>>
>> - sock_put(&q->sk);
>> - --vlan->numvtaps;
>> }
>>
>> spin_unlock(&macvtap_lock);
>> @@ -166,6 +273,9 @@ static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
>> if (!numvtaps)
>> goto out;
>>
>> + /* Check taps after numvtaps were exposed. */
>> + rmb();
>> +
> Except this doesn't seem to handle case where taps are going away ...
We're protected by rcu read lock here so even though it choose the queue
which is going to be destroyed temporarily, the socket won't be freed
before the packets were queued in the socket.
>> /* Check if we can use flow to select a queue */
>> rxq = skb_get_rxhash(skb);
>> if (rxq) {
>> @@ -201,7 +311,7 @@ static void macvtap_del_queues(struct net_device *dev)
>>
>> /* macvtap_put_queue can free some slots, so go through all slots */
>> spin_lock(&macvtap_lock);
>> - for (i = 0; i < vlan->numvtaps; i++) {
>> + for (i = 0; i < vlan->numvtaps + vlan->numdisabled; i++) {
>> q = rcu_dereference_protected(vlan->taps[i],
>> lockdep_is_held(&macvtap_lock));
>> BUG_ON(q == NULL);
>> @@ -211,6 +321,7 @@ static void macvtap_del_queues(struct net_device *dev)
>> }
>> /* guarantee that any future macvtap_set_queue will fail */
>> vlan->numvtaps = MAX_MACVTAP_QUEUES;
>> + vlan->numdisabled = 0;
>> spin_unlock(&macvtap_lock);
>>
>> synchronize_rcu();
>> @@ -927,6 +1038,27 @@ static int macvtap_set_iff(struct file *file, struct ifreq __user *ifr_u)
>> return 0;
>> }
>>
>> +static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags)
>> +{
>> + struct macvtap_queue *q = file->private_data;
>> + struct macvlan_dev *vlan;
>> + int ret = -EINVAL;
>> +
>> + vlan = macvtap_get_vlan(q);
>> + if (!vlan)
>> + goto done;
>> +
>> + if (flags & IFF_ATTACH_QUEUE)
>> + ret = macvtap_enable_queue(vlan->dev, file, q);
>> + else if (flags & IFF_DETACH_QUEUE)
>> + ret = macvtap_disable_queue(q);
>> +
>> + macvtap_put_vlan(vlan);
>> +
>> +done:
>> + return ret;
>> +}
>> +
>> /*
>> * provide compatibility with generic tun/tap interface
>> */
>> @@ -959,6 +1091,11 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
>> macvtap_put_vlan(vlan);
>> return ret;
>>
>> + case TUNSETQUEUE:
>> + if (get_user(u, &ifr->ifr_flags))
>> + return -EFAULT;
>> + return macvtap_ioctl_set_queue(file, u);
>> +
>> case TUNGETFEATURES:
>> if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR, up))
>> return -EFAULT;
>> diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
>> index 62d8bda..d528f38 100644
>> --- a/include/linux/if_macvlan.h
>> +++ b/include/linux/if_macvlan.h
>> @@ -69,8 +69,15 @@ struct macvlan_dev {
>> u16 flags;
>> int (*receive)(struct sk_buff *skb);
>> int (*forward)(struct net_device *dev, struct sk_buff *skb);
>> + /* This array tracks all taps (include disabled ones) and will be
>> + * reshuffled to keep the following order:
>> + * [0, numvtaps) : enabled taps,
>> + * [numvtaps, numvtaps + numdisabled) : disabled taps,
>> + * [numvtaps + numdisabled, MAX_MACVTAP_QUEUES) : unused slots
>> + */
>> struct macvtap_queue *taps[MAX_MACVTAP_QUEUES];
>> int numvtaps;
>> + int numdisabled;
>> int minor;
>> };
>>
>> --
>> 1.7.1
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [net-next rfc V2 7/8] macvtap: add TUNSETQUEUE ioctl
2013-06-03 5:20 ` Jason Wang
@ 2013-06-03 11:09 ` Michael S. Tsirkin
2013-06-04 5:54 ` Jason Wang
0 siblings, 1 reply; 17+ messages in thread
From: Michael S. Tsirkin @ 2013-06-03 11:09 UTC (permalink / raw)
To: Jason Wang; +Cc: davem, netdev, linux-kernel
On Mon, Jun 03, 2013 at 01:20:58PM +0800, Jason Wang wrote:
> On 06/02/2013 07:22 PM, Michael S. Tsirkin wrote:
> > On Fri, May 31, 2013 at 05:53:24PM +0800, Jason Wang wrote:
> >> This patch adds TUNSETQUEUE ioctl to let userspace can temporarily disable or
> >> enable a queue of macvtap. This is used to be compatible at API layer of tuntap
> >> to simplify the userspace to manage the queues.
> >>
> >> This is done by split the taps array into three different areas:
> >>
> >> - [0, numvtaps) : enabled taps
> >> - [numvtaps, numvtaps + numdisabled) : disabled taps
> >> - [numvtaps + numdisabled, MAX_MAXVTAP_QUEUES) : unused slots
> >>
> >> When a tap were enabled and disabled, it was moved to another area.
> >>
> >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> >
> > This seems a bit tricky. Can we just move the tap out of the
> > array?
>
> Certainly yes.
> > the only reason we have the array is for fast
> > lookup on xmit.
> > What's the reason to keep disabled queues there?
>
> It saves us some space and make code simpler.
> > To be able to track all queues for cleanups, all we need is
> > a linked list of all queues (enabled and disabled).
>
> Yes, but you need iterate both arrays and linked list which won't be
> simpler than keeping them in place.
No, my idea is to keep all taps in the list.
If you need all taps, walks the list.
If you need active taps, look them up in the array.
Reasonable?
> >> ---
> >> drivers/net/macvtap.c | 167 ++++++++++++++++++++++++++++++++++++++++----
> >> include/linux/if_macvlan.h | 7 ++
> >> 2 files changed, 159 insertions(+), 15 deletions(-)
> >>
> >> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> >> index eac49cb..03b781c 100644
> >> --- a/drivers/net/macvtap.c
> >> +++ b/drivers/net/macvtap.c
> >> @@ -85,32 +85,126 @@ static const struct proto_ops macvtap_socket_ops;
> >> */
> >> static DEFINE_SPINLOCK(macvtap_lock);
> >>
> >> -static int macvtap_set_queue(struct net_device *dev, struct file *file,
> >> +static void macvtap_swap_slot(struct macvlan_dev *vlan, int a, int b)
> >> +{
> >> + struct macvtap_queue *q1, *q2;
> >> +
> >> + if (a == b)
> >> + return;
> >> +
> >> + q1 = rcu_dereference_protected(vlan->taps[a],
> >> + lockdep_is_held(&macvtap_lock));
> >> + q2 = rcu_dereference_protected(vlan->taps[b],
> >> + lockdep_is_held(&macvtap_lock));
> >> +
> >> + BUG_ON(q1 == NULL || q2 == NULL);
> >> +
> >> + rcu_assign_pointer(vlan->taps[a], q2);
> >> + rcu_assign_pointer(vlan->taps[b], q1);
> >> +
> >> + q1->queue_index = b;
> >> + q2->queue_index = a;
> >> +}
> >> +
> >> +static int macvtap_enable_queue(struct net_device *dev, struct file *file,
> >> struct macvtap_queue *q)
> >> {
> >> struct macvlan_dev *vlan = netdev_priv(dev);
> >> + int err = -EINVAL;
> >> + int total;
> >> +
> >> + spin_lock(&macvtap_lock);
> >> + total = vlan->numvtaps + vlan->numdisabled;
> >> +
> >> + if (q->queue_index < vlan->numvtaps)
> >> + goto out;
> >> +
> >> + err = 0;
> >> +
> >> + BUG_ON(q->queue_index >= total);
> >> + macvtap_swap_slot(vlan, q->queue_index, vlan->numvtaps);
> >> +
> >> + /* Make sure the pointers were seen before indices. */
> >> + wmb();
> > Which indices? We only care about numvtaps right?
> > So let's just say so.
>
> ok
> >
> > Why is this wmb and not smp_wmb()?
>
> will correct it.
> >
> >> +
> >> + vlan->numdisabled--;
> >> + vlan->numvtaps++;
> >> +out:
> >> + spin_unlock(&macvtap_lock);
> >> + return err;
> >> +}
> >> +
> >> +static int macvtap_set_queue(struct net_device *dev, struct file *file,
> >> + struct macvtap_queue *q)
> >> +{
> >> + struct macvlan_dev *vlan = netdev_priv(dev);
> >> int err = -EBUSY;
> >> + int total;
> >>
> >> spin_lock(&macvtap_lock);
> >> - if (vlan->numvtaps == MAX_MACVTAP_QUEUES)
> >> +
> >> + total = vlan->numvtaps + vlan->numdisabled;
> >> + if (total == MAX_MACVTAP_QUEUES)
> >> goto out;
> >>
> >> err = 0;
> >> +
> >> rcu_assign_pointer(q->vlan, vlan);
> >> - rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
> >> + rcu_assign_pointer(vlan->taps[total], q);
> >> sock_hold(&q->sk);
> >>
> >> q->file = file;
> >> - q->queue_index = vlan->numvtaps;
> >> + q->queue_index = total;
> >> file->private_data = q;
> >> + if (vlan->numdisabled)
> >> + macvtap_swap_slot(vlan, vlan->numvtaps, total);
> >>
> >> - vlan->numvtaps++;
> >> + /* Make sure the pointers were seen before indices. */
> >> + wmb();
> >>
> >> + vlan->numvtaps++;
> >> out:
> >> spin_unlock(&macvtap_lock);
> >> return err;
> >> }
> >>
> >> +static int macvtap_disable_queue(struct macvtap_queue *q)
> >> +{
> >> + struct macvlan_dev *vlan;
> >> + int err = -EINVAL;
> >> +
> >> + spin_lock(&macvtap_lock);
> >> + vlan = rcu_dereference_protected(q->vlan,
> >> + lockdep_is_held(&macvtap_lock));
> >> +
> >> + if (vlan) {
> >> + int total = vlan->numvtaps + vlan->numdisabled;
> >> + int index = q->queue_index;
> >> +
> >> + BUG_ON(q->queue_index >= total);
> >> + if (q->queue_index >= vlan->numvtaps)
> >> + goto out;
> >> +
> >> + err = 0;
> >> + macvtap_swap_slot(vlan, index, total - 1);
> >> + if (vlan->numdisabled)
> >> + /* If there's disabled taps, the above swap will cause
> >> + * a disabled tap to be moved to enabled area. So
> >> + * another swap is needed to keep the right order.
> >> + */
> >> + macvtap_swap_slot(vlan, index, vlan->numvtaps - 1);
> >> +
> >> + /* make sure the pointers were seen before indices */
> >> + wmb();
> > Hmm this looks questionable. The code near rmb first
> > checks numvtaps then dereferences the queue.
> > So it might see a new queue but old value of numvtaps.
>
> Right, barriers here were just best effort to reduce the possibility of
> wrong queue selection when changing the number of queues.
If this is an optimization, I'd say benchmark it and
see if it helps performance.
I don't expect it to have any effect really.
In fact, the re-ordering of queues that this patch does
is likely to cause packet reorering and hurt performance
more.
I'm guessing the only thing we need for correctness
is ACCESS_ONCE on numvtaps?
> >
> >> +
> >> + vlan->numvtaps--;
> >> + vlan->numdisabled++;
> >> + }
> >> +
> >> +out:
> >> + spin_unlock(&macvtap_lock);
> >> + return err;
> >> +}
> >> /*
> >> * The file owning the queue got closed, give up both
> >> * the reference that the files holds as well as the
> >> @@ -121,25 +215,38 @@ out:
> >> */
> >> static void macvtap_put_queue(struct macvtap_queue *q)
> >> {
> >> - struct macvtap_queue *nq;
> >> struct macvlan_dev *vlan;
> >>
> >> spin_lock(&macvtap_lock);
> >> vlan = rcu_dereference_protected(q->vlan,
> >> lockdep_is_held(&macvtap_lock));
> >> +
> >> if (vlan) {
> >> + int total = vlan->numvtaps + vlan->numdisabled;
> >> int index = q->queue_index;
> >> - BUG_ON(index >= vlan->numvtaps);
> >> + bool disabled = q->queue_index >= vlan->numvtaps;
> >> +
> >> + BUG_ON(q->queue_index >= total);
> >> + macvtap_swap_slot(vlan, index, total - 1);
> >> + if (!disabled && vlan->numdisabled)
> >> + /* If there's disabled taps, the above swap will cause
> >> + * a disabled tap to be moved to enabled area. So
> >> + * another swap is needed to keep the right order.
> >> + */
> >> + macvtap_swap_slot(vlan, index, vlan->numvtaps - 1);
> >> +
> >> + RCU_INIT_POINTER(vlan->taps[total - 1], NULL);
> >> + RCU_INIT_POINTER(q->vlan, NULL);
> >> + sock_put(&q->sk);
> >>
> >> - nq = rcu_dereference_protected(vlan->taps[vlan->numvtaps - 1],
> >> - lockdep_is_held(&macvtap_lock));
> >> - rcu_assign_pointer(vlan->taps[index], nq);
> >> - nq->queue_index = index;
> >> + /* Make sure the pointers were seen before indices */
> > Here it's one pointer, right?
>
> Right.
> >
> >> + wmb();
> > Same issue as above, looks even more worrying
> > as queue is freed here.
>
> The read side were protected by rcu_read_lock(), so no worries here.
Okay so basically numvtaps is just a hint,
it can be wrong and nothing too bad happens?
OK, but let's document this.
> >
> >>
> >> - RCU_INIT_POINTER(q->vlan, NULL);
> >> + if (disabled)
> >> + vlan->numdisabled--;
> >> + else
> >> + vlan->numvtaps--;
> >>
> >> - sock_put(&q->sk);
> >> - --vlan->numvtaps;
> >> }
> >>
> >> spin_unlock(&macvtap_lock);
> >> @@ -166,6 +273,9 @@ static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
> >> if (!numvtaps)
> >> goto out;
> >>
> >> + /* Check taps after numvtaps were exposed. */
> >> + rmb();
> >> +
> > Except this doesn't seem to handle case where taps are going away ...
>
> We're protected by rcu read lock here so even though it choose the queue
> which is going to be destroyed temporarily, the socket won't be freed
> before the packets were queued in the socket.
> >> /* Check if we can use flow to select a queue */
> >> rxq = skb_get_rxhash(skb);
> >> if (rxq) {
> >> @@ -201,7 +311,7 @@ static void macvtap_del_queues(struct net_device *dev)
> >>
> >> /* macvtap_put_queue can free some slots, so go through all slots */
> >> spin_lock(&macvtap_lock);
> >> - for (i = 0; i < vlan->numvtaps; i++) {
> >> + for (i = 0; i < vlan->numvtaps + vlan->numdisabled; i++) {
> >> q = rcu_dereference_protected(vlan->taps[i],
> >> lockdep_is_held(&macvtap_lock));
> >> BUG_ON(q == NULL);
> >> @@ -211,6 +321,7 @@ static void macvtap_del_queues(struct net_device *dev)
> >> }
> >> /* guarantee that any future macvtap_set_queue will fail */
> >> vlan->numvtaps = MAX_MACVTAP_QUEUES;
> >> + vlan->numdisabled = 0;
> >> spin_unlock(&macvtap_lock);
> >>
> >> synchronize_rcu();
> >> @@ -927,6 +1038,27 @@ static int macvtap_set_iff(struct file *file, struct ifreq __user *ifr_u)
> >> return 0;
> >> }
> >>
> >> +static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags)
> >> +{
> >> + struct macvtap_queue *q = file->private_data;
> >> + struct macvlan_dev *vlan;
> >> + int ret = -EINVAL;
> >> +
> >> + vlan = macvtap_get_vlan(q);
> >> + if (!vlan)
> >> + goto done;
> >> +
> >> + if (flags & IFF_ATTACH_QUEUE)
> >> + ret = macvtap_enable_queue(vlan->dev, file, q);
> >> + else if (flags & IFF_DETACH_QUEUE)
> >> + ret = macvtap_disable_queue(q);
> >> +
> >> + macvtap_put_vlan(vlan);
> >> +
> >> +done:
> >> + return ret;
> >> +}
> >> +
> >> /*
> >> * provide compatibility with generic tun/tap interface
> >> */
> >> @@ -959,6 +1091,11 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
> >> macvtap_put_vlan(vlan);
> >> return ret;
> >>
> >> + case TUNSETQUEUE:
> >> + if (get_user(u, &ifr->ifr_flags))
> >> + return -EFAULT;
> >> + return macvtap_ioctl_set_queue(file, u);
> >> +
> >> case TUNGETFEATURES:
> >> if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR, up))
> >> return -EFAULT;
> >> diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
> >> index 62d8bda..d528f38 100644
> >> --- a/include/linux/if_macvlan.h
> >> +++ b/include/linux/if_macvlan.h
> >> @@ -69,8 +69,15 @@ struct macvlan_dev {
> >> u16 flags;
> >> int (*receive)(struct sk_buff *skb);
> >> int (*forward)(struct net_device *dev, struct sk_buff *skb);
> >> + /* This array tracks all taps (include disabled ones) and will be
> >> + * reshuffled to keep the following order:
> >> + * [0, numvtaps) : enabled taps,
> >> + * [numvtaps, numvtaps + numdisabled) : disabled taps,
> >> + * [numvtaps + numdisabled, MAX_MACVTAP_QUEUES) : unused slots
> >> + */
> >> struct macvtap_queue *taps[MAX_MACVTAP_QUEUES];
> >> int numvtaps;
> >> + int numdisabled;
> >> int minor;
> >> };
> >>
> >> --
> >> 1.7.1
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [net-next rfc V2 7/8] macvtap: add TUNSETQUEUE ioctl
2013-06-03 11:09 ` Michael S. Tsirkin
@ 2013-06-04 5:54 ` Jason Wang
2013-06-04 7:05 ` Michael S. Tsirkin
0 siblings, 1 reply; 17+ messages in thread
From: Jason Wang @ 2013-06-04 5:54 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: davem, netdev, linux-kernel
On 06/03/2013 07:09 PM, Michael S. Tsirkin wrote:
> On Mon, Jun 03, 2013 at 01:20:58PM +0800, Jason Wang wrote:
>> On 06/02/2013 07:22 PM, Michael S. Tsirkin wrote:
>>> On Fri, May 31, 2013 at 05:53:24PM +0800, Jason Wang wrote:
>>>> This patch adds TUNSETQUEUE ioctl to let userspace can temporarily disable or
>>>> enable a queue of macvtap. This is used to be compatible at API layer of tuntap
>>>> to simplify the userspace to manage the queues.
>>>>
>>>> This is done by split the taps array into three different areas:
>>>>
>>>> - [0, numvtaps) : enabled taps
>>>> - [numvtaps, numvtaps + numdisabled) : disabled taps
>>>> - [numvtaps + numdisabled, MAX_MAXVTAP_QUEUES) : unused slots
>>>>
>>>> When a tap were enabled and disabled, it was moved to another area.
>>>>
>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>> This seems a bit tricky. Can we just move the tap out of the
>>> array?
>> Certainly yes.
>>> the only reason we have the array is for fast
>>> lookup on xmit.
>>> What's the reason to keep disabled queues there?
>> It saves us some space and make code simpler.
>>> To be able to track all queues for cleanups, all we need is
>>> a linked list of all queues (enabled and disabled).
>> Yes, but you need iterate both arrays and linked list which won't be
>> simpler than keeping them in place.
> No, my idea is to keep all taps in the list.
>
> If you need all taps, walks the list.
> If you need active taps, look them up in the array.
>
> Reasonable?
Looks so, will change in next version.
>
>>>> ---
>>>> drivers/net/macvtap.c | 167 ++++++++++++++++++++++++++++++++++++++++----
>>>> include/linux/if_macvlan.h | 7 ++
>>>> 2 files changed, 159 insertions(+), 15 deletions(-)
>>>>
>>>> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
>>>> index eac49cb..03b781c 100644
>>>> --- a/drivers/net/macvtap.c
>>>> +++ b/drivers/net/macvtap.c
>>>> @@ -85,32 +85,126 @@ static const struct proto_ops macvtap_socket_ops;
>>>> */
>>>> static DEFINE_SPINLOCK(macvtap_lock);
>>>>
>>>> -static int macvtap_set_queue(struct net_device *dev, struct file *file,
>>>> +static void macvtap_swap_slot(struct macvlan_dev *vlan, int a, int b)
>>>> +{
>>>> + struct macvtap_queue *q1, *q2;
>>>> +
>>>> + if (a == b)
>>>> + return;
>>>> +
>>>> + q1 = rcu_dereference_protected(vlan->taps[a],
>>>> + lockdep_is_held(&macvtap_lock));
>>>> + q2 = rcu_dereference_protected(vlan->taps[b],
>>>> + lockdep_is_held(&macvtap_lock));
>>>> +
>>>> + BUG_ON(q1 == NULL || q2 == NULL);
>>>> +
>>>> + rcu_assign_pointer(vlan->taps[a], q2);
>>>> + rcu_assign_pointer(vlan->taps[b], q1);
>>>> +
>>>> + q1->queue_index = b;
>>>> + q2->queue_index = a;
>>>> +}
>>>> +
>>>> +static int macvtap_enable_queue(struct net_device *dev, struct file *file,
>>>> struct macvtap_queue *q)
>>>> {
>>>> struct macvlan_dev *vlan = netdev_priv(dev);
>>>> + int err = -EINVAL;
>>>> + int total;
>>>> +
>>>> + spin_lock(&macvtap_lock);
>>>> + total = vlan->numvtaps + vlan->numdisabled;
>>>> +
>>>> + if (q->queue_index < vlan->numvtaps)
>>>> + goto out;
>>>> +
>>>> + err = 0;
>>>> +
>>>> + BUG_ON(q->queue_index >= total);
>>>> + macvtap_swap_slot(vlan, q->queue_index, vlan->numvtaps);
>>>> +
>>>> + /* Make sure the pointers were seen before indices. */
>>>> + wmb();
>>> Which indices? We only care about numvtaps right?
>>> So let's just say so.
>> ok
>>> Why is this wmb and not smp_wmb()?
>> will correct it.
>>>> +
>>>> + vlan->numdisabled--;
>>>> + vlan->numvtaps++;
>>>> +out:
>>>> + spin_unlock(&macvtap_lock);
>>>> + return err;
>>>> +}
>>>> +
>>>> +static int macvtap_set_queue(struct net_device *dev, struct file *file,
>>>> + struct macvtap_queue *q)
>>>> +{
>>>> + struct macvlan_dev *vlan = netdev_priv(dev);
>>>> int err = -EBUSY;
>>>> + int total;
>>>>
>>>> spin_lock(&macvtap_lock);
>>>> - if (vlan->numvtaps == MAX_MACVTAP_QUEUES)
>>>> +
>>>> + total = vlan->numvtaps + vlan->numdisabled;
>>>> + if (total == MAX_MACVTAP_QUEUES)
>>>> goto out;
>>>>
>>>> err = 0;
>>>> +
>>>> rcu_assign_pointer(q->vlan, vlan);
>>>> - rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
>>>> + rcu_assign_pointer(vlan->taps[total], q);
>>>> sock_hold(&q->sk);
>>>>
>>>> q->file = file;
>>>> - q->queue_index = vlan->numvtaps;
>>>> + q->queue_index = total;
>>>> file->private_data = q;
>>>> + if (vlan->numdisabled)
>>>> + macvtap_swap_slot(vlan, vlan->numvtaps, total);
>>>>
>>>> - vlan->numvtaps++;
>>>> + /* Make sure the pointers were seen before indices. */
>>>> + wmb();
>>>>
>>>> + vlan->numvtaps++;
>>>> out:
>>>> spin_unlock(&macvtap_lock);
>>>> return err;
>>>> }
>>>>
>>>> +static int macvtap_disable_queue(struct macvtap_queue *q)
>>>> +{
>>>> + struct macvlan_dev *vlan;
>>>> + int err = -EINVAL;
>>>> +
>>>> + spin_lock(&macvtap_lock);
>>>> + vlan = rcu_dereference_protected(q->vlan,
>>>> + lockdep_is_held(&macvtap_lock));
>>>> +
>>>> + if (vlan) {
>>>> + int total = vlan->numvtaps + vlan->numdisabled;
>>>> + int index = q->queue_index;
>>>> +
>>>> + BUG_ON(q->queue_index >= total);
>>>> + if (q->queue_index >= vlan->numvtaps)
>>>> + goto out;
>>>> +
>>>> + err = 0;
>>>> + macvtap_swap_slot(vlan, index, total - 1);
>>>> + if (vlan->numdisabled)
>>>> + /* If there's disabled taps, the above swap will cause
>>>> + * a disabled tap to be moved to enabled area. So
>>>> + * another swap is needed to keep the right order.
>>>> + */
>>>> + macvtap_swap_slot(vlan, index, vlan->numvtaps - 1);
>>>> +
>>>> + /* make sure the pointers were seen before indices */
>>>> + wmb();
>>> Hmm this looks questionable. The code near rmb first
>>> checks numvtaps then dereferences the queue.
>>> So it might see a new queue but old value of numvtaps.
>> Right, barriers here were just best effort to reduce the possibility of
>> wrong queue selection when changing the number of queues.
> If this is an optimization, I'd say benchmark it and
> see if it helps performance.
>
> I don't expect it to have any effect really.
> In fact, the re-ordering of queues that this patch does
> is likely to cause packet reorering and hurt performance
> more.
Yes, so I will remove the barriers.
The re-ordering seems to be the easiest way to do fast lookup of active
queues. We could use indirection to avoid the re-ordering of queues,
it's hard to eliminate OOO packets. If we don't depends on changing the
number of queues frequently, we're ok.
>
> I'm guessing the only thing we need for correctness
> is ACCESS_ONCE on numvtaps?
Did't see how it help.
>
>>>> +
>>>> + vlan->numvtaps--;
>>>> + vlan->numdisabled++;
>>>> + }
>>>> +
>>>> +out:
>>>> + spin_unlock(&macvtap_lock);
>>>> + return err;
>>>> +}
>>>> /*
>>>> * The file owning the queue got closed, give up both
>>>> * the reference that the files holds as well as the
>>>> @@ -121,25 +215,38 @@ out:
>>>> */
>>>> static void macvtap_put_queue(struct macvtap_queue *q)
>>>> {
>>>> - struct macvtap_queue *nq;
>>>> struct macvlan_dev *vlan;
>>>>
>>>> spin_lock(&macvtap_lock);
>>>> vlan = rcu_dereference_protected(q->vlan,
>>>> lockdep_is_held(&macvtap_lock));
>>>> +
>>>> if (vlan) {
>>>> + int total = vlan->numvtaps + vlan->numdisabled;
>>>> int index = q->queue_index;
>>>> - BUG_ON(index >= vlan->numvtaps);
>>>> + bool disabled = q->queue_index >= vlan->numvtaps;
>>>> +
>>>> + BUG_ON(q->queue_index >= total);
>>>> + macvtap_swap_slot(vlan, index, total - 1);
>>>> + if (!disabled && vlan->numdisabled)
>>>> + /* If there's disabled taps, the above swap will cause
>>>> + * a disabled tap to be moved to enabled area. So
>>>> + * another swap is needed to keep the right order.
>>>> + */
>>>> + macvtap_swap_slot(vlan, index, vlan->numvtaps - 1);
>>>> +
>>>> + RCU_INIT_POINTER(vlan->taps[total - 1], NULL);
>>>> + RCU_INIT_POINTER(q->vlan, NULL);
>>>> + sock_put(&q->sk);
>>>>
>>>> - nq = rcu_dereference_protected(vlan->taps[vlan->numvtaps - 1],
>>>> - lockdep_is_held(&macvtap_lock));
>>>> - rcu_assign_pointer(vlan->taps[index], nq);
>>>> - nq->queue_index = index;
>>>> + /* Make sure the pointers were seen before indices */
>>> Here it's one pointer, right?
>> Right.
>>>> + wmb();
>>> Same issue as above, looks even more worrying
>>> as queue is freed here.
>> The read side were protected by rcu_read_lock(), so no worries here.
> Okay so basically numvtaps is just a hint,
> it can be wrong and nothing too bad happens?
>
> OK, but let's document this.
Sure.
>
>>>>
>>>> - RCU_INIT_POINTER(q->vlan, NULL);
>>>> + if (disabled)
>>>> + vlan->numdisabled--;
>>>> + else
>>>> + vlan->numvtaps--;
>>>>
>>>> - sock_put(&q->sk);
>>>> - --vlan->numvtaps;
>>>> }
>>>>
>>>> spin_unlock(&macvtap_lock);
>>>> @@ -166,6 +273,9 @@ static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
>>>> if (!numvtaps)
>>>> goto out;
>>>>
>>>> + /* Check taps after numvtaps were exposed. */
>>>> + rmb();
>>>> +
>>> Except this doesn't seem to handle case where taps are going away ...
>> We're protected by rcu read lock here so even though it choose the queue
>> which is going to be destroyed temporarily, the socket won't be freed
>> before the packets were queued in the socket.
>>>> /* Check if we can use flow to select a queue */
>>>> rxq = skb_get_rxhash(skb);
>>>> if (rxq) {
>>>> @@ -201,7 +311,7 @@ static void macvtap_del_queues(struct net_device *dev)
>>>>
>>>> /* macvtap_put_queue can free some slots, so go through all slots */
>>>> spin_lock(&macvtap_lock);
>>>> - for (i = 0; i < vlan->numvtaps; i++) {
>>>> + for (i = 0; i < vlan->numvtaps + vlan->numdisabled; i++) {
>>>> q = rcu_dereference_protected(vlan->taps[i],
>>>> lockdep_is_held(&macvtap_lock));
>>>> BUG_ON(q == NULL);
>>>> @@ -211,6 +321,7 @@ static void macvtap_del_queues(struct net_device *dev)
>>>> }
>>>> /* guarantee that any future macvtap_set_queue will fail */
>>>> vlan->numvtaps = MAX_MACVTAP_QUEUES;
>>>> + vlan->numdisabled = 0;
>>>> spin_unlock(&macvtap_lock);
>>>>
>>>> synchronize_rcu();
>>>> @@ -927,6 +1038,27 @@ static int macvtap_set_iff(struct file *file, struct ifreq __user *ifr_u)
>>>> return 0;
>>>> }
>>>>
>>>> +static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags)
>>>> +{
>>>> + struct macvtap_queue *q = file->private_data;
>>>> + struct macvlan_dev *vlan;
>>>> + int ret = -EINVAL;
>>>> +
>>>> + vlan = macvtap_get_vlan(q);
>>>> + if (!vlan)
>>>> + goto done;
>>>> +
>>>> + if (flags & IFF_ATTACH_QUEUE)
>>>> + ret = macvtap_enable_queue(vlan->dev, file, q);
>>>> + else if (flags & IFF_DETACH_QUEUE)
>>>> + ret = macvtap_disable_queue(q);
>>>> +
>>>> + macvtap_put_vlan(vlan);
>>>> +
>>>> +done:
>>>> + return ret;
>>>> +}
>>>> +
>>>> /*
>>>> * provide compatibility with generic tun/tap interface
>>>> */
>>>> @@ -959,6 +1091,11 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
>>>> macvtap_put_vlan(vlan);
>>>> return ret;
>>>>
>>>> + case TUNSETQUEUE:
>>>> + if (get_user(u, &ifr->ifr_flags))
>>>> + return -EFAULT;
>>>> + return macvtap_ioctl_set_queue(file, u);
>>>> +
>>>> case TUNGETFEATURES:
>>>> if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR, up))
>>>> return -EFAULT;
>>>> diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
>>>> index 62d8bda..d528f38 100644
>>>> --- a/include/linux/if_macvlan.h
>>>> +++ b/include/linux/if_macvlan.h
>>>> @@ -69,8 +69,15 @@ struct macvlan_dev {
>>>> u16 flags;
>>>> int (*receive)(struct sk_buff *skb);
>>>> int (*forward)(struct net_device *dev, struct sk_buff *skb);
>>>> + /* This array tracks all taps (include disabled ones) and will be
>>>> + * reshuffled to keep the following order:
>>>> + * [0, numvtaps) : enabled taps,
>>>> + * [numvtaps, numvtaps + numdisabled) : disabled taps,
>>>> + * [numvtaps + numdisabled, MAX_MACVTAP_QUEUES) : unused slots
>>>> + */
>>>> struct macvtap_queue *taps[MAX_MACVTAP_QUEUES];
>>>> int numvtaps;
>>>> + int numdisabled;
>>>> int minor;
>>>> };
>>>>
>>>> --
>>>> 1.7.1
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at http://www.tux.org/lkml/
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [net-next rfc V2 7/8] macvtap: add TUNSETQUEUE ioctl
2013-06-04 5:54 ` Jason Wang
@ 2013-06-04 7:05 ` Michael S. Tsirkin
2013-06-05 3:01 ` Jason Wang
0 siblings, 1 reply; 17+ messages in thread
From: Michael S. Tsirkin @ 2013-06-04 7:05 UTC (permalink / raw)
To: Jason Wang; +Cc: davem, netdev, linux-kernel
On Tue, Jun 04, 2013 at 01:54:56PM +0800, Jason Wang wrote:
> On 06/03/2013 07:09 PM, Michael S. Tsirkin wrote:
> > On Mon, Jun 03, 2013 at 01:20:58PM +0800, Jason Wang wrote:
> >> On 06/02/2013 07:22 PM, Michael S. Tsirkin wrote:
> >>> On Fri, May 31, 2013 at 05:53:24PM +0800, Jason Wang wrote:
> >>>> This patch adds TUNSETQUEUE ioctl to let userspace can temporarily disable or
> >>>> enable a queue of macvtap. This is used to be compatible at API layer of tuntap
> >>>> to simplify the userspace to manage the queues.
> >>>>
> >>>> This is done by split the taps array into three different areas:
> >>>>
> >>>> - [0, numvtaps) : enabled taps
> >>>> - [numvtaps, numvtaps + numdisabled) : disabled taps
> >>>> - [numvtaps + numdisabled, MAX_MAXVTAP_QUEUES) : unused slots
> >>>>
> >>>> When a tap were enabled and disabled, it was moved to another area.
> >>>>
> >>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> >>> This seems a bit tricky. Can we just move the tap out of the
> >>> array?
> >> Certainly yes.
> >>> the only reason we have the array is for fast
> >>> lookup on xmit.
> >>> What's the reason to keep disabled queues there?
> >> It saves us some space and make code simpler.
> >>> To be able to track all queues for cleanups, all we need is
> >>> a linked list of all queues (enabled and disabled).
> >> Yes, but you need iterate both arrays and linked list which won't be
> >> simpler than keeping them in place.
> > No, my idea is to keep all taps in the list.
> >
> > If you need all taps, walks the list.
> > If you need active taps, look them up in the array.
> >
> > Reasonable?
>
> Looks so, will change in next version.
> >
> >>>> ---
> >>>> drivers/net/macvtap.c | 167 ++++++++++++++++++++++++++++++++++++++++----
> >>>> include/linux/if_macvlan.h | 7 ++
> >>>> 2 files changed, 159 insertions(+), 15 deletions(-)
> >>>>
> >>>> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> >>>> index eac49cb..03b781c 100644
> >>>> --- a/drivers/net/macvtap.c
> >>>> +++ b/drivers/net/macvtap.c
> >>>> @@ -85,32 +85,126 @@ static const struct proto_ops macvtap_socket_ops;
> >>>> */
> >>>> static DEFINE_SPINLOCK(macvtap_lock);
> >>>>
> >>>> -static int macvtap_set_queue(struct net_device *dev, struct file *file,
> >>>> +static void macvtap_swap_slot(struct macvlan_dev *vlan, int a, int b)
> >>>> +{
> >>>> + struct macvtap_queue *q1, *q2;
> >>>> +
> >>>> + if (a == b)
> >>>> + return;
> >>>> +
> >>>> + q1 = rcu_dereference_protected(vlan->taps[a],
> >>>> + lockdep_is_held(&macvtap_lock));
> >>>> + q2 = rcu_dereference_protected(vlan->taps[b],
> >>>> + lockdep_is_held(&macvtap_lock));
> >>>> +
> >>>> + BUG_ON(q1 == NULL || q2 == NULL);
> >>>> +
> >>>> + rcu_assign_pointer(vlan->taps[a], q2);
> >>>> + rcu_assign_pointer(vlan->taps[b], q1);
> >>>> +
> >>>> + q1->queue_index = b;
> >>>> + q2->queue_index = a;
> >>>> +}
> >>>> +
> >>>> +static int macvtap_enable_queue(struct net_device *dev, struct file *file,
> >>>> struct macvtap_queue *q)
> >>>> {
> >>>> struct macvlan_dev *vlan = netdev_priv(dev);
> >>>> + int err = -EINVAL;
> >>>> + int total;
> >>>> +
> >>>> + spin_lock(&macvtap_lock);
> >>>> + total = vlan->numvtaps + vlan->numdisabled;
> >>>> +
> >>>> + if (q->queue_index < vlan->numvtaps)
> >>>> + goto out;
> >>>> +
> >>>> + err = 0;
> >>>> +
> >>>> + BUG_ON(q->queue_index >= total);
> >>>> + macvtap_swap_slot(vlan, q->queue_index, vlan->numvtaps);
> >>>> +
> >>>> + /* Make sure the pointers were seen before indices. */
> >>>> + wmb();
> >>> Which indices? We only care about numvtaps right?
> >>> So let's just say so.
> >> ok
> >>> Why is this wmb and not smp_wmb()?
> >> will correct it.
> >>>> +
> >>>> + vlan->numdisabled--;
> >>>> + vlan->numvtaps++;
> >>>> +out:
> >>>> + spin_unlock(&macvtap_lock);
> >>>> + return err;
> >>>> +}
> >>>> +
> >>>> +static int macvtap_set_queue(struct net_device *dev, struct file *file,
> >>>> + struct macvtap_queue *q)
> >>>> +{
> >>>> + struct macvlan_dev *vlan = netdev_priv(dev);
> >>>> int err = -EBUSY;
> >>>> + int total;
> >>>>
> >>>> spin_lock(&macvtap_lock);
> >>>> - if (vlan->numvtaps == MAX_MACVTAP_QUEUES)
> >>>> +
> >>>> + total = vlan->numvtaps + vlan->numdisabled;
> >>>> + if (total == MAX_MACVTAP_QUEUES)
> >>>> goto out;
> >>>>
> >>>> err = 0;
> >>>> +
> >>>> rcu_assign_pointer(q->vlan, vlan);
> >>>> - rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
> >>>> + rcu_assign_pointer(vlan->taps[total], q);
> >>>> sock_hold(&q->sk);
> >>>>
> >>>> q->file = file;
> >>>> - q->queue_index = vlan->numvtaps;
> >>>> + q->queue_index = total;
> >>>> file->private_data = q;
> >>>> + if (vlan->numdisabled)
> >>>> + macvtap_swap_slot(vlan, vlan->numvtaps, total);
> >>>>
> >>>> - vlan->numvtaps++;
> >>>> + /* Make sure the pointers were seen before indices. */
> >>>> + wmb();
> >>>>
> >>>> + vlan->numvtaps++;
> >>>> out:
> >>>> spin_unlock(&macvtap_lock);
> >>>> return err;
> >>>> }
> >>>>
> >>>> +static int macvtap_disable_queue(struct macvtap_queue *q)
> >>>> +{
> >>>> + struct macvlan_dev *vlan;
> >>>> + int err = -EINVAL;
> >>>> +
> >>>> + spin_lock(&macvtap_lock);
> >>>> + vlan = rcu_dereference_protected(q->vlan,
> >>>> + lockdep_is_held(&macvtap_lock));
> >>>> +
> >>>> + if (vlan) {
> >>>> + int total = vlan->numvtaps + vlan->numdisabled;
> >>>> + int index = q->queue_index;
> >>>> +
> >>>> + BUG_ON(q->queue_index >= total);
> >>>> + if (q->queue_index >= vlan->numvtaps)
> >>>> + goto out;
> >>>> +
> >>>> + err = 0;
> >>>> + macvtap_swap_slot(vlan, index, total - 1);
> >>>> + if (vlan->numdisabled)
> >>>> + /* If there's disabled taps, the above swap will cause
> >>>> + * a disabled tap to be moved to enabled area. So
> >>>> + * another swap is needed to keep the right order.
> >>>> + */
> >>>> + macvtap_swap_slot(vlan, index, vlan->numvtaps - 1);
> >>>> +
> >>>> + /* make sure the pointers were seen before indices */
> >>>> + wmb();
> >>> Hmm this looks questionable. The code near rmb first
> >>> checks numvtaps then dereferences the queue.
> >>> So it might see a new queue but old value of numvtaps.
> >> Right, barriers here were just best effort to reduce the possibility of
> >> wrong queue selection when changing the number of queues.
> > If this is an optimization, I'd say benchmark it and
> > see if it helps performance.
> >
> > I don't expect it to have any effect really.
> > In fact, the re-ordering of queues that this patch does
> > is likely to cause packet reorering and hurt performance
> > more.
>
> Yes, so I will remove the barriers.
>
> The re-ordering seems to be the easiest way to do fast lookup of active
> queues. We could use indirection to avoid the re-ordering of queues,
> it's hard to eliminate OOO packets. If we don't depends on changing the
> number of queues frequently, we're ok.
> >
> > I'm guessing the only thing we need for correctness
> > is ACCESS_ONCE on numvtaps?
>
> Did't see how it help.
For this loop:
while (unlikely(rxq >= numvtaps))
rxq -= numvtaps;
you better read numvtaps with ACCESS_ONCE otherwise
compiler can re-read numvtaps and it would
change during loop execution.
rxq can then become negative.
> >
> >>>> +
> >>>> + vlan->numvtaps--;
> >>>> + vlan->numdisabled++;
> >>>> + }
> >>>> +
> >>>> +out:
> >>>> + spin_unlock(&macvtap_lock);
> >>>> + return err;
> >>>> +}
> >>>> /*
> >>>> * The file owning the queue got closed, give up both
> >>>> * the reference that the files holds as well as the
> >>>> @@ -121,25 +215,38 @@ out:
> >>>> */
> >>>> static void macvtap_put_queue(struct macvtap_queue *q)
> >>>> {
> >>>> - struct macvtap_queue *nq;
> >>>> struct macvlan_dev *vlan;
> >>>>
> >>>> spin_lock(&macvtap_lock);
> >>>> vlan = rcu_dereference_protected(q->vlan,
> >>>> lockdep_is_held(&macvtap_lock));
> >>>> +
> >>>> if (vlan) {
> >>>> + int total = vlan->numvtaps + vlan->numdisabled;
> >>>> int index = q->queue_index;
> >>>> - BUG_ON(index >= vlan->numvtaps);
> >>>> + bool disabled = q->queue_index >= vlan->numvtaps;
> >>>> +
> >>>> + BUG_ON(q->queue_index >= total);
> >>>> + macvtap_swap_slot(vlan, index, total - 1);
> >>>> + if (!disabled && vlan->numdisabled)
> >>>> + /* If there's disabled taps, the above swap will cause
> >>>> + * a disabled tap to be moved to enabled area. So
> >>>> + * another swap is needed to keep the right order.
> >>>> + */
> >>>> + macvtap_swap_slot(vlan, index, vlan->numvtaps - 1);
> >>>> +
> >>>> + RCU_INIT_POINTER(vlan->taps[total - 1], NULL);
> >>>> + RCU_INIT_POINTER(q->vlan, NULL);
> >>>> + sock_put(&q->sk);
> >>>>
> >>>> - nq = rcu_dereference_protected(vlan->taps[vlan->numvtaps - 1],
> >>>> - lockdep_is_held(&macvtap_lock));
> >>>> - rcu_assign_pointer(vlan->taps[index], nq);
> >>>> - nq->queue_index = index;
> >>>> + /* Make sure the pointers were seen before indices */
> >>> Here it's one pointer, right?
> >> Right.
> >>>> + wmb();
> >>> Same issue as above, looks even more worrying
> >>> as queue is freed here.
> >> The read side were protected by rcu_read_lock(), so no worries here.
> > Okay so basically numvtaps is just a hint,
> > it can be wrong and nothing too bad happens?
> >
> > OK, but let's document this.
>
> Sure.
> >
> >>>>
> >>>> - RCU_INIT_POINTER(q->vlan, NULL);
> >>>> + if (disabled)
> >>>> + vlan->numdisabled--;
> >>>> + else
> >>>> + vlan->numvtaps--;
> >>>>
> >>>> - sock_put(&q->sk);
> >>>> - --vlan->numvtaps;
> >>>> }
> >>>>
> >>>> spin_unlock(&macvtap_lock);
> >>>> @@ -166,6 +273,9 @@ static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
> >>>> if (!numvtaps)
> >>>> goto out;
> >>>>
> >>>> + /* Check taps after numvtaps were exposed. */
> >>>> + rmb();
> >>>> +
> >>> Except this doesn't seem to handle case where taps are going away ...
> >> We're protected by rcu read lock here so even though it choose the queue
> >> which is going to be destroyed temporarily, the socket won't be freed
> >> before the packets were queued in the socket.
> >>>> /* Check if we can use flow to select a queue */
> >>>> rxq = skb_get_rxhash(skb);
> >>>> if (rxq) {
> >>>> @@ -201,7 +311,7 @@ static void macvtap_del_queues(struct net_device *dev)
> >>>>
> >>>> /* macvtap_put_queue can free some slots, so go through all slots */
> >>>> spin_lock(&macvtap_lock);
> >>>> - for (i = 0; i < vlan->numvtaps; i++) {
> >>>> + for (i = 0; i < vlan->numvtaps + vlan->numdisabled; i++) {
> >>>> q = rcu_dereference_protected(vlan->taps[i],
> >>>> lockdep_is_held(&macvtap_lock));
> >>>> BUG_ON(q == NULL);
> >>>> @@ -211,6 +321,7 @@ static void macvtap_del_queues(struct net_device *dev)
> >>>> }
> >>>> /* guarantee that any future macvtap_set_queue will fail */
> >>>> vlan->numvtaps = MAX_MACVTAP_QUEUES;
> >>>> + vlan->numdisabled = 0;
> >>>> spin_unlock(&macvtap_lock);
> >>>>
> >>>> synchronize_rcu();
> >>>> @@ -927,6 +1038,27 @@ static int macvtap_set_iff(struct file *file, struct ifreq __user *ifr_u)
> >>>> return 0;
> >>>> }
> >>>>
> >>>> +static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags)
> >>>> +{
> >>>> + struct macvtap_queue *q = file->private_data;
> >>>> + struct macvlan_dev *vlan;
> >>>> + int ret = -EINVAL;
> >>>> +
> >>>> + vlan = macvtap_get_vlan(q);
> >>>> + if (!vlan)
> >>>> + goto done;
> >>>> +
> >>>> + if (flags & IFF_ATTACH_QUEUE)
> >>>> + ret = macvtap_enable_queue(vlan->dev, file, q);
> >>>> + else if (flags & IFF_DETACH_QUEUE)
> >>>> + ret = macvtap_disable_queue(q);
> >>>> +
> >>>> + macvtap_put_vlan(vlan);
> >>>> +
> >>>> +done:
> >>>> + return ret;
> >>>> +}
> >>>> +
> >>>> /*
> >>>> * provide compatibility with generic tun/tap interface
> >>>> */
> >>>> @@ -959,6 +1091,11 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
> >>>> macvtap_put_vlan(vlan);
> >>>> return ret;
> >>>>
> >>>> + case TUNSETQUEUE:
> >>>> + if (get_user(u, &ifr->ifr_flags))
> >>>> + return -EFAULT;
> >>>> + return macvtap_ioctl_set_queue(file, u);
> >>>> +
> >>>> case TUNGETFEATURES:
> >>>> if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR, up))
> >>>> return -EFAULT;
> >>>> diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
> >>>> index 62d8bda..d528f38 100644
> >>>> --- a/include/linux/if_macvlan.h
> >>>> +++ b/include/linux/if_macvlan.h
> >>>> @@ -69,8 +69,15 @@ struct macvlan_dev {
> >>>> u16 flags;
> >>>> int (*receive)(struct sk_buff *skb);
> >>>> int (*forward)(struct net_device *dev, struct sk_buff *skb);
> >>>> + /* This array tracks all taps (include disabled ones) and will be
> >>>> + * reshuffled to keep the following order:
> >>>> + * [0, numvtaps) : enabled taps,
> >>>> + * [numvtaps, numvtaps + numdisabled) : disabled taps,
> >>>> + * [numvtaps + numdisabled, MAX_MACVTAP_QUEUES) : unused slots
> >>>> + */
> >>>> struct macvtap_queue *taps[MAX_MACVTAP_QUEUES];
> >>>> int numvtaps;
> >>>> + int numdisabled;
> >>>> int minor;
> >>>> };
> >>>>
> >>>> --
> >>>> 1.7.1
> >>> --
> >>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >>> the body of a message to majordomo@vger.kernel.org
> >>> More majordomo info at http://vger.kernel.org/majordomo-info.html
> >>> Please read the FAQ at http://www.tux.org/lkml/
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [net-next rfc V2 7/8] macvtap: add TUNSETQUEUE ioctl
2013-06-04 7:05 ` Michael S. Tsirkin
@ 2013-06-05 3:01 ` Jason Wang
0 siblings, 0 replies; 17+ messages in thread
From: Jason Wang @ 2013-06-05 3:01 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: davem, netdev, linux-kernel
On 06/04/2013 03:05 PM, Michael S. Tsirkin wrote:
> On Tue, Jun 04, 2013 at 01:54:56PM +0800, Jason Wang wrote:
>> On 06/03/2013 07:09 PM, Michael S. Tsirkin wrote:
>>> On Mon, Jun 03, 2013 at 01:20:58PM +0800, Jason Wang wrote:
>>>> On 06/02/2013 07:22 PM, Michael S. Tsirkin wrote:
>>>>> On Fri, May 31, 2013 at 05:53:24PM +0800, Jason Wang wrote:
>>>>>> This patch adds TUNSETQUEUE ioctl to let userspace can temporarily disable or
>>>>>> enable a queue of macvtap. This is used to be compatible at API layer of tuntap
>>>>>> to simplify the userspace to manage the queues.
>>>>>>
>>>>>> This is done by split the taps array into three different areas:
>>>>>>
>>>>>> - [0, numvtaps) : enabled taps
>>>>>> - [numvtaps, numvtaps + numdisabled) : disabled taps
>>>>>> - [numvtaps + numdisabled, MAX_MAXVTAP_QUEUES) : unused slots
>>>>>>
>>>>>> When a tap were enabled and disabled, it was moved to another area.
>>>>>>
>>>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
[...]
>>>>> +static int macvtap_disable_queue(struct macvtap_queue *q)
>>>>> +{
>>>>> + struct macvlan_dev *vlan;
>>>>> + int err = -EINVAL;
>>>>> +
>>>>> + spin_lock(&macvtap_lock);
>>>>> + vlan = rcu_dereference_protected(q->vlan,
>>>>> + lockdep_is_held(&macvtap_lock));
>>>>> +
>>>>> + if (vlan) {
>>>>> + int total = vlan->numvtaps + vlan->numdisabled;
>>>>> + int index = q->queue_index;
>>>>> +
>>>>> + BUG_ON(q->queue_index >= total);
>>>>> + if (q->queue_index >= vlan->numvtaps)
>>>>> + goto out;
>>>>> +
>>>>> + err = 0;
>>>>> + macvtap_swap_slot(vlan, index, total - 1);
>>>>> + if (vlan->numdisabled)
>>>>> + /* If there's disabled taps, the above swap will cause
>>>>> + * a disabled tap to be moved to enabled area. So
>>>>> + * another swap is needed to keep the right order.
>>>>> + */
>>>>> + macvtap_swap_slot(vlan, index, vlan->numvtaps - 1);
>>>>> +
>>>>> + /* make sure the pointers were seen before indices */
>>>>> + wmb();
>>>>> Hmm this looks questionable. The code near rmb first
>>>>> checks numvtaps then dereferences the queue.
>>>>> So it might see a new queue but old value of numvtaps.
>>>> Right, barriers here were just best effort to reduce the possibility of
>>>> wrong queue selection when changing the number of queues.
>>> If this is an optimization, I'd say benchmark it and
>>> see if it helps performance.
>>>
>>> I don't expect it to have any effect really.
>>> In fact, the re-ordering of queues that this patch does
>>> is likely to cause packet reorering and hurt performance
>>> more.
>> Yes, so I will remove the barriers.
>>
>> The re-ordering seems to be the easiest way to do fast lookup of active
>> queues. We could use indirection to avoid the re-ordering of queues,
>> it's hard to eliminate OOO packets. If we don't depends on changing the
>> number of queues frequently, we're ok.
>>> I'm guessing the only thing we need for correctness
>>> is ACCESS_ONCE on numvtaps?
>> Did't see how it help.
> For this loop:
> while (unlikely(rxq >= numvtaps))
> rxq -= numvtaps;
>
> you better read numvtaps with ACCESS_ONCE otherwise
> compiler can re-read numvtaps and it would
> change during loop execution.
> rxq can then become negative.
>
I get your meaning, looks like tun should be fixed as well.
Thanks
^ permalink raw reply [flat|nested] 17+ messages in thread
* [net-next rfc V2 8/8] macvtap: enable multiqueue flag
2013-05-31 9:53 [net-next rfc V2 0/8] Multiqueue API for macvtap Jason Wang
` (6 preceding siblings ...)
2013-05-31 9:53 ` [net-next rfc V2 7/8] macvtap: add TUNSETQUEUE ioctl Jason Wang
@ 2013-05-31 9:53 ` Jason Wang
7 siblings, 0 replies; 17+ messages in thread
From: Jason Wang @ 2013-05-31 9:53 UTC (permalink / raw)
To: davem, netdev, linux-kernel, mst; +Cc: Jason Wang
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/macvtap.c | 7 ++-----
1 files changed, 2 insertions(+), 5 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 03b781c..fac9dbf 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -31,10 +31,6 @@
* macvtap_proto is used to allocate queues through the sock allocation
* mechanism.
*
- * TODO: multiqueue support is currently not implemented, even though
- * macvtap is basically prepared for that. We will need to add this
- * here as well as in virtio-net and qemu to get line rate on 10gbit
- * adapters from a guest.
*/
struct macvtap_queue {
struct sock sk;
@@ -1097,7 +1093,8 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
return macvtap_ioctl_set_queue(file, u);
case TUNGETFEATURES:
- if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR, up))
+ if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR |
+ IFF_MULTI_QUEUE, up))
return -EFAULT;
return 0;
--
1.7.1
^ permalink raw reply related [flat|nested] 17+ messages in thread