* [PATCH net-next V3 5/6] net: introduce NETDEV_CHANGE_TX_QUEUE_LEN
From: Jason Wang @ 2016-06-30 3:52 UTC (permalink / raw)
To: mst, netdev, linux-kernel, davem
Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <1467258779-3539-1-git-send-email-jasowang@redhat.com>
This patch introduces a new event - NETDEV_CHANGE_TX_QUEUE_LEN, this
will be triggered when tx_queue_len. It could be used by net device
who want to do some processing at that time. An example is tun who may
want to resize tx array when tx_queue_len is changed.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
include/linux/netdevice.h | 1 +
net/core/net-sysfs.c | 15 ++++++++++++++-
2 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e84d9d2..7dc2ec7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2237,6 +2237,7 @@ struct netdev_lag_lower_state_info {
#define NETDEV_PRECHANGEUPPER 0x001A
#define NETDEV_CHANGELOWERSTATE 0x001B
#define NETDEV_UDP_TUNNEL_PUSH_INFO 0x001C
+#define NETDEV_CHANGE_TX_QUEUE_LEN 0x001E
int register_netdevice_notifier(struct notifier_block *nb);
int unregister_netdevice_notifier(struct notifier_block *nb);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 7a0b616..6e4f347 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -322,7 +322,20 @@ NETDEVICE_SHOW_RW(flags, fmt_hex);
static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
- dev->tx_queue_len = new_len;
+ int res, orig_len = dev->tx_queue_len;
+
+ if (new_len != orig_len) {
+ dev->tx_queue_len = new_len;
+ res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
+ res = notifier_to_errno(res);
+ if (res) {
+ netdev_err(dev,
+ "refused to change device tx_queue_len\n");
+ dev->tx_queue_len = orig_len;
+ return -EFAULT;
+ }
+ }
+
return 0;
}
--
2.7.4
^ permalink raw reply related
* [PATCH net-next V3 6/6] tun: switch to use skb array for tx
From: Jason Wang @ 2016-06-30 3:52 UTC (permalink / raw)
To: mst, netdev, linux-kernel, davem
Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <1467258779-3539-1-git-send-email-jasowang@redhat.com>
We used to queue tx packets in sk_receive_queue, this is less
efficient since it requires spinlocks to synchronize between producer
and consumer.
This patch tries to address this by:
- switch from sk_receive_queue to a skb_array, and resize it when
tx_queue_len was changed.
- introduce a new proto_ops peek_len which was used for peeking the
skb length.
- implement a tun version of peek_len for vhost_net to use and convert
vhost_net to use peek_len if possible.
Pktgen test shows about 15.3% improvement on guest receiving pps for small
buffers:
Before: ~1300000pps
After : ~1500000pps
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 138 +++++++++++++++++++++++++++++++++++++++++++++++++---
drivers/vhost/net.c | 16 +++++-
include/linux/net.h | 1 +
3 files changed, 146 insertions(+), 9 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 4884802..3be69ea 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -71,6 +71,7 @@
#include <net/sock.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
+#include <linux/skb_array.h>
#include <asm/uaccess.h>
@@ -167,6 +168,7 @@ struct tun_file {
};
struct list_head next;
struct tun_struct *detached;
+ struct skb_array tx_array;
};
struct tun_flow_entry {
@@ -515,7 +517,11 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
static void tun_queue_purge(struct tun_file *tfile)
{
- skb_queue_purge(&tfile->sk.sk_receive_queue);
+ struct sk_buff *skb;
+
+ while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
+ kfree_skb(skb);
+
skb_queue_purge(&tfile->sk.sk_error_queue);
}
@@ -560,6 +566,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
tun->dev->reg_state == NETREG_REGISTERED)
unregister_netdevice(tun->dev);
}
+ if (tun)
+ skb_array_cleanup(&tfile->tx_array);
sock_put(&tfile->sk);
}
}
@@ -613,6 +621,7 @@ static void tun_detach_all(struct net_device *dev)
static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
{
struct tun_file *tfile = file->private_data;
+ struct net_device *dev = tun->dev;
int err;
err = security_tun_dev_attach(tfile->socket.sk, tun->security);
@@ -642,6 +651,13 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
if (!err)
goto out;
}
+
+ if (!tfile->detached &&
+ skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
tfile->queue_index = tun->numqueues;
tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
rcu_assign_pointer(tfile->tun, tun);
@@ -891,8 +907,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
nf_reset(skb);
- /* Enqueue packet */
- skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
+ if (skb_array_produce(&tfile->tx_array, skb))
+ goto drop;
/* Notify and wake up reader process */
if (tfile->flags & TUN_FASYNC)
@@ -1107,7 +1123,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
poll_wait(file, sk_sleep(sk), wait);
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_array_empty(&tfile->tx_array))
mask |= POLLIN | POLLRDNORM;
if (sock_writeable(sk) ||
@@ -1426,22 +1442,61 @@ done:
return total;
}
+static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
+ int *err)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ struct sk_buff *skb = NULL;
+
+ skb = skb_array_consume(&tfile->tx_array);
+ if (skb)
+ goto out;
+ if (noblock) {
+ *err = -EAGAIN;
+ goto out;
+ }
+
+ add_wait_queue(&tfile->wq.wait, &wait);
+ current->state = TASK_INTERRUPTIBLE;
+
+ while (1) {
+ skb = skb_array_consume(&tfile->tx_array);
+ if (skb)
+ break;
+ if (signal_pending(current)) {
+ *err = -ERESTARTSYS;
+ break;
+ }
+ if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
+ *err = -EFAULT;
+ break;
+ }
+
+ schedule();
+ };
+
+ current->state = TASK_RUNNING;
+ remove_wait_queue(&tfile->wq.wait, &wait);
+
+out:
+ return skb;
+}
+
static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
struct iov_iter *to,
int noblock)
{
struct sk_buff *skb;
ssize_t ret;
- int peeked, err, off = 0;
+ int err;
tun_debug(KERN_INFO, tun, "tun_do_read\n");
if (!iov_iter_count(to))
return 0;
- /* Read frames from queue */
- skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
- &peeked, &off, &err);
+ /* Read frames from ring */
+ skb = tun_ring_recv(tfile, noblock, &err);
if (!skb)
return err;
@@ -1574,8 +1629,25 @@ out:
return ret;
}
+static int tun_peek_len(struct socket *sock)
+{
+ struct tun_file *tfile = container_of(sock, struct tun_file, socket);
+ struct tun_struct *tun;
+ int ret = 0;
+
+ tun = __tun_get(tfile);
+ if (!tun)
+ return 0;
+
+ ret = skb_array_peek_len(&tfile->tx_array);
+ tun_put(tun);
+
+ return ret;
+}
+
/* Ops structure to mimic raw sockets with tun */
static const struct proto_ops tun_socket_ops = {
+ .peek_len = tun_peek_len,
.sendmsg = tun_sendmsg,
.recvmsg = tun_recvmsg,
};
@@ -2397,6 +2469,53 @@ static const struct ethtool_ops tun_ethtool_ops = {
.get_ts_info = ethtool_op_get_ts_info,
};
+static int tun_queue_resize(struct tun_struct *tun)
+{
+ struct net_device *dev = tun->dev;
+ struct tun_file *tfile;
+ struct skb_array **arrays;
+ int n = tun->numqueues + tun->numdisabled;
+ int ret, i;
+
+ arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
+ if (!arrays)
+ return -ENOMEM;
+
+ for (i = 0; i < tun->numqueues; i++) {
+ tfile = rtnl_dereference(tun->tfiles[i]);
+ arrays[i] = &tfile->tx_array;
+ }
+ list_for_each_entry(tfile, &tun->disabled, next)
+ arrays[i++] = &tfile->tx_array;
+
+ ret = skb_array_resize_multiple(arrays, n,
+ dev->tx_queue_len, GFP_KERNEL);
+
+ kfree(arrays);
+ return ret;
+}
+
+static int tun_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct tun_struct *tun = netdev_priv(dev);
+
+ switch (event) {
+ case NETDEV_CHANGE_TX_QUEUE_LEN:
+ if (tun_queue_resize(tun))
+ return NOTIFY_BAD;
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block tun_notifier_block __read_mostly = {
+ .notifier_call = tun_device_event,
+};
static int __init tun_init(void)
{
@@ -2416,6 +2535,8 @@ static int __init tun_init(void)
pr_err("Can't register misc device %d\n", TUN_MINOR);
goto err_misc;
}
+
+ register_netdevice_notifier(&tun_notifier_block);
return 0;
err_misc:
rtnl_link_unregister(&tun_link_ops);
@@ -2427,6 +2548,7 @@ static void tun_cleanup(void)
{
misc_deregister(&tun_miscdev);
rtnl_link_unregister(&tun_link_ops);
+ unregister_netdevice_notifier(&tun_notifier_block);
}
/* Get an underlying socket object from tun file. Returns error unless file is
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 1d3e45f..e032ca3 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -481,10 +481,14 @@ out:
static int peek_head_len(struct sock *sk)
{
+ struct socket *sock = sk->sk_socket;
struct sk_buff *head;
int len = 0;
unsigned long flags;
+ if (sock->ops->peek_len)
+ return sock->ops->peek_len(sock);
+
spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
head = skb_peek(&sk->sk_receive_queue);
if (likely(head)) {
@@ -497,6 +501,16 @@ static int peek_head_len(struct sock *sk)
return len;
}
+static int sk_has_rx_data(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (sock->ops->peek_len)
+ return sock->ops->peek_len(sock);
+
+ return skb_queue_empty(&sk->sk_receive_queue);
+}
+
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
@@ -513,7 +527,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
endtime = busy_clock() + vq->busyloop_timeout;
while (vhost_can_busy_poll(&net->dev, endtime) &&
- skb_queue_empty(&sk->sk_receive_queue) &&
+ !sk_has_rx_data(sk) &&
vhost_vq_avail_empty(&net->dev, vq))
cpu_relax_lowlatency();
diff --git a/include/linux/net.h b/include/linux/net.h
index 9aa49a0..b6b3843 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -185,6 +185,7 @@ struct proto_ops {
ssize_t (*splice_read)(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len, unsigned int flags);
int (*set_peek_off)(struct sock *sk, int val);
+ int (*peek_len)(struct socket *sock);
};
#define DECLARE_SOCKADDR(type, dst, src) \
--
2.7.4
^ permalink raw reply related
* Re: [PATCH net-next V3 5/6] net: introduce NETDEV_CHANGE_TX_QUEUE_LEN
From: John Fastabend @ 2016-06-30 4:56 UTC (permalink / raw)
To: Jason Wang, mst, netdev, linux-kernel, davem
Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <1467258779-3539-6-git-send-email-jasowang@redhat.com>
On 16-06-29 08:52 PM, Jason Wang wrote:
> This patch introduces a new event - NETDEV_CHANGE_TX_QUEUE_LEN, this
> will be triggered when tx_queue_len. It could be used by net device
> who want to do some processing at that time. An example is tun who may
> want to resize tx array when tx_queue_len is changed.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> include/linux/netdevice.h | 1 +
> net/core/net-sysfs.c | 15 ++++++++++++++-
> 2 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index e84d9d2..7dc2ec7 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -2237,6 +2237,7 @@ struct netdev_lag_lower_state_info {
> #define NETDEV_PRECHANGEUPPER 0x001A
> #define NETDEV_CHANGELOWERSTATE 0x001B
> #define NETDEV_UDP_TUNNEL_PUSH_INFO 0x001C
> +#define NETDEV_CHANGE_TX_QUEUE_LEN 0x001E
>
> int register_netdevice_notifier(struct notifier_block *nb);
> int unregister_netdevice_notifier(struct notifier_block *nb);
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index 7a0b616..6e4f347 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -322,7 +322,20 @@ NETDEVICE_SHOW_RW(flags, fmt_hex);
>
> static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)
> {
> - dev->tx_queue_len = new_len;
> + int res, orig_len = dev->tx_queue_len;
> +
> + if (new_len != orig_len) {
> + dev->tx_queue_len = new_len;
> + res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
> + res = notifier_to_errno(res);
> + if (res) {
> + netdev_err(dev,
> + "refused to change device tx_queue_len\n");
> + dev->tx_queue_len = orig_len;
> + return -EFAULT;
> + }
> + }
> +
> return 0;
> }
>
>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Great timing I was just looking into this because I need it for the
qdisc side.
It looks like this covers the sysfs change but the tx_queue_len can
also be changed via rtnetlink as well. So we need another patch for
that path right?
if (tb[IFLA_TXQLEN]) {
unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]);
if (dev->tx_queue_len ^ value)
status |= DO_SETLINK_NOTIFY;
dev->tx_queue_len = value;
}
Thanks,
John
^ permalink raw reply
* Re: [PATCH net-next V3 5/6] net: introduce NETDEV_CHANGE_TX_QUEUE_LEN
From: Jason Wang @ 2016-06-30 5:12 UTC (permalink / raw)
To: John Fastabend, mst, netdev, linux-kernel, davem
Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <5774A67B.1060802@gmail.com>
On 2016年06月30日 12:56, John Fastabend wrote:
> On 16-06-29 08:52 PM, Jason Wang wrote:
>> This patch introduces a new event - NETDEV_CHANGE_TX_QUEUE_LEN, this
>> will be triggered when tx_queue_len. It could be used by net device
>> who want to do some processing at that time. An example is tun who may
>> want to resize tx array when tx_queue_len is changed.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>> include/linux/netdevice.h | 1 +
>> net/core/net-sysfs.c | 15 ++++++++++++++-
>> 2 files changed, 15 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index e84d9d2..7dc2ec7 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -2237,6 +2237,7 @@ struct netdev_lag_lower_state_info {
>> #define NETDEV_PRECHANGEUPPER 0x001A
>> #define NETDEV_CHANGELOWERSTATE 0x001B
>> #define NETDEV_UDP_TUNNEL_PUSH_INFO 0x001C
>> +#define NETDEV_CHANGE_TX_QUEUE_LEN 0x001E
>>
>> int register_netdevice_notifier(struct notifier_block *nb);
>> int unregister_netdevice_notifier(struct notifier_block *nb);
>> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
>> index 7a0b616..6e4f347 100644
>> --- a/net/core/net-sysfs.c
>> +++ b/net/core/net-sysfs.c
>> @@ -322,7 +322,20 @@ NETDEVICE_SHOW_RW(flags, fmt_hex);
>>
>> static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)
>> {
>> - dev->tx_queue_len = new_len;
>> + int res, orig_len = dev->tx_queue_len;
>> +
>> + if (new_len != orig_len) {
>> + dev->tx_queue_len = new_len;
>> + res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
>> + res = notifier_to_errno(res);
>> + if (res) {
>> + netdev_err(dev,
>> + "refused to change device tx_queue_len\n");
>> + dev->tx_queue_len = orig_len;
>> + return -EFAULT;
>> + }
>> + }
>> +
>> return 0;
>> }
>>
>>
> Acked-by: John Fastabend <john.r.fastabend@intel.com>
>
> Great timing I was just looking into this because I need it for the
> qdisc side.
>
> It looks like this covers the sysfs change but the tx_queue_len can
> also be changed via rtnetlink as well. So we need another patch for
> that path right?
>
> if (tb[IFLA_TXQLEN]) {
> unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]);
>
> if (dev->tx_queue_len ^ value)
> status |= DO_SETLINK_NOTIFY;
>
> dev->tx_queue_len = value;
> }
>
> Thanks,
> John
>
Right, will do this in next version.
Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net-next V3 0/6] switch to use tx skb array in tun
From: Michael S. Tsirkin @ 2016-06-30 5:37 UTC (permalink / raw)
To: Jason Wang
Cc: kvm, eric.dumazet, netdev, linux-kernel, virtualization, brouer,
davem
In-Reply-To: <1467258779-3539-1-git-send-email-jasowang@redhat.com>
On Thu, Jun 30, 2016 at 11:52:53AM +0800, Jason Wang wrote:
> Hi all:
>
> This series tries to switch to use skb array in tun. This is used to
> eliminate the spinlock contention between producer and consumer. The
> conversion was straightforward: just introdce a tx skb array and use
> it instead of sk_receive_queue.
>
> A minor issue is to keep the tx_queue_len behaviour, since tun used to
> use it for the length of sk_receive_queue. This is done through:
>
> - add the ability to resize multiple rings at once to avoid handling
> partial resize failure for mutiple rings.
> - add the support for zero length ring.
> - introduce a notifier which was triggered when tx_queue_len was
> changed for a netdev.
> - resize all queues during the tx_queue_len changing.
>
> Tests shows about 15% improvement on guest rx pps:
>
> Before: ~1300000pps
> After : ~1500000pps
Series:
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> Changes from V2:
> - add multiple rings resizing support for ptr_ring/skb_array
> - add zero length ring support
> - introdce a NETDEV_CHANGE_TX_QUEUE_LEN
> - drop new flags
>
> Changes from V1:
> - switch to use skb array instead of a customized circular buffer
> - add non-blocking support
> - rename .peek to .peek_len
> - drop lockless peeking since test show very minor improvement
>
> Jason Wang (5):
> ptr_ring: support zero length ring
> skb_array: minor tweak
> skb_array: add wrappers for resizing
> net: introduce NETDEV_CHANGE_TX_QUEUE_LEN
> tun: switch to use skb array for tx
>
> Michael S. Tsirkin (1):
> ptr_ring: support resizing multiple queues
>
> drivers/net/tun.c | 138 ++++++++++++++++++++++++++++++++++++---
> drivers/vhost/net.c | 16 ++++-
> include/linux/net.h | 1 +
> include/linux/netdevice.h | 1 +
> include/linux/ptr_ring.h | 77 ++++++++++++++++++----
> include/linux/skb_array.h | 13 +++-
> net/core/net-sysfs.c | 15 ++++-
> tools/virtio/ringtest/ptr_ring.c | 5 ++
> 8 files changed, 243 insertions(+), 23 deletions(-)
>
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH v6v3 02/12] mm: migrate: support non-lru movable page migration
From: Anshuman Khandual @ 2016-06-30 5:56 UTC (permalink / raw)
To: Minchan Kim
Cc: Rik van Riel, Sergey Senozhatsky, Rafael Aquini, Jonathan Corbet,
Hugh Dickins, linux-kernel, dri-devel, virtualization,
John Einar Reitan, linux-mm, Gioh Kim, Mel Gorman, Andrew Morton,
Joonsoo Kim, Vlastimil Babka
In-Reply-To: <20160628063912.GA25560@bbox>
On 06/28/2016 12:09 PM, Minchan Kim wrote:
> On Mon, Jun 27, 2016 at 11:21:01AM +0530, Anshuman Khandual wrote:
>> On 06/16/2016 11:07 AM, Minchan Kim wrote:
>>> On Thu, Jun 16, 2016 at 09:12:07AM +0530, Anshuman Khandual wrote:
>>>> On 06/16/2016 05:56 AM, Minchan Kim wrote:
>>>>> On Wed, Jun 15, 2016 at 12:15:04PM +0530, Anshuman Khandual wrote:
>>>>>> On 06/15/2016 08:02 AM, Minchan Kim wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> On Mon, Jun 13, 2016 at 03:08:19PM +0530, Anshuman Khandual wrote:
>>>>>>>>> On 05/31/2016 05:31 AM, Minchan Kim wrote:
>>>>>>>>>>> @@ -791,6 +921,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>>>>>>>>>>> int rc = -EAGAIN;
>>>>>>>>>>> int page_was_mapped = 0;
>>>>>>>>>>> struct anon_vma *anon_vma = NULL;
>>>>>>>>>>> + bool is_lru = !__PageMovable(page);
>>>>>>>>>>>
>>>>>>>>>>> if (!trylock_page(page)) {
>>>>>>>>>>> if (!force || mode == MIGRATE_ASYNC)
>>>>>>>>>>> @@ -871,6 +1002,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>>>>>>>>>>> goto out_unlock_both;
>>>>>>>>>>> }
>>>>>>>>>>>
>>>>>>>>>>> + if (unlikely(!is_lru)) {
>>>>>>>>>>> + rc = move_to_new_page(newpage, page, mode);
>>>>>>>>>>> + goto out_unlock_both;
>>>>>>>>>>> + }
>>>>>>>>>>> +
>>>>>>>>>
>>>>>>>>> Hello Minchan,
>>>>>>>>>
>>>>>>>>> I might be missing something here but does this implementation support the
>>>>>>>>> scenario where these non LRU pages owned by the driver mapped as PTE into
>>>>>>>>> process page table ? Because the "goto out_unlock_both" statement above
>>>>>>>>> skips all the PTE unmap, putting a migration PTE and removing the migration
>>>>>>>>> PTE steps.
>>>>>>> You're right. Unfortunately, it doesn't support right now but surely,
>>>>>>> it's my TODO after landing this work.
>>>>>>>
>>>>>>> Could you share your usecase?
>>>>>>
>>>>>> Sure.
>>>>>
>>>>> Thanks a lot!
>>>>>
>>>>>>
>>>>>> My driver has privately managed non LRU pages which gets mapped into user space
>>>>>> process page table through f_ops->mmap() and vmops->fault() which then updates
>>>>>> the file RMAP (page->mapping->i_mmap) through page_add_file_rmap(page). One thing
>>>>>
>>>>> Hmm, page_add_file_rmap is not exported function. How does your driver can use it?
>>>>
>>>> Its not using the function directly, I just re-iterated the sequence of functions
>>>> above. (do_set_pte -> page_add_file_rmap) gets called after we grab the page from
>>>> driver through (__do_fault->vma->vm_ops->fault()).
>>>>
>>>>> Do you use vm_insert_pfn?
>>>>> What type your vma is? VM_PFNMMAP or VM_MIXEDMAP?
>>>>
>>>> I dont use vm_insert_pfn(). Here is the sequence of events how the user space
>>>> VMA gets the non LRU pages from the driver.
>>>>
>>>> - Driver registers a character device with 'struct file_operations' binding
>>>> - Then the 'fops->mmap()' just binds the incoming 'struct vma' with a 'struct
>>>> vm_operations_struct' which provides the 'vmops->fault()' routine which
>>>> basically traps all page faults on the VMA and provides one page at a time
>>>> through a driver specific allocation routine which hands over non LRU pages
>>>>
>>>> The VMA is not anything special as such. Its what we get when we try to do a
>>>> simple mmap() on a file descriptor pointing to a character device. I can
>>>> figure out all the VM_* flags it holds after creation.
>>>>
>>>>>
>>>>> I want to make dummy driver to simulate your case.
>>>>
>>>> Sure. I hope the above mentioned steps will help you but in case you need more
>>>> information, please do let me know.
>>>
>>> I got understood now. :)
>>> I will test it with dummy driver and will Cc'ed when I send a patch.
>>
>> Hello Minchan,
>>
>> Do you have any updates on this ? The V7 of the series still has this limitation.
>> Did you get a chance to test the driver out ? I am still concerned about how to
>> handle the struct address_space override problem within the struct page.
>
> Hi Anshuman,
>
> Slow but I am working on that. :) However, as I said, I want to do it
I really appreciate. Was just curious about the problem and any potential
solution we can look into.
> after soft landing of current non-lru-no-mapped page migration to solve
> current real field issues.
yeah it makes sense.
>
> About the overriding problem of non-lru-mapped-page, I implemented dummy
> driver as miscellaneous device and in test_mmap(file_operations.mmap),
> I changed a_ops with my address_space_operations.
>
> int test_mmap(struct file *filp, struct vm_area_struct *vma)
> {
> filp->f_mapping->a_ops = &test_aops;
> vma->vm_ops = &test_vm_ops;
> vma->vm_private_data = filp->private_data;
> return 0;
> }
>
Okay.
> test_aops should have *set_page_dirty* overriding.
>
> static int test_set_pag_dirty(struct page *page)
> {
> if (!PageDirty(page))
> SetPageDirty*page);
> return 0;
> }
>
> Otherwise, it goes BUG_ON during radix tree operation because
> currently try_to_unmap is designed for file-lru pages which lives
> in page cache so it propagates page table dirty bit to PG_dirty flag
> of struct page by set_page_dirty. And set_page_dirty want to mark
> dirty tag in radix tree node but it's character driver so the page
> cache doesn't have it. That's why we encounter BUG_ON in radix tree
> operation. Anyway, to test, I implemented set_page_dirty in my dummy
> driver.
Okay and the above test_set_page_dirty() example is sufficient ?
>
> With only that, it doesn't work because I need to modify migrate.c to
> work non-lru-mapped-page and changing PG_isolated flag which is
> override of PG_reclaim which is cleared in set_page_dirty.
Got it, so what changes you did ? Implemented PG_isolated differently
not by overriding PG_reclaim or something else ? Yes set_page_dirty
indeed clears the PG_reclaim flag.
>
> With that, it seems to work. But I'm not saying it's right model now
So the mapped pages migration was successful ? Even after overloading
filp->f_mapping->a_ops = &test_aops, we still have the RMAP information
intact with filp->f_mappinp pointed interval tree. But would really like
to see the code changes.
> for device drivers. In runtime, replacing filp->f_mapping->a_ops with
> custom a_ops of own driver seems to be hacky to me.
Yeah I thought so.
> So, I'm considering now new pseudo fs "movable_inode" which will
> support
>
> struct file *movable_inode_getfile(const char *name,
> const struct file_operations *fop,
> const struct address_space_operations *a_ops)
> {
> struct path path;
> struct qstr this;
> struct inode *inode;
> struct super_block *sb;
>
> this.name = name;
> this.len = strlen(name);
> this.hash = 0;
> sb = movable_mnt.mnt_sb;
> patch.denty = d_alloc_pseudo(movable_inode_mnt->mnt_sb, &this);
> patch.mnt = mntget(movable_inode_mnt);
>
> inode = new_inode(sb);
> ..
> ..
> inode->i_mapping->a_ops = a_ops;
> d_instantiate(path.dentry, inode);
>
> return alloc_file(&path, FMODE_WRITE | FMODE_READ, f_op);
> }
>
> And in our driver, we can change vma->vm_file with new one.
>
> int test_mmap(struct file *filp, struct vm_area_structd *vma)
> {
> struct file *newfile = movable_inode_getfile("[test"],
> filep->f_op, &test_aops);
> vma->vm_file = newfile;
> ..
> ..
> }
>
> When I read mmap_region in mm/mmap.c, it's reasonable usecase
> which dirver's mmap changes vma->vm_file with own file.
I will look into these details.
> Anyway, it needs many subtle changes in mm/vfs/driver side so
> need to review from each maintainers related subsystem so I
> want to not be hurry.
Sure, makes sense. Mean while it will be really great if you could share
your code changes as described above, so that I can try them out.
^ permalink raw reply
* Re: [PATCH net-next V3 5/6] net: introduce NETDEV_CHANGE_TX_QUEUE_LEN
From: Jason Wang @ 2016-06-30 5:59 UTC (permalink / raw)
To: John Fastabend, mst, netdev, linux-kernel, davem
Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <5774AA46.3090604@redhat.com>
On 2016年06月30日 13:12, Jason Wang wrote:
>
>
> On 2016年06月30日 12:56, John Fastabend wrote:
>> On 16-06-29 08:52 PM, Jason Wang wrote:
>>> This patch introduces a new event - NETDEV_CHANGE_TX_QUEUE_LEN, this
>>> will be triggered when tx_queue_len. It could be used by net device
>>> who want to do some processing at that time. An example is tun who may
>>> want to resize tx array when tx_queue_len is changed.
>>>
>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>> ---
>>> include/linux/netdevice.h | 1 +
>>> net/core/net-sysfs.c | 15 ++++++++++++++-
>>> 2 files changed, 15 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>> index e84d9d2..7dc2ec7 100644
>>> --- a/include/linux/netdevice.h
>>> +++ b/include/linux/netdevice.h
>>> @@ -2237,6 +2237,7 @@ struct netdev_lag_lower_state_info {
>>> #define NETDEV_PRECHANGEUPPER 0x001A
>>> #define NETDEV_CHANGELOWERSTATE 0x001B
>>> #define NETDEV_UDP_TUNNEL_PUSH_INFO 0x001C
>>> +#define NETDEV_CHANGE_TX_QUEUE_LEN 0x001E
>>> int register_netdevice_notifier(struct notifier_block *nb);
>>> int unregister_netdevice_notifier(struct notifier_block *nb);
>>> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
>>> index 7a0b616..6e4f347 100644
>>> --- a/net/core/net-sysfs.c
>>> +++ b/net/core/net-sysfs.c
>>> @@ -322,7 +322,20 @@ NETDEVICE_SHOW_RW(flags, fmt_hex);
>>> static int change_tx_queue_len(struct net_device *dev, unsigned
>>> long new_len)
>>> {
>>> - dev->tx_queue_len = new_len;
>>> + int res, orig_len = dev->tx_queue_len;
>>> +
>>> + if (new_len != orig_len) {
>>> + dev->tx_queue_len = new_len;
>>> + res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN,
>>> dev);
>>> + res = notifier_to_errno(res);
>>> + if (res) {
>>> + netdev_err(dev,
>>> + "refused to change device tx_queue_len\n");
>>> + dev->tx_queue_len = orig_len;
>>> + return -EFAULT;
>>> + }
>>> + }
>>> +
>>> return 0;
>>> }
>>>
>> Acked-by: John Fastabend <john.r.fastabend@intel.com>
>>
>> Great timing I was just looking into this because I need it for the
>> qdisc side.
>>
>> It looks like this covers the sysfs change but the tx_queue_len can
>> also be changed via rtnetlink as well. So we need another patch for
>> that path right?
>>
>> if (tb[IFLA_TXQLEN]) {
>> unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]);
>>
>> if (dev->tx_queue_len ^ value)
>> status |= DO_SETLINK_NOTIFY;
>>
>> dev->tx_queue_len = value;
>> }
>>
>> Thanks,
>> John
>>
>
> Right, will do this in next version.
>
> Thanks
Ok, since Michael has acked on the series, will prepare a patch on top.
Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH v6v3 02/12] mm: migrate: support non-lru movable page migration
From: Minchan Kim @ 2016-06-30 6:18 UTC (permalink / raw)
To: Anshuman Khandual
Cc: Rik van Riel, Sergey Senozhatsky, Rafael Aquini, Jonathan Corbet,
Hugh Dickins, linux-kernel, dri-devel, virtualization,
John Einar Reitan, linux-mm, Gioh Kim, Mel Gorman, Andrew Morton,
Joonsoo Kim, Vlastimil Babka
In-Reply-To: <5774B49D.6080000@linux.vnet.ibm.com>
On Thu, Jun 30, 2016 at 11:26:45AM +0530, Anshuman Khandual wrote:
<snip>
> >> Did you get a chance to test the driver out ? I am still concerned about how to
> >> handle the struct address_space override problem within the struct page.
> >
> > Hi Anshuman,
> >
> > Slow but I am working on that. :) However, as I said, I want to do it
>
> I really appreciate. Was just curious about the problem and any potential
> solution we can look into.
>
> > after soft landing of current non-lru-no-mapped page migration to solve
> > current real field issues.
>
> yeah it makes sense.
>
> >
> > About the overriding problem of non-lru-mapped-page, I implemented dummy
> > driver as miscellaneous device and in test_mmap(file_operations.mmap),
> > I changed a_ops with my address_space_operations.
> >
> > int test_mmap(struct file *filp, struct vm_area_struct *vma)
> > {
> > filp->f_mapping->a_ops = &test_aops;
> > vma->vm_ops = &test_vm_ops;
> > vma->vm_private_data = filp->private_data;
> > return 0;
> > }
> >
>
> Okay.
>
> > test_aops should have *set_page_dirty* overriding.
> >
> > static int test_set_pag_dirty(struct page *page)
> > {
> > if (!PageDirty(page))
> > SetPageDirty*page);
> > return 0;
> > }
> >
> > Otherwise, it goes BUG_ON during radix tree operation because
> > currently try_to_unmap is designed for file-lru pages which lives
> > in page cache so it propagates page table dirty bit to PG_dirty flag
> > of struct page by set_page_dirty. And set_page_dirty want to mark
> > dirty tag in radix tree node but it's character driver so the page
> > cache doesn't have it. That's why we encounter BUG_ON in radix tree
> > operation. Anyway, to test, I implemented set_page_dirty in my dummy
> > driver.
>
> Okay and the above test_set_page_dirty() example is sufficient ?
I guess just return 0 is sufficeint without any dirting a page.
>
> >
> > With only that, it doesn't work because I need to modify migrate.c to
> > work non-lru-mapped-page and changing PG_isolated flag which is
> > override of PG_reclaim which is cleared in set_page_dirty.
>
> Got it, so what changes you did ? Implemented PG_isolated differently
> not by overriding PG_reclaim or something else ? Yes set_page_dirty
> indeed clears the PG_reclaim flag.
>
> >
> > With that, it seems to work. But I'm not saying it's right model now
>
> So the mapped pages migration was successful ? Even after overloading
> filp->f_mapping->a_ops = &test_aops, we still have the RMAP information
> intact with filp->f_mappinp pointed interval tree. But would really like
> to see the code changes.
>
> > for device drivers. In runtime, replacing filp->f_mapping->a_ops with
> > custom a_ops of own driver seems to be hacky to me.
>
> Yeah I thought so.
>
> > So, I'm considering now new pseudo fs "movable_inode" which will
> > support
> >
> > struct file *movable_inode_getfile(const char *name,
> > const struct file_operations *fop,
> > const struct address_space_operations *a_ops)
> > {
> > struct path path;
> > struct qstr this;
> > struct inode *inode;
> > struct super_block *sb;
> >
> > this.name = name;
> > this.len = strlen(name);
> > this.hash = 0;
> > sb = movable_mnt.mnt_sb;
> > patch.denty = d_alloc_pseudo(movable_inode_mnt->mnt_sb, &this);
> > patch.mnt = mntget(movable_inode_mnt);
> >
> > inode = new_inode(sb);
> > ..
> > ..
> > inode->i_mapping->a_ops = a_ops;
> > d_instantiate(path.dentry, inode);
> >
> > return alloc_file(&path, FMODE_WRITE | FMODE_READ, f_op);
> > }
> >
> > And in our driver, we can change vma->vm_file with new one.
> >
> > int test_mmap(struct file *filp, struct vm_area_structd *vma)
> > {
> > struct file *newfile = movable_inode_getfile("[test"],
> > filep->f_op, &test_aops);
> > vma->vm_file = newfile;
> > ..
> > ..
> > }
> >
> > When I read mmap_region in mm/mmap.c, it's reasonable usecase
> > which dirver's mmap changes vma->vm_file with own file.
>
> I will look into these details.
>
> > Anyway, it needs many subtle changes in mm/vfs/driver side so
> > need to review from each maintainers related subsystem so I
> > want to not be hurry.
>
> Sure, makes sense. Mean while it will be really great if you could share
> your code changes as described above, so that I can try them out.
>
It's almost done for draft version and I'm doing stress test now and
fortunately, doesn't see the problem until now.
I will send you when I'm ready.
Thanks.
^ permalink raw reply
* [PATCH] tun: fix semicolon.cocci warnings
From: kbuild test robot @ 2016-06-30 6:20 UTC (permalink / raw)
To: Jason Wang
Cc: eric.dumazet, kvm, mst, netdev, linux-kernel, virtualization,
kbuild-all, brouer, davem
In-Reply-To: <1467258779-3539-7-git-send-email-jasowang@redhat.com>
drivers/net/tun.c:1476:2-3: Unneeded semicolon
Remove unneeded semicolon.
Generated by: scripts/coccinelle/misc/semicolon.cocci
CC: Jason Wang <jasowang@redhat.com>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
---
tun.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1473,7 +1473,7 @@ static struct sk_buff *tun_ring_recv(str
}
schedule();
- };
+ }
current->state = TASK_RUNNING;
remove_wait_queue(&tfile->wq.wait, &wait);
^ permalink raw reply
* Re: [PATCH net-next V3 6/6] tun: switch to use skb array for tx
From: kbuild test robot @ 2016-06-30 6:20 UTC (permalink / raw)
To: Jason Wang
Cc: eric.dumazet, kvm, mst, netdev, linux-kernel, virtualization,
kbuild-all, brouer, davem
In-Reply-To: <1467258779-3539-7-git-send-email-jasowang@redhat.com>
Hi,
[auto build test WARNING on net-next/master]
url: https://github.com/0day-ci/linux/commits/Jason-Wang/switch-to-use-tx-skb-array-in-tun/20160630-120656
coccinelle warnings: (new ones prefixed by >>)
>> drivers/net/tun.c:1476:2-3: Unneeded semicolon
Please review and possibly fold the followup patch.
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
^ permalink raw reply
* Re: [PATCH v2 00/12] gendisk: Generate uevent after attribute available
From: Christoph Hellwig @ 2016-06-30 6:24 UTC (permalink / raw)
To: Fam Zheng
Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
linux-nvme, virtualization, Keith Busch, Paul Mackerras,
Michael Ellerman, Shaohua Li, Nitin Gupta, Jiri Kosina,
linux-block, dan.j.williams, Ed L. Cashin, Jens Axboe, linux-raid,
David Woodhouse, linux-mmc, linux-kernel, Minchan Kim, linux-mtd,
Brian Norris, linuxppc-dev
In-Reply-To: <20160630015953.6888-1-famz@redhat.com>
On Thu, Jun 30, 2016 at 09:59:41AM +0800, Fam Zheng wrote:
> Documentation/kobject.txt:
> > Use the KOBJ_ADD action for when the kobject is first added to the kernel.
> > This should be done only after any attributes or children of the kobject
> > have been initialized properly, as userspace will instantly start to look
> > for them when this call happens.
>
> Unfortunately it seems impossible to fix this generally without touching the
> offending callers. The approach I'm proposing here is adding a flag to
> suppress uevent in add_disk(), which is patch 1, then in later patches, convert
> any caller to only trigger the uevent when attributes are added.
We (or rather Dan) is touching most add_disk callers anyway for the
driverfs_dev removal. Let's just pass the array of attributes to
a disk_add variant and solve the issue for real.
^ permalink raw reply
* Re: [PATCH v2 00/12] gendisk: Generate uevent after attribute available
From: Fam Zheng @ 2016-06-30 6:35 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
linux-nvme, virtualization, Keith Busch, Paul Mackerras,
Michael Ellerman, Shaohua Li, Nitin Gupta, Jiri Kosina,
linux-block, dan.j.williams, Ed L. Cashin, Jens Axboe, linux-raid,
David Woodhouse, linux-mmc, linux-kernel, Minchan Kim, linux-mtd,
Brian Norris, linuxppc-dev
In-Reply-To: <20160630062442.GA19761@infradead.org>
On Wed, 06/29 23:24, Christoph Hellwig wrote:
> On Thu, Jun 30, 2016 at 09:59:41AM +0800, Fam Zheng wrote:
> > Documentation/kobject.txt:
> > > Use the KOBJ_ADD action for when the kobject is first added to the kernel.
> > > This should be done only after any attributes or children of the kobject
> > > have been initialized properly, as userspace will instantly start to look
> > > for them when this call happens.
> >
> > Unfortunately it seems impossible to fix this generally without touching the
> > offending callers. The approach I'm proposing here is adding a flag to
> > suppress uevent in add_disk(), which is patch 1, then in later patches, convert
> > any caller to only trigger the uevent when attributes are added.
>
> We (or rather Dan) is touching most add_disk callers anyway for the
> driverfs_dev removal. Let's just pass the array of attributes to
> a disk_add variant and solve the issue for real.
I thought about that. Its usage is more compact compared to this series, but is
also more code and less flexible IMO. For example, we need at least two
variants, for attribute_group and device_attribute separately, right?
Fam
^ permalink raw reply
* Re: [PATCH v2 00/12] gendisk: Generate uevent after attribute available
From: Christoph Hellwig @ 2016-06-30 6:38 UTC (permalink / raw)
To: Fam Zheng
Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
linux-nvme, virtualization, Keith Busch, Paul Mackerras,
Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
Jiri Kosina, linux-raid, dan.j.williams, Ed L. Cashin, Jens Axboe,
linux-block, David Woodhouse, linux-mmc, linux-kernel,
Minchan Kim, linux-mtd, Brian Norris, linuxppc-dev
In-Reply-To: <20160630063554.GE23296@ad.usersys.redhat.com>
On Thu, Jun 30, 2016 at 02:35:54PM +0800, Fam Zheng wrote:
> also more code and less flexible IMO. For example, we need at least two
> variants, for attribute_group and device_attribute separately, right?
Yes, or maybe just a calling convention that just passes both.
^ permalink raw reply
* Re: [PATCH net-next V3 5/6] net: introduce NETDEV_CHANGE_TX_QUEUE_LEN
From: Jason Wang @ 2016-06-30 6:43 UTC (permalink / raw)
To: John Fastabend, mst, netdev, linux-kernel, davem
Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <5774B553.5070701@redhat.com>
On 2016年06月30日 13:59, Jason Wang wrote:
>
>
> On 2016年06月30日 13:12, Jason Wang wrote:
>>
>>
>> On 2016年06月30日 12:56, John Fastabend wrote:
>>> On 16-06-29 08:52 PM, Jason Wang wrote:
>>>> This patch introduces a new event - NETDEV_CHANGE_TX_QUEUE_LEN, this
>>>> will be triggered when tx_queue_len. It could be used by net device
>>>> who want to do some processing at that time. An example is tun who may
>>>> want to resize tx array when tx_queue_len is changed.
>>>>
>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>>> ---
>>>> include/linux/netdevice.h | 1 +
>>>> net/core/net-sysfs.c | 15 ++++++++++++++-
>>>> 2 files changed, 15 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>>> index e84d9d2..7dc2ec7 100644
>>>> --- a/include/linux/netdevice.h
>>>> +++ b/include/linux/netdevice.h
>>>> @@ -2237,6 +2237,7 @@ struct netdev_lag_lower_state_info {
>>>> #define NETDEV_PRECHANGEUPPER 0x001A
>>>> #define NETDEV_CHANGELOWERSTATE 0x001B
>>>> #define NETDEV_UDP_TUNNEL_PUSH_INFO 0x001C
>>>> +#define NETDEV_CHANGE_TX_QUEUE_LEN 0x001E
>>>> int register_netdevice_notifier(struct notifier_block *nb);
>>>> int unregister_netdevice_notifier(struct notifier_block *nb);
>>>> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
>>>> index 7a0b616..6e4f347 100644
>>>> --- a/net/core/net-sysfs.c
>>>> +++ b/net/core/net-sysfs.c
>>>> @@ -322,7 +322,20 @@ NETDEVICE_SHOW_RW(flags, fmt_hex);
>>>> static int change_tx_queue_len(struct net_device *dev, unsigned
>>>> long new_len)
>>>> {
>>>> - dev->tx_queue_len = new_len;
>>>> + int res, orig_len = dev->tx_queue_len;
>>>> +
>>>> + if (new_len != orig_len) {
>>>> + dev->tx_queue_len = new_len;
>>>> + res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN,
>>>> dev);
>>>> + res = notifier_to_errno(res);
>>>> + if (res) {
>>>> + netdev_err(dev,
>>>> + "refused to change device tx_queue_len\n");
>>>> + dev->tx_queue_len = orig_len;
>>>> + return -EFAULT;
>>>> + }
>>>> + }
>>>> +
>>>> return 0;
>>>> }
>>>>
>>> Acked-by: John Fastabend <john.r.fastabend@intel.com>
>>>
>>> Great timing I was just looking into this because I need it for the
>>> qdisc side.
>>>
>>> It looks like this covers the sysfs change but the tx_queue_len can
>>> also be changed via rtnetlink as well. So we need another patch for
>>> that path right?
>>>
>>> if (tb[IFLA_TXQLEN]) {
>>> unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]);
>>>
>>> if (dev->tx_queue_len ^ value)
>>> status |= DO_SETLINK_NOTIFY;
>>>
>>> dev->tx_queue_len = value;
>>> }
>>>
>>> Thanks,
>>> John
>>>
>>
>> Right, will do this in next version.
>>
>> Thanks
>
> Ok, since Michael has acked on the series, will prepare a patch on top.
>
> Thanks
Since kbuild test robot has found a minor issue on this series, I will
post v4 with this fixed.
Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH net-next V4 0/6] switch to use tx skb array in tun
From: Jason Wang @ 2016-06-30 6:45 UTC (permalink / raw)
To: mst, netdev, linux-kernel, davem
Cc: brouer, eric.dumazet, kvm, virtualization
Hi all:
This series tries to switch to use skb array in tun. This is used to
eliminate the spinlock contention between producer and consumer. The
conversion was straightforward: just introdce a tx skb array and use
it instead of sk_receive_queue.
A minor issue is to keep the tx_queue_len behaviour, since tun used to
use it for the length of sk_receive_queue. This is done through:
- add the ability to resize multiple rings at once to avoid handling
partial resize failure for mutiple rings.
- add the support for zero length ring.
- introduce a notifier which was triggered when tx_queue_len was
changed for a netdev.
- resize all queues during the tx_queue_len changing.
Tests shows about 15% improvement on guest rx pps:
Before: ~1300000pps
After : ~1500000pps
Changes from V3:
- fix kbuild warnings
- call NETDEV_CHANGE_TX_QUEUE_LEN on IFLA_TXQLEN
Changes from V2:
- add multiple rings resizing support for ptr_ring/skb_array
- add zero length ring support
- introdce a NETDEV_CHANGE_TX_QUEUE_LEN
- drop new flags
Changes from V1:
- switch to use skb array instead of a customized circular buffer
- add non-blocking support
- rename .peek to .peek_len
- drop lockless peeking since test show very minor improvement
Jason Wang (5):
ptr_ring: support zero length ring
skb_array: minor tweak
skb_array: add wrappers for resizing
net: introduce NETDEV_CHANGE_TX_QUEUE_LEN
tun: switch to use skb array for tx
Michael S. Tsirkin (1):
ptr_ring: support resizing multiple queues
drivers/net/tun.c | 138 ++++++++++++++++++++++++++++++++++++---
drivers/vhost/net.c | 16 ++++-
include/linux/net.h | 1 +
include/linux/netdevice.h | 1 +
include/linux/ptr_ring.h | 77 ++++++++++++++++++----
include/linux/skb_array.h | 13 +++-
net/core/net-sysfs.c | 15 ++++-
net/core/rtnetlink.c | 16 +++--
tools/virtio/ringtest/ptr_ring.c | 5 ++
9 files changed, 255 insertions(+), 27 deletions(-)
--
2.7.4
^ permalink raw reply
* [PATCH net-next V4 1/6] ptr_ring: support zero length ring
From: Jason Wang @ 2016-06-30 6:45 UTC (permalink / raw)
To: mst, netdev, linux-kernel, davem
Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <1467269136-8082-1-git-send-email-jasowang@redhat.com>
Sometimes, we need zero length ring. But current code will crash since
we don't do any check before accessing the ring. This patch fixes this.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
include/linux/ptr_ring.h | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 562a65e..d78b8b8 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -102,7 +102,7 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r)
*/
static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
{
- if (r->queue[r->producer])
+ if (unlikely(!r->size) || r->queue[r->producer])
return -ENOSPC;
r->queue[r->producer++] = ptr;
@@ -164,7 +164,9 @@ static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr)
*/
static inline void *__ptr_ring_peek(struct ptr_ring *r)
{
- return r->queue[r->consumer];
+ if (likely(r->size))
+ return r->queue[r->consumer];
+ return NULL;
}
/* Note: callers invoking this in a loop must use a compiler barrier,
--
2.7.4
^ permalink raw reply related
* [PATCH net-next V4 2/6] skb_array: minor tweak
From: Jason Wang @ 2016-06-30 6:45 UTC (permalink / raw)
To: mst, netdev, linux-kernel, davem
Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <1467269136-8082-1-git-send-email-jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
include/linux/skb_array.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index 678bfbf..2dd0d1e 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -151,12 +151,12 @@ static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp)
return ptr_ring_init(&a->ring, size, gfp);
}
-void __skb_array_destroy_skb(void *ptr)
+static void __skb_array_destroy_skb(void *ptr)
{
kfree_skb(ptr);
}
-int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
+static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
{
return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb);
}
--
2.7.4
^ permalink raw reply related
* [PATCH net-next V4 3/6] ptr_ring: support resizing multiple queues
From: Jason Wang @ 2016-06-30 6:45 UTC (permalink / raw)
To: mst, netdev, linux-kernel, davem
Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <1467269136-8082-1-git-send-email-jasowang@redhat.com>
From: "Michael S. Tsirkin" <mst@redhat.com>
Sometimes, we need support resizing multiple queues at once. This is
because it was not easy to recover to recover from a partial failure
of multiple queues resizing.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
include/linux/ptr_ring.h | 71 +++++++++++++++++++++++++++++++++++-----
tools/virtio/ringtest/ptr_ring.c | 5 +++
2 files changed, 67 insertions(+), 9 deletions(-)
diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index d78b8b8..2052011 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -349,20 +349,14 @@ static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp)
return 0;
}
-static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
- void (*destroy)(void *))
+static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
+ int size, gfp_t gfp,
+ void (*destroy)(void *))
{
- unsigned long flags;
int producer = 0;
- void **queue = __ptr_ring_init_queue_alloc(size, gfp);
void **old;
void *ptr;
- if (!queue)
- return -ENOMEM;
-
- spin_lock_irqsave(&(r)->producer_lock, flags);
-
while ((ptr = ptr_ring_consume(r)))
if (producer < size)
queue[producer++] = ptr;
@@ -375,6 +369,23 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
old = r->queue;
r->queue = queue;
+ return old;
+}
+
+static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
+ void (*destroy)(void *))
+{
+ unsigned long flags;
+ void **queue = __ptr_ring_init_queue_alloc(size, gfp);
+ void **old;
+
+ if (!queue)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&(r)->producer_lock, flags);
+
+ old = __ptr_ring_swap_queue(r, queue, size, gfp, destroy);
+
spin_unlock_irqrestore(&(r)->producer_lock, flags);
kfree(old);
@@ -382,6 +393,48 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
return 0;
}
+static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings,
+ int size,
+ gfp_t gfp, void (*destroy)(void *))
+{
+ unsigned long flags;
+ void ***queues;
+ int i;
+
+ queues = kmalloc(nrings * sizeof *queues, gfp);
+ if (!queues)
+ goto noqueues;
+
+ for (i = 0; i < nrings; ++i) {
+ queues[i] = __ptr_ring_init_queue_alloc(size, gfp);
+ if (!queues[i])
+ goto nomem;
+ }
+
+ for (i = 0; i < nrings; ++i) {
+ spin_lock_irqsave(&(rings[i])->producer_lock, flags);
+ queues[i] = __ptr_ring_swap_queue(rings[i], queues[i],
+ size, gfp, destroy);
+ spin_unlock_irqrestore(&(rings[i])->producer_lock, flags);
+ }
+
+ for (i = 0; i < nrings; ++i)
+ kfree(queues[i]);
+
+ kfree(queues);
+
+ return 0;
+
+nomem:
+ while (--i >= 0)
+ kfree(queues[i]);
+
+ kfree(queues);
+
+noqueues:
+ return -ENOMEM;
+}
+
static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *))
{
void *ptr;
diff --git a/tools/virtio/ringtest/ptr_ring.c b/tools/virtio/ringtest/ptr_ring.c
index 74abd74..68e4f9f 100644
--- a/tools/virtio/ringtest/ptr_ring.c
+++ b/tools/virtio/ringtest/ptr_ring.c
@@ -17,6 +17,11 @@
typedef pthread_spinlock_t spinlock_t;
typedef int gfp_t;
+static void *kmalloc(unsigned size, gfp_t gfp)
+{
+ return memalign(64, size);
+}
+
static void *kzalloc(unsigned size, gfp_t gfp)
{
void *p = memalign(64, size);
--
2.7.4
^ permalink raw reply related
* [PATCH net-next V4 4/6] skb_array: add wrappers for resizing
From: Jason Wang @ 2016-06-30 6:45 UTC (permalink / raw)
To: mst, netdev, linux-kernel, davem
Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <1467269136-8082-1-git-send-email-jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
include/linux/skb_array.h | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index 2dd0d1e..f4dfade 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -161,6 +161,15 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb);
}
+static inline int skb_array_resize_multiple(struct skb_array **rings,
+ int nrings, int size, gfp_t gfp)
+{
+ BUILD_BUG_ON(offsetof(struct skb_array, ring));
+ return ptr_ring_resize_multiple((struct ptr_ring **)rings,
+ nrings, size, gfp,
+ __skb_array_destroy_skb);
+}
+
static inline void skb_array_cleanup(struct skb_array *a)
{
ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb);
--
2.7.4
^ permalink raw reply related
* [PATCH net-next V4 5/6] net: introduce NETDEV_CHANGE_TX_QUEUE_LEN
From: Jason Wang @ 2016-06-30 6:45 UTC (permalink / raw)
To: mst, netdev, linux-kernel, davem
Cc: kvm, eric.dumazet, virtualization, John Fastabend, brouer
In-Reply-To: <1467269136-8082-1-git-send-email-jasowang@redhat.com>
This patch introduces a new event - NETDEV_CHANGE_TX_QUEUE_LEN, this
will be triggered when tx_queue_len. It could be used by net device
who want to do some processing at that time. An example is tun who may
want to resize tx array when tx_queue_len is changed.
Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
include/linux/netdevice.h | 1 +
net/core/net-sysfs.c | 15 ++++++++++++++-
net/core/rtnetlink.c | 16 ++++++++++++----
3 files changed, 27 insertions(+), 5 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e84d9d2..7dc2ec7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2237,6 +2237,7 @@ struct netdev_lag_lower_state_info {
#define NETDEV_PRECHANGEUPPER 0x001A
#define NETDEV_CHANGELOWERSTATE 0x001B
#define NETDEV_UDP_TUNNEL_PUSH_INFO 0x001C
+#define NETDEV_CHANGE_TX_QUEUE_LEN 0x001E
int register_netdevice_notifier(struct notifier_block *nb);
int unregister_netdevice_notifier(struct notifier_block *nb);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 7a0b616..6e4f347 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -322,7 +322,20 @@ NETDEVICE_SHOW_RW(flags, fmt_hex);
static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
- dev->tx_queue_len = new_len;
+ int res, orig_len = dev->tx_queue_len;
+
+ if (new_len != orig_len) {
+ dev->tx_queue_len = new_len;
+ res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
+ res = notifier_to_errno(res);
+ if (res) {
+ netdev_err(dev,
+ "refused to change device tx_queue_len\n");
+ dev->tx_queue_len = orig_len;
+ return -EFAULT;
+ }
+ }
+
return 0;
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index eb49ca2..b16b779 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1927,11 +1927,19 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_TXQLEN]) {
unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]);
-
- if (dev->tx_queue_len ^ value)
+ unsigned long orig_len = dev->tx_queue_len;
+
+ if (dev->tx_queue_len ^ value) {
+ dev->tx_queue_len = value;
+ err = call_netdevice_notifiers(
+ NETDEV_CHANGE_TX_QUEUE_LEN, dev);
+ err = notifier_to_errno(err);
+ if (err) {
+ dev->tx_queue_len = orig_len;
+ goto errout;
+ }
status |= DO_SETLINK_NOTIFY;
-
- dev->tx_queue_len = value;
+ }
}
if (tb[IFLA_OPERSTATE])
--
2.7.4
^ permalink raw reply related
* [PATCH net-next V4 6/6] tun: switch to use skb array for tx
From: Jason Wang @ 2016-06-30 6:45 UTC (permalink / raw)
To: mst, netdev, linux-kernel, davem
Cc: brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <1467269136-8082-1-git-send-email-jasowang@redhat.com>
We used to queue tx packets in sk_receive_queue, this is less
efficient since it requires spinlocks to synchronize between producer
and consumer.
This patch tries to address this by:
- switch from sk_receive_queue to a skb_array, and resize it when
tx_queue_len was changed.
- introduce a new proto_ops peek_len which was used for peeking the
skb length.
- implement a tun version of peek_len for vhost_net to use and convert
vhost_net to use peek_len if possible.
Pktgen test shows about 15.3% improvement on guest receiving pps for small
buffers:
Before: ~1300000pps
After : ~1500000pps
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 138 +++++++++++++++++++++++++++++++++++++++++++++++++---
drivers/vhost/net.c | 16 +++++-
include/linux/net.h | 1 +
3 files changed, 146 insertions(+), 9 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 4884802..7475215 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -71,6 +71,7 @@
#include <net/sock.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
+#include <linux/skb_array.h>
#include <asm/uaccess.h>
@@ -167,6 +168,7 @@ struct tun_file {
};
struct list_head next;
struct tun_struct *detached;
+ struct skb_array tx_array;
};
struct tun_flow_entry {
@@ -515,7 +517,11 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
static void tun_queue_purge(struct tun_file *tfile)
{
- skb_queue_purge(&tfile->sk.sk_receive_queue);
+ struct sk_buff *skb;
+
+ while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
+ kfree_skb(skb);
+
skb_queue_purge(&tfile->sk.sk_error_queue);
}
@@ -560,6 +566,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
tun->dev->reg_state == NETREG_REGISTERED)
unregister_netdevice(tun->dev);
}
+ if (tun)
+ skb_array_cleanup(&tfile->tx_array);
sock_put(&tfile->sk);
}
}
@@ -613,6 +621,7 @@ static void tun_detach_all(struct net_device *dev)
static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
{
struct tun_file *tfile = file->private_data;
+ struct net_device *dev = tun->dev;
int err;
err = security_tun_dev_attach(tfile->socket.sk, tun->security);
@@ -642,6 +651,13 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
if (!err)
goto out;
}
+
+ if (!tfile->detached &&
+ skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
tfile->queue_index = tun->numqueues;
tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
rcu_assign_pointer(tfile->tun, tun);
@@ -891,8 +907,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
nf_reset(skb);
- /* Enqueue packet */
- skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
+ if (skb_array_produce(&tfile->tx_array, skb))
+ goto drop;
/* Notify and wake up reader process */
if (tfile->flags & TUN_FASYNC)
@@ -1107,7 +1123,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
poll_wait(file, sk_sleep(sk), wait);
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_array_empty(&tfile->tx_array))
mask |= POLLIN | POLLRDNORM;
if (sock_writeable(sk) ||
@@ -1426,22 +1442,61 @@ done:
return total;
}
+static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
+ int *err)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ struct sk_buff *skb = NULL;
+
+ skb = skb_array_consume(&tfile->tx_array);
+ if (skb)
+ goto out;
+ if (noblock) {
+ *err = -EAGAIN;
+ goto out;
+ }
+
+ add_wait_queue(&tfile->wq.wait, &wait);
+ current->state = TASK_INTERRUPTIBLE;
+
+ while (1) {
+ skb = skb_array_consume(&tfile->tx_array);
+ if (skb)
+ break;
+ if (signal_pending(current)) {
+ *err = -ERESTARTSYS;
+ break;
+ }
+ if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
+ *err = -EFAULT;
+ break;
+ }
+
+ schedule();
+ }
+
+ current->state = TASK_RUNNING;
+ remove_wait_queue(&tfile->wq.wait, &wait);
+
+out:
+ return skb;
+}
+
static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
struct iov_iter *to,
int noblock)
{
struct sk_buff *skb;
ssize_t ret;
- int peeked, err, off = 0;
+ int err;
tun_debug(KERN_INFO, tun, "tun_do_read\n");
if (!iov_iter_count(to))
return 0;
- /* Read frames from queue */
- skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
- &peeked, &off, &err);
+ /* Read frames from ring */
+ skb = tun_ring_recv(tfile, noblock, &err);
if (!skb)
return err;
@@ -1574,8 +1629,25 @@ out:
return ret;
}
+static int tun_peek_len(struct socket *sock)
+{
+ struct tun_file *tfile = container_of(sock, struct tun_file, socket);
+ struct tun_struct *tun;
+ int ret = 0;
+
+ tun = __tun_get(tfile);
+ if (!tun)
+ return 0;
+
+ ret = skb_array_peek_len(&tfile->tx_array);
+ tun_put(tun);
+
+ return ret;
+}
+
/* Ops structure to mimic raw sockets with tun */
static const struct proto_ops tun_socket_ops = {
+ .peek_len = tun_peek_len,
.sendmsg = tun_sendmsg,
.recvmsg = tun_recvmsg,
};
@@ -2397,6 +2469,53 @@ static const struct ethtool_ops tun_ethtool_ops = {
.get_ts_info = ethtool_op_get_ts_info,
};
+static int tun_queue_resize(struct tun_struct *tun)
+{
+ struct net_device *dev = tun->dev;
+ struct tun_file *tfile;
+ struct skb_array **arrays;
+ int n = tun->numqueues + tun->numdisabled;
+ int ret, i;
+
+ arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
+ if (!arrays)
+ return -ENOMEM;
+
+ for (i = 0; i < tun->numqueues; i++) {
+ tfile = rtnl_dereference(tun->tfiles[i]);
+ arrays[i] = &tfile->tx_array;
+ }
+ list_for_each_entry(tfile, &tun->disabled, next)
+ arrays[i++] = &tfile->tx_array;
+
+ ret = skb_array_resize_multiple(arrays, n,
+ dev->tx_queue_len, GFP_KERNEL);
+
+ kfree(arrays);
+ return ret;
+}
+
+static int tun_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct tun_struct *tun = netdev_priv(dev);
+
+ switch (event) {
+ case NETDEV_CHANGE_TX_QUEUE_LEN:
+ if (tun_queue_resize(tun))
+ return NOTIFY_BAD;
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block tun_notifier_block __read_mostly = {
+ .notifier_call = tun_device_event,
+};
static int __init tun_init(void)
{
@@ -2416,6 +2535,8 @@ static int __init tun_init(void)
pr_err("Can't register misc device %d\n", TUN_MINOR);
goto err_misc;
}
+
+ register_netdevice_notifier(&tun_notifier_block);
return 0;
err_misc:
rtnl_link_unregister(&tun_link_ops);
@@ -2427,6 +2548,7 @@ static void tun_cleanup(void)
{
misc_deregister(&tun_miscdev);
rtnl_link_unregister(&tun_link_ops);
+ unregister_netdevice_notifier(&tun_notifier_block);
}
/* Get an underlying socket object from tun file. Returns error unless file is
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 1d3e45f..e032ca3 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -481,10 +481,14 @@ out:
static int peek_head_len(struct sock *sk)
{
+ struct socket *sock = sk->sk_socket;
struct sk_buff *head;
int len = 0;
unsigned long flags;
+ if (sock->ops->peek_len)
+ return sock->ops->peek_len(sock);
+
spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
head = skb_peek(&sk->sk_receive_queue);
if (likely(head)) {
@@ -497,6 +501,16 @@ static int peek_head_len(struct sock *sk)
return len;
}
+static int sk_has_rx_data(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (sock->ops->peek_len)
+ return sock->ops->peek_len(sock);
+
+ return skb_queue_empty(&sk->sk_receive_queue);
+}
+
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
@@ -513,7 +527,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
endtime = busy_clock() + vq->busyloop_timeout;
while (vhost_can_busy_poll(&net->dev, endtime) &&
- skb_queue_empty(&sk->sk_receive_queue) &&
+ !sk_has_rx_data(sk) &&
vhost_vq_avail_empty(&net->dev, vq))
cpu_relax_lowlatency();
diff --git a/include/linux/net.h b/include/linux/net.h
index 9aa49a0..b6b3843 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -185,6 +185,7 @@ struct proto_ops {
ssize_t (*splice_read)(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len, unsigned int flags);
int (*set_peek_off)(struct sock *sk, int val);
+ int (*peek_len)(struct socket *sock);
};
#define DECLARE_SOCKADDR(type, dst, src) \
--
2.7.4
^ permalink raw reply related
* Re: [PATCH net-next V4 0/6] switch to use tx skb array in tun
From: Michael S. Tsirkin @ 2016-06-30 15:45 UTC (permalink / raw)
To: Jason Wang
Cc: kvm, eric.dumazet, netdev, linux-kernel, virtualization, brouer,
davem
In-Reply-To: <1467269136-8082-1-git-send-email-jasowang@redhat.com>
On Thu, Jun 30, 2016 at 02:45:30PM +0800, Jason Wang wrote:
> Hi all:
>
> This series tries to switch to use skb array in tun. This is used to
> eliminate the spinlock contention between producer and consumer. The
> conversion was straightforward: just introdce a tx skb array and use
> it instead of sk_receive_queue.
>
> A minor issue is to keep the tx_queue_len behaviour, since tun used to
> use it for the length of sk_receive_queue. This is done through:
>
> - add the ability to resize multiple rings at once to avoid handling
> partial resize failure for mutiple rings.
> - add the support for zero length ring.
> - introduce a notifier which was triggered when tx_queue_len was
> changed for a netdev.
> - resize all queues during the tx_queue_len changing.
>
> Tests shows about 15% improvement on guest rx pps:
>
> Before: ~1300000pps
> After : ~1500000pps
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-from-altitude: 34697 feet.
> Changes from V3:
> - fix kbuild warnings
> - call NETDEV_CHANGE_TX_QUEUE_LEN on IFLA_TXQLEN
>
> Changes from V2:
> - add multiple rings resizing support for ptr_ring/skb_array
> - add zero length ring support
> - introdce a NETDEV_CHANGE_TX_QUEUE_LEN
> - drop new flags
>
> Changes from V1:
> - switch to use skb array instead of a customized circular buffer
> - add non-blocking support
> - rename .peek to .peek_len
> - drop lockless peeking since test show very minor improvement
>
> Jason Wang (5):
> ptr_ring: support zero length ring
> skb_array: minor tweak
> skb_array: add wrappers for resizing
> net: introduce NETDEV_CHANGE_TX_QUEUE_LEN
> tun: switch to use skb array for tx
>
> Michael S. Tsirkin (1):
> ptr_ring: support resizing multiple queues
>
> drivers/net/tun.c | 138 ++++++++++++++++++++++++++++++++++++---
> drivers/vhost/net.c | 16 ++++-
> include/linux/net.h | 1 +
> include/linux/netdevice.h | 1 +
> include/linux/ptr_ring.h | 77 ++++++++++++++++++----
> include/linux/skb_array.h | 13 +++-
> net/core/net-sysfs.c | 15 ++++-
> net/core/rtnetlink.c | 16 +++--
> tools/virtio/ringtest/ptr_ring.c | 5 ++
> 9 files changed, 255 insertions(+), 27 deletions(-)
>
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH net-next V4 5/6] net: introduce NETDEV_CHANGE_TX_QUEUE_LEN
From: John Fastabend @ 2016-06-30 16:33 UTC (permalink / raw)
To: Jason Wang, mst, netdev, linux-kernel, davem
Cc: John Fastabend, brouer, eric.dumazet, kvm, virtualization
In-Reply-To: <1467269136-8082-6-git-send-email-jasowang@redhat.com>
On 16-06-29 11:45 PM, Jason Wang wrote:
> This patch introduces a new event - NETDEV_CHANGE_TX_QUEUE_LEN, this
> will be triggered when tx_queue_len. It could be used by net device
> who want to do some processing at that time. An example is tun who may
> want to resize tx array when tx_queue_len is changed.
>
> Cc: John Fastabend <john.r.fastabend@intel.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
Thanks for adding the setlink case.
Acked-by: John Fastabend <john.r.fastabend@intel.com>
^ permalink raw reply
* Re: [PATCH v2 04/12] axonrom: Generate uevent after attribute available
From: Dan Williams @ 2016-06-30 22:10 UTC (permalink / raw)
To: Fam Zheng
Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
linux-nvme, virtualization, Keith Busch, Paul Mackerras,
Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
Jiri Kosina, linux-block, Ed L. Cashin, Jens Axboe, linux-raid,
David Woodhouse, linux-mmc@vger.kernel.org,
Linux Kernel Mailing List, Minchan Kim, linux-mtd, Brian Norris
In-Reply-To: <20160630015953.6888-5-famz@redhat.com>
On Wed, Jun 29, 2016 at 6:59 PM, Fam Zheng <famz@redhat.com> wrote:
> It is documented that KOBJ_ADD should be generated after the object's
> attributes and children are ready. We can achieve this with the new
> disk_gen_uevents interface.
>
> Signed-off-by: Fam Zheng <famz@redhat.com>
> ---
> arch/powerpc/sysdev/axonram.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
> index 4efd69b..27e7175 100644
> --- a/arch/powerpc/sysdev/axonram.c
> +++ b/arch/powerpc/sysdev/axonram.c
> @@ -238,7 +238,7 @@ static int axon_ram_probe(struct platform_device *device)
> set_capacity(bank->disk, bank->size >> AXON_RAM_SECTOR_SHIFT);
> blk_queue_make_request(bank->disk->queue, axon_ram_make_request);
> blk_queue_logical_block_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE);
> - add_disk(bank->disk, true);
> + add_disk(bank->disk, false);
>
> bank->irq_id = irq_of_parse_and_map(device->dev.of_node, 0);
> if (bank->irq_id == NO_IRQ) {
> @@ -262,6 +262,7 @@ static int axon_ram_probe(struct platform_device *device)
> rc = -EFAULT;
> goto failed;
> }
> + disk_gen_uevents(bank->disk);
I assume you are doing this after:
rc = device_create_file(&device->dev, &dev_attr_ecc);
...so that userspace gets notified of the new attribute, but this
attribute is on the parent device, not the disk itself. Instead I
think this attribute should simply be registered before the call to
add_disk(). Then the KOBJ_ADD event for the disk comes after the
attribute is available. It's still not a clean fit, because userspace
should not be expecting a child device uevent to signal new attributes
available on the parent.
^ permalink raw reply
* Re: [PATCH v2 05/12] aoeblk: Generate uevent after attribute available
From: Ed Cashin @ 2016-07-01 0:57 UTC (permalink / raw)
To: Fam Zheng, linux-kernel
Cc: Sergey Senozhatsky, Michael S. Tsirkin, Benjamin Herrenschmidt,
linux-nvme, virtualization, Keith Busch, Paul Mackerras,
Michael Ellerman, Christoph Hellwig, Shaohua Li, Nitin Gupta,
Jiri Kosina, linux-block, Jens Axboe, linux-raid, David Woodhouse,
linux-mmc, Minchan Kim, linux-mtd, Brian Norris, linuxppc-dev
In-Reply-To: <20160630015953.6888-6-famz@redhat.com>
On 06/29/2016 09:59 PM, Fam Zheng wrote:
> It is documented that KOBJ_ADD should be generated after the object's
> attributes and children are ready. We can achieve this with the new
> disk_gen_uevents interface.
Looks like an improvement, thanks!
--
Ed
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox