Netdev List
 help / color / mirror / Atom feed
* [PATCH v3 2/4] vhost-vsock: add pkt cancel capability
From: Peng Tao @ 2016-12-08 17:12 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: David Miller, kvm, virtualization, netdev, Jorgen Hansen
In-Reply-To: <1481217156-7160-2-git-send-email-bergwolf@gmail.com>

To allow canceling all packets of a connection.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Peng Tao <bergwolf@gmail.com>
---
 drivers/vhost/vsock.c  | 41 +++++++++++++++++++++++++++++++++++++++++
 include/net/af_vsock.h |  3 +++
 2 files changed, 44 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index a504e2e0..db64d51 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -218,6 +218,46 @@ vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt)
 	return len;
 }
 
+static int
+vhost_transport_cancel_pkt(struct vsock_sock *vsk)
+{
+	struct vhost_vsock *vsock;
+	struct virtio_vsock_pkt *pkt, *n;
+	int cnt = 0;
+	LIST_HEAD(freeme);
+
+	/* Find the vhost_vsock according to guest context id  */
+	vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
+	if (!vsock)
+		return -ENODEV;
+
+	spin_lock_bh(&vsock->send_pkt_list_lock);
+	list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) {
+		if (pkt->cancel_token != (void *)vsk)
+			continue;
+		list_move(&pkt->list, &freeme);
+	}
+	spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+	list_for_each_entry_safe(pkt, n, &freeme, list) {
+		if (pkt->reply)
+			cnt++;
+		list_del(&pkt->list);
+		virtio_transport_free_pkt(pkt);
+	}
+
+	if (cnt) {
+		struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
+		int new_cnt;
+
+		new_cnt = atomic_sub_return(cnt, &vsock->queued_replies);
+		if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num)
+			vhost_poll_queue(&tx_vq->poll);
+	}
+
+	return 0;
+}
+
 static struct virtio_vsock_pkt *
 vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
 		      unsigned int out, unsigned int in)
@@ -664,6 +704,7 @@ static struct virtio_transport vhost_transport = {
 		.release                  = virtio_transport_release,
 		.connect                  = virtio_transport_connect,
 		.shutdown                 = virtio_transport_shutdown,
+		.cancel_pkt               = vhost_transport_cancel_pkt,
 
 		.dgram_enqueue            = virtio_transport_dgram_enqueue,
 		.dgram_dequeue            = virtio_transport_dgram_dequeue,
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index f275896..ce5f100 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -100,6 +100,9 @@ struct vsock_transport {
 	void (*destruct)(struct vsock_sock *);
 	void (*release)(struct vsock_sock *);
 
+	/* Cancel packets belonging the same vsock */
+	int (*cancel_pkt)(struct vsock_sock *vsk);
+
 	/* Connections. */
 	int (*connect)(struct vsock_sock *);
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 3/4] vsock: add pkt cancel capability
From: Peng Tao @ 2016-12-08 17:12 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: David Miller, kvm, virtualization, netdev, Jorgen Hansen
In-Reply-To: <1481217156-7160-3-git-send-email-bergwolf@gmail.com>

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Peng Tao <bergwolf@gmail.com>
---
 net/vmw_vsock/virtio_transport.c | 42 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 936d7ee..95c1162 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -170,6 +170,47 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
 	return len;
 }
 
+static int
+virtio_transport_cancel_pkt(struct vsock_sock *vsk)
+{
+	struct virtio_vsock *vsock;
+	struct virtio_vsock_pkt *pkt, *n;
+	int cnt = 0;
+	LIST_HEAD(freeme);
+
+	vsock = virtio_vsock_get();
+	if (!vsock) {
+		return -ENODEV;
+	}
+
+	spin_lock_bh(&vsock->send_pkt_list_lock);
+	list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) {
+		if (pkt->cancel_token != (void *)vsk)
+			continue;
+		list_move(&pkt->list, &freeme);
+	}
+	spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+	list_for_each_entry_safe(pkt, n, &freeme, list) {
+		if (pkt->reply)
+			cnt++;
+		list_del(&pkt->list);
+		virtio_transport_free_pkt(pkt);
+	}
+
+	if (cnt) {
+		struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX];
+		int new_cnt;
+
+		new_cnt = atomic_sub_return(cnt, &vsock->queued_replies);
+		if (new_cnt + cnt >= virtqueue_get_vring_size(rx_vq) &&
+		    new_cnt < virtqueue_get_vring_size(rx_vq))
+			queue_work(virtio_vsock_workqueue, &vsock->rx_work);
+	}
+
+	return 0;
+}
+
 static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
 {
 	int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
@@ -419,6 +460,7 @@ static struct virtio_transport virtio_transport = {
 		.release                  = virtio_transport_release,
 		.connect                  = virtio_transport_connect,
 		.shutdown                 = virtio_transport_shutdown,
+		.cancel_pkt               = virtio_transport_cancel_pkt,
 
 		.dgram_bind               = virtio_transport_dgram_bind,
 		.dgram_dequeue            = virtio_transport_dgram_dequeue,
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 4/4] vsock: cancel packets when failing to connect
From: Peng Tao @ 2016-12-08 17:12 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: David Miller, kvm, virtualization, netdev, Jorgen Hansen
In-Reply-To: <1481217156-7160-4-git-send-email-bergwolf@gmail.com>

Otherwise we'll leave the packets queued until releasing vsock device.
E.g., if guest is slow to start up, resulting ETIMEDOUT on connect, guest
will get the connect requests from failed host sockets.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Peng Tao <bergwolf@gmail.com>
---
 net/vmw_vsock/af_vsock.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 8a398b3..c73b03a 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1101,10 +1101,19 @@ static const struct proto_ops vsock_dgram_ops = {
 	.sendpage = sock_no_sendpage,
 };
 
+static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
+{
+	if (!transport->cancel_pkt)
+		return -EOPNOTSUPP;
+
+	return transport->cancel_pkt(vsk);
+}
+
 static void vsock_connect_timeout(struct work_struct *work)
 {
 	struct sock *sk;
 	struct vsock_sock *vsk;
+	int cancel = 0;
 
 	vsk = container_of(work, struct vsock_sock, dwork.work);
 	sk = sk_vsock(vsk);
@@ -1115,8 +1124,11 @@ static void vsock_connect_timeout(struct work_struct *work)
 		sk->sk_state = SS_UNCONNECTED;
 		sk->sk_err = ETIMEDOUT;
 		sk->sk_error_report(sk);
+		cancel = 1;
 	}
 	release_sock(sk);
+	if (cancel)
+		vsock_transport_cancel_pkt(vsk);
 
 	sock_put(sk);
 }
@@ -1223,11 +1235,13 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
 			err = sock_intr_errno(timeout);
 			sk->sk_state = SS_UNCONNECTED;
 			sock->state = SS_UNCONNECTED;
+			vsock_transport_cancel_pkt(vsk);
 			goto out_wait;
 		} else if (timeout == 0) {
 			err = -ETIMEDOUT;
 			sk->sk_state = SS_UNCONNECTED;
 			sock->state = SS_UNCONNECTED;
+			vsock_transport_cancel_pkt(vsk);
 			goto out_wait;
 		}
 
-- 
2.7.4

^ permalink raw reply related

* Re: net: deadlock on genl_mutex
From: Dmitry Vyukov @ 2016-12-08 17:16 UTC (permalink / raw)
  To: syzkaller
  Cc: Eric Dumazet, David Miller, Matti Vaittinen, Tycho Andersen,
	Cong Wang, Florian Westphal, stephen hemminger, Tom Herbert,
	netdev, LKML, Richard Guy Briggs, netdev-owner
In-Reply-To: <CACT4Y+auHN0Rdu2Hepk6DNFmz1K2XSA6s18PC=KZiHdoMG845Q@mail.gmail.com>

On Thu, Dec 8, 2016 at 5:16 PM, Dmitry Vyukov <dvyukov@google.com> wrote:
> On Tue, Nov 29, 2016 at 6:59 AM,  <subashab@codeaurora.org> wrote:
>>>
>>> Issue was reported yesterday and is under investigation.
>>>
>>>
>>> http://marc.info/?l=linux-netdev&m=148014004331663&w=2
>>>
>>>
>>> Thanks !
>>
>>
>> Hi Dmitry
>>
>> Can you try the patch below with your reproducer? I haven't seen similar
>> crashes reported after this (or even with Eric's patch).
>
> I've synced to 318c8932ddec5c1c26a4af0f3c053784841c598e (Dec 7) and do
> _not_ see this report happening anymore.
> Thanks.


But now I am seeing "possible deadlock" warnings involving genl_lock:

[ INFO: possible circular locking dependency detected ]
4.9.0-rc8+ #77 Not tainted
-------------------------------------------------------
syz-executor7/18794 is trying to acquire lock:
 (rtnl_mutex){+.+.+.}, at: [<ffffffff86b4682c>] rtnl_lock+0x1c/0x20
net/core/rtnetlink.c:70
but task is already holding lock:
 (genl_mutex){+.+.+.}, at: [<     inline     >] genl_lock
net/netlink/genetlink.c:31
 (genl_mutex){+.+.+.}, at: [<ffffffff86cc27c9>]
genl_rcv_msg+0x209/0x260 net/netlink/genetlink.c:658
which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

       [  315.403815] [<     inline     >] validate_chain
kernel/locking/lockdep.c:2265
       [  315.403815] [<ffffffff81569576>]
__lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
       [  315.403815] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
kernel/locking/lockdep.c:3749
       [  315.403815] [<     inline     >] __mutex_lock_common
kernel/locking/mutex.c:521
       [  315.403815] [<ffffffff88195bcf>]
mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
       [  315.403815] [<     inline     >] genl_lock net/netlink/genetlink.c:31
       [  315.403815] [<ffffffff86cc0c26>] genl_lock_dumpit+0x46/0xa0
net/netlink/genetlink.c:518
       [  315.403815] [<ffffffff86cb33ac>] netlink_dump+0x57c/0xd70
net/netlink/af_netlink.c:2127
       [  315.403815] [<ffffffff86cb7b6a>]
__netlink_dump_start+0x4ea/0x760 net/netlink/af_netlink.c:2217
       [  315.403815] [<ffffffff86cc2319>]
genl_family_rcv_msg+0xdc9/0x1070 net/netlink/genetlink.c:586
       [  315.403815] [<ffffffff86cc2770>] genl_rcv_msg+0x1b0/0x260
net/netlink/genetlink.c:660
       [  315.403815] [<ffffffff86cc034c>] netlink_rcv_skb+0x2bc/0x3a0
net/netlink/af_netlink.c:2298
       [  315.403815] [<ffffffff86cc153d>] genl_rcv+0x2d/0x40
net/netlink/genetlink.c:671
       [  315.403815] [<     inline     >] netlink_unicast_kernel
net/netlink/af_netlink.c:1231
       [  315.403815] [<ffffffff86cbeb6a>] netlink_unicast+0x51a/0x740
net/netlink/af_netlink.c:1257
       [  315.403815] [<ffffffff86cbf834>] netlink_sendmsg+0xaa4/0xe50
net/netlink/af_netlink.c:1803
       [  315.403815] [<     inline     >] sock_sendmsg_nosec net/socket.c:621
       [  315.403815] [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110
net/socket.c:631
       [  315.403815] [<ffffffff86a764fb>] sock_write_iter+0x32b/0x620
net/socket.c:829
       [  315.403815] [<     inline     >] new_sync_write fs/read_write.c:499
       [  315.403815] [<ffffffff81a701ae>] __vfs_write+0x4fe/0x830
fs/read_write.c:512
       [  315.403815] [<ffffffff81a71c55>] vfs_write+0x175/0x4e0
fs/read_write.c:560
       [  315.403815] [<     inline     >] SYSC_write fs/read_write.c:607
       [  315.403815] [<ffffffff81a760e0>] SyS_write+0x100/0x240
fs/read_write.c:599
       [  315.403815] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6

       [  315.403815] [<     inline     >] validate_chain
kernel/locking/lockdep.c:2265
       [  315.403815] [<ffffffff81569576>]
__lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
       [  315.403815] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
kernel/locking/lockdep.c:3749
       [  315.403815] [<     inline     >] __mutex_lock_common
kernel/locking/mutex.c:521
       [  315.403815] [<ffffffff88195bcf>]
mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
       [  315.403815] [<ffffffff86cb7779>]
__netlink_dump_start+0xf9/0x760 net/netlink/af_netlink.c:2187
       [  315.403815] [<     inline     >] netlink_dump_start
include/linux/netlink.h:165
       [  315.403815] [<ffffffff86d14d48>]
ctnetlink_stat_ct_cpu+0x198/0x1e0
net/netfilter/nf_conntrack_netlink.c:2045
       [  315.403815] [<ffffffff86cd313e>]
nfnetlink_rcv_msg+0x9be/0xd60 net/netfilter/nfnetlink.c:212
       [  315.403815] [<ffffffff86cc034c>] netlink_rcv_skb+0x2bc/0x3a0
net/netlink/af_netlink.c:2298
       [  315.403815] [<ffffffff86cd1b71>] nfnetlink_rcv+0x7e1/0x10d0
net/netfilter/nfnetlink.c:474
       [  315.403815] [<     inline     >] netlink_unicast_kernel
net/netlink/af_netlink.c:1231
       [  315.403815] [<ffffffff86cbeb6a>] netlink_unicast+0x51a/0x740
net/netlink/af_netlink.c:1257
       [  315.403815] [<ffffffff86cbf834>] netlink_sendmsg+0xaa4/0xe50
net/netlink/af_netlink.c:1803
       [  315.403815] [<     inline     >] sock_sendmsg_nosec net/socket.c:621
       [  315.403815] [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110
net/socket.c:631
       [  315.403815] [<ffffffff86a764fb>] sock_write_iter+0x32b/0x620
net/socket.c:829
       [  315.403815] [<     inline     >] new_sync_write fs/read_write.c:499
       [  315.403815] [<ffffffff81a701ae>] __vfs_write+0x4fe/0x830
fs/read_write.c:512
       [  315.403815] [<ffffffff81a71c55>] vfs_write+0x175/0x4e0
fs/read_write.c:560
       [  315.403815] [<     inline     >] SYSC_write fs/read_write.c:607
       [  315.403815] [<ffffffff81a760e0>] SyS_write+0x100/0x240
fs/read_write.c:599
       [  315.403815] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6

       [  315.403815] [<     inline     >] validate_chain
kernel/locking/lockdep.c:2265
       [  315.403815] [<ffffffff81569576>]
__lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
       [  315.403815] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
kernel/locking/lockdep.c:3749
       [  315.403815] [<     inline     >] __mutex_lock_common
kernel/locking/mutex.c:521
       [  315.403815] [<ffffffff88195bcf>]
mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
       [  315.403815] [<ffffffff86cd083d>] nfnl_lock+0x2d/0x30
net/netfilter/nfnetlink.c:61
       [  315.403815] [<ffffffff86d7c5b1>]
nf_tables_netdev_event+0x1f1/0x720
net/netfilter/nf_tables_netdev.c:122
       [  315.403815] [<ffffffff8149095a>]
notifier_call_chain+0x14a/0x2f0 kernel/notifier.c:93
       [  315.403815] [<     inline     >] __raw_notifier_call_chain
kernel/notifier.c:394
       [  315.403815] [<ffffffff81490b82>]
raw_notifier_call_chain+0x32/0x40 kernel/notifier.c:401
       [  315.403815] [<ffffffff86ae4af6>]
call_netdevice_notifiers_info+0x56/0x90 net/core/dev.c:1645
       [  315.403815] [<     inline     >] call_netdevice_notifiers
net/core/dev.c:1661
       [  315.403815] [<ffffffff86af898d>]
rollback_registered_many+0x73d/0xba0 net/core/dev.c:6759
       [  315.403815] [<ffffffff86af8e9e>]
rollback_registered+0xae/0x100 net/core/dev.c:6800
       [  315.403815] [<ffffffff86af8f76>]
unregister_netdevice_queue+0x86/0x140 net/core/dev.c:7787
       [  315.403815] [<     inline     >] unregister_netdevice
include/linux/netdevice.h:2455
       [  315.403815] [<ffffffff84912be6>] __tun_detach+0xc66/0xea0
drivers/net/tun.c:567
       [  315.808015] [<     inline     >] tun_detach drivers/net/tun.c:578
       [  315.808015] [<ffffffff84912e69>] tun_chr_close+0x49/0x60
drivers/net/tun.c:2350
       [  315.808015] [<ffffffff81a77f7e>] __fput+0x34e/0x910
fs/file_table.c:208
       [  315.808015] [<ffffffff81a785ca>] ____fput+0x1a/0x20
fs/file_table.c:244
       [  315.808015] [<ffffffff81483c20>] task_work_run+0x1a0/0x280
kernel/task_work.c:116
       [  315.808015] [<     inline     >] exit_task_work
include/linux/task_work.h:21
       [  315.808015] [<ffffffff814129e2>] do_exit+0x1842/0x2650
kernel/exit.c:828
       [  315.808015] [<ffffffff814139ae>] do_group_exit+0x14e/0x420
kernel/exit.c:932
       [  315.808015] [<ffffffff81442b43>] get_signal+0x663/0x1880
kernel/signal.c:2307
       [  315.808015] [<ffffffff81239b45>] do_signal+0xc5/0x2190
arch/x86/kernel/signal.c:807
       [  315.808015] [<ffffffff8100666a>]
exit_to_usermode_loop+0x1ea/0x2d0 arch/x86/entry/common.c:156
       [  315.808015] [<     inline     >] prepare_exit_to_usermode
arch/x86/entry/common.c:190
       [  315.808015] [<ffffffff81009693>]
syscall_return_slowpath+0x4d3/0x570 arch/x86/entry/common.c:259
       [  315.808015] [<ffffffff881a6026>] entry_SYSCALL_64_fastpath+0xc4/0xc6

       [  315.808015] [<     inline     >] check_prev_add
kernel/locking/lockdep.c:1828
       [  315.808015] [<ffffffff8156309b>]
check_prevs_add+0xaab/0x1c20 kernel/locking/lockdep.c:1938
       [  315.808015] [<     inline     >] validate_chain
kernel/locking/lockdep.c:2265
       [  315.808015] [<ffffffff81569576>]
__lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
       [  315.808015] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
kernel/locking/lockdep.c:3749
       [  315.808015] [<     inline     >] __mutex_lock_common
kernel/locking/mutex.c:521
       [  315.808015] [<ffffffff88195bcf>]
mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
       [  315.808015] [<ffffffff86b4682c>] rtnl_lock+0x1c/0x20
net/core/rtnetlink.c:70
       [  315.808015] [<ffffffff87b5cdf9>]
nl80211_pre_doit+0x309/0x5b0 net/wireless/nl80211.c:11750
       [  315.808015] [<ffffffff86cc1cd0>]
genl_family_rcv_msg+0x780/0x1070 net/netlink/genetlink.c:631
       [  315.808015] [<ffffffff86cc2770>] genl_rcv_msg+0x1b0/0x260
net/netlink/genetlink.c:660
       [  315.808015] [<ffffffff86cc034c>] netlink_rcv_skb+0x2bc/0x3a0
net/netlink/af_netlink.c:2298
       [  315.808015] [<ffffffff86cc153d>] genl_rcv+0x2d/0x40
net/netlink/genetlink.c:671
       [  315.808015] [<     inline     >] netlink_unicast_kernel
net/netlink/af_netlink.c:1231
       [  315.808015] [<ffffffff86cbeb6a>] netlink_unicast+0x51a/0x740
net/netlink/af_netlink.c:1257
       [  315.808015] [<ffffffff86cbf834>] netlink_sendmsg+0xaa4/0xe50
net/netlink/af_netlink.c:1803
       [  315.808015] [<     inline     >] sock_sendmsg_nosec net/socket.c:621
       [  315.808015] [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110
net/socket.c:631
       [  315.808015] [<ffffffff86a764fb>] sock_write_iter+0x32b/0x620
net/socket.c:829
       [  315.808015] [<ffffffff81a6f9a3>]
do_iter_readv_writev+0x363/0x670 fs/read_write.c:695
       [  315.808015] [<ffffffff81a723f1>] do_readv_writev+0x431/0x9b0
fs/read_write.c:872
       [  315.808015] [<ffffffff81a72f2c>] vfs_writev+0x8c/0xc0
fs/read_write.c:911
       [  315.808015] [<ffffffff81a73075>] do_writev+0x115/0x2d0
fs/read_write.c:944
       [  315.808015] [<     inline     >] SYSC_writev fs/read_write.c:1017
       [  315.808015] [<ffffffff81a7682c>] SyS_writev+0x2c/0x40
fs/read_write.c:1014
       [  315.808015] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6

other info that might help us debug this:

Chain exists of:
 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(genl_mutex);
                               lock(nlk->cb_mutex);
                               lock(genl_mutex);
  lock(rtnl_mutex);

 *** DEADLOCK ***

2 locks held by syz-executor7/18794:
 #0:  (cb_lock){++++++}, at: [<ffffffff86cc152e>] genl_rcv+0x1e/0x40
net/netlink/genetlink.c:670
 #1:  (genl_mutex){+.+.+.}, at: [<     inline     >] genl_lock
net/netlink/genetlink.c:31
 #1:  (genl_mutex){+.+.+.}, at: [<ffffffff86cc27c9>]
genl_rcv_msg+0x209/0x260 net/netlink/genetlink.c:658

stack backtrace:
CPU: 0 PID: 18794 Comm: syz-executor7 Not tainted 4.9.0-rc8+ #77
Hardware name: Google Google/Google, BIOS Google 01/01/2011
 ffff88004add6468 ffffffff834c44f9 ffffffff00000000 1ffff100095bac20
 ffffed00095bac18 0000000041b58ab3 ffffffff895816f0 ffffffff834c420b
 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
 [<     inline     >] __dump_stack lib/dump_stack.c:15
 [<ffffffff834c44f9>] dump_stack+0x2ee/0x3f5 lib/dump_stack.c:51
 [<ffffffff81560cb0>] print_circular_bug+0x310/0x3c0
kernel/locking/lockdep.c:1202
 [<     inline     >] check_prev_add kernel/locking/lockdep.c:1828
 [<ffffffff8156309b>] check_prevs_add+0xaab/0x1c20 kernel/locking/lockdep.c:1938
 [<     inline     >] validate_chain kernel/locking/lockdep.c:2265
 [<ffffffff81569576>] __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
 [<ffffffff8156b672>] lock_acquire+0x2a2/0x790 kernel/locking/lockdep.c:3749
 [<     inline     >] __mutex_lock_common kernel/locking/mutex.c:521
 [<ffffffff88195bcf>] mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
 [<ffffffff86b4682c>] rtnl_lock+0x1c/0x20 net/core/rtnetlink.c:70
 [<ffffffff87b5cdf9>] nl80211_pre_doit+0x309/0x5b0 net/wireless/nl80211.c:11750
 [<ffffffff86cc1cd0>] genl_family_rcv_msg+0x780/0x1070
net/netlink/genetlink.c:631
 [<ffffffff86cc2770>] genl_rcv_msg+0x1b0/0x260 net/netlink/genetlink.c:660
 [<ffffffff86cc034c>] netlink_rcv_skb+0x2bc/0x3a0 net/netlink/af_netlink.c:2298
 [<ffffffff86cc153d>] genl_rcv+0x2d/0x40 net/netlink/genetlink.c:671
 [<     inline     >] netlink_unicast_kernel net/netlink/af_netlink.c:1231
 [<ffffffff86cbeb6a>] netlink_unicast+0x51a/0x740 net/netlink/af_netlink.c:1257
 [<ffffffff86cbf834>] netlink_sendmsg+0xaa4/0xe50 net/netlink/af_netlink.c:1803
 [<     inline     >] sock_sendmsg_nosec net/socket.c:621
 [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110 net/socket.c:631
 [<ffffffff86a764fb>] sock_write_iter+0x32b/0x620 net/socket.c:829
 [<ffffffff81a6f9a3>] do_iter_readv_writev+0x363/0x670 fs/read_write.c:695
 [<ffffffff81a723f1>] do_readv_writev+0x431/0x9b0 fs/read_write.c:872
 [<ffffffff81a72f2c>] vfs_writev+0x8c/0xc0 fs/read_write.c:911
 [<ffffffff81a73075>] do_writev+0x115/0x2d0 fs/read_write.c:944
 [<     inline     >] SYSC_writev fs/read_write.c:1017
 [<ffffffff81a7682c>] SyS_writev+0x2c/0x40 fs/read_write.c:1014
 [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6

^ permalink raw reply

* (unknown), 
From: marketing @ 2016-12-08 17:22 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: MESAGE_3403929235556_netdev.zip --]
[-- Type: application/zip, Size: 7120 bytes --]

^ permalink raw reply

* netconsole: sleeping function called from invalid context
From: Dave Jones @ 2016-12-08 17:36 UTC (permalink / raw)
  To: netdev

I think this has been around for a while, but for some reason I'm running into
it a lot today.


BUG: sleeping function called from invalid context at kernel/irq/manage.c:110
in_atomic(): 1, irqs_disabled(): 1, pid: 1839, name: modprobe
no locks held by modprobe/1839.
Preemption disabled at:
[<ffffffff81b17163>] write_ext_msg+0x73/0x2d0
CPU: 0 PID: 1839 Comm: modprobe Not tainted 4.9.0-rc8-think+ #5 
 ffff880442287300
 ffffffff81651e19
 0000000080000001
 0000000000000000
 ffff88044221d380
 000000000000006e
 ffff880442287338
 ffffffff811117c3
 ffff88044221d388
 ffffffff8207b940
 000000000000006e
 0000000000000000
Call Trace:
 [<ffffffff81651e19>] dump_stack+0x6c/0x93
 [<ffffffff811117c3>] ___might_sleep+0x193/0x210
 [<ffffffff811118b1>] __might_sleep+0x71/0xe0
 [<ffffffff811673d4>] ? __synchronize_hardirq+0x94/0xa0
 [<ffffffff81167598>] synchronize_irq+0xa8/0x170
 [<ffffffff811674f0>] ? set_irq_wake_real+0x90/0x90
 [<ffffffff811674f5>] ? synchronize_irq+0x5/0x170
 [<ffffffff81167b95>] ? disable_irq+0x5/0x30
 [<ffffffff81167bb8>] disable_irq+0x28/0x30
 [<ffffffff81b78654>] e1000_netpoll+0x1c4/0x200
 [<ffffffff81b78490>] ? e1000_intr_msix_tx+0x190/0x190
 [<ffffffff81d4fd40>] netpoll_poll_dev+0xa0/0x3b0
 [<ffffffff811113c8>] ? preempt_count_sub+0x18/0xd0
 [<ffffffff81d5025d>] netpoll_send_skb_on_dev+0x20d/0x3d0
 [<ffffffff81d50955>] netpoll_send_udp+0x535/0x8c0
 [<ffffffff81b17376>] write_ext_msg+0x286/0x2d0
 [<ffffffff8168c71b>] ? check_preemption_disabled+0x3b/0x160
 [<ffffffff81161d85>] call_console_drivers.isra.20.constprop.26+0x165/0x310
 [<ffffffff811631b6>] console_unlock+0x3b6/0x840
 [<ffffffff81163af5>] vprintk_emit+0x4b5/0x6e0
 [<ffffffff81164058>] vprintk_default+0x48/0x80
 [<ffffffff812b6e11>] printk+0xbc/0xe7
 [<ffffffff812b6d55>] ? printk_lock.constprop.1+0x102/0x102
 [<ffffffff812b6d5a>] ? printk+0x5/0xe7
 [<ffffffffa0990001>] ? bt_init+0x1/0xfa [bluetooth]
 [<ffffffffa090fddd>] bt_info+0xdd/0x110 [bluetooth]
 [<ffffffffa090fd00>] ? bt_to_errno+0x50/0x50 [bluetooth]
 [<ffffffffa090fd05>] ? bt_info+0x5/0x110 [bluetooth]
 [<ffffffffa0990470>] sco_init+0xb0/0xc40 [bluetooth]
 [<ffffffffa0990000>] ? 0xffffffffa0990000
 [<ffffffffa099009d>] bt_init+0x9d/0xfa [bluetooth]
 [<ffffffff81000639>] do_one_initcall+0x199/0x220
 [<ffffffff810004a0>] ? initcall_blacklisted+0x170/0x170
 [<ffffffff812b759f>] ? do_init_module+0xe3/0x2fd
 [<ffffffffa0990000>] ? 0xffffffffa0990000
 [<ffffffff810004a5>] ? do_one_initcall+0x5/0x220
 [<ffffffff8137063c>] ? __asan_register_globals+0x7c/0xa0
 [<ffffffff812b75b0>] do_init_module+0xf4/0x2fd
 [<ffffffff811cae09>] load_module+0x3a79/0x4670
 [<ffffffff811c4f00>] ? disable_ro_nx+0x80/0x80
 [<ffffffff811c7390>] ? module_frob_arch_sections+0x20/0x20
 [<ffffffff8123874a>] ? __buffer_unlock_commit+0x4a/0x90
 [<ffffffff81239a9c>] ? trace_function+0x9c/0xc0
 [<ffffffff81246dda>] ? function_trace_call+0xea/0x290
 [<ffffffff811cbda1>] ? SYSC_finit_module+0x181/0x1c0
 [<ffffffff811c7390>] ? module_frob_arch_sections+0x20/0x20
 [<ffffffff813b4400>] ? get_user_arg_ptr.isra.26+0xa0/0xa0
 [<ffffffff811c7395>] ? load_module+0x5/0x4670
 [<ffffffff811cbda1>] SYSC_finit_module+0x181/0x1c0
 [<ffffffff811cbc20>] ? SYSC_init_module+0x220/0x220
 [<ffffffff81246dda>] ? function_trace_call+0xea/0x290
 [<ffffffff811cbdf0>] ? SyS_init_module+0x10/0x10
 [<ffffffff811cbdf0>] ? SyS_init_module+0x10/0x10
 [<ffffffff811cbdf5>] ? SyS_finit_module+0x5/0x10
 [<ffffffff8168c87c>] ? __this_cpu_preempt_check+0x1c/0x20
 [<ffffffff811cbdf0>] ? SyS_init_module+0x10/0x10
 [<ffffffff811cbdfe>] SyS_finit_module+0xe/0x10
 [<ffffffff81003bc0>] do_syscall_64+0x100/0x2b0
 [<ffffffff81f317cb>] entry_SYSCALL64_slow_path+0x25/0x25

^ permalink raw reply

* [PATCH v2 net-next 0/4] udp: receive path optimizations
From: Eric Dumazet @ 2016-12-08 17:38 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Paolo Abeni, Eric Dumazet

This patch series provides about 100 % performance increase under flood. 

David, please scratch it if you prefer to wait for linux-4.11,
thanks !

Eric Dumazet (4):
  udp: under rx pressure, try to condense skbs
  udp: add busylocks in RX path
  udp: copy skb->truesize in the first cache line
  udp: add batching to udp_rmem_release()

 include/linux/skbuff.h | 11 +++++++-
 include/linux/udp.h    |  3 ++
 net/core/skbuff.c      | 28 ++++++++++++++++++
 net/ipv4/udp.c         | 77 +++++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 114 insertions(+), 5 deletions(-)

-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply

* [PATCH v2 net-next 1/4] udp: under rx pressure, try to condense skbs
From: Eric Dumazet @ 2016-12-08 17:38 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Paolo Abeni, Eric Dumazet
In-Reply-To: <1481218739-27089-1-git-send-email-edumazet@google.com>

Under UDP flood, many softirq producers try to add packets to
UDP receive queue, and one user thread is burning one cpu trying
to dequeue packets as fast as possible.

Two parts of the per packet cost are :
- copying payload from kernel space to user space,
- freeing memory pieces associated with skb.

If socket is under pressure, softirq handler(s) can try to pull in
skb->head the payload of the packet if it fits.

Meaning the softirq handler(s) can free/reuse the page fragment
immediately, instead of letting udp_recvmsg() do this hundreds of usec
later, possibly from another node.

Additional gains :
- We reduce skb->truesize and thus can store more packets per SO_RCVBUF
- We avoid cache line misses at copyout() time and consume_skb() time,
and avoid one put_page() with potential alien freeing on NUMA hosts.

This comes at the cost of a copy, bounded to available tail room, which
is usually small. (We might have to fix GRO_MAX_HEAD which looks bigger
than necessary)

This patch gave me about 5 % increase in throughput in my tests.

skb_condense() helper could probably used in other contexts.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/skbuff.h |  2 ++
 net/core/skbuff.c      | 28 ++++++++++++++++++++++++++++
 net/ipv4/udp.c         | 12 +++++++++++-
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9c535fbccf2c..0cd92b0f2af5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1966,6 +1966,8 @@ static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len)
 	return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL;
 }
 
+void skb_condense(struct sk_buff *skb);
+
 /**
  *	skb_headroom - bytes at buffer head
  *	@skb: buffer to check
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b45cd1494243..d27e0352ae2a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4931,3 +4931,31 @@ struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
 	return clone;
 }
 EXPORT_SYMBOL(pskb_extract);
+
+/**
+ * skb_condense - try to get rid of fragments/frag_list if possible
+ * @skb: buffer
+ *
+ * Can be used to save memory before skb is added to a busy queue.
+ * If the packet has bytes in frags and enough tail room in skb->head,
+ * pull all of them, so that we can free the frags right now and adjust
+ * truesize.
+ * Notes:
+ *	We do not reallocate skb->head thus can not fail.
+ *	Caller must re-evaluate skb->truesize if needed.
+ */
+void skb_condense(struct sk_buff *skb)
+{
+	if (!skb->data_len ||
+	    skb->data_len > skb->end - skb->tail ||
+	    skb_cloned(skb))
+		return;
+
+	/* Nice, we can free page frag(s) right now */
+	__pskb_pull_tail(skb, skb->data_len);
+
+	/* Now adjust skb->truesize, since __pskb_pull_tail() does
+	 * not do this.
+	 */
+	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 16d88ba9ff1c..110414903f9e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1199,7 +1199,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct sk_buff_head *list = &sk->sk_receive_queue;
 	int rmem, delta, amt, err = -ENOMEM;
-	int size = skb->truesize;
+	int size;
 
 	/* try to avoid the costly atomic add/sub pair when the receive
 	 * queue is full; always allow at least a packet
@@ -1208,6 +1208,16 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 	if (rmem > sk->sk_rcvbuf)
 		goto drop;
 
+	/* Under mem pressure, it might be helpful to give udp_recvmsg()
+	 * linear skbs :
+	 * - Reduce memory overhead and thus increase receive queue capacity
+	 * - Less cache line misses at copyout() time
+	 * - Less work at consume_skb() (less alien page frag freeing)
+	 */
+	if (rmem > (sk->sk_rcvbuf >> 1))
+		skb_condense(skb);
+	size = skb->truesize;
+
 	/* we drop only if the receive buf is full and the receive
 	 * queue contains some other skb
 	 */
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH v2 net-next 2/4] udp: add busylocks in RX path
From: Eric Dumazet @ 2016-12-08 17:38 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Paolo Abeni, Eric Dumazet
In-Reply-To: <1481218739-27089-1-git-send-email-edumazet@google.com>

Idea of busylocks is to let producers grab an extra spinlock
to relieve pressure on the receive_queue spinlock shared by consumer.

This behavior is requested only once socket receive queue is above
half occupancy.

Under flood, this means that only one producer can be in line
trying to acquire the receive_queue spinlock.

These busylock can be allocated on a per cpu manner, instead of a
per socket one (that would consume a cache line per socket)

This patch considerably improves UDP behavior under stress,
depending on number of NIC RX queues and/or RPS spread.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/udp.c | 43 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 110414903f9e..77875712405f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1195,10 +1195,36 @@ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(udp_skb_destructor);
 
+/* Idea of busylocks is to let producers grab an extra spinlock
+ * to relieve pressure on the receive_queue spinlock shared by consumer.
+ * Under flood, this means that only one producer can be in line
+ * trying to acquire the receive_queue spinlock.
+ * These busylock can be allocated on a per cpu manner, instead of a
+ * per socket one (that would consume a cache line per socket)
+ */
+static int udp_busylocks_log __read_mostly;
+static spinlock_t *udp_busylocks __read_mostly;
+
+static spinlock_t *busylock_acquire(void *ptr)
+{
+	spinlock_t *busy;
+
+	busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log);
+	spin_lock(busy);
+	return busy;
+}
+
+static void busylock_release(spinlock_t *busy)
+{
+	if (busy)
+		spin_unlock(busy);
+}
+
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct sk_buff_head *list = &sk->sk_receive_queue;
 	int rmem, delta, amt, err = -ENOMEM;
+	spinlock_t *busy = NULL;
 	int size;
 
 	/* try to avoid the costly atomic add/sub pair when the receive
@@ -1214,8 +1240,11 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 	 * - Less cache line misses at copyout() time
 	 * - Less work at consume_skb() (less alien page frag freeing)
 	 */
-	if (rmem > (sk->sk_rcvbuf >> 1))
+	if (rmem > (sk->sk_rcvbuf >> 1)) {
 		skb_condense(skb);
+
+		busy = busylock_acquire(sk);
+	}
 	size = skb->truesize;
 
 	/* we drop only if the receive buf is full and the receive
@@ -1252,6 +1281,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 	if (!sock_flag(sk, SOCK_DEAD))
 		sk->sk_data_ready(sk);
 
+	busylock_release(busy);
 	return 0;
 
 uncharge_drop:
@@ -1259,6 +1289,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 
 drop:
 	atomic_inc(&sk->sk_drops);
+	busylock_release(busy);
 	return err;
 }
 EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
@@ -2613,6 +2644,7 @@ EXPORT_SYMBOL(udp_flow_hashrnd);
 void __init udp_init(void)
 {
 	unsigned long limit;
+	unsigned int i;
 
 	udp_table_init(&udp_table, "UDP");
 	limit = nr_free_buffer_pages() / 8;
@@ -2623,4 +2655,13 @@ void __init udp_init(void)
 
 	sysctl_udp_rmem_min = SK_MEM_QUANTUM;
 	sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+
+	/* 16 spinlocks per cpu */
+	udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
+	udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
+				GFP_KERNEL);
+	if (!udp_busylocks)
+		panic("UDP: failed to alloc udp_busylocks\n");
+	for (i = 0; i < (1U << udp_busylocks_log); i++)
+		spin_lock_init(udp_busylocks + i);
 }
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH v2 net-next 3/4] udp: copy skb->truesize in the first cache line
From: Eric Dumazet @ 2016-12-08 17:38 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Paolo Abeni, Eric Dumazet
In-Reply-To: <1481218739-27089-1-git-send-email-edumazet@google.com>

In UDP RX handler, we currently clear skb->dev before skb
is added to receive queue, because device pointer is no longer
available once we exit from RCU section.

Since this first cache line is always hot, lets reuse this space
to store skb->truesize and thus avoid a cache line miss at
udp_recvmsg()/udp_skb_destructor time while receive queue
spinlock is held.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/linux/skbuff.h |  9 ++++++++-
 net/ipv4/udp.c         | 13 ++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0cd92b0f2af5..332e76756f54 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -645,8 +645,15 @@ struct sk_buff {
 		struct rb_node	rbnode; /* used in netem & tcp stack */
 	};
 	struct sock		*sk;
-	struct net_device	*dev;
 
+	union {
+		struct net_device	*dev;
+		/* Some protocols might use this space to store information,
+		 * while device pointer would be NULL.
+		 * UDP receive path is one user.
+		 */
+		unsigned long		dev_scratch;
+	};
 	/*
 	 * This is the control buffer. It is free to use for every
 	 * layer. Please put your private variables there. If you
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 77875712405f..880cd3d84abf 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1188,10 +1188,14 @@ static void udp_rmem_release(struct sock *sk, int size, int partial)
 		__sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
 }
 
-/* Note: called with sk_receive_queue.lock held */
+/* Note: called with sk_receive_queue.lock held.
+ * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
+ * This avoids a cache line miss while receive_queue lock is held.
+ * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
+ */
 void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
 {
-	udp_rmem_release(sk, skb->truesize, 1);
+	udp_rmem_release(sk, skb->dev_scratch, 1);
 }
 EXPORT_SYMBOL(udp_skb_destructor);
 
@@ -1246,6 +1250,10 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 		busy = busylock_acquire(sk);
 	}
 	size = skb->truesize;
+	/* Copy skb->truesize into skb->dev_scratch to avoid a cache line miss
+	 * in udp_skb_destructor()
+	 */
+	skb->dev_scratch = size;
 
 	/* we drop only if the receive buf is full and the receive
 	 * queue contains some other skb
@@ -1272,7 +1280,6 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 	/* no need to setup a destructor, we will explicitly release the
 	 * forward allocated memory on dequeue
 	 */
-	skb->dev = NULL;
 	sock_skb_set_dropcount(sk, skb);
 
 	__skb_queue_tail(list, skb);
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH v2 net-next 4/4] udp: add batching to udp_rmem_release()
From: Eric Dumazet @ 2016-12-08 17:38 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Paolo Abeni, Eric Dumazet
In-Reply-To: <1481218739-27089-1-git-send-email-edumazet@google.com>

If udp_recvmsg() constantly releases sk_rmem_alloc
for every read packet, it gives opportunity for
producers to immediately grab spinlocks and desperatly
try adding another packet, causing false sharing.

We can add a simple heuristic to give the signal
by batches of ~25 % of the queue capacity.

This patch considerably increases performance under
flood by about 50 %, since the thread draining the queue
is no longer slowed by false sharing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/linux/udp.h |  3 +++
 net/ipv4/udp.c      | 11 +++++++++++
 2 files changed, 14 insertions(+)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index d1fd8cd39478..c0f530809d1f 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -79,6 +79,9 @@ struct udp_sock {
 	int			(*gro_complete)(struct sock *sk,
 						struct sk_buff *skb,
 						int nhoff);
+
+	/* This field is dirtied by udp_recvmsg() */
+	int		forward_deficit;
 };
 
 static inline struct udp_sock *udp_sk(const struct sock *sk)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 880cd3d84abf..f0096d088104 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1177,8 +1177,19 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
 /* fully reclaim rmem/fwd memory allocated for skb */
 static void udp_rmem_release(struct sock *sk, int size, int partial)
 {
+	struct udp_sock *up = udp_sk(sk);
 	int amt;
 
+	if (likely(partial)) {
+		up->forward_deficit += size;
+		size = up->forward_deficit;
+		if (size < (sk->sk_rcvbuf >> 2))
+			return;
+	} else {
+		size += up->forward_deficit;
+	}
+	up->forward_deficit = 0;
+
 	atomic_sub(size, &sk->sk_rmem_alloc);
 	sk->sk_forward_alloc += size;
 	amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* Re: [PATCH net-next] net: sock_rps_record_flow() is for connected sockets
From: Tom Herbert @ 2016-12-08 17:49 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Paolo Abeni, David Miller, netdev, Willem de Bruijn
In-Reply-To: <1481093247.18162.637.camel@edumazet-glaptop3.roam.corp.google.com>

On Tue, Dec 6, 2016 at 10:47 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2016-12-06 at 19:32 -0800, Eric Dumazet wrote:
>> A follow up patch will provide a static_key (Jump Label) since most
>> hosts do not even use RFS.
>
> Speaking of static_key, it appears we now have GRO on UDP, and this
> consumes a considerable amount of cpu cycles.
>
> Turning off GRO allows me to get +20 % more packets on my single UDP
> socket. (1.2 Mpps instead of 1.0 Mpps)
>
> Surely udp_gro_receive() should be bypassed if no UDP socket has
> registered a udp_sk(sk)->gro_receive handler
>
> And/or delay the inet_add_offload(&udpv{4|6}_offload, IPPROTO_UDP); to
> the first UDP sockets setting udp_sk(sk)->gro_receive handler,
> ie udp_encap_enable() and udpv6_encap_enable()
>
Of course that would only help on systems where no one enable encaps,
ie. looks good in the the simple benchmarks but in real life if just
one socket enables encap everyone else takes the hit. Alternatively,
maybe we could do early demux when we do the lookup in GRO to
eliminate the extra lookup?

Tom

>
> :(
>
>
>

^ permalink raw reply

* Re: [PATCH net-next] net: sock_rps_record_flow() is for connected sockets
From: Paolo Abeni @ 2016-12-08 17:49 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Willem de Bruijn, Tom Herbert
In-Reply-To: <1481120791.4930.4.camel@edumazet-glaptop3.roam.corp.google.com>

On Wed, 2016-12-07 at 06:26 -0800, Eric Dumazet wrote:
> On Wed, 2016-12-07 at 08:57 +0100, Paolo Abeni wrote:
> > On Tue, 2016-12-06 at 22:47 -0800, Eric Dumazet wrote:
> > > On Tue, 2016-12-06 at 19:32 -0800, Eric Dumazet wrote:
> > > > A follow up patch will provide a static_key (Jump Label) since most
> > > > hosts do not even use RFS.
> > > 
> > > Speaking of static_key, it appears we now have GRO on UDP, and this
> > > consumes a considerable amount of cpu cycles.
> > > 
> > > Turning off GRO allows me to get +20 % more packets on my single UDP
> > > socket. (1.2 Mpps instead of 1.0 Mpps)
> > 
> > I see also an improvement for single flow tests disabling GRO, but on a
> > smaller scale (~5% if I recall correctly).
> 
> Was it on a NUMA host ?

I'm using a single socket host, with 12 cores/24 threads and 16 RX
queues. 
But my data is old. I'll re-run the test on top of current net-next.

Paolo

^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH 0/3] i40e: Support for XDP
From: John Fastabend @ 2016-12-08 17:52 UTC (permalink / raw)
  To: Björn Töpel, jeffrey.t.kirsher, intel-wired-lan
  Cc: netdev, Björn Töpel, magnus.karlsson,
	Alexei Starovoitov
In-Reply-To: <20161208170022.11555-1-bjorn.topel@gmail.com>

On 16-12-08 09:00 AM, Björn Töpel wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
> 
> This series adds XDP support for i40e-based NICs.
> 
> The first patch adds XDP_RX support, the second XDP_TX support and the
> last patch makes it possible to change an XDP program without
> rebuilding the rings.
> 
> 
> Björn
> 
> 
> Björn Töpel (3):
>   i40e: Initial support for XDP
>   i40e: Add XDP_TX support
>   i40e: Don't reset/rebuild rings on XDP program swap
> 
>  drivers/net/ethernet/intel/i40e/i40e.h         |  18 +
>  drivers/net/ethernet/intel/i40e/i40e_ethtool.c |   3 +
>  drivers/net/ethernet/intel/i40e/i40e_main.c    | 358 +++++++++++++++++---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c    | 445 +++++++++++++++++++++----
>  drivers/net/ethernet/intel/i40e/i40e_txrx.h    |   7 +
>  5 files changed, 715 insertions(+), 116 deletions(-)
> 

Hi Jeff,

These are for the Intel driver net-next tree per our offlist email.

Thanks!
John

^ permalink raw reply

* Re: [PATCH net] phy: Don't increment MDIO bus refcount unless it's a different owner
From: Florian Fainelli @ 2016-12-08 17:54 UTC (permalink / raw)
  To: Johan Hovold; +Cc: netdev, rmk+kernel, andrew
In-Reply-To: <20161208170127.GJ31573@localhost>

On 12/08/2016 09:01 AM, Johan Hovold wrote:
> On Thu, Dec 08, 2016 at 08:47:54AM -0800, Florian Fainelli wrote:
>> On 12/08/2016 08:27 AM, Johan Hovold wrote:
>>> On Tue, Dec 06, 2016 at 08:54:43PM -0800, Florian Fainelli wrote:
>>>> Commit 3e3aaf649416 ("phy: fix mdiobus module safety") fixed the way we
>>>> dealt with MDIO bus module reference count, but sort of introduced a
>>>> regression in that, if an Ethernet driver registers its own MDIO bus
>>>> driver, as is common, we will end up with the Ethernet driver's
>>>> module->refnct set to 1, thus preventing this driver from any removal.
>>>>
>>>> Fix this by comparing the network device's device driver owner against
>>>> the MDIO bus driver owner, and only if they are different, increment the
>>>> MDIO bus module refcount.
>>>>
>>>> Fixes: 3e3aaf649416 ("phy: fix mdiobus module safety")
>>>> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
>>>> ---
>>>> Russell,
>>>>
>>>> I verified this against the ethoc driver primarily (on a TS7300 board)
>>>> and bcmgenet.
>>>>
>>>> Thanks!
>>>>
>>>>  drivers/net/phy/phy_device.c | 16 +++++++++++++---
>>>>  1 file changed, 13 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>>>> index 1a4bf8acad78..c4ceb082e970 100644
>>>> --- a/drivers/net/phy/phy_device.c
>>>> +++ b/drivers/net/phy/phy_device.c
>>>> @@ -857,11 +857,17 @@ EXPORT_SYMBOL(phy_attached_print);
>>>>  int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
>>>>  		      u32 flags, phy_interface_t interface)
>>>>  {
>>>> +	struct module *ndev_owner = dev->dev.parent->driver->owner;
>>>
>>> Is this really safe? A driver does not need to set a parent device, and
>>> in that case you get a NULL-deref here (I tried using cpsw).
>>
>> Humm, cpsw does call SET_NETDEV_DEV() which should take care of that, is
>> the call made too late? Do you have an example oops?
> 
> Sorry if I was being unclear, cpsw does set a parent device, but there
> are network driver that do not. Perhaps such drivers will never hit this
> code path, but I can't say for sure and everything appear to work for
> cpsw if you comment out that SET_NETDEV_DEV (well, at least before this
> patch).

You were clear, I did not understand that you exercised this with cpsw
to see whether this was safe in all conditions.

> 
>> I don't mind safeguarding this with a check against dev->dev.parent, but
>> I would like to fix the drivers where relevant too, since
>> SET_NETDEV_DEV() should really be called, otherwise a number of things
>> just don't work
> 
> I grepped for for register_netdev and think I saw a number of drivers
> which do not call SET_NETDEV_DEV.
> 
> Again, perhaps they will never hit this path, but thought I should ask.

You are absolutely right, this is a potential problem, so far I found
two legitimate drivers that do not call SET_NETDEV_DEV (lantiq_etop.c
and cpmac.c, both fixed), and Freescale's FMAN driver, which I have a
hard time understanding what it does with mac_dev->net_dev...

Thanks!
-- 
Florian

^ permalink raw reply

* A
From: richard @ 2016-12-08  7:37 UTC (permalink / raw)


Please confirm receipt of my previous mail? What time and when can i call you?

^ permalink raw reply

* Re: [PATCH] net: ethernet: slicoss: use module_pci_driver()
From: David Miller @ 2016-12-08 18:01 UTC (permalink / raw)
  To: tklauser; +Cc: LinoSanfilippo, netdev
In-Reply-To: <20161207134330.8829-1-tklauser@distanz.ch>

From: Tobias Klauser <tklauser@distanz.ch>
Date: Wed,  7 Dec 2016 14:43:30 +0100

> Use module_pci_driver() to get rid of some boilerplate code.
> 
> Signed-off-by: Tobias Klauser <tklauser@distanz.ch>

Applied.

^ permalink raw reply

* Re: net: deadlock on genl_mutex
From: Dmitry Vyukov @ 2016-12-08 18:02 UTC (permalink / raw)
  To: syzkaller
  Cc: Eric Dumazet, David Miller, Matti Vaittinen, Tycho Andersen,
	Cong Wang, Florian Westphal, stephen hemminger, Tom Herbert,
	netdev, LKML, Richard Guy Briggs, netdev-owner
In-Reply-To: <CACT4Y+Zy82UAJ55VbPbVadUM92ZSx1VJCFPdhhcmj53uxZ5PXQ@mail.gmail.com>

On Thu, Dec 8, 2016 at 6:16 PM, Dmitry Vyukov <dvyukov@google.com> wrote:
> On Thu, Dec 8, 2016 at 5:16 PM, Dmitry Vyukov <dvyukov@google.com> wrote:
>> On Tue, Nov 29, 2016 at 6:59 AM,  <subashab@codeaurora.org> wrote:
>>>>
>>>> Issue was reported yesterday and is under investigation.
>>>>
>>>>
>>>> http://marc.info/?l=linux-netdev&m=148014004331663&w=2
>>>>
>>>>
>>>> Thanks !
>>>
>>>
>>> Hi Dmitry
>>>
>>> Can you try the patch below with your reproducer? I haven't seen similar
>>> crashes reported after this (or even with Eric's patch).
>>
>> I've synced to 318c8932ddec5c1c26a4af0f3c053784841c598e (Dec 7) and do
>> _not_ see this report happening anymore.
>> Thanks.
>
>
> But now I am seeing "possible deadlock" warnings involving genl_lock:
>
> [ INFO: possible circular locking dependency detected ]
> 4.9.0-rc8+ #77 Not tainted
> -------------------------------------------------------
> syz-executor7/18794 is trying to acquire lock:
>  (rtnl_mutex){+.+.+.}, at: [<ffffffff86b4682c>] rtnl_lock+0x1c/0x20
> net/core/rtnetlink.c:70
> but task is already holding lock:
>  (genl_mutex){+.+.+.}, at: [<     inline     >] genl_lock
> net/netlink/genetlink.c:31
>  (genl_mutex){+.+.+.}, at: [<ffffffff86cc27c9>]
> genl_rcv_msg+0x209/0x260 net/netlink/genetlink.c:658
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
>        [  315.403815] [<     inline     >] validate_chain
> kernel/locking/lockdep.c:2265
>        [  315.403815] [<ffffffff81569576>]
> __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
>        [  315.403815] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
> kernel/locking/lockdep.c:3749
>        [  315.403815] [<     inline     >] __mutex_lock_common
> kernel/locking/mutex.c:521
>        [  315.403815] [<ffffffff88195bcf>]
> mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
>        [  315.403815] [<     inline     >] genl_lock net/netlink/genetlink.c:31
>        [  315.403815] [<ffffffff86cc0c26>] genl_lock_dumpit+0x46/0xa0
> net/netlink/genetlink.c:518
>        [  315.403815] [<ffffffff86cb33ac>] netlink_dump+0x57c/0xd70
> net/netlink/af_netlink.c:2127
>        [  315.403815] [<ffffffff86cb7b6a>]
> __netlink_dump_start+0x4ea/0x760 net/netlink/af_netlink.c:2217
>        [  315.403815] [<ffffffff86cc2319>]
> genl_family_rcv_msg+0xdc9/0x1070 net/netlink/genetlink.c:586
>        [  315.403815] [<ffffffff86cc2770>] genl_rcv_msg+0x1b0/0x260
> net/netlink/genetlink.c:660
>        [  315.403815] [<ffffffff86cc034c>] netlink_rcv_skb+0x2bc/0x3a0
> net/netlink/af_netlink.c:2298
>        [  315.403815] [<ffffffff86cc153d>] genl_rcv+0x2d/0x40
> net/netlink/genetlink.c:671
>        [  315.403815] [<     inline     >] netlink_unicast_kernel
> net/netlink/af_netlink.c:1231
>        [  315.403815] [<ffffffff86cbeb6a>] netlink_unicast+0x51a/0x740
> net/netlink/af_netlink.c:1257
>        [  315.403815] [<ffffffff86cbf834>] netlink_sendmsg+0xaa4/0xe50
> net/netlink/af_netlink.c:1803
>        [  315.403815] [<     inline     >] sock_sendmsg_nosec net/socket.c:621
>        [  315.403815] [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110
> net/socket.c:631
>        [  315.403815] [<ffffffff86a764fb>] sock_write_iter+0x32b/0x620
> net/socket.c:829
>        [  315.403815] [<     inline     >] new_sync_write fs/read_write.c:499
>        [  315.403815] [<ffffffff81a701ae>] __vfs_write+0x4fe/0x830
> fs/read_write.c:512
>        [  315.403815] [<ffffffff81a71c55>] vfs_write+0x175/0x4e0
> fs/read_write.c:560
>        [  315.403815] [<     inline     >] SYSC_write fs/read_write.c:607
>        [  315.403815] [<ffffffff81a760e0>] SyS_write+0x100/0x240
> fs/read_write.c:599
>        [  315.403815] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6
>
>        [  315.403815] [<     inline     >] validate_chain
> kernel/locking/lockdep.c:2265
>        [  315.403815] [<ffffffff81569576>]
> __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
>        [  315.403815] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
> kernel/locking/lockdep.c:3749
>        [  315.403815] [<     inline     >] __mutex_lock_common
> kernel/locking/mutex.c:521
>        [  315.403815] [<ffffffff88195bcf>]
> mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
>        [  315.403815] [<ffffffff86cb7779>]
> __netlink_dump_start+0xf9/0x760 net/netlink/af_netlink.c:2187
>        [  315.403815] [<     inline     >] netlink_dump_start
> include/linux/netlink.h:165
>        [  315.403815] [<ffffffff86d14d48>]
> ctnetlink_stat_ct_cpu+0x198/0x1e0
> net/netfilter/nf_conntrack_netlink.c:2045
>        [  315.403815] [<ffffffff86cd313e>]
> nfnetlink_rcv_msg+0x9be/0xd60 net/netfilter/nfnetlink.c:212
>        [  315.403815] [<ffffffff86cc034c>] netlink_rcv_skb+0x2bc/0x3a0
> net/netlink/af_netlink.c:2298
>        [  315.403815] [<ffffffff86cd1b71>] nfnetlink_rcv+0x7e1/0x10d0
> net/netfilter/nfnetlink.c:474
>        [  315.403815] [<     inline     >] netlink_unicast_kernel
> net/netlink/af_netlink.c:1231
>        [  315.403815] [<ffffffff86cbeb6a>] netlink_unicast+0x51a/0x740
> net/netlink/af_netlink.c:1257
>        [  315.403815] [<ffffffff86cbf834>] netlink_sendmsg+0xaa4/0xe50
> net/netlink/af_netlink.c:1803
>        [  315.403815] [<     inline     >] sock_sendmsg_nosec net/socket.c:621
>        [  315.403815] [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110
> net/socket.c:631
>        [  315.403815] [<ffffffff86a764fb>] sock_write_iter+0x32b/0x620
> net/socket.c:829
>        [  315.403815] [<     inline     >] new_sync_write fs/read_write.c:499
>        [  315.403815] [<ffffffff81a701ae>] __vfs_write+0x4fe/0x830
> fs/read_write.c:512
>        [  315.403815] [<ffffffff81a71c55>] vfs_write+0x175/0x4e0
> fs/read_write.c:560
>        [  315.403815] [<     inline     >] SYSC_write fs/read_write.c:607
>        [  315.403815] [<ffffffff81a760e0>] SyS_write+0x100/0x240
> fs/read_write.c:599
>        [  315.403815] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6
>
>        [  315.403815] [<     inline     >] validate_chain
> kernel/locking/lockdep.c:2265
>        [  315.403815] [<ffffffff81569576>]
> __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
>        [  315.403815] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
> kernel/locking/lockdep.c:3749
>        [  315.403815] [<     inline     >] __mutex_lock_common
> kernel/locking/mutex.c:521
>        [  315.403815] [<ffffffff88195bcf>]
> mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
>        [  315.403815] [<ffffffff86cd083d>] nfnl_lock+0x2d/0x30
> net/netfilter/nfnetlink.c:61
>        [  315.403815] [<ffffffff86d7c5b1>]
> nf_tables_netdev_event+0x1f1/0x720
> net/netfilter/nf_tables_netdev.c:122
>        [  315.403815] [<ffffffff8149095a>]
> notifier_call_chain+0x14a/0x2f0 kernel/notifier.c:93
>        [  315.403815] [<     inline     >] __raw_notifier_call_chain
> kernel/notifier.c:394
>        [  315.403815] [<ffffffff81490b82>]
> raw_notifier_call_chain+0x32/0x40 kernel/notifier.c:401
>        [  315.403815] [<ffffffff86ae4af6>]
> call_netdevice_notifiers_info+0x56/0x90 net/core/dev.c:1645
>        [  315.403815] [<     inline     >] call_netdevice_notifiers
> net/core/dev.c:1661
>        [  315.403815] [<ffffffff86af898d>]
> rollback_registered_many+0x73d/0xba0 net/core/dev.c:6759
>        [  315.403815] [<ffffffff86af8e9e>]
> rollback_registered+0xae/0x100 net/core/dev.c:6800
>        [  315.403815] [<ffffffff86af8f76>]
> unregister_netdevice_queue+0x86/0x140 net/core/dev.c:7787
>        [  315.403815] [<     inline     >] unregister_netdevice
> include/linux/netdevice.h:2455
>        [  315.403815] [<ffffffff84912be6>] __tun_detach+0xc66/0xea0
> drivers/net/tun.c:567
>        [  315.808015] [<     inline     >] tun_detach drivers/net/tun.c:578
>        [  315.808015] [<ffffffff84912e69>] tun_chr_close+0x49/0x60
> drivers/net/tun.c:2350
>        [  315.808015] [<ffffffff81a77f7e>] __fput+0x34e/0x910
> fs/file_table.c:208
>        [  315.808015] [<ffffffff81a785ca>] ____fput+0x1a/0x20
> fs/file_table.c:244
>        [  315.808015] [<ffffffff81483c20>] task_work_run+0x1a0/0x280
> kernel/task_work.c:116
>        [  315.808015] [<     inline     >] exit_task_work
> include/linux/task_work.h:21
>        [  315.808015] [<ffffffff814129e2>] do_exit+0x1842/0x2650
> kernel/exit.c:828
>        [  315.808015] [<ffffffff814139ae>] do_group_exit+0x14e/0x420
> kernel/exit.c:932
>        [  315.808015] [<ffffffff81442b43>] get_signal+0x663/0x1880
> kernel/signal.c:2307
>        [  315.808015] [<ffffffff81239b45>] do_signal+0xc5/0x2190
> arch/x86/kernel/signal.c:807
>        [  315.808015] [<ffffffff8100666a>]
> exit_to_usermode_loop+0x1ea/0x2d0 arch/x86/entry/common.c:156
>        [  315.808015] [<     inline     >] prepare_exit_to_usermode
> arch/x86/entry/common.c:190
>        [  315.808015] [<ffffffff81009693>]
> syscall_return_slowpath+0x4d3/0x570 arch/x86/entry/common.c:259
>        [  315.808015] [<ffffffff881a6026>] entry_SYSCALL_64_fastpath+0xc4/0xc6
>
>        [  315.808015] [<     inline     >] check_prev_add
> kernel/locking/lockdep.c:1828
>        [  315.808015] [<ffffffff8156309b>]
> check_prevs_add+0xaab/0x1c20 kernel/locking/lockdep.c:1938
>        [  315.808015] [<     inline     >] validate_chain
> kernel/locking/lockdep.c:2265
>        [  315.808015] [<ffffffff81569576>]
> __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
>        [  315.808015] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
> kernel/locking/lockdep.c:3749
>        [  315.808015] [<     inline     >] __mutex_lock_common
> kernel/locking/mutex.c:521
>        [  315.808015] [<ffffffff88195bcf>]
> mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
>        [  315.808015] [<ffffffff86b4682c>] rtnl_lock+0x1c/0x20
> net/core/rtnetlink.c:70
>        [  315.808015] [<ffffffff87b5cdf9>]
> nl80211_pre_doit+0x309/0x5b0 net/wireless/nl80211.c:11750
>        [  315.808015] [<ffffffff86cc1cd0>]
> genl_family_rcv_msg+0x780/0x1070 net/netlink/genetlink.c:631
>        [  315.808015] [<ffffffff86cc2770>] genl_rcv_msg+0x1b0/0x260
> net/netlink/genetlink.c:660
>        [  315.808015] [<ffffffff86cc034c>] netlink_rcv_skb+0x2bc/0x3a0
> net/netlink/af_netlink.c:2298
>        [  315.808015] [<ffffffff86cc153d>] genl_rcv+0x2d/0x40
> net/netlink/genetlink.c:671
>        [  315.808015] [<     inline     >] netlink_unicast_kernel
> net/netlink/af_netlink.c:1231
>        [  315.808015] [<ffffffff86cbeb6a>] netlink_unicast+0x51a/0x740
> net/netlink/af_netlink.c:1257
>        [  315.808015] [<ffffffff86cbf834>] netlink_sendmsg+0xaa4/0xe50
> net/netlink/af_netlink.c:1803
>        [  315.808015] [<     inline     >] sock_sendmsg_nosec net/socket.c:621
>        [  315.808015] [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110
> net/socket.c:631
>        [  315.808015] [<ffffffff86a764fb>] sock_write_iter+0x32b/0x620
> net/socket.c:829
>        [  315.808015] [<ffffffff81a6f9a3>]
> do_iter_readv_writev+0x363/0x670 fs/read_write.c:695
>        [  315.808015] [<ffffffff81a723f1>] do_readv_writev+0x431/0x9b0
> fs/read_write.c:872
>        [  315.808015] [<ffffffff81a72f2c>] vfs_writev+0x8c/0xc0
> fs/read_write.c:911
>        [  315.808015] [<ffffffff81a73075>] do_writev+0x115/0x2d0
> fs/read_write.c:944
>        [  315.808015] [<     inline     >] SYSC_writev fs/read_write.c:1017
>        [  315.808015] [<ffffffff81a7682c>] SyS_writev+0x2c/0x40
> fs/read_write.c:1014
>        [  315.808015] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6
>
> other info that might help us debug this:
>
> Chain exists of:
>  Possible unsafe locking scenario:
>
>        CPU0                    CPU1
>        ----                    ----
>   lock(genl_mutex);
>                                lock(nlk->cb_mutex);
>                                lock(genl_mutex);
>   lock(rtnl_mutex);
>
>  *** DEADLOCK ***
>
> 2 locks held by syz-executor7/18794:
>  #0:  (cb_lock){++++++}, at: [<ffffffff86cc152e>] genl_rcv+0x1e/0x40
> net/netlink/genetlink.c:670
>  #1:  (genl_mutex){+.+.+.}, at: [<     inline     >] genl_lock
> net/netlink/genetlink.c:31
>  #1:  (genl_mutex){+.+.+.}, at: [<ffffffff86cc27c9>]
> genl_rcv_msg+0x209/0x260 net/netlink/genetlink.c:658
>
> stack backtrace:
> CPU: 0 PID: 18794 Comm: syz-executor7 Not tainted 4.9.0-rc8+ #77
> Hardware name: Google Google/Google, BIOS Google 01/01/2011
>  ffff88004add6468 ffffffff834c44f9 ffffffff00000000 1ffff100095bac20
>  ffffed00095bac18 0000000041b58ab3 ffffffff895816f0 ffffffff834c420b
>  0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
>  [<     inline     >] __dump_stack lib/dump_stack.c:15
>  [<ffffffff834c44f9>] dump_stack+0x2ee/0x3f5 lib/dump_stack.c:51
>  [<ffffffff81560cb0>] print_circular_bug+0x310/0x3c0
> kernel/locking/lockdep.c:1202
>  [<     inline     >] check_prev_add kernel/locking/lockdep.c:1828
>  [<ffffffff8156309b>] check_prevs_add+0xaab/0x1c20 kernel/locking/lockdep.c:1938
>  [<     inline     >] validate_chain kernel/locking/lockdep.c:2265
>  [<ffffffff81569576>] __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
>  [<ffffffff8156b672>] lock_acquire+0x2a2/0x790 kernel/locking/lockdep.c:3749
>  [<     inline     >] __mutex_lock_common kernel/locking/mutex.c:521
>  [<ffffffff88195bcf>] mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
>  [<ffffffff86b4682c>] rtnl_lock+0x1c/0x20 net/core/rtnetlink.c:70
>  [<ffffffff87b5cdf9>] nl80211_pre_doit+0x309/0x5b0 net/wireless/nl80211.c:11750
>  [<ffffffff86cc1cd0>] genl_family_rcv_msg+0x780/0x1070
> net/netlink/genetlink.c:631
>  [<ffffffff86cc2770>] genl_rcv_msg+0x1b0/0x260 net/netlink/genetlink.c:660
>  [<ffffffff86cc034c>] netlink_rcv_skb+0x2bc/0x3a0 net/netlink/af_netlink.c:2298
>  [<ffffffff86cc153d>] genl_rcv+0x2d/0x40 net/netlink/genetlink.c:671
>  [<     inline     >] netlink_unicast_kernel net/netlink/af_netlink.c:1231
>  [<ffffffff86cbeb6a>] netlink_unicast+0x51a/0x740 net/netlink/af_netlink.c:1257
>  [<ffffffff86cbf834>] netlink_sendmsg+0xaa4/0xe50 net/netlink/af_netlink.c:1803
>  [<     inline     >] sock_sendmsg_nosec net/socket.c:621
>  [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110 net/socket.c:631
>  [<ffffffff86a764fb>] sock_write_iter+0x32b/0x620 net/socket.c:829
>  [<ffffffff81a6f9a3>] do_iter_readv_writev+0x363/0x670 fs/read_write.c:695
>  [<ffffffff81a723f1>] do_readv_writev+0x431/0x9b0 fs/read_write.c:872
>  [<ffffffff81a72f2c>] vfs_writev+0x8c/0xc0 fs/read_write.c:911
>  [<ffffffff81a73075>] do_writev+0x115/0x2d0 fs/read_write.c:944
>  [<     inline     >] SYSC_writev fs/read_write.c:1017
>  [<ffffffff81a7682c>] SyS_writev+0x2c/0x40 fs/read_write.c:1014
>  [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6



Probably a related one:

[ INFO: possible circular locking dependency detected ]
4.9.0-rc8+ #77 Not tainted
-------------------------------------------------------
syz-executor5/5777 is trying to acquire lock:
 (genl_mutex){+.+.+.}, at: [<     inline     >] genl_lock
net/netlink/genetlink.c:31
 (genl_mutex){+.+.+.}, at: [<ffffffff86cc0c26>]
genl_lock_dumpit+0x46/0xa0 net/netlink/genetlink.c:518
but task is already holding lock:
 (nlk->cb_mutex){+.+.+.}, at: [<ffffffff86cb2f08>]
netlink_dump+0xd8/0xd70 net/netlink/af_netlink.c:2084
which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

       [  158.966653] [<     inline     >] validate_chain
kernel/locking/lockdep.c:2265
       [  158.966653] [<ffffffff81569576>]
__lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
       [  158.966653] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
kernel/locking/lockdep.c:3749
       [  158.966653] [<     inline     >] __mutex_lock_common
kernel/locking/mutex.c:521
       [  158.966653] [<ffffffff88195bcf>]
mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
       [  158.966653] [<ffffffff86cb7779>]
__netlink_dump_start+0xf9/0x760 net/netlink/af_netlink.c:2187
       [  158.966653] [<     inline     >] netlink_dump_start
include/linux/netlink.h:165
       [  158.966653] [<ffffffff86d1395f>]
ctnetlink_get_ct_unconfirmed+0x17f/0x220
net/netfilter/nf_conntrack_netlink.c:1369
       [  158.966653] [<ffffffff86cd313e>]
nfnetlink_rcv_msg+0x9be/0xd60 net/netfilter/nfnetlink.c:212
       [  158.966653] [<ffffffff86cc034c>] netlink_rcv_skb+0x2bc/0x3a0
net/netlink/af_netlink.c:2298
       [  158.966653] [<ffffffff86cd1b71>] nfnetlink_rcv+0x7e1/0x10d0
net/netfilter/nfnetlink.c:474
       [  158.966653] [<     inline     >] netlink_unicast_kernel
net/netlink/af_netlink.c:1231
       [  158.966653] [<ffffffff86cbeb6a>] netlink_unicast+0x51a/0x740
net/netlink/af_netlink.c:1257
       [  158.966653] [<ffffffff86cbf834>] netlink_sendmsg+0xaa4/0xe50
net/netlink/af_netlink.c:1803
       [  158.966653] [<     inline     >] sock_sendmsg_nosec net/socket.c:621
       [  158.966653] [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110
net/socket.c:631
       [  158.966653] [<ffffffff86a764fb>] sock_write_iter+0x32b/0x620
net/socket.c:829
       [  158.966653] [<     inline     >] new_sync_write fs/read_write.c:499
       [  158.966653] [<ffffffff81a701ae>] __vfs_write+0x4fe/0x830
fs/read_write.c:512
       [  158.966653] [<ffffffff81a71c55>] vfs_write+0x175/0x4e0
fs/read_write.c:560
       [  158.966653] [<     inline     >] SYSC_write fs/read_write.c:607
       [  158.966653] [<ffffffff81a760e0>] SyS_write+0x100/0x240
fs/read_write.c:599
       [  158.966653] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6

       [  158.966653] [<     inline     >] validate_chain
kernel/locking/lockdep.c:2265
       [  158.966653] [<ffffffff81569576>]
__lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
       [  158.966653] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
kernel/locking/lockdep.c:3749
       [  158.966653] [<     inline     >] __mutex_lock_common
kernel/locking/mutex.c:521
       [  158.966653] [<ffffffff88195bcf>]
mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
       [  158.966653] [<ffffffff86cd083d>] nfnl_lock+0x2d/0x30
net/netfilter/nfnetlink.c:61
       [  158.966653] [<ffffffff86d7c5b1>]
nf_tables_netdev_event+0x1f1/0x720
net/netfilter/nf_tables_netdev.c:122
       [  158.966653] [<ffffffff8149095a>]
notifier_call_chain+0x14a/0x2f0 kernel/notifier.c:93
       [  158.966653] [<     inline     >] __raw_notifier_call_chain
kernel/notifier.c:394
       [  158.966653] [<ffffffff81490b82>]
raw_notifier_call_chain+0x32/0x40 kernel/notifier.c:401
       [  158.966653] [<ffffffff86ae4af6>]
call_netdevice_notifiers_info+0x56/0x90 net/core/dev.c:1645
       [  158.966653] [<     inline     >] call_netdevice_notifiers
net/core/dev.c:1661
       [  158.966653] [<ffffffff86af898d>]
rollback_registered_many+0x73d/0xba0 net/core/dev.c:6759
       [  158.966653] [<ffffffff86af8e9e>]
rollback_registered+0xae/0x100 net/core/dev.c:6800
       [  158.966653] [<ffffffff86af8f76>]
unregister_netdevice_queue+0x86/0x140 net/core/dev.c:7787
       [  158.966653] [<     inline     >] unregister_netdevice
include/linux/netdevice.h:2455
       [  158.966653] [<ffffffff84912be6>] __tun_detach+0xc66/0xea0
drivers/net/tun.c:567
       [  158.966653] [<     inline     >] tun_detach drivers/net/tun.c:578
       [  158.966653] [<ffffffff84912e69>] tun_chr_close+0x49/0x60
drivers/net/tun.c:2350
       [  158.966653] [<ffffffff81a77f7e>] __fput+0x34e/0x910
fs/file_table.c:208
       [  158.966653] [<ffffffff81a785ca>] ____fput+0x1a/0x20
fs/file_table.c:244
       [  158.966653] [<ffffffff81483c20>] task_work_run+0x1a0/0x280
kernel/task_work.c:116
       [  158.966653] [<     inline     >] exit_task_work
include/linux/task_work.h:21
       [  158.966653] [<ffffffff814129e2>] do_exit+0x1842/0x2650
kernel/exit.c:828
       [  158.966653] [<ffffffff814139ae>] do_group_exit+0x14e/0x420
kernel/exit.c:932
       [  159.308048] [<ffffffff81442b43>] get_signal+0x663/0x1880
kernel/signal.c:2307
       [  159.308048] [<ffffffff81239b45>] do_signal+0xc5/0x2190
arch/x86/kernel/signal.c:807
       [  159.308048] [<ffffffff8100666a>]
exit_to_usermode_loop+0x1ea/0x2d0 arch/x86/entry/common.c:156
       [  159.308048] [<     inline     >] prepare_exit_to_usermode
arch/x86/entry/common.c:190
       [  159.308048] [<ffffffff81009693>]
syscall_return_slowpath+0x4d3/0x570 arch/x86/entry/common.c:259
       [  159.308048] [<ffffffff881a6026>] entry_SYSCALL_64_fastpath+0xc4/0xc6

       [  159.308048] [<     inline     >] validate_chain
kernel/locking/lockdep.c:2265
       [  159.308048] [<ffffffff81569576>]
__lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
       [  159.308048] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
kernel/locking/lockdep.c:3749
       [  159.308048] [<     inline     >] __mutex_lock_common
kernel/locking/mutex.c:521
       [  159.308048] [<ffffffff88195bcf>]
mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
       [  159.308048] [<ffffffff86b4682c>] rtnl_lock+0x1c/0x20
net/core/rtnetlink.c:70
       [  159.308048] [<ffffffff87b5cdf9>]
nl80211_pre_doit+0x309/0x5b0 net/wireless/nl80211.c:11750
       [  159.308048] [<ffffffff86cc1cd0>]
genl_family_rcv_msg+0x780/0x1070 net/netlink/genetlink.c:631
       [  159.308048] [<ffffffff86cc2770>] genl_rcv_msg+0x1b0/0x260
net/netlink/genetlink.c:660
       [  159.308048] [<ffffffff86cc034c>] netlink_rcv_skb+0x2bc/0x3a0
net/netlink/af_netlink.c:2298
       [  159.308048] [<ffffffff86cc153d>] genl_rcv+0x2d/0x40
net/netlink/genetlink.c:671
       [  159.308048] [<     inline     >] netlink_unicast_kernel
net/netlink/af_netlink.c:1231
       [  159.308048] [<ffffffff86cbeb6a>] netlink_unicast+0x51a/0x740
net/netlink/af_netlink.c:1257
       [  159.308048] [<ffffffff86cbf834>] netlink_sendmsg+0xaa4/0xe50
net/netlink/af_netlink.c:1803
       [  159.308048] [<     inline     >] sock_sendmsg_nosec net/socket.c:621
       [  159.308048] [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110
net/socket.c:631
       [  159.308048] [<ffffffff86a764fb>] sock_write_iter+0x32b/0x620
net/socket.c:829
       [  159.308048] [<ffffffff81a6f9a3>]
do_iter_readv_writev+0x363/0x670 fs/read_write.c:695
       [  159.308048] [<ffffffff81a723f1>] do_readv_writev+0x431/0x9b0
fs/read_write.c:872
       [  159.308048] [<ffffffff81a72f2c>] vfs_writev+0x8c/0xc0
fs/read_write.c:911
       [  159.308048] [<ffffffff81a73075>] do_writev+0x115/0x2d0
fs/read_write.c:944
       [  159.308048] [<     inline     >] SYSC_writev fs/read_write.c:1017
       [  159.308048] [<ffffffff81a7682c>] SyS_writev+0x2c/0x40
fs/read_write.c:1014
       [  159.308048] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6

       [  159.308048] [<     inline     >] check_prev_add
kernel/locking/lockdep.c:1828
       [  159.308048] [<ffffffff8156309b>]
check_prevs_add+0xaab/0x1c20 kernel/locking/lockdep.c:1938
       [  159.308048] [<     inline     >] validate_chain
kernel/locking/lockdep.c:2265
       [  159.308048] [<ffffffff81569576>]
__lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
       [  159.308048] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
kernel/locking/lockdep.c:3749
       [  159.308048] [<     inline     >] __mutex_lock_common
kernel/locking/mutex.c:521
       [  159.308048] [<ffffffff88195bcf>]
mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
       [  159.308048] [<     inline     >] genl_lock net/netlink/genetlink.c:31
       [  159.308048] [<ffffffff86cc0c26>] genl_lock_dumpit+0x46/0xa0
net/netlink/genetlink.c:518
       [  159.308048] [<ffffffff86cb33ac>] netlink_dump+0x57c/0xd70
net/netlink/af_netlink.c:2127
       [  159.308048] [<ffffffff86cb7b6a>]
__netlink_dump_start+0x4ea/0x760 net/netlink/af_netlink.c:2217
       [  159.308048] [<ffffffff86cc2319>]
genl_family_rcv_msg+0xdc9/0x1070 net/netlink/genetlink.c:586
       [  159.308048] [<ffffffff86cc2770>] genl_rcv_msg+0x1b0/0x260
net/netlink/genetlink.c:660
       [  159.308048] [<ffffffff86cc034c>] netlink_rcv_skb+0x2bc/0x3a0
net/netlink/af_netlink.c:2298
       [  159.308048] [<ffffffff86cc153d>] genl_rcv+0x2d/0x40
net/netlink/genetlink.c:671
       [  159.308048] [<     inline     >] netlink_unicast_kernel
net/netlink/af_netlink.c:1231
       [  159.308048] [<ffffffff86cbeb6a>] netlink_unicast+0x51a/0x740
net/netlink/af_netlink.c:1257
       [  159.308048] [<ffffffff86cbf834>] netlink_sendmsg+0xaa4/0xe50
net/netlink/af_netlink.c:1803
       [  159.308048] [<     inline     >] sock_sendmsg_nosec net/socket.c:621
       [  159.308048] [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110
net/socket.c:631
       [  159.308048] [<ffffffff86a764fb>] sock_write_iter+0x32b/0x620
net/socket.c:829
       [  159.308048] [<     inline     >] new_sync_write fs/read_write.c:499
       [  159.308048] [<ffffffff81a701ae>] __vfs_write+0x4fe/0x830
fs/read_write.c:512
       [  159.308048] [<ffffffff81a71c55>] vfs_write+0x175/0x4e0
fs/read_write.c:560
       [  159.308048] [<     inline     >] SYSC_write fs/read_write.c:607
       [  159.308048] [<ffffffff81a760e0>] SyS_write+0x100/0x240
fs/read_write.c:599
       [  159.308048] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6

other info that might help us debug this:

Chain exists of:
 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(nlk->cb_mutex);
                               lock(&table[i].mutex);
                               lock(nlk->cb_mutex);
  lock(genl_mutex);

 *** DEADLOCK ***

2 locks held by syz-executor5/5777:
 #0:  (cb_lock){++++++}, at: [<ffffffff86cc152e>] genl_rcv+0x1e/0x40
net/netlink/genetlink.c:670
 #1:  (nlk->cb_mutex){+.+.+.}, at: [<ffffffff86cb2f08>]
netlink_dump+0xd8/0xd70 net/netlink/af_netlink.c:2084

stack backtrace:
CPU: 1 PID: 5777 Comm: syz-executor5 Not tainted 4.9.0-rc8+ #77
Hardware name: Google Google/Google, BIOS Google 01/01/2011
 ffff88005fe363e8 ffffffff834c44f9 ffffffff00000001 1ffff1000bfc6c10
 ffffed000bfc6c08 0000000041b58ab3 ffffffff895816f0 ffffffff834c420b
 0000000000000000 0000000000000000 0000000000000000 dffffc0000000000
Call Trace:
 [<     inline     >] __dump_stack lib/dump_stack.c:15
 [<ffffffff834c44f9>] dump_stack+0x2ee/0x3f5 lib/dump_stack.c:51
 [<ffffffff81560cb0>] print_circular_bug+0x310/0x3c0
kernel/locking/lockdep.c:1202
 [<     inline     >] check_prev_add kernel/locking/lockdep.c:1828
 [<ffffffff8156309b>] check_prevs_add+0xaab/0x1c20 kernel/locking/lockdep.c:1938
 [<     inline     >] validate_chain kernel/locking/lockdep.c:2265
 [<ffffffff81569576>] __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
 [<ffffffff8156b672>] lock_acquire+0x2a2/0x790 kernel/locking/lockdep.c:3749
 [<     inline     >] __mutex_lock_common kernel/locking/mutex.c:521
 [<ffffffff88195bcf>] mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
 [<     inline     >] genl_lock net/netlink/genetlink.c:31
 [<ffffffff86cc0c26>] genl_lock_dumpit+0x46/0xa0 net/netlink/genetlink.c:518
 [<ffffffff86cb33ac>] netlink_dump+0x57c/0xd70 net/netlink/af_netlink.c:2127
 [<ffffffff86cb7b6a>] __netlink_dump_start+0x4ea/0x760
net/netlink/af_netlink.c:2217
 [<ffffffff86cc2319>] genl_family_rcv_msg+0xdc9/0x1070
net/netlink/genetlink.c:586
 [<ffffffff86cc2770>] genl_rcv_msg+0x1b0/0x260 net/netlink/genetlink.c:660
 [<ffffffff86cc034c>] netlink_rcv_skb+0x2bc/0x3a0 net/netlink/af_netlink.c:2298
 [<ffffffff86cc153d>] genl_rcv+0x2d/0x40 net/netlink/genetlink.c:671
 [<     inline     >] netlink_unicast_kernel net/netlink/af_netlink.c:1231
 [<ffffffff86cbeb6a>] netlink_unicast+0x51a/0x740 net/netlink/af_netlink.c:1257
 [<ffffffff86cbf834>] netlink_sendmsg+0xaa4/0xe50 net/netlink/af_netlink.c:1803
 [<     inline     >] sock_sendmsg_nosec net/socket.c:621
 [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110 net/socket.c:631
 [<ffffffff86a764fb>] sock_write_iter+0x32b/0x620 net/socket.c:829
 [<     inline     >] new_sync_write fs/read_write.c:499
 [<ffffffff81a701ae>] __vfs_write+0x4fe/0x830 fs/read_write.c:512
 [<ffffffff81a71c55>] vfs_write+0x175/0x4e0 fs/read_write.c:560
 [<     inline     >] SYSC_write fs/read_write.c:607
 [<ffffffff81a760e0>] SyS_write+0x100/0x240 fs/read_write.c:599
 [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6

^ permalink raw reply

* Re: [PATCH 0/1] NET: usb: cdc_mbim: add quirk for supporting Telit LE922A
From: David Miller @ 2016-12-08 18:02 UTC (permalink / raw)
  To: dnlplm; +Cc: bjorn, oliver, netdev, linux-usb
In-Reply-To: <1481116068-32691-1-git-send-email-dnlplm@gmail.com>

From: Daniele Palmas <dnlplm@gmail.com>
Date: Wed,  7 Dec 2016 14:07:47 +0100

> Telit LE922A MBIM based composition does not work properly
> with altsetting toggle done in cdc_ncm_bind_common.
> 
> This patch adds CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE quirk
> to avoid this procedure that, instead, is mandatory for
> other modems.
> 
> References:
> https://www.spinics.net/lists/linux-usb/msg149249.html
> https://www.spinics.net/lists/linux-usb/msg149819.html
> 
> Thanks to Bjørn for the productive discussion and feedback!

Patch applied, thanks.

^ permalink raw reply

* Re: [PATCH net-next] net: sock_rps_record_flow() is for connected sockets
From: Eric Dumazet @ 2016-12-08 18:02 UTC (permalink / raw)
  To: Tom Herbert; +Cc: Paolo Abeni, David Miller, netdev, Willem de Bruijn
In-Reply-To: <CALx6S34e_5KW3cdxS_yNXwhYuK2FQe=6+9=yTCVDsg6f2vx87g@mail.gmail.com>

On Thu, 2016-12-08 at 09:49 -0800, Tom Herbert wrote:

> Of course that would only help on systems where no one enable encaps,
> ie. looks good in the the simple benchmarks but in real life if just
> one socket enables encap everyone else takes the hit. Alternatively,
> maybe we could do early demux when we do the lookup in GRO to
> eliminate the extra lookup?

Well, if you do the lookup in GRO, wont it be done for every incoming
MSS, instead of once per GRO packet ?

Anyway, the flooded UDP sockets out there are not normally connected
ones.

^ permalink raw reply

* Re: [PATCH v3 0/6] net: stmmac: make DMA programmable burst length more configurable
From: David Miller @ 2016-12-08 18:07 UTC (permalink / raw)
  To: niklas.cassel; +Cc: netdev, niklass, devicetree, linux-kernel, linux-doc
In-Reply-To: <1481120409-18103-1-git-send-email-niklass@axis.com>

From: Niklas Cassel <niklas.cassel@axis.com>
Date: Wed, 7 Dec 2016 15:20:02 +0100

> Make DMA programmable burst length more configurable in the stmmac driver.
> 
> This is done by adding support for independent pbl for tx/rx through DT.
> More fine grained tuning of pbl is possible thanks to a DT property saying
> that we should NOT multiply pbl values by x8/x4 in hardware.
> 
> All new DT properties are optional, and created in a way that it will not
> affect any existing DT configurations.

Series applied to net-next, thanks.

^ permalink raw reply

* Re: [PATCH net-next] net: sock_rps_record_flow() is for connected sockets
From: Eric Dumazet @ 2016-12-08 18:07 UTC (permalink / raw)
  To: Tom Herbert; +Cc: Paolo Abeni, David Miller, netdev, Willem de Bruijn
In-Reply-To: <CALx6S34e_5KW3cdxS_yNXwhYuK2FQe=6+9=yTCVDsg6f2vx87g@mail.gmail.com>

On Thu, 2016-12-08 at 09:49 -0800, Tom Herbert wrote:

> Of course that would only help on systems where no one enable encaps,
> ie. looks good in the the simple benchmarks but in real life if just
> one socket enables encap everyone else takes the hit.

Well, in real life most linux hosts do not use any UDP encapsulation.

Or if they do, maybe they still have to handle a lot of UDP traffic
which does not hit a tunnel in the kernel.

Anyway, my difference vs GRO on/off were caused by copybreak in mlx4
driver.

GRO off --> mlx4 uses copybreak for small messages (all protocols)
GRO on  --> no copybreak for native protocols (IP+TCP IP+UDP)

The lookup being done twice is not that expensive, if the first two
cache lines of the socket stay shared (mostly read)

^ permalink raw reply

* Re: [net-next] macsec: remove first zero and add attribute name in comments
From: David Miller @ 2016-12-08 18:08 UTC (permalink / raw)
  To: zhangshengju; +Cc: netdev
In-Reply-To: <1481122929-19147-1-git-send-email-zhangshengju@cmss.chinamobile.com>

From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Wed,  7 Dec 2016 23:02:09 +0800

> Remove first zero for add, and use full attribute name in comments.
> 
> Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>

Applied.

^ permalink raw reply

* Re: [net-next PATCH v5 5/6] virtio_net: add XDP_TX support
From: John Fastabend @ 2016-12-08 18:18 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: daniel, shm, davem, tgraf, alexei.starovoitov, john.r.fastabend,
	netdev, brouer
In-Reply-To: <20161208080647-mutt-send-email-mst@kernel.org>

On 16-12-07 10:11 PM, Michael S. Tsirkin wrote:
> On Wed, Dec 07, 2016 at 12:12:45PM -0800, John Fastabend wrote:
>> This adds support for the XDP_TX action to virtio_net. When an XDP
>> program is run and returns the XDP_TX action the virtio_net XDP
>> implementation will transmit the packet on a TX queue that aligns
>> with the current CPU that the XDP packet was processed on.
>>
>> Before sending the packet the header is zeroed.  Also XDP is expected
>> to handle checksum correctly so no checksum offload  support is
>> provided.
>>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---
>>  drivers/net/virtio_net.c |   99 +++++++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 92 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 28b1196..8e5b13c 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -330,12 +330,57 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>>  	return skb;
>>  }
>>  
>> +static void virtnet_xdp_xmit(struct virtnet_info *vi,
>> +			     struct receive_queue *rq,
>> +			     struct send_queue *sq,
>> +			     struct xdp_buff *xdp)
>> +{
>> +	struct page *page = virt_to_head_page(xdp->data);
>> +	struct virtio_net_hdr_mrg_rxbuf *hdr;
>> +	unsigned int num_sg, len;
>> +	void *xdp_sent;
>> +	int err;
>> +
>> +	/* Free up any pending old buffers before queueing new ones. */
>> +	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
>> +		struct page *sent_page = virt_to_head_page(xdp_sent);
>> +
>> +		if (vi->mergeable_rx_bufs)
>> +			put_page(sent_page);
>> +		else
>> +			give_pages(rq, sent_page);
>> +	}
> 
> Looks like this is the only place where you do virtqueue_get_buf.
> No interrupt handler?
> This means that if you fill up the queue, nothing will clean it
> and things will get stuck.

hmm OK so the callbacks should be implemented to do this and a pair
of virtqueue_enable_cb_prepare()/virtqueue_disable_cb() used to enable
and disable callbacks if packets are enqueued.

Also in the normal xmit path via start_xmit() will the same condition
happen? It looks like free_old_xmit_skbs for example is only called if
a packet is sent could we end up holding on to skbs in this case? I
don't see free_old_xmit_skbs being called from any callbacks?

> Can this be the issue you saw?

nope see below I was mishandling the big_packets page cleanup path in
the error case.

> 
> 
>> +
>> +	/* Zero header and leave csum up to XDP layers */
>> +	hdr = xdp->data;
>> +	memset(hdr, 0, vi->hdr_len);
>> +
>> +	nu_sg = 1;
>> +	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
>> +	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
>> +				   xdp->data, GFP_ATOMIC);
>> +	if (unlikely(err)) {
>> +		if (vi->mergeable_rx_bufs)
>> +			put_page(page);
>> +		else
>> +			give_pages(rq, page);
>> +	} else if (!vi->mergeable_rx_bufs) {
>> +		/* If not mergeable bufs must be big packets so cleanup pages */
>> +		give_pages(rq, (struct page *)page->private);
>> +		page->private = 0;
>> +	}
>> +
>> +	virtqueue_kick(sq->vq);
> 
> Is this unconditional kick a work-around for hang
> we could not figure out yet?

I tracked the original issue down to how I handled the big_packet page
cleanups.

> I guess this helps because it just slows down the guest.
> I don't much like it ...

I left it like this copying the pattern in balloon and input drivers. I
can change it back to the previous pattern where it is only called if
there is no errors. It has been running fine with the old pattern now
for an hour or so.

.John

^ permalink raw reply

* Re: [PATCH net-next] net: rfs: add a jump label
From: David Miller @ 2016-12-08 18:19 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, pabeni
In-Reply-To: <1481128150.4930.25.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 07 Dec 2016 08:29:10 -0800

> From: Eric Dumazet <edumazet@google.com>
> 
> RFS is not commonly used, so add a jump label to avoid some conditionals
> in fast path.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Applied, but I wonder how effective this will really be in the long run.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox