Netdev List
 help / color / mirror / Atom feed
* [PATCH V2 net-next 5/7] net: hns3: remove explicit conversion to bool
From: Huazhong Tan @ 2019-09-05 13:31 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm,
	jakub.kicinski, Guojia Liao, Huazhong Tan
In-Reply-To: <1567690302-16648-1-git-send-email-tanhuazhong@huawei.com>

From: Guojia Liao <liaoguojia@huawei.com>

Relational and logical operators evaluate to bool,
explicit conversion is overly verbose and unnecessary.

Signed-off-by: Guojia Liao <liaoguojia@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 76e1c84..dde752f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -6174,7 +6174,7 @@ static void hclge_enable_fd(struct hnae3_handle *handle, bool enable)
 	bool clear;
 
 	hdev->fd_en = enable;
-	clear = hdev->fd_active_type == HCLGE_FD_ARFS_ACTIVE ? true : false;
+	clear = hdev->fd_active_type == HCLGE_FD_ARFS_ACTIVE;
 	if (!enable)
 		hclge_del_all_fd_entries(handle, clear);
 	else
-- 
2.7.4


^ permalink raw reply related

* [PATCH V2 net-next 7/7] net: hns3: make hclge_dbg_get_m7_stats_info static
From: Huazhong Tan @ 2019-09-05 13:31 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm,
	jakub.kicinski, Guojia Liao, Huazhong Tan
In-Reply-To: <1567690302-16648-1-git-send-email-tanhuazhong@huawei.com>

From: Guojia Liao <liaoguojia@huawei.com>

hclge_dbg_get_m7_info is used only in the hclge_debugfs.c,
so it should be declared with static.

Signed-off-by: Guojia Liao <liaoguojia@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c
index 1c6b501..6dcce48 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c
@@ -949,7 +949,7 @@ static void hclge_dbg_dump_rst_info(struct hclge_dev *hdev)
 		 hdev->rst_stats.reset_cnt);
 }
 
-void hclge_dbg_get_m7_stats_info(struct hclge_dev *hdev)
+static void hclge_dbg_get_m7_stats_info(struct hclge_dev *hdev)
 {
 	struct hclge_desc *desc_src, *desc_tmp;
 	struct hclge_get_m7_bd_cmd *req;
-- 
2.7.4


^ permalink raw reply related

* [PATCH V2 net-next 4/7] net: hns3: add client node validity judgment
From: Huazhong Tan @ 2019-09-05 13:31 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm,
	jakub.kicinski, Peng Li, Huazhong Tan
In-Reply-To: <1567690302-16648-1-git-send-email-tanhuazhong@huawei.com>

From: Peng Li <lipeng321@huawei.com>

HNS3 driver can only unregister client which included in hnae3_client_list.
This patch adds the client node validity judgment.

Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
index 528f624..03ca7d9 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
@@ -138,12 +138,28 @@ EXPORT_SYMBOL(hnae3_register_client);
 
 void hnae3_unregister_client(struct hnae3_client *client)
 {
+	struct hnae3_client *client_tmp;
 	struct hnae3_ae_dev *ae_dev;
+	bool existed = false;
 
 	if (!client)
 		return;
 
 	mutex_lock(&hnae3_common_lock);
+
+	list_for_each_entry(client_tmp, &hnae3_client_list, node) {
+		if (client_tmp->type == client->type) {
+			existed = true;
+			break;
+		}
+	}
+
+	if (!existed) {
+		mutex_unlock(&hnae3_common_lock);
+		pr_err("client %s does not exist!\n", client->name);
+		return;
+	}
+
 	/* un-initialize the client on every matched port */
 	list_for_each_entry(ae_dev, &hnae3_ae_dev_list, node) {
 		hnae3_uninit_client_instance(client, ae_dev);
-- 
2.7.4


^ permalink raw reply related

* [PATCH V2 net-next 6/7] net: hns3: disable loopback setting in hclge_mac_init
From: Huazhong Tan @ 2019-09-05 13:31 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm,
	jakub.kicinski, Yufeng Mo, Huazhong Tan
In-Reply-To: <1567690302-16648-1-git-send-email-tanhuazhong@huawei.com>

From: Yufeng Mo <moyufeng@huawei.com>

If the selftest and reset are performed at the same time, the loopback
setting may be still in the enable state after the reset. As a result,
packets cannot be sent out.

This patch fixes this issue by disabling loopback in hclge_mac_init.

Signed-off-by: Yufeng Mo <moyufeng@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 34 +++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index dde752f..8d4dc1b 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -66,6 +66,7 @@ static void hclge_rfs_filter_expire(struct hclge_dev *hdev);
 static void hclge_clear_arfs_rules(struct hnae3_handle *handle);
 static enum hnae3_reset_type hclge_get_reset_level(struct hnae3_ae_dev *ae_dev,
 						   unsigned long *addr);
+static int hclge_set_default_loopback(struct hclge_dev *hdev);
 
 static struct hnae3_ae_algo ae_algo;
 
@@ -2599,6 +2600,10 @@ static int hclge_mac_init(struct hclge_dev *hdev)
 		return ret;
 	}
 
+	ret = hclge_set_default_loopback(hdev);
+	if (ret)
+		return ret;
+
 	ret = hclge_buffer_alloc(hdev);
 	if (ret)
 		dev_err(&hdev->pdev->dev,
@@ -6331,7 +6336,7 @@ static int hclge_set_app_loopback(struct hclge_dev *hdev, bool en)
 	return ret;
 }
 
-static int hclge_set_serdes_loopback(struct hclge_dev *hdev, bool en,
+static int hclge_cfg_serdes_loopback(struct hclge_dev *hdev, bool en,
 				     enum hnae3_loop loop_mode)
 {
 #define HCLGE_SERDES_RETRY_MS	10
@@ -6392,6 +6397,17 @@ static int hclge_set_serdes_loopback(struct hclge_dev *hdev, bool en,
 		dev_err(&hdev->pdev->dev, "serdes loopback set failed in fw\n");
 		return -EIO;
 	}
+	return ret;
+}
+
+static int hclge_set_serdes_loopback(struct hclge_dev *hdev, bool en,
+				     enum hnae3_loop loop_mode)
+{
+	int ret;
+
+	ret = hclge_cfg_serdes_loopback(hdev, en, loop_mode);
+	if (ret)
+		return ret;
 
 	hclge_cfg_mac_mode(hdev, en);
 
@@ -6535,6 +6551,22 @@ static int hclge_set_loopback(struct hnae3_handle *handle,
 	return 0;
 }
 
+static int hclge_set_default_loopback(struct hclge_dev *hdev)
+{
+	int ret;
+
+	ret = hclge_set_app_loopback(hdev, false);
+	if (ret)
+		return ret;
+
+	ret = hclge_cfg_serdes_loopback(hdev, false, HNAE3_LOOP_SERIAL_SERDES);
+	if (ret)
+		return ret;
+
+	return hclge_cfg_serdes_loopback(hdev, false,
+					 HNAE3_LOOP_PARALLEL_SERDES);
+}
+
 static void hclge_reset_tqp_stats(struct hnae3_handle *handle)
 {
 	struct hclge_vport *vport = hclge_get_vport(handle);
-- 
2.7.4


^ permalink raw reply related

* [PATCH V2 net-next 0/7] net: hns3: add some bugfixes and cleanups
From: Huazhong Tan @ 2019-09-05 13:31 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm,
	jakub.kicinski, Huazhong Tan

This patch-set includes bugfixes and cleanups for the HNS3
ethernet controller driver.

[patch 01/07] fixes an error when setting VLAN offload.

[patch 02/07] fixes an double free issue when setting ringparam.

[patch 03/07] fixes a mis-assignment of hdev->reset_level.

[patch 04/07] adds a checking for client's validity.

[patch 05/07] simplifies bool variable's assignment.

[patch 06/07] disables loopback when initializing.

[patch 07/07] makes internal function to static.

Change log:
V1->V2: fixes comment from Sergei Shtylyov.

Guojia Liao (2):
  net: hns3: remove explicit conversion to bool
  net: hns3: make hclge_dbg_get_m7_stats_info static

Huazhong Tan (2):
  net: hns3: fix double free bug when setting ringparam
  net: hns3: fix mis-assignment to hdev->reset_level in hclge_reset

Jian Shen (1):
  net: hns3: fix error VF index when setting VLAN offload

Peng Li (1):
  net: hns3: add client node validity judgment

Yufeng Mo (1):
  net: hns3: disable loopback setting in hclge_mac_init

 drivers/net/ethernet/hisilicon/hns3/hnae3.c        | 16 ++++++
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c |  4 +-
 .../ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c |  2 +-
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 59 ++++++++++++++++++----
 4 files changed, 69 insertions(+), 12 deletions(-)

-- 
2.7.4


^ permalink raw reply

* RE: [PATCH net] net: sched: fix reordering issues
From: John Fastabend @ 2019-09-05 13:27 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller
  Cc: netdev, Eric Dumazet, Eric Dumazet, John Fastabend
In-Reply-To: <20190905122022.106680-1-edumazet@google.com>

Eric Dumazet wrote:
> Whenever MQ is not used on a multiqueue device, we experience
> serious reordering problems. Bisection found the cited
> commit.
> 
> The issue can be described this way :
> 
> - A single qdisc hierarchy is shared by all transmit queues.
>   (eg : tc qdisc replace dev eth0 root fq_codel)
> 
> - When/if try_bulk_dequeue_skb_slow() dequeues a packet targetting
>   a different transmit queue than the one used to build a packet train,
>   we stop building the current list and save the 'bad' skb (P1) in a
>   special queue. (bad_txq)
> 
> - When dequeue_skb() calls qdisc_dequeue_skb_bad_txq() and finds this
>   skb (P1), it checks if the associated transmit queues is still in frozen
>   state. If the queue is still blocked (by BQL or NIC tx ring full),
>   we leave the skb in bad_txq and return NULL.
> 
> - dequeue_skb() calls q->dequeue() to get another packet (P2)
> 
>   The other packet can target the problematic queue (that we found
>   in frozen state for the bad_txq packet), but another cpu just ran
>   TX completion and made room in the txq that is now ready to accept
>   new packets.
> 
> - Packet P2 is sent while P1 is still held in bad_txq, P1 might be sent
>   at next round. In practice P2 is the lead of a big packet train
>   (P2,P3,P4 ...) filling the BQL budget and delaying P1 by many packets :/
> 
> To solve this problem, we have to block the dequeue process as long
> as the first packet in bad_txq can not be sent. Reordering issues
> disappear and no side effects have been seen.
> 
> Fixes: a53851e2c321 ("net: sched: explicit locking in gso_cpu fallback")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: John Fastabend <john.fastabend@gmail.com>
> ---
>  net/sched/sch_generic.c | 9 +++++++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 

Dang, missed this case. Thanks!

Acked-by: John Fastabend <john.fastabend@gmail.com>

^ permalink raw reply

* Re: [PATCH REPOST 1/2] can: flexcan: fix deadlock when using self wakeup
From: Sean Nyekjaer @ 2019-09-05 13:17 UTC (permalink / raw)
  To: Joakim Zhang, mkl@pengutronix.de, linux-can@vger.kernel.org
  Cc: wg@grandegger.com, netdev@vger.kernel.org, dl-linux-imx,
	Martin Hundebøll
In-Reply-To: <DB7PR04MB461868320DA0B25CC8255213E6BB0@DB7PR04MB4618.eurprd04.prod.outlook.com>



On 05/09/2019 09.10, Joakim Zhang wrote:
> Hi Sean,
> 
> Could you update lastest flexcan driver using linux-can-next/flexcan and then merge below two patches from linux-can/testing?
> d0b53616716e (HEAD -> testing, origin/testing) can: flexcan: add LPSR mode support for i.MX7D
> 803eb6bad65b can: flexcan: fix deadlock when using self wakeup
> 
> Best Regards,
> Joakim Zhang

The testing branch have some UBI bugs, when suspending it crashes...
So will have to leave this, until they are resolved :-)

/Sean

^ permalink raw reply

* Re: [PATCH bpf] bpf: fix precision tracking of stack slots
From: Daniel Borkmann @ 2019-09-05 13:15 UTC (permalink / raw)
  To: Alexei Starovoitov, davem; +Cc: netdev, bpf, kernel-team
In-Reply-To: <20190903221617.635375-1-ast@kernel.org>

On 9/4/19 12:16 AM, Alexei Starovoitov wrote:
> The problem can be seen in the following two tests:
> 0: (bf) r3 = r10
> 1: (55) if r3 != 0x7b goto pc+0
> 2: (7a) *(u64 *)(r3 -8) = 0
> 3: (79) r4 = *(u64 *)(r10 -8)
> ..
> 0: (85) call bpf_get_prandom_u32#7
> 1: (bf) r3 = r10
> 2: (55) if r3 != 0x7b goto pc+0
> 3: (7b) *(u64 *)(r3 -8) = r0
> 4: (79) r4 = *(u64 *)(r10 -8)
> 
> When backtracking need to mark R4 it will mark slot fp-8.
> But ST or STX into fp-8 could belong to the same block of instructions.
> When backtracing is done the parent state may have fp-8 slot
> as "unallocated stack". Which will cause verifier to warn
> and incorrectly reject such programs.
> 
> Writes into stack via non-R10 register are rare. llvm always
> generates canonical stack spill/fill.
> For such pathological case fall back to conservative precision
> tracking instead of rejecting.
> 
> Reported-by: syzbot+c8d66267fd2b5955287e@syzkaller.appspotmail.com
> Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking")
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Applied, thanks!

^ permalink raw reply

* Re: [PATCH bpf-next v3 0/4] xsk: various CPU barrier and {READ, WRITE}_ONCE
From: Daniel Borkmann @ 2019-09-05 13:14 UTC (permalink / raw)
  To: Björn Töpel, ast, netdev
  Cc: magnus.karlsson, magnus.karlsson, bpf, bjorn.topel,
	jonathan.lemon, syzbot+c82697e3043781e08802, hdanton, i.maximets
In-Reply-To: <20190904114913.17217-1-bjorn.topel@gmail.com>

On 9/4/19 1:49 PM, Björn Töpel wrote:
> This is a four patch series of various barrier, {READ, WRITE}_ONCE
> cleanups in the AF_XDP socket code. More details can be found in the
> corresponding commit message. Previous revisions: v1 [4] and v2 [5].
> 
> For an AF_XDP socket, most control plane operations are done under the
> control mutex (struct xdp_sock, mutex), but there are some places
> where members of the struct is read outside the control mutex. The
> dev, queue_id members are set in bind() and cleared at cleanup. The
> umem, fq, cq, tx, rx, and state member are all assigned in various
> places, e.g. bind() and setsockopt(). When the members are assigned,
> they are protected by the control mutex, but since they are read
> outside the mutex, a WRITE_ONCE is required to avoid store-tearing on
> the read-side.
> 
> Prior the state variable was introduced by Ilya, the dev member was
> used to determine whether the socket was bound or not. However, when
> dev was read, proper SMP barriers and READ_ONCE were missing. In order
> to address the missing barriers and READ_ONCE, we start using the
> state variable as a point of synchronization. The state member
> read/write is paired with proper SMP barriers, and from this follows
> that the members described above does not need READ_ONCE statements if
> used in conjunction with state check.
> 
> To summarize: The members struct xdp_sock members dev, queue_id, umem,
> fq, cq, tx, rx, and state were read lock-less, with incorrect barriers
> and missing {READ, WRITE}_ONCE. After this series umem, fq, cq, tx,
> rx, and state are read lock-less. When these members are updated,
> WRITE_ONCE is used. When read, READ_ONCE are only used when read
> outside the control mutex (e.g. mmap) or, not synchronized with the
> state member (XSK_BOUND plus smp_rmb())
> 
> Thanks,
> Björn
> 
> [1] https://lore.kernel.org/bpf/beef16bb-a09b-40f1-7dd0-c323b4b89b17@iogearbox.net/
> [2] https://lwn.net/Articles/793253/
> [3] https://github.com/google/ktsan/wiki/READ_ONCE-and-WRITE_ONCE
> [4] https://lore.kernel.org/bpf/20190822091306.20581-1-bjorn.topel@gmail.com/
> [5] https://lore.kernel.org/bpf/20190826061053.15996-1-bjorn.topel@gmail.com/
> 
> v2->v3:
>    Minor restructure of commits.
>    Improve cover and commit messages. (Daniel)
> v1->v2:
>    Removed redundant dev check. (Jonathan)
> 
> Björn Töpel (4):
>    xsk: avoid store-tearing when assigning queues
>    xsk: avoid store-tearing when assigning umem
>    xsk: use state member for socket synchronization
>    xsk: lock the control mutex in sock_diag interface
> 
>   net/xdp/xsk.c      | 60 ++++++++++++++++++++++++++++++++--------------
>   net/xdp/xsk_diag.c |  3 +++
>   2 files changed, 45 insertions(+), 18 deletions(-)
> 

Applied, thanks!

^ permalink raw reply

* Re: [PATCH bpf-next] selftests/bpf: precision tracking tests
From: Daniel Borkmann @ 2019-09-05 13:13 UTC (permalink / raw)
  To: Alexei Starovoitov, davem; +Cc: netdev, bpf, kernel-team
In-Reply-To: <20190903225133.821243-1-ast@kernel.org>

On 9/4/19 12:51 AM, Alexei Starovoitov wrote:
> Add two tests to check that stack slot marking during backtracking
> doesn't trigger 'spi > allocated_stack' warning.
> One test is using BPF_ST insn. Another is using BPF_STX.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Applied, thanks!

^ permalink raw reply

* Re: [PATCH bpf-next] ixgbe: fix xdp handle calculations
From: Daniel Borkmann @ 2019-09-05 13:13 UTC (permalink / raw)
  To: Kevin Laatz, netdev, ast, bjorn.topel, magnus.karlsson,
	jonathan.lemon
  Cc: bruce.richardson, ciara.loftus, bpf, intel-wired-lan
In-Reply-To: <20190905011217.3567-1-kevin.laatz@intel.com>

On 9/5/19 3:12 AM, Kevin Laatz wrote:
> Currently, we don't add headroom to the handle in ixgbe_zca_free,
> ixgbe_alloc_buffer_slow_zc and ixgbe_alloc_buffer_zc. The addition of the
> headroom to the handle was removed in
> commit d8c3061e5edd ("ixgbe: modify driver for handling offsets"), which
> will break things when headroom isvnon-zero. This patch fixes this and uses
> xsk_umem_adjust_offset to add it appropritely based on the mode being run.
> 
> Fixes: d8c3061e5edd ("ixgbe: modify driver for handling offsets")
> Reported-by: Bjorn Topel <bjorn.topel@intel.com>
> Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>

Applied, thanks!

^ permalink raw reply

* Re: [PATCH bpf-next] i40e: fix xdp handle calculations
From: Daniel Borkmann @ 2019-09-05 13:13 UTC (permalink / raw)
  To: Kevin Laatz, netdev, ast, bjorn.topel, magnus.karlsson,
	jonathan.lemon
  Cc: bruce.richardson, ciara.loftus, bpf, intel-wired-lan
In-Reply-To: <20190905011144.3513-1-kevin.laatz@intel.com>

On 9/5/19 3:11 AM, Kevin Laatz wrote:
> Currently, we don't add headroom to the handle in i40e_zca_free,
> i40e_alloc_buffer_slow_zc and i40e_alloc_buffer_zc. The addition of the
> headroom to the handle was removed in
> commit 2f86c806a8a8 ("i40e: modify driver for handling offsets"), which
> will break things when headroom is non-zero. This patch fixes this and uses
> xsk_umem_adjust_offset to add it appropritely based on the mode being run.
> 
> Fixes: 2f86c806a8a8 ("i40e: modify driver for handling offsets")
> Reported-by: Bjorn Topel <bjorn.topel@intel.com>
> Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>

Applied, thanks!

^ permalink raw reply

* IPv6 temporary addresses (Fwd: New Version Notification for draft-ietf-6man-rfc4941bis-03.txt)
From: Fernando Gont @ 2019-09-05 12:56 UTC (permalink / raw)
  To: netdev
In-Reply-To: <156768734939.22666.4804883631217307240.idtracker@ietfa.amsl.com>

Folks,

We are working in a revision of RFC4941 (temporary addresses), to
address issues found in such spec -- for example, RFC4941 leads to the
same interface identifiers being employed for different prefixes (which
of course has privacy implications)

The current version of the revised spec is available at:
https://tools.ietf.org/html/draft-ietf-6man-rfc4941bis

We'd really like to hear your comments and double-check if we got
everything right, if there's something that we missed, etc.

If possible, please post your feedback on the 6man list
(https://www.ietf.org/mailman/listinfo/ipv6). But I'll be happy to
receive comments here or unicast and relay them as necessary.

Thanks!

Cheers,
Fernando




-------- Forwarded Message --------
Subject: New Version Notification for draft-ietf-6man-rfc4941bis-03.txt
Date: Thu, 05 Sep 2019 05:42:29 -0700
From: internet-drafts@ietf.org
To: Fernando Gont <fgont@si6networks.com>, Suresh Krishnan
<suresh.krishnan@ericsson.com>, Richard Draves <richdr@microsoft.com>,
Thomas Narten <narten@us.ibm.com>


A new version of I-D, draft-ietf-6man-rfc4941bis-03.txt
has been successfully submitted by Fernando Gont and posted to the
IETF repository.

Name:		draft-ietf-6man-rfc4941bis
Revision:	03
Title:		Privacy Extensions for Stateless Address Autoconfiguration in IPv6
Document date:	2019-09-05
Group:		6man
Pages:		21
URL:
https://www.ietf.org/internet-drafts/draft-ietf-6man-rfc4941bis-03.txt
Status:         https://datatracker.ietf.org/doc/draft-ietf-6man-rfc4941bis/
Htmlized:       https://tools.ietf.org/html/draft-ietf-6man-rfc4941bis-03
Htmlized:
https://datatracker.ietf.org/doc/html/draft-ietf-6man-rfc4941bis
Diff:
https://www.ietf.org/rfcdiff?url2=draft-ietf-6man-rfc4941bis-03

Abstract:
   Nodes use IPv6 stateless address autoconfiguration to generate
   addresses using a combination of locally available information and
   information advertised by routers.  Addresses are formed by combining
   network prefixes with an interface identifier.  This document
   describes an extension that causes nodes to generate global scope
   addresses with randomized interface identifiers that change over
   time.  Changing global scope addresses over time makes it more
   difficult for eavesdroppers and other information collectors to
   identify when different addresses used in different transactions
   actually correspond to the same node.  This document formally
   obsoletes RFC4941.




Please note that it may take a couple of minutes from the time of submission
until the htmlized version and diff are available at tools.ietf.org.

The IETF Secretariat



^ permalink raw reply

* [PATCH v1] stmmac: platform: adjust messages and move to dev level
From: Andy Shevchenko @ 2019-09-05 13:00 UTC (permalink / raw)
  To: Giuseppe Cavallaro, Alexandre Torgue, Jose Abreu, netdev,
	Maxime Coquelin, linux-stm32, David S . Miller 
  Cc: Andy Shevchenko

This patch amends the error and warning messages across the platform driver.
It includes the following changes:
 - append \n to the end of messages
 - change pr_* macros to dev_*

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 .../ethernet/stmicro/stmmac/stmmac_platform.c | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 154daf4d1072..1da461907114 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -23,6 +23,7 @@
 
 /**
  * dwmac1000_validate_mcast_bins - validates the number of Multicast filter bins
+ * @dev: struct device of the platform device
  * @mcast_bins: Multicast filtering bins
  * Description:
  * this function validates the number of Multicast filtering bins specified
@@ -33,7 +34,7 @@
  * invalid and will cause the filtering algorithm to use Multicast
  * promiscuous mode.
  */
-static int dwmac1000_validate_mcast_bins(int mcast_bins)
+static int dwmac1000_validate_mcast_bins(struct device *dev, int mcast_bins)
 {
 	int x = mcast_bins;
 
@@ -44,8 +45,8 @@ static int dwmac1000_validate_mcast_bins(int mcast_bins)
 		break;
 	default:
 		x = 0;
-		pr_info("Hash table entries set to unexpected value %d",
-			mcast_bins);
+		dev_info(dev, "Hash table entries set to unexpected value %d\n",
+			 mcast_bins);
 		break;
 	}
 	return x;
@@ -53,6 +54,7 @@ static int dwmac1000_validate_mcast_bins(int mcast_bins)
 
 /**
  * dwmac1000_validate_ucast_entries - validate the Unicast address entries
+ * @dev: struct device of the platform device
  * @ucast_entries: number of Unicast address entries
  * Description:
  * This function validates the number of Unicast address entries supported
@@ -62,7 +64,8 @@ static int dwmac1000_validate_mcast_bins(int mcast_bins)
  * selected, and defaults to 1 Unicast address if an unsupported
  * configuration is selected.
  */
-static int dwmac1000_validate_ucast_entries(int ucast_entries)
+static int dwmac1000_validate_ucast_entries(struct device *dev,
+					    int ucast_entries)
 {
 	int x = ucast_entries;
 
@@ -73,8 +76,8 @@ static int dwmac1000_validate_ucast_entries(int ucast_entries)
 		break;
 	default:
 		x = 1;
-		pr_info("Unicast table entries set to unexpected value %d\n",
-			ucast_entries);
+		dev_info(dev, "Unicast table entries set to unexpected value %d\n",
+			 ucast_entries);
 		break;
 	}
 	return x;
@@ -457,9 +460,9 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 		of_property_read_u32(np, "snps,perfect-filter-entries",
 				     &plat->unicast_filter_entries);
 		plat->unicast_filter_entries = dwmac1000_validate_ucast_entries(
-					       plat->unicast_filter_entries);
+				&pdev->dev, plat->unicast_filter_entries);
 		plat->multicast_filter_bins = dwmac1000_validate_mcast_bins(
-					      plat->multicast_filter_bins);
+				&pdev->dev, plat->multicast_filter_bins);
 		plat->has_gmac = 1;
 		plat->pmt = 1;
 	}
@@ -508,7 +511,8 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 	plat->force_thresh_dma_mode = of_property_read_bool(np, "snps,force_thresh_dma_mode");
 	if (plat->force_thresh_dma_mode) {
 		plat->force_sf_dma_mode = 0;
-		pr_warn("force_sf_dma_mode is ignored if force_thresh_dma_mode is set.");
+		dev_warn(&pdev->dev,
+			 "force_sf_dma_mode is ignored if force_thresh_dma_mode is set.\n");
 	}
 
 	of_property_read_u32(np, "snps,ps-speed", &plat->mac_port_sel_speed);
-- 
2.23.0.rc1


^ permalink raw reply related

* [PATCH net-next] MAINTAINERS: add myself as maintainer for xilinx axiethernet driver
From: Radhey Shyam Pandey @ 2019-09-05 12:56 UTC (permalink / raw)
  To: davem, netdev
  Cc: michal.simek, anirudha.sarangi, john.linn, mchehab+samsung,
	gregkh, nicolas.ferre, linux-arm-kernel, linux-kernel,
	Radhey Shyam Pandey

I am maintaining xilinx axiethernet driver in xilinx tree and would like
to maintain it in the mainline kernel as well. Hence adding myself as a
maintainer. Also Anirudha and John has moved to new roles, so based on
request removing them from the maintainer list.

Signed-off-by: Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
Acked-by: John Linn <john.linn@xilinx.com>
Acked-by: Michal Simek <michal.simek@xilinx.com>
---
I am resending this patch as earlier version didn't go through mailing
list due to some config restriction on the external email addresses.
Also adding Michal's acked-by tag.
---
 MAINTAINERS | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 84bb347..16fc09d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17732,8 +17732,7 @@ F:	include/uapi/linux/dqblk_xfs.h
 F:	include/uapi/linux/fsmap.h
 
 XILINX AXI ETHERNET DRIVER
-M:	Anirudha Sarangi <anirudh@xilinx.com>
-M:	John Linn <John.Linn@xilinx.com>
+M:	Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
 S:	Maintained
 F:	drivers/net/ethernet/xilinx/xilinx_axienet*
 
-- 
2.7.4


^ permalink raw reply related

* [PATCH iproute2 4/4] devlink: Fix devlink health set command
From: Tariq Toukan @ 2019-09-05 12:43 UTC (permalink / raw)
  To: Stephen Hemminger, David Ahern
  Cc: netdev, Moshe Shemesh, Aya Levin, Jiri Pirko, Tariq Toukan
In-Reply-To: <1567687387-12993-1-git-send-email-tariqt@mellanox.com>

From: Aya Levin <ayal@mellanox.com>

Prior to this patch both the reporter's name and the grace period
attributes shared the same bit. This caused zeroing grace period when
setting auto recovery. Let each parameter has its own bit.

Fixes: b18d89195b16 ("devlink: Add devlink health set command")
Signed-off-by: Aya Levin <ayal@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
---
 devlink/devlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 722b6a101673..306abfb5222b 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -231,11 +231,11 @@ static void ifname_map_free(struct ifname_map *ifname_map)
 #define DL_OPT_FLASH_FILE_NAME	BIT(25)
 #define DL_OPT_FLASH_COMPONENT	BIT(26)
 #define DL_OPT_HEALTH_REPORTER_NAME	BIT(27)
-#define DL_OPT_HEALTH_REPORTER_GRACEFUL_PERIOD	BIT(27)
-#define DL_OPT_HEALTH_REPORTER_AUTO_RECOVER	BIT(28)
+#define DL_OPT_HEALTH_REPORTER_GRACEFUL_PERIOD	BIT(28)
 #define DL_OPT_TRAP_NAME		BIT(29)
 #define DL_OPT_TRAP_ACTION		BIT(30)
 #define DL_OPT_TRAP_GROUP_NAME		BIT(31)
+#define DL_OPT_HEALTH_REPORTER_AUTO_RECOVER	BIT(32)
 
 struct dl_opts {
 	uint64_t present; /* flags of present items */
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH iproute2 2/4] devlink: Left justification on FMSG output
From: Tariq Toukan @ 2019-09-05 12:43 UTC (permalink / raw)
  To: Stephen Hemminger, David Ahern
  Cc: netdev, Moshe Shemesh, Aya Levin, Jiri Pirko, Tariq Toukan
In-Reply-To: <1567687387-12993-1-git-send-email-tariqt@mellanox.com>

From: Aya Levin <ayal@mellanox.com>

FMSG output is dynamic, space separator must be on the left hand side of
the value. Otherwise output has redundant left indentation regardless
the hierarchy.

Before the patch:
 Common config: SQ: stride size: 64 size: 1024
 CQ: stride size: 64 size: 1024
 SQs:
   channel ix: 0 tc: 0 txq ix: 0 sqn: 10 HW state: 1 stopped: false cc: 0 pc: 0 CQ: cqn: 6 HW status: 0
   channel ix: 1 tc: 0 txq ix: 1 sqn: 14 HW state: 1 stopped: false cc: 0 pc: 0 CQ: cqn: 10 HW status: 0
   channel ix: 2 tc: 0 txq ix: 2 sqn: 18 HW state: 1 stopped: false cc: 5 pc: 5 CQ: cqn: 14 HW status: 0
   channel ix: 3 tc: 0 txq ix: 3 sqn: 22 HW state: 1 stopped: false cc: 0 pc: 0 CQ: cqn: 18 HW status: 0

With the patch:
Common config: SQ: stride size: 64 size: 1024
CQ: stride size: 64 size: 1024
SQs:
  channel ix: 0 tc: 0 txq ix: 0 sqn: 10 HW state: 1 stopped: false cc: 0 pc: 0 CQ: cqn: 6 HW status: 0
  channel ix: 1 tc: 0 txq ix: 1 sqn: 14 HW state: 1 stopped: false cc: 0 pc: 0 CQ: cqn: 10 HW status: 0
  channel ix: 2 tc: 0 txq ix: 2 sqn: 18 HW state: 1 stopped: false cc: 5 pc: 5 CQ: cqn: 14 HW status: 0
  channel ix: 3 tc: 0 txq ix: 3 sqn: 22 HW state: 1 stopped: false cc: 0 pc: 0 CQ: cqn: 18 HW status: 0

Fixes: 844a61764c6f ("devlink: Add helper functions for name and value separately")
Signed-off-by: Aya Levin <ayal@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
---
 devlink/devlink.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index f1b9b2da39d7..1bfc3283a832 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -1843,26 +1843,29 @@ static void pr_out_u64(struct dl *dl, const char *name, uint64_t val)
 
 static void pr_out_bool_value(struct dl *dl, bool value)
 {
+	__pr_out_indent_newline(dl);
 	if (dl->json_output)
 		jsonw_bool(dl->jw, value);
 	else
-		pr_out(" %s", value ? "true" : "false");
+		pr_out("%s", value ? "true" : "false");
 }
 
 static void pr_out_uint_value(struct dl *dl, unsigned int value)
 {
+	__pr_out_indent_newline(dl);
 	if (dl->json_output)
 		jsonw_uint(dl->jw, value);
 	else
-		pr_out(" %u", value);
+		pr_out("%u", value);
 }
 
 static void pr_out_uint64_value(struct dl *dl, uint64_t value)
 {
+	__pr_out_indent_newline(dl);
 	if (dl->json_output)
 		jsonw_u64(dl->jw, value);
 	else
-		pr_out(" %"PRIu64, value);
+		pr_out("%"PRIu64, value);
 }
 
 static bool is_binary_eol(int i)
@@ -1889,18 +1892,20 @@ static void pr_out_binary_value(struct dl *dl, uint8_t *data, uint32_t len)
 
 static void pr_out_str_value(struct dl *dl, const char *value)
 {
+	__pr_out_indent_newline(dl);
 	if (dl->json_output)
 		jsonw_string(dl->jw, value);
 	else
-		pr_out(" %s", value);
+		pr_out("%s", value);
 }
 
 static void pr_out_name(struct dl *dl, const char *name)
 {
+	__pr_out_indent_newline(dl);
 	if (dl->json_output)
 		jsonw_name(dl->jw, name);
 	else
-		pr_out(" %s:", name);
+		pr_out("%s:", name);
 }
 
 static void pr_out_region_chunk_start(struct dl *dl, uint64_t addr)
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH iproute2 1/4] devlink: Add helper for left justification print
From: Tariq Toukan @ 2019-09-05 12:43 UTC (permalink / raw)
  To: Stephen Hemminger, David Ahern
  Cc: netdev, Moshe Shemesh, Aya Levin, Jiri Pirko, Tariq Toukan
In-Reply-To: <1567687387-12993-1-git-send-email-tariqt@mellanox.com>

From: Aya Levin <ayal@mellanox.com>

Introduce a helper function which wraps code that adds a left hand side
space separator unless it follows a newline.

Fixes: e3d0f0c0e3d8 ("devlink: add option to generate JSON output")
Signed-off-by: Aya Levin <ayal@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
---
 devlink/devlink.c | 45 ++++++++++++++++++++-------------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 2f084c020765..f1b9b2da39d7 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -355,6 +355,12 @@ static bool dl_no_arg(struct dl *dl)
 	return dl_argc(dl) == 0;
 }
 
+static void __pr_out_indent_newline(struct dl *dl)
+{
+	if (!g_indent_newline && !dl->json_output)
+		pr_out(" ");
+}
+
 static const enum mnl_attr_data_type devlink_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = MNL_TYPE_NUL_STRING,
 	[DEVLINK_ATTR_DEV_NAME] = MNL_TYPE_NUL_STRING,
@@ -1799,14 +1805,11 @@ static void pr_out_port_handle_end(struct dl *dl)
 
 static void pr_out_str(struct dl *dl, const char *name, const char *val)
 {
-	if (dl->json_output) {
+	__pr_out_indent_newline(dl);
+	if (dl->json_output)
 		jsonw_string_field(dl->jw, name, val);
-	} else {
-		if (g_indent_newline)
-			pr_out("%s %s", name, val);
-		else
-			pr_out(" %s %s", name, val);
-	}
+	else
+		pr_out("%s %s", name, val);
 }
 
 static void pr_out_bool(struct dl *dl, const char *name, bool val)
@@ -1819,29 +1822,23 @@ static void pr_out_bool(struct dl *dl, const char *name, bool val)
 
 static void pr_out_uint(struct dl *dl, const char *name, unsigned int val)
 {
-	if (dl->json_output) {
+	__pr_out_indent_newline(dl);
+	if (dl->json_output)
 		jsonw_uint_field(dl->jw, name, val);
-	} else {
-		if (g_indent_newline)
-			pr_out("%s %u", name, val);
-		else
-			pr_out(" %s %u", name, val);
-	}
+	else
+		pr_out("%s %u", name, val);
 }
 
 static void pr_out_u64(struct dl *dl, const char *name, uint64_t val)
 {
+	__pr_out_indent_newline(dl);
 	if (val == (uint64_t) -1)
 		return pr_out_str(dl, name, "unlimited");
 
-	if (dl->json_output) {
+	if (dl->json_output)
 		jsonw_u64_field(dl->jw, name, val);
-	} else {
-		if (g_indent_newline)
-			pr_out("%s %"PRIu64, name, val);
-		else
-			pr_out(" %s %"PRIu64, name, val);
-	}
+	else
+		pr_out("%s %"PRIu64, name, val);
 }
 
 static void pr_out_bool_value(struct dl *dl, bool value)
@@ -5835,14 +5832,12 @@ static void pr_out_region_handle_end(struct dl *dl)
 
 static void pr_out_region_snapshots_start(struct dl *dl, bool array)
 {
+	__pr_out_indent_newline(dl);
 	if (dl->json_output) {
 		jsonw_name(dl->jw, "snapshot");
 		jsonw_start_array(dl->jw);
 	} else {
-		if (g_indent_newline)
-			pr_out("snapshot %s", array ? "[" : "");
-		else
-			pr_out(" snapshot %s", array ? "[" : "");
+		pr_out("snapshot %s", array ? "[" : "");
 	}
 }
 
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH iproute2 3/4] devlink: Fix inconsistency between command input and output
From: Tariq Toukan @ 2019-09-05 12:43 UTC (permalink / raw)
  To: Stephen Hemminger, David Ahern
  Cc: netdev, Moshe Shemesh, Aya Levin, Jiri Pirko, Tariq Toukan
In-Reply-To: <1567687387-12993-1-git-send-email-tariqt@mellanox.com>

From: Aya Levin <ayal@mellanox.com>

In devlink health show command the reporter's name parameter is called
reporter, but in the output the reporter's name is referred to as name

Before this patch:
$ devlink health show pci/0000:04:00.0 reporter tx
pci/0000:04:00.0:
   name tx
     state healthy error 0 recover 0 grace_period 500 auto_recover true

After this patch:
$ devlink health show pci/0000:04:00.0 reporter tx
pci/0000:04:00.0:
   reporter tx
     state healthy error 0 recover 0 grace_period 500 auto_recover true

Fixes: 2f1242efe9d0 ("devlink: Add devlink health show command")
Signed-off-by: Aya Levin <ayal@mellanox.com>
Reported-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
---
 devlink/devlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 1bfc3283a832..722b6a101673 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -6377,7 +6377,7 @@ static void pr_out_health(struct dl *dl, struct nlattr **tb_health)
 
 	pr_out_handle_start_arr(dl, tb_health);
 
-	pr_out_str(dl, "name",
+	pr_out_str(dl, "reporter",
 		   mnl_attr_get_str(tb[DEVLINK_ATTR_HEALTH_REPORTER_NAME]));
 	if (!dl->json_output) {
 		__pr_out_newline();
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH iproute2 0/4] Devlink health FMSG fixes and enhancements
From: Tariq Toukan @ 2019-09-05 12:43 UTC (permalink / raw)
  To: Stephen Hemminger, David Ahern
  Cc: netdev, Moshe Shemesh, Aya Levin, Jiri Pirko, Tariq Toukan

Hi,

This patchset by Aya enhances FMSG output and fixes bugs in devlink health.

Patch 1 adds a helper function wrapping repeating code which determines
  whether left-hand-side space separator in needed or not. It's not
  needed after a newline.
Patch 2 fixes left justification of FMSG output. Prior to this patch
  the FMSG output had an extra space on the left-hand-side. This patch
  avoids this by looking at a flag turned on by pr_out_new_line.
Patch 3 fixes inconsistency between input and output in devlink health
  show command.
Patch 4 fixes ill setting of auto_recovery, prior to this patch setting
  auto_recovery zeroed grace_period.

Series generated against master commit:
84b9168328bf ip nexthop: Allow flush|list operations to specify a specific protocol

Thanks,
Tariq

Aya Levin (4):
  devlink: Add helper for left justification print
  devlink: Left justification on FMSG output
  devlink: Fix inconsistency between command input and output
  devlink: Fix devlink health set command

 devlink/devlink.c | 66 +++++++++++++++++++++++++++----------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

-- 
1.8.3.1


^ permalink raw reply

* [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
From: Jason Wang @ 2019-09-05 12:27 UTC (permalink / raw)
  To: mst, jasowang, kvm, virtualization
  Cc: netdev, linux-kernel, jgg, aarcange, jglisse, linux-mm,
	James Bottomley, Christoph Hellwig, David Miller,
	linux-arm-kernel, linux-parisc
In-Reply-To: <20190905122736.19768-1-jasowang@redhat.com>

This is a rework on the commit 7f466032dc9e ("vhost: access vq
metadata through kernel virtual address").

It was noticed that the copy_to/from_user() friends that was used to
access virtqueue metdata tends to be very expensive for dataplane
implementation like vhost since it involves lots of software checks,
speculation barriers, hardware feature toggling (e.g SMAP). The
extra cost will be more obvious when transferring small packets since
the time spent on metadata accessing become more significant.

This patch tries to eliminate those overheads by accessing them
through direct mapping of those pages. Invalidation callbacks is
implemented for co-operation with general VM management (swap, KSM,
THP or NUMA balancing). We will try to get the direct mapping of vq
metadata before each round of packet processing if it doesn't
exist. If we fail, we will simplely fallback to copy_to/from_user()
friends.

This invalidation, direct mapping access and set are synchronized
through spinlock. This takes a step back from the original commit
7f466032dc9e ("vhost: access vq metadata through kernel virtual
address") which tries to RCU which is suspicious and hard to be
reviewed. This won't perform as well as RCU because of the atomic,
this could be addressed by the future optimization.

This method might does not work for high mem page which requires
temporary mapping so we just fallback to normal
copy_to/from_user() and may not for arch that has virtual tagged cache
since extra cache flushing is needed to eliminate the alias. This will
result complex logic and bad performance. For those archs, this patch
simply go for copy_to/from_user() friends. This is done by ruling out
kernel mapping codes through ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.

Note that this is only done when device IOTLB is not enabled. We
could use similar method to optimize IOTLB in the future.

Tests shows at most about 22% improvement on TX PPS when using
virtio-user + vhost_net + xdp1 + TAP on 4.0GHz Kaby Lake.

        SMAP on | SMAP off
Before: 4.9Mpps | 6.9Mpps
After:  6.0Mpps | 7.5Mpps

On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
any difference.

Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Miller <davem@davemloft.net>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: linux-mm@kvack.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-parisc@vger.kernel.org
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vhost.c | 551 +++++++++++++++++++++++++++++++++++++++++-
 drivers/vhost/vhost.h |  41 ++++
 2 files changed, 589 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 791562e03fe0..f98155f28f02 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -298,6 +298,182 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
 		__vhost_vq_meta_reset(d->vqs[i]);
 }
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_map_unprefetch(struct vhost_map *map)
+{
+	kfree(map->pages);
+	kfree(map);
+}
+
+static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
+				struct vhost_map *map, int index)
+{
+	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+	int i;
+
+	if (uaddr->write) {
+		for (i = 0; i < map->npages; i++)
+			set_page_dirty(map->pages[i]);
+	}
+}
+
+static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
+{
+	struct vhost_map *map[VHOST_NUM_ADDRS];
+	int i;
+
+	spin_lock(&vq->mmu_lock);
+	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
+		map[i] = vq->maps[i];
+		if (map[i]) {
+			vhost_set_map_dirty(vq, map[i], i);
+			vq->maps[i] = NULL;
+		}
+	}
+	spin_unlock(&vq->mmu_lock);
+
+	/* No need for synchronization since we are serialized with
+	 * memory accessors (e.g vq mutex held).
+	 */
+
+	for (i = 0; i < VHOST_NUM_ADDRS; i++)
+		if (map[i])
+			vhost_map_unprefetch(map[i]);
+
+}
+
+static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
+{
+	int i;
+
+	vhost_uninit_vq_maps(vq);
+	for (i = 0; i < VHOST_NUM_ADDRS; i++)
+		vq->uaddrs[i].size = 0;
+}
+
+static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
+				     unsigned long start,
+				     unsigned long end)
+{
+	if (unlikely(!uaddr->size))
+		return false;
+
+	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
+}
+
+static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
+{
+	spin_lock(&vq->mmu_lock);
+}
+
+static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
+{
+	spin_unlock(&vq->mmu_lock);
+}
+
+static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
+				     int index,
+				     unsigned long start,
+				     unsigned long end,
+				     bool blockable)
+{
+	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+	struct vhost_map *map;
+
+	if (!vhost_map_range_overlap(uaddr, start, end))
+		return 0;
+	else if (!blockable)
+		return -EAGAIN;
+
+	spin_lock(&vq->mmu_lock);
+	++vq->invalidate_count;
+
+	map = vq->maps[index];
+	if (map)
+		vq->maps[index] = NULL;
+	spin_unlock(&vq->mmu_lock);
+
+	if (map) {
+		vhost_set_map_dirty(vq, map, index);
+		vhost_map_unprefetch(map);
+	}
+
+	return 0;
+}
+
+static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
+				    int index,
+				    unsigned long start,
+				    unsigned long end)
+{
+	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
+		return;
+
+	spin_lock(&vq->mmu_lock);
+	--vq->invalidate_count;
+	spin_unlock(&vq->mmu_lock);
+}
+
+static int vhost_invalidate_range_start(struct mmu_notifier *mn,
+					const struct mmu_notifier_range *range)
+{
+	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
+					     mmu_notifier);
+	bool blockable = mmu_notifier_range_blockable(range);
+	int i, j, ret;
+
+	for (i = 0; i < dev->nvqs; i++) {
+		struct vhost_virtqueue *vq = dev->vqs[i];
+
+		for (j = 0; j < VHOST_NUM_ADDRS; j++) {
+			ret = vhost_invalidate_vq_start(vq, j,
+							range->start,
+							range->end, blockable);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void vhost_invalidate_range_end(struct mmu_notifier *mn,
+				       const struct mmu_notifier_range *range)
+{
+	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
+					     mmu_notifier);
+	int i, j;
+
+	for (i = 0; i < dev->nvqs; i++) {
+		struct vhost_virtqueue *vq = dev->vqs[i];
+
+		for (j = 0; j < VHOST_NUM_ADDRS; j++)
+			vhost_invalidate_vq_end(vq, j,
+						range->start,
+						range->end);
+	}
+}
+
+static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
+	.invalidate_range_start = vhost_invalidate_range_start,
+	.invalidate_range_end = vhost_invalidate_range_end,
+};
+
+static void vhost_init_maps(struct vhost_dev *dev)
+{
+	struct vhost_virtqueue *vq;
+	int i, j;
+
+	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
+
+	for (i = 0; i < dev->nvqs; ++i) {
+		vq = dev->vqs[i];
+		for (j = 0; j < VHOST_NUM_ADDRS; j++)
+			vq->maps[j] = NULL;
+	}
+}
+#endif
+
 static void vhost_vq_reset(struct vhost_dev *dev,
 			   struct vhost_virtqueue *vq)
 {
@@ -326,7 +502,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->busyloop_timeout = 0;
 	vq->umem = NULL;
 	vq->iotlb = NULL;
+	vq->invalidate_count = 0;
 	__vhost_vq_meta_reset(vq);
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	vhost_reset_vq_maps(vq);
+#endif
 }
 
 static int vhost_worker(void *data)
@@ -471,12 +651,15 @@ void vhost_dev_init(struct vhost_dev *dev,
 	dev->iov_limit = iov_limit;
 	dev->weight = weight;
 	dev->byte_weight = byte_weight;
+	dev->has_notifier = false;
 	init_llist_head(&dev->work_list);
 	init_waitqueue_head(&dev->wait);
 	INIT_LIST_HEAD(&dev->read_list);
 	INIT_LIST_HEAD(&dev->pending_list);
 	spin_lock_init(&dev->iotlb_lock);
-
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	vhost_init_maps(dev);
+#endif
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		vq = dev->vqs[i];
@@ -485,6 +668,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 		vq->heads = NULL;
 		vq->dev = dev;
 		mutex_init(&vq->mutex);
+		spin_lock_init(&vq->mmu_lock);
 		vhost_vq_reset(dev, vq);
 		if (vq->handle_kick)
 			vhost_poll_init(&vq->poll, vq->handle_kick,
@@ -564,7 +748,19 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 	if (err)
 		goto err_cgroup;
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
+	if (err)
+		goto err_mmu_notifier;
+#endif
+	dev->has_notifier = true;
+
 	return 0;
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+err_mmu_notifier:
+	vhost_dev_free_iovecs(dev);
+#endif
 err_cgroup:
 	kthread_stop(worker);
 	dev->worker = NULL;
@@ -655,6 +851,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
 	spin_unlock(&dev->iotlb_lock);
 }
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
+			      int index, unsigned long uaddr,
+			      size_t size, bool write)
+{
+	struct vhost_uaddr *addr = &vq->uaddrs[index];
+
+	addr->uaddr = uaddr;
+	addr->size = size;
+	addr->write = write;
+}
+
+static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
+{
+	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
+			  (unsigned long)vq->desc,
+			  vhost_get_desc_size(vq, vq->num),
+			  false);
+	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
+			  (unsigned long)vq->avail,
+			  vhost_get_avail_size(vq, vq->num),
+			  false);
+	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
+			  (unsigned long)vq->used,
+			  vhost_get_used_size(vq, vq->num),
+			  true);
+}
+
+static int vhost_map_prefetch(struct vhost_virtqueue *vq,
+			       int index)
+{
+	struct vhost_map *map;
+	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+	struct page **pages;
+	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
+	int npinned;
+	void *vaddr, *v;
+	int err;
+	int i;
+
+	spin_lock(&vq->mmu_lock);
+
+	err = -EFAULT;
+	if (vq->invalidate_count)
+		goto err;
+
+	err = -ENOMEM;
+	map = kmalloc(sizeof(*map), GFP_ATOMIC);
+	if (!map)
+		goto err;
+
+	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
+	if (!pages)
+		goto err_pages;
+
+	err = EFAULT;
+	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
+					uaddr->write, pages);
+	if (npinned > 0)
+		release_pages(pages, npinned);
+	if (npinned != npages)
+		goto err_gup;
+
+	for (i = 0; i < npinned; i++)
+		if (PageHighMem(pages[i]))
+			goto err_gup;
+
+	vaddr = v = page_address(pages[0]);
+
+	/* For simplicity, fallback to userspace address if VA is not
+	 * contigious.
+	 */
+	for (i = 1; i < npinned; i++) {
+		v += PAGE_SIZE;
+		if (v != page_address(pages[i]))
+			goto err_gup;
+	}
+
+	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
+	map->npages = npages;
+	map->pages = pages;
+
+	vq->maps[index] = map;
+	/* No need for a synchronize_rcu(). This function should be
+	 * called by dev->worker so we are serialized with all
+	 * readers.
+	 */
+	spin_unlock(&vq->mmu_lock);
+
+	return 0;
+
+err_gup:
+	kfree(pages);
+err_pages:
+	kfree(map);
+err:
+	spin_unlock(&vq->mmu_lock);
+	return err;
+}
+#endif
+
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
 	int i;
@@ -684,8 +981,20 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 		kthread_stop(dev->worker);
 		dev->worker = NULL;
 	}
-	if (dev->mm)
+	if (dev->mm) {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+		if (dev->has_notifier) {
+			mmu_notifier_unregister(&dev->mmu_notifier,
+						dev->mm);
+			dev->has_notifier = false;
+		}
+#endif
 		mmput(dev->mm);
+	}
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	for (i = 0; i < dev->nvqs; i++)
+		vhost_uninit_vq_maps(dev->vqs[i]);
+#endif
 	dev->mm = NULL;
 }
 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
@@ -914,6 +1223,26 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 
 static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			*((__virtio16 *)&used->ring[vq->num]) =
+				cpu_to_vhost16(vq, vq->avail_idx);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
 			      vhost_avail_event(vq));
 }
@@ -922,6 +1251,27 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
 				 struct vring_used_elem *head, int idx,
 				 int count)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+	size_t size;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			size = count * sizeof(*head);
+			memcpy(used->ring + idx, head, size);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
 				  count * sizeof(*head));
 }
@@ -929,6 +1279,25 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
 static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			used->flags = cpu_to_vhost16(vq, vq->used_flags);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
 			      &vq->used->flags);
 }
@@ -936,6 +1305,25 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
 			      &vq->used->idx);
 }
@@ -981,12 +1369,50 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
 static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
 				      __virtio16 *idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*idx = avail->idx;
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *idx, &vq->avail->idx);
 }
 
 static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 				       __virtio16 *head, int idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*head = avail->ring[idx & (vq->num - 1)];
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *head,
 			       &vq->avail->ring[idx & (vq->num - 1)]);
 }
@@ -994,24 +1420,98 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
 					__virtio16 *flags)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*flags = avail->flags;
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *flags, &vq->avail->flags);
 }
 
 static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
 				       __virtio16 *event)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_avail *avail;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+		map = vq->maps[VHOST_ADDR_AVAIL];
+		if (likely(map)) {
+			avail = map->addr;
+			*event = (__virtio16)avail->ring[vq->num];
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_avail(vq, *event, vhost_used_event(vq));
 }
 
 static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
 				     __virtio16 *idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_used *used;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_USED];
+		if (likely(map)) {
+			used = map->addr;
+			*idx = used->idx;
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_get_used(vq, *idx, &vq->used->idx);
 }
 
 static inline int vhost_get_desc(struct vhost_virtqueue *vq,
 				 struct vring_desc *desc, int idx)
 {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	struct vhost_map *map;
+	struct vring_desc *d;
+
+	if (!vq->iotlb) {
+		vhost_vq_access_map_begin(vq);
+
+		map = vq->maps[VHOST_ADDR_DESC];
+		if (likely(map)) {
+			d = map->addr;
+			*desc = *(d + idx);
+			vhost_vq_access_map_end(vq);
+			return 0;
+		}
+
+		vhost_vq_access_map_end(vq);
+	}
+#endif
+
 	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
 }
 
@@ -1352,12 +1852,30 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
 	return true;
 }
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
+{
+	struct vhost_map *map;
+	int i;
+
+	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
+		map = vq->maps[i];
+		if (unlikely(!map))
+			vhost_map_prefetch(vq, i);
+	}
+}
+#endif
+
 int vq_meta_prefetch(struct vhost_virtqueue *vq)
 {
 	unsigned int num = vq->num;
 
-	if (!vq->iotlb)
+	if (!vq->iotlb) {
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+		vhost_vq_map_prefetch(vq);
+#endif
 		return 1;
+	}
 
 	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
 			       vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
@@ -1568,6 +2086,22 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 
 	mutex_lock(&vq->mutex);
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	/* Unregister MMU notifer to allow invalidation callback
+	 * can access vq->uaddrs[] without holding a lock.
+	 */
+	if (d->has_notifier) {
+		mmu_notifier_unregister(&d->mmu_notifier, d->mm);
+		d->has_notifier = false;
+	}
+
+	/* reset invalidate_count in case we are in the middle of
+	 * invalidate_start() and invalidate_end().
+	 */
+	vq->invalidate_count = 0;
+	vhost_uninit_vq_maps(vq);
+#endif
+
 	switch (ioctl) {
 	case VHOST_SET_VRING_NUM:
 		r = vhost_vring_set_num(d, vq, argp);
@@ -1579,6 +2113,17 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 		BUG();
 	}
 
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	if (r == 0)
+		vhost_setup_vq_uaddr(vq);
+
+	if (d->mm) {
+		r = mmu_notifier_register(&d->mmu_notifier, d->mm);
+		if (!r)
+			d->has_notifier = true;
+	}
+#endif
+
 	mutex_unlock(&vq->mutex);
 
 	return r;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index e9ed2722b633..85e97e0f77f5 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -12,6 +12,9 @@
 #include <linux/virtio_config.h>
 #include <linux/virtio_ring.h>
 #include <linux/atomic.h>
+#include <linux/pagemap.h>
+#include <linux/mmu_notifier.h>
+#include <asm/cacheflush.h>
 
 struct vhost_work;
 typedef void (*vhost_work_fn_t)(struct vhost_work *work);
@@ -80,6 +83,24 @@ enum vhost_uaddr_type {
 	VHOST_NUM_ADDRS = 3,
 };
 
+struct vhost_map {
+	int npages;
+	void *addr;
+	struct page **pages;
+};
+
+struct vhost_uaddr {
+	unsigned long uaddr;
+	size_t size;
+	bool write;
+};
+
+#if defined(CONFIG_MMU_NOTIFIER) && ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 0
+#define VHOST_ARCH_CAN_ACCEL_UACCESS 1
+#else
+#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
+#endif
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -90,7 +111,21 @@ struct vhost_virtqueue {
 	struct vring_desc __user *desc;
 	struct vring_avail __user *avail;
 	struct vring_used __user *used;
+
+#if VHOST_ARCH_CAN_ACCEL_UACCESS
+	/* Read by memory accessors, modified by meta data
+	 * prefetching, MMU notifier and vring ioctl().
+	 * Synchonrized through mmu_lock.
+	 */
+	struct vhost_map *maps[VHOST_NUM_ADDRS];
+	/* Read by MMU notifier, modified by vring ioctl(),
+	 * synchronized through MMU notifier
+	 * registering/unregistering.
+	 */
+	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
+#endif
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
+
 	struct file *kick;
 	struct eventfd_ctx *call_ctx;
 	struct eventfd_ctx *error_ctx;
@@ -145,6 +180,8 @@ struct vhost_virtqueue {
 	bool user_be;
 #endif
 	u32 busyloop_timeout;
+	spinlock_t mmu_lock;
+	int invalidate_count;
 };
 
 struct vhost_msg_node {
@@ -158,6 +195,9 @@ struct vhost_msg_node {
 
 struct vhost_dev {
 	struct mm_struct *mm;
+#ifdef CONFIG_MMU_NOTIFIER
+	struct mmu_notifier mmu_notifier;
+#endif
 	struct mutex mutex;
 	struct vhost_virtqueue **vqs;
 	int nvqs;
@@ -173,6 +213,7 @@ struct vhost_dev {
 	int iov_limit;
 	int weight;
 	int byte_weight;
+	bool has_notifier;
 };
 
 bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len);
-- 
2.19.1


^ permalink raw reply related

* [PATCH 1/2] Revert "vhost: access vq metadata through kernel virtual address"
From: Jason Wang @ 2019-09-05 12:27 UTC (permalink / raw)
  To: mst, jasowang, kvm, virtualization
  Cc: netdev, linux-kernel, jgg, aarcange, jglisse, linux-mm
In-Reply-To: <20190905122736.19768-1-jasowang@redhat.com>

It was reported that metadata acceleration introduces several issues,
so this patch reverts commit ff466032dc9e5a61217f22ea34b2df932786bbfc,
73f628ec9e6bcc45b77c53fe6d0c0ec55eaf82af and
0b4a7092ffe568a55bf8f3cefdf79ff666586d91.

We will rework it on the next version.

Cc: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 515 +-----------------------------------------
 drivers/vhost/vhost.h |  41 ----
 2 files changed, 3 insertions(+), 553 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0536f8526359..791562e03fe0 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -298,160 +298,6 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
 		__vhost_vq_meta_reset(d->vqs[i]);
 }
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-static void vhost_map_unprefetch(struct vhost_map *map)
-{
-	kfree(map->pages);
-	map->pages = NULL;
-	map->npages = 0;
-	map->addr = NULL;
-}
-
-static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
-{
-	struct vhost_map *map[VHOST_NUM_ADDRS];
-	int i;
-
-	spin_lock(&vq->mmu_lock);
-	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
-		map[i] = rcu_dereference_protected(vq->maps[i],
-				  lockdep_is_held(&vq->mmu_lock));
-		if (map[i])
-			rcu_assign_pointer(vq->maps[i], NULL);
-	}
-	spin_unlock(&vq->mmu_lock);
-
-	synchronize_rcu();
-
-	for (i = 0; i < VHOST_NUM_ADDRS; i++)
-		if (map[i])
-			vhost_map_unprefetch(map[i]);
-
-}
-
-static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
-{
-	int i;
-
-	vhost_uninit_vq_maps(vq);
-	for (i = 0; i < VHOST_NUM_ADDRS; i++)
-		vq->uaddrs[i].size = 0;
-}
-
-static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
-				     unsigned long start,
-				     unsigned long end)
-{
-	if (unlikely(!uaddr->size))
-		return false;
-
-	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
-}
-
-static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
-				      int index,
-				      unsigned long start,
-				      unsigned long end)
-{
-	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
-	struct vhost_map *map;
-	int i;
-
-	if (!vhost_map_range_overlap(uaddr, start, end))
-		return;
-
-	spin_lock(&vq->mmu_lock);
-	++vq->invalidate_count;
-
-	map = rcu_dereference_protected(vq->maps[index],
-					lockdep_is_held(&vq->mmu_lock));
-	if (map) {
-		if (uaddr->write) {
-			for (i = 0; i < map->npages; i++)
-				set_page_dirty(map->pages[i]);
-		}
-		rcu_assign_pointer(vq->maps[index], NULL);
-	}
-	spin_unlock(&vq->mmu_lock);
-
-	if (map) {
-		synchronize_rcu();
-		vhost_map_unprefetch(map);
-	}
-}
-
-static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
-				    int index,
-				    unsigned long start,
-				    unsigned long end)
-{
-	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
-		return;
-
-	spin_lock(&vq->mmu_lock);
-	--vq->invalidate_count;
-	spin_unlock(&vq->mmu_lock);
-}
-
-static int vhost_invalidate_range_start(struct mmu_notifier *mn,
-					const struct mmu_notifier_range *range)
-{
-	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
-					     mmu_notifier);
-	int i, j;
-
-	if (!mmu_notifier_range_blockable(range))
-		return -EAGAIN;
-
-	for (i = 0; i < dev->nvqs; i++) {
-		struct vhost_virtqueue *vq = dev->vqs[i];
-
-		for (j = 0; j < VHOST_NUM_ADDRS; j++)
-			vhost_invalidate_vq_start(vq, j,
-						  range->start,
-						  range->end);
-	}
-
-	return 0;
-}
-
-static void vhost_invalidate_range_end(struct mmu_notifier *mn,
-				       const struct mmu_notifier_range *range)
-{
-	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
-					     mmu_notifier);
-	int i, j;
-
-	for (i = 0; i < dev->nvqs; i++) {
-		struct vhost_virtqueue *vq = dev->vqs[i];
-
-		for (j = 0; j < VHOST_NUM_ADDRS; j++)
-			vhost_invalidate_vq_end(vq, j,
-						range->start,
-						range->end);
-	}
-}
-
-static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
-	.invalidate_range_start = vhost_invalidate_range_start,
-	.invalidate_range_end = vhost_invalidate_range_end,
-};
-
-static void vhost_init_maps(struct vhost_dev *dev)
-{
-	struct vhost_virtqueue *vq;
-	int i, j;
-
-	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
-
-	for (i = 0; i < dev->nvqs; ++i) {
-		vq = dev->vqs[i];
-		for (j = 0; j < VHOST_NUM_ADDRS; j++)
-			RCU_INIT_POINTER(vq->maps[j], NULL);
-	}
-}
-#endif
-
 static void vhost_vq_reset(struct vhost_dev *dev,
 			   struct vhost_virtqueue *vq)
 {
@@ -480,11 +326,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->busyloop_timeout = 0;
 	vq->umem = NULL;
 	vq->iotlb = NULL;
-	vq->invalidate_count = 0;
 	__vhost_vq_meta_reset(vq);
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	vhost_reset_vq_maps(vq);
-#endif
 }
 
 static int vhost_worker(void *data)
@@ -634,9 +476,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 	INIT_LIST_HEAD(&dev->read_list);
 	INIT_LIST_HEAD(&dev->pending_list);
 	spin_lock_init(&dev->iotlb_lock);
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	vhost_init_maps(dev);
-#endif
+
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		vq = dev->vqs[i];
@@ -645,7 +485,6 @@ void vhost_dev_init(struct vhost_dev *dev,
 		vq->heads = NULL;
 		vq->dev = dev;
 		mutex_init(&vq->mutex);
-		spin_lock_init(&vq->mmu_lock);
 		vhost_vq_reset(dev, vq);
 		if (vq->handle_kick)
 			vhost_poll_init(&vq->poll, vq->handle_kick,
@@ -725,18 +564,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 	if (err)
 		goto err_cgroup;
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
-	if (err)
-		goto err_mmu_notifier;
-#endif
-
 	return 0;
-
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-err_mmu_notifier:
-	vhost_dev_free_iovecs(dev);
-#endif
 err_cgroup:
 	kthread_stop(worker);
 	dev->worker = NULL;
@@ -827,107 +655,6 @@ static void vhost_clear_msg(struct vhost_dev *dev)
 	spin_unlock(&dev->iotlb_lock);
 }
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
-			      int index, unsigned long uaddr,
-			      size_t size, bool write)
-{
-	struct vhost_uaddr *addr = &vq->uaddrs[index];
-
-	addr->uaddr = uaddr;
-	addr->size = size;
-	addr->write = write;
-}
-
-static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
-{
-	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
-			  (unsigned long)vq->desc,
-			  vhost_get_desc_size(vq, vq->num),
-			  false);
-	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
-			  (unsigned long)vq->avail,
-			  vhost_get_avail_size(vq, vq->num),
-			  false);
-	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
-			  (unsigned long)vq->used,
-			  vhost_get_used_size(vq, vq->num),
-			  true);
-}
-
-static int vhost_map_prefetch(struct vhost_virtqueue *vq,
-			       int index)
-{
-	struct vhost_map *map;
-	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
-	struct page **pages;
-	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
-	int npinned;
-	void *vaddr, *v;
-	int err;
-	int i;
-
-	spin_lock(&vq->mmu_lock);
-
-	err = -EFAULT;
-	if (vq->invalidate_count)
-		goto err;
-
-	err = -ENOMEM;
-	map = kmalloc(sizeof(*map), GFP_ATOMIC);
-	if (!map)
-		goto err;
-
-	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
-	if (!pages)
-		goto err_pages;
-
-	err = EFAULT;
-	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
-					uaddr->write, pages);
-	if (npinned > 0)
-		release_pages(pages, npinned);
-	if (npinned != npages)
-		goto err_gup;
-
-	for (i = 0; i < npinned; i++)
-		if (PageHighMem(pages[i]))
-			goto err_gup;
-
-	vaddr = v = page_address(pages[0]);
-
-	/* For simplicity, fallback to userspace address if VA is not
-	 * contigious.
-	 */
-	for (i = 1; i < npinned; i++) {
-		v += PAGE_SIZE;
-		if (v != page_address(pages[i]))
-			goto err_gup;
-	}
-
-	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
-	map->npages = npages;
-	map->pages = pages;
-
-	rcu_assign_pointer(vq->maps[index], map);
-	/* No need for a synchronize_rcu(). This function should be
-	 * called by dev->worker so we are serialized with all
-	 * readers.
-	 */
-	spin_unlock(&vq->mmu_lock);
-
-	return 0;
-
-err_gup:
-	kfree(pages);
-err_pages:
-	kfree(map);
-err:
-	spin_unlock(&vq->mmu_lock);
-	return err;
-}
-#endif
-
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
 	int i;
@@ -957,16 +684,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 		kthread_stop(dev->worker);
 		dev->worker = NULL;
 	}
-	if (dev->mm) {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-		mmu_notifier_unregister(&dev->mmu_notifier, dev->mm);
-#endif
+	if (dev->mm)
 		mmput(dev->mm);
-	}
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	for (i = 0; i < dev->nvqs; i++)
-		vhost_uninit_vq_maps(dev->vqs[i]);
-#endif
 	dev->mm = NULL;
 }
 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
@@ -1195,26 +914,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 
 static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_used *used;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
-		if (likely(map)) {
-			used = map->addr;
-			*((__virtio16 *)&used->ring[vq->num]) =
-				cpu_to_vhost16(vq, vq->avail_idx);
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
 			      vhost_avail_event(vq));
 }
@@ -1223,27 +922,6 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
 				 struct vring_used_elem *head, int idx,
 				 int count)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_used *used;
-	size_t size;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
-		if (likely(map)) {
-			used = map->addr;
-			size = count * sizeof(*head);
-			memcpy(used->ring + idx, head, size);
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
 				  count * sizeof(*head));
 }
@@ -1251,25 +929,6 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
 static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_used *used;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
-		if (likely(map)) {
-			used = map->addr;
-			used->flags = cpu_to_vhost16(vq, vq->used_flags);
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
 			      &vq->used->flags);
 }
@@ -1277,25 +936,6 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_used *used;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
-		if (likely(map)) {
-			used = map->addr;
-			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
 			      &vq->used->idx);
 }
@@ -1341,50 +981,12 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
 static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
 				      __virtio16 *idx)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_avail *avail;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
-		if (likely(map)) {
-			avail = map->addr;
-			*idx = avail->idx;
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_get_avail(vq, *idx, &vq->avail->idx);
 }
 
 static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 				       __virtio16 *head, int idx)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_avail *avail;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
-		if (likely(map)) {
-			avail = map->addr;
-			*head = avail->ring[idx & (vq->num - 1)];
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_get_avail(vq, *head,
 			       &vq->avail->ring[idx & (vq->num - 1)]);
 }
@@ -1392,98 +994,24 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
 					__virtio16 *flags)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_avail *avail;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
-		if (likely(map)) {
-			avail = map->addr;
-			*flags = avail->flags;
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_get_avail(vq, *flags, &vq->avail->flags);
 }
 
 static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
 				       __virtio16 *event)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_avail *avail;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
-		if (likely(map)) {
-			avail = map->addr;
-			*event = (__virtio16)avail->ring[vq->num];
-			rcu_read_unlock();
-			return 0;
-		}
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_get_avail(vq, *event, vhost_used_event(vq));
 }
 
 static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
 				     __virtio16 *idx)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_used *used;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
-		if (likely(map)) {
-			used = map->addr;
-			*idx = used->idx;
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_get_used(vq, *idx, &vq->used->idx);
 }
 
 static inline int vhost_get_desc(struct vhost_virtqueue *vq,
 				 struct vring_desc *desc, int idx)
 {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	struct vhost_map *map;
-	struct vring_desc *d;
-
-	if (!vq->iotlb) {
-		rcu_read_lock();
-
-		map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
-		if (likely(map)) {
-			d = map->addr;
-			*desc = *(d + idx);
-			rcu_read_unlock();
-			return 0;
-		}
-
-		rcu_read_unlock();
-	}
-#endif
-
 	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
 }
 
@@ -1824,32 +1352,12 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
 	return true;
 }
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
-{
-	struct vhost_map __rcu *map;
-	int i;
-
-	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
-		rcu_read_lock();
-		map = rcu_dereference(vq->maps[i]);
-		rcu_read_unlock();
-		if (unlikely(!map))
-			vhost_map_prefetch(vq, i);
-	}
-}
-#endif
-
 int vq_meta_prefetch(struct vhost_virtqueue *vq)
 {
 	unsigned int num = vq->num;
 
-	if (!vq->iotlb) {
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-		vhost_vq_map_prefetch(vq);
-#endif
+	if (!vq->iotlb)
 		return 1;
-	}
 
 	return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
 			       vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
@@ -2060,16 +1568,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 
 	mutex_lock(&vq->mutex);
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	/* Unregister MMU notifer to allow invalidation callback
-	 * can access vq->uaddrs[] without holding a lock.
-	 */
-	if (d->mm)
-		mmu_notifier_unregister(&d->mmu_notifier, d->mm);
-
-	vhost_uninit_vq_maps(vq);
-#endif
-
 	switch (ioctl) {
 	case VHOST_SET_VRING_NUM:
 		r = vhost_vring_set_num(d, vq, argp);
@@ -2081,13 +1579,6 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 		BUG();
 	}
 
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	vhost_setup_vq_uaddr(vq);
-
-	if (d->mm)
-		mmu_notifier_register(&d->mmu_notifier, d->mm);
-#endif
-
 	mutex_unlock(&vq->mutex);
 
 	return r;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 42a8c2a13ab1..e9ed2722b633 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -12,9 +12,6 @@
 #include <linux/virtio_config.h>
 #include <linux/virtio_ring.h>
 #include <linux/atomic.h>
-#include <linux/pagemap.h>
-#include <linux/mmu_notifier.h>
-#include <asm/cacheflush.h>
 
 struct vhost_work;
 typedef void (*vhost_work_fn_t)(struct vhost_work *work);
@@ -83,24 +80,6 @@ enum vhost_uaddr_type {
 	VHOST_NUM_ADDRS = 3,
 };
 
-struct vhost_map {
-	int npages;
-	void *addr;
-	struct page **pages;
-};
-
-struct vhost_uaddr {
-	unsigned long uaddr;
-	size_t size;
-	bool write;
-};
-
-#if defined(CONFIG_MMU_NOTIFIER) && ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 0
-#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
-#else
-#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
-#endif
-
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -111,22 +90,7 @@ struct vhost_virtqueue {
 	struct vring_desc __user *desc;
 	struct vring_avail __user *avail;
 	struct vring_used __user *used;
-
-#if VHOST_ARCH_CAN_ACCEL_UACCESS
-	/* Read by memory accessors, modified by meta data
-	 * prefetching, MMU notifier and vring ioctl().
-	 * Synchonrized through mmu_lock (writers) and RCU (writers
-	 * and readers).
-	 */
-	struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
-	/* Read by MMU notifier, modified by vring ioctl(),
-	 * synchronized through MMU notifier
-	 * registering/unregistering.
-	 */
-	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
-#endif
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
-
 	struct file *kick;
 	struct eventfd_ctx *call_ctx;
 	struct eventfd_ctx *error_ctx;
@@ -181,8 +145,6 @@ struct vhost_virtqueue {
 	bool user_be;
 #endif
 	u32 busyloop_timeout;
-	spinlock_t mmu_lock;
-	int invalidate_count;
 };
 
 struct vhost_msg_node {
@@ -196,9 +158,6 @@ struct vhost_msg_node {
 
 struct vhost_dev {
 	struct mm_struct *mm;
-#ifdef CONFIG_MMU_NOTIFIER
-	struct mmu_notifier mmu_notifier;
-#endif
 	struct mutex mutex;
 	struct vhost_virtqueue **vqs;
 	int nvqs;
-- 
2.19.1


^ permalink raw reply related

* [PATCH 0/2] Revert and rework on the metadata accelreation
From: Jason Wang @ 2019-09-05 12:27 UTC (permalink / raw)
  To: mst, jasowang, kvm, virtualization
  Cc: netdev, linux-kernel, jgg, aarcange, jglisse, linux-mm

Hi:

Per request from Michael and Jason, the metadata accelreation is
reverted in this version and rework in next version.

Please review.

Thanks

Jason Wang (2):
  Revert "vhost: access vq metadata through kernel virtual address"
  vhost: re-introducing metadata acceleration through kernel virtual
    address

 drivers/vhost/vhost.c | 202 +++++++++++++++++++++++++-----------------
 drivers/vhost/vhost.h |   8 +-
 2 files changed, 123 insertions(+), 87 deletions(-)

-- 
2.19.1


^ permalink raw reply

* [PATCH net] net: sched: fix reordering issues
From: Eric Dumazet @ 2019-09-05 12:20 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet, John Fastabend

Whenever MQ is not used on a multiqueue device, we experience
serious reordering problems. Bisection found the cited
commit.

The issue can be described this way :

- A single qdisc hierarchy is shared by all transmit queues.
  (eg : tc qdisc replace dev eth0 root fq_codel)

- When/if try_bulk_dequeue_skb_slow() dequeues a packet targetting
  a different transmit queue than the one used to build a packet train,
  we stop building the current list and save the 'bad' skb (P1) in a
  special queue. (bad_txq)

- When dequeue_skb() calls qdisc_dequeue_skb_bad_txq() and finds this
  skb (P1), it checks if the associated transmit queues is still in frozen
  state. If the queue is still blocked (by BQL or NIC tx ring full),
  we leave the skb in bad_txq and return NULL.

- dequeue_skb() calls q->dequeue() to get another packet (P2)

  The other packet can target the problematic queue (that we found
  in frozen state for the bad_txq packet), but another cpu just ran
  TX completion and made room in the txq that is now ready to accept
  new packets.

- Packet P2 is sent while P1 is still held in bad_txq, P1 might be sent
  at next round. In practice P2 is the lead of a big packet train
  (P2,P3,P4 ...) filling the BQL budget and delaying P1 by many packets :/

To solve this problem, we have to block the dequeue process as long
as the first packet in bad_txq can not be sent. Reordering issues
disappear and no side effects have been seen.

Fixes: a53851e2c321 ("net: sched: explicit locking in gso_cpu fallback")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: John Fastabend <john.fastabend@gmail.com>
---
 net/sched/sch_generic.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 137db1cbde8538e124133c6e148acc3e92e56a74..ac28f6a5d70e0b38ae8ce7858f08e9d15778c22f 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -46,6 +46,8 @@ EXPORT_SYMBOL(default_qdisc_ops);
  * - updates to tree and tree walking are only done under the rtnl mutex.
  */
 
+#define SKB_XOFF_MAGIC ((struct sk_buff *)1UL)
+
 static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
 {
 	const struct netdev_queue *txq = q->dev_queue;
@@ -71,7 +73,7 @@ static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
 				q->q.qlen--;
 			}
 		} else {
-			skb = NULL;
+			skb = SKB_XOFF_MAGIC;
 		}
 	}
 
@@ -253,8 +255,11 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 		return skb;
 
 	skb = qdisc_dequeue_skb_bad_txq(q);
-	if (unlikely(skb))
+	if (unlikely(skb)) {
+		if (skb == SKB_XOFF_MAGIC)
+			return NULL;
 		goto bulk;
+	}
 	skb = q->dequeue(q);
 	if (skb) {
 bulk:
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply related

* [PATCH v3 net-next] net: stmmac: Add support for MDIO interrupts
From: Voon Weifeng @ 2019-09-05 12:05 UTC (permalink / raw)
  To: David S. Miller, Maxime Coquelin
  Cc: netdev, linux-kernel, Jose Abreu, Giuseppe Cavallaro, Andrew Lunn,
	Alexandre Torgue, Ong Boon Leong, Voon Weifeng

From: "Chuah, Kim Tatt" <kim.tatt.chuah@intel.com>

DW EQoS v5.xx controllers added capability for interrupt generation
when MDIO interface is done (GMII Busy bit is cleared).
This patch adds support for this interrupt on supported HW to avoid
polling on GMII Busy bit.

stmmac_mdio_read() & stmmac_mdio_write() will sleep until wake_up() is
called by the interrupt handler.

Reviewed-by: Voon Weifeng <weifeng.voon@intel.com>
Reviewed-by: Kweh, Hock Leong <hock.leong.kweh@intel.com>
Reviewed-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Chuah, Kim Tatt <kim.tatt.chuah@intel.com>
Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Voon Weifeng <weifeng.voon@intel.com>

Changelog v3
*Move the changelog to before the --- marker line
Changelog v2
*mdio interrupt mode or polling mode will depends on mdio interrupt
 enable bit
*Disable the mdio interrupt enable bit in stmmac_release
*Remove the condition for initialize wait queues
*Applied reverse Christmas tree
---
 drivers/net/ethernet/stmicro/stmmac/common.h      |  2 +
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h      |  1 +
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c |  7 +++
 drivers/net/ethernet/stmicro/stmmac/dwmac5.c      |  8 ++++
 drivers/net/ethernet/stmicro/stmmac/dwmac5.h      |  4 ++
 drivers/net/ethernet/stmicro/stmmac/hwif.c        |  9 ++++
 drivers/net/ethernet/stmicro/stmmac/hwif.h        |  4 ++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |  5 ++
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 57 +++++++++++++++++++----
 9 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 49aa56ca09cc..775a1c114b1a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -448,6 +448,8 @@ struct mac_device_info {
 	unsigned int pcs;
 	unsigned int pmt;
 	unsigned int ps;
+	bool mdio_intr_en;
+	wait_queue_head_t mdio_busy_wait;
 };
 
 struct stmmac_rx_routing {
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index 2ed11a581d80..1be6a8a88b8f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -106,6 +106,7 @@ enum dwmac4_irq_status {
 	mmc_irq = 0x00000100,
 	lpi_irq = 0x00000020,
 	pmt_irq = 0x00000010,
+	mdio_irq = 0x00040000,
 };
 
 /* MAC PMT bitmap */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index fc9954e4a772..97fca6d65141 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -59,6 +59,9 @@ static void dwmac4_core_init(struct mac_device_info *hw,
 	if (hw->pcs)
 		value |= GMAC_PCS_IRQ_DEFAULT;
 
+	if (hw->mdio_intr_en)
+		value |= GMAC_INT_MDIO_EN;
+
 	writel(value, ioaddr + GMAC_INT_EN);
 }
 
@@ -629,6 +632,9 @@ static int dwmac4_irq_status(struct mac_device_info *hw,
 			x->irq_rx_path_exit_lpi_mode_n++;
 	}
 
+	if (intr_status & mdio_irq)
+		wake_up(&hw->mdio_busy_wait);
+
 	dwmac_pcs_isr(ioaddr, GMAC_PCS_BASE, intr_status, x);
 	if (intr_status & PCS_RGSMIIIS_IRQ)
 		dwmac4_phystatus(ioaddr, x);
@@ -836,6 +842,7 @@ static void dwmac4_set_mac_loopback(void __iomem *ioaddr, bool enable)
 	.rxp_config = dwmac5_rxp_config,
 	.flex_pps_config = dwmac5_flex_pps_config,
 	.set_mac_loopback = dwmac4_set_mac_loopback,
+	.mdio_intr_dis = dwmac5_mdio_intr_dis,
 };
 
 int dwmac4_setup(struct stmmac_priv *priv)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
index 3f4f3132e16b..c58751e1dcb6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
@@ -549,3 +549,11 @@ int dwmac5_flex_pps_config(void __iomem *ioaddr, int index,
 	writel(val, ioaddr + MAC_PPS_CONTROL);
 	return 0;
 }
+
+void dwmac5_mdio_intr_dis(void __iomem *ioaddr)
+{
+	u32 val = readl(ioaddr + GMAC_INT_EN);
+
+	val &= ~GMAC_INT_MDIO_EN;
+	writel(val, ioaddr + GMAC_INT_EN);
+}
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
index 775db776b3cc..a56511a4c97d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
@@ -72,6 +72,9 @@
 #define TCEIE				BIT(0)
 #define DMA_ECC_INT_STATUS		0x00001088
 
+/* MDIO interrupt enable in MAC_Interrupt_Enable register */
+#define GMAC_INT_MDIO_EN		BIT(18)
+
 int dwmac5_safety_feat_config(void __iomem *ioaddr, unsigned int asp);
 int dwmac5_safety_feat_irq_status(struct net_device *ndev,
 		void __iomem *ioaddr, unsigned int asp,
@@ -83,5 +86,6 @@ int dwmac5_rxp_config(void __iomem *ioaddr, struct stmmac_tc_entry *entries,
 int dwmac5_flex_pps_config(void __iomem *ioaddr, int index,
 			   struct stmmac_pps_cfg *cfg, bool enable,
 			   u32 sub_second_inc, u32 systime_flags);
+void dwmac5_mdio_intr_dis(void __iomem *ioaddr);
 
 #endif /* __DWMAC5_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c
index 3af2e5015245..7127efe652db 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.c
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c
@@ -73,6 +73,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 	bool gmac;
 	bool gmac4;
 	bool xgmac;
+	bool mdio_intr_en;
 	u32 min_id;
 	const struct stmmac_regs_off regs;
 	const void *desc;
@@ -90,6 +91,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 		.gmac = false,
 		.gmac4 = false,
 		.xgmac = false,
+		.mdio_intr_en = false,
 		.min_id = 0,
 		.regs = {
 			.ptp_off = PTP_GMAC3_X_OFFSET,
@@ -108,6 +110,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 		.gmac = true,
 		.gmac4 = false,
 		.xgmac = false,
+		.mdio_intr_en = false,
 		.min_id = 0,
 		.regs = {
 			.ptp_off = PTP_GMAC3_X_OFFSET,
@@ -126,6 +129,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 		.gmac = false,
 		.gmac4 = true,
 		.xgmac = false,
+		.mdio_intr_en = false,
 		.min_id = 0,
 		.regs = {
 			.ptp_off = PTP_GMAC4_OFFSET,
@@ -144,6 +148,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 		.gmac = false,
 		.gmac4 = true,
 		.xgmac = false,
+		.mdio_intr_en = false,
 		.min_id = DWMAC_CORE_4_00,
 		.regs = {
 			.ptp_off = PTP_GMAC4_OFFSET,
@@ -162,6 +167,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 		.gmac = false,
 		.gmac4 = true,
 		.xgmac = false,
+		.mdio_intr_en = false,
 		.min_id = DWMAC_CORE_4_10,
 		.regs = {
 			.ptp_off = PTP_GMAC4_OFFSET,
@@ -180,6 +186,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 		.gmac = false,
 		.gmac4 = true,
 		.xgmac = false,
+		.mdio_intr_en = true,
 		.min_id = DWMAC_CORE_5_10,
 		.regs = {
 			.ptp_off = PTP_GMAC4_OFFSET,
@@ -198,6 +205,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
 		.gmac = false,
 		.gmac4 = false,
 		.xgmac = true,
+		.mdio_intr_en = false,
 		.min_id = DWXGMAC_CORE_2_10,
 		.regs = {
 			.ptp_off = PTP_XGMAC_OFFSET,
@@ -276,6 +284,7 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
 		mac->mode = mac->mode ? : entry->mode;
 		mac->tc = mac->tc ? : entry->tc;
 		mac->mmc = mac->mmc ? : entry->mmc;
+		mac->mdio_intr_en = mac->mdio_intr_en ? : entry->mdio_intr_en;
 
 		priv->hw = mac;
 		priv->ptpaddr = priv->ioaddr + entry->regs.ptp_off;
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 9435b312495d..d42885426e78 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -363,6 +363,8 @@ struct stmmac_ops {
 	int (*get_mac_tx_timestamp)(struct mac_device_info *hw, u64 *ts);
 	/* Source Address Insertion / Replacement */
 	void (*sarc_configure)(void __iomem *ioaddr, int val);
+	/* Disable mdio interrupt */
+	void (*mdio_intr_dis)(void __iomem *ioaddr);
 };
 
 #define stmmac_core_init(__priv, __args...) \
@@ -443,6 +445,8 @@ struct stmmac_ops {
 	stmmac_do_callback(__priv, mac, get_mac_tx_timestamp, __args)
 #define stmmac_sarc_configure(__priv, __args...) \
 	stmmac_do_void_callback(__priv, mac, sarc_configure, __args)
+#define stmmac_mdio_intr_dis(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, mdio_intr_dis, __args)
 
 /* PTP and HW Timer helpers */
 struct stmmac_hwtimestamp {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 06ccd216ae90..2557a2beb03d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2768,6 +2768,8 @@ static int stmmac_release(struct net_device *dev)
 	phylink_stop(priv->phylink);
 	phylink_disconnect_phy(priv->phylink);
 
+	stmmac_mdio_intr_dis(priv, priv->ioaddr);
+
 	stmmac_stop_all_queues(priv);
 
 	stmmac_disable_all_queues(priv);
@@ -4463,6 +4465,9 @@ int stmmac_dvr_probe(struct device *device,
 	if (ret)
 		goto error_hw_init;
 
+	/* mdio intr wait queue */
+	init_waitqueue_head(&priv->hw->mdio_busy_wait);
+
 	stmmac_check_ether_addr(priv);
 
 	/* Configure real RX and TX queues */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 40c42637ad75..625e1fde0c86 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -19,6 +19,8 @@
 #include <linux/property.h>
 #include <linux/slab.h>
 
+#include "dwmac4.h"
+#include "dwmac5.h"
 #include "dwxgmac2.h"
 #include "stmmac.h"
 
@@ -142,6 +144,18 @@ static int stmmac_xgmac2_mdio_write(struct mii_bus *bus, int phyaddr,
 				  !(tmp & MII_XGMAC_BUSY), 100, 10000);
 }
 
+static bool stmmac_mdio_intr_done(struct mii_bus *bus)
+{
+	struct net_device *ndev = bus->priv;
+	struct stmmac_priv *priv;
+	unsigned int mii_address;
+
+	priv = netdev_priv(ndev);
+	mii_address = priv->hw->mii.addr;
+
+	return !(readl(priv->ioaddr + mii_address) & MII_BUSY);
+}
+
 /**
  * stmmac_mdio_read
  * @bus: points to the mii_bus structure
@@ -159,9 +173,11 @@ static int stmmac_mdio_read(struct mii_bus *bus, int phyaddr, int phyreg)
 	unsigned int mii_address = priv->hw->mii.addr;
 	unsigned int mii_data = priv->hw->mii.data;
 	u32 value = MII_BUSY;
+	u32 mdio_intr_en = 0;
 	int data = 0;
 	u32 v;
 
+	mdio_intr_en = readl(priv->ioaddr + GMAC_INT_EN) & GMAC_INT_MDIO_EN;
 	value |= (phyaddr << priv->hw->mii.addr_shift)
 		& priv->hw->mii.addr_mask;
 	value |= (phyreg << priv->hw->mii.reg_shift) & priv->hw->mii.reg_mask;
@@ -181,16 +197,26 @@ static int stmmac_mdio_read(struct mii_bus *bus, int phyaddr, int phyreg)
 		}
 	}
 
-	if (readl_poll_timeout(priv->ioaddr + mii_address, v, !(v & MII_BUSY),
-			       100, 10000))
+	if (mdio_intr_en) {
+		if (!wait_event_timeout(priv->hw->mdio_busy_wait,
+					stmmac_mdio_intr_done(bus), HZ / 100))
+			return -EBUSY;
+	} else if (readl_poll_timeout(priv->ioaddr + mii_address, v,
+				      !(v & MII_BUSY), 100, 10000)) {
 		return -EBUSY;
+	}
 
 	writel(data, priv->ioaddr + mii_data);
 	writel(value, priv->ioaddr + mii_address);
 
-	if (readl_poll_timeout(priv->ioaddr + mii_address, v, !(v & MII_BUSY),
-			       100, 10000))
+	if (mdio_intr_en) {
+		if (!wait_event_timeout(priv->hw->mdio_busy_wait,
+					stmmac_mdio_intr_done(bus), HZ / 100))
+			return -EBUSY;
+	} else if (readl_poll_timeout(priv->ioaddr + mii_address, v,
+				      !(v & MII_BUSY), 100, 10000)) {
 		return -EBUSY;
+	}
 
 	/* Read the data from the MII data register */
 	data = (int)readl(priv->ioaddr + mii_data) & MII_DATA_MASK;
@@ -214,9 +240,11 @@ static int stmmac_mdio_write(struct mii_bus *bus, int phyaddr, int phyreg,
 	unsigned int mii_address = priv->hw->mii.addr;
 	unsigned int mii_data = priv->hw->mii.data;
 	u32 value = MII_BUSY;
+	u32 mdio_intr_en = 0;
 	int data = phydata;
 	u32 v;
 
+	mdio_intr_en = readl(priv->ioaddr + GMAC_INT_EN) & GMAC_INT_MDIO_EN;
 	value |= (phyaddr << priv->hw->mii.addr_shift)
 		& priv->hw->mii.addr_mask;
 	value |= (phyreg << priv->hw->mii.reg_shift) & priv->hw->mii.reg_mask;
@@ -240,17 +268,30 @@ static int stmmac_mdio_write(struct mii_bus *bus, int phyaddr, int phyreg,
 	}
 
 	/* Wait until any existing MII operation is complete */
-	if (readl_poll_timeout(priv->ioaddr + mii_address, v, !(v & MII_BUSY),
-			       100, 10000))
+	if (mdio_intr_en) {
+		if (!wait_event_timeout(priv->hw->mdio_busy_wait,
+					stmmac_mdio_intr_done(bus), HZ / 100))
+			return -EBUSY;
+	} else if (readl_poll_timeout(priv->ioaddr + mii_address, v,
+				      !(v & MII_BUSY), 100, 10000)) {
 		return -EBUSY;
+	}
 
 	/* Set the MII address register to write */
 	writel(data, priv->ioaddr + mii_data);
 	writel(value, priv->ioaddr + mii_address);
 
 	/* Wait until any existing MII operation is complete */
-	return readl_poll_timeout(priv->ioaddr + mii_address, v, !(v & MII_BUSY),
-				  100, 10000);
+	if (mdio_intr_en) {
+		if (!wait_event_timeout(priv->hw->mdio_busy_wait,
+					stmmac_mdio_intr_done(bus), HZ / 100))
+			return -EBUSY;
+	} else if (readl_poll_timeout(priv->ioaddr + mii_address, v,
+				      !(v & MII_BUSY), 100, 10000)) {
+		return -EBUSY;
+	}
+
+	return 0;
 }
 
 /**
-- 
1.9.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox