Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH for-next V2 04/11] net/mlx5: Support new MR features
From: Saeed Mahameed @ 2017-01-02  9:37 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Leon Romanovsky, Ilya Lesokhin, Artemy Kovalyov, Leon Romanovsky,
	Saeed Mahameed
In-Reply-To: <1483349868-11716-1-git-send-email-saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

From: Artemy Kovalyov <artemyko-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

This patch adds the following items to IFC file.

1. MLX5_MKC_ACCESS_MODE_KSM enum value for creating KSM memory keys.
KSM access mode used when indirect MKey associated with fixed memory
size entries.

2. null_mkey field that is used to indicate non-present KLM/KSM
entries, where it causes the device to generate page fault event
when trying to access it.

3. struct mlx5_ifc_cmd_hca_cap_bits capability bits indicating
related value/field is supported:
* fixed_buffer_size - MLX5_MKC_ACCESS_MODE_KSM
* umr_extended_translation_offset - translation_offset_42_16
    in UMR ctrl segment
* null_mkey - null_mkey in QUERY_SPECIAL_CONTEXTS

Signed-off-by: Artemy Kovalyov <artemyko-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Signed-off-by: Saeed Mahameed <saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 include/linux/mlx5/mlx5_ifc.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4792c85..7c760e5 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -782,11 +782,12 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_max_eq[0x4];
 
 	u8         max_indirection[0x8];
-	u8         reserved_at_108[0x1];
+	u8         fixed_buffer_size[0x1];
 	u8         log_max_mrw_sz[0x7];
 	u8         reserved_at_110[0x2];
 	u8         log_max_bsf_list_size[0x6];
-	u8         reserved_at_118[0x2];
+	u8         umr_extended_translation_offset[0x1];
+	u8         null_mkey[0x1];
 	u8         log_max_klm_list_size[0x6];
 
 	u8         reserved_at_120[0xa];
@@ -2569,6 +2570,7 @@ enum {
 	MLX5_MKC_ACCESS_MODE_PA    = 0x0,
 	MLX5_MKC_ACCESS_MODE_MTT   = 0x1,
 	MLX5_MKC_ACCESS_MODE_KLMS  = 0x2,
+	MLX5_MKC_ACCESS_MODE_KSM   = 0x3,
 };
 
 struct mlx5_ifc_mkc_bits {
@@ -3677,6 +3679,10 @@ struct mlx5_ifc_query_special_contexts_out_bits {
 	u8         dump_fill_mkey[0x20];
 
 	u8         resd_lkey[0x20];
+
+	u8         null_mkey[0x20];
+
+	u8         reserved_at_a0[0x60];
 };
 
 struct mlx5_ifc_query_special_contexts_in_bits {
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH for-next V2 01/11] net/mlx5: Fix offset naming for reserved fields in hca_cap_bits
From: Saeed Mahameed @ 2017-01-02  9:37 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Leon Romanovsky, Ilya Lesokhin, Artemy Kovalyov, Max Gurtovoy,
	Leon Romanovsky, Saeed Mahameed
In-Reply-To: <1483349868-11716-1-git-send-email-saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

From: Max Gurtovoy <maxg-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Fix offset for reserved fields.

Fixes: 7486216b3a0b ("{net,IB}/mlx5: mlx5_ifc updates")
Fixes: b4ff3a36d3e4 ("net/mlx5: Use offset based reserved field names in the IFC header file")
Fixes: 7d5e14237a55 ("net/mlx5: Update mlx5_ifc hardware features")
Signed-off-by: Max Gurtovoy <maxg-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Reviewed-by: Artemy Kovalyov <artemyko-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Signed-off-by: Saeed Mahameed <saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 include/linux/mlx5/mlx5_ifc.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 57bec54..4792c85 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -826,9 +826,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_1a9[0x2];
 	u8         local_ca_ack_delay[0x5];
 	u8         port_module_event[0x1];
-	u8         reserved_at_1b0[0x1];
+	u8         reserved_at_1b1[0x1];
 	u8         ports_check[0x1];
-	u8         reserved_at_1b2[0x1];
+	u8         reserved_at_1b3[0x1];
 	u8         disable_link_up[0x1];
 	u8         beacon_led[0x1];
 	u8         port_type[0x2];
@@ -858,7 +858,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         compact_address_vector[0x1];
 	u8         striding_rq[0x1];
-	u8         reserved_at_201[0x2];
+	u8         reserved_at_202[0x2];
 	u8         ipoib_basic_offloads[0x1];
 	u8         reserved_at_205[0xa];
 	u8         drain_sigerr[0x1];
@@ -1009,10 +1009,10 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         rndv_offload_rc[0x1];
 	u8         rndv_offload_dc[0x1];
 	u8         log_tag_matching_list_sz[0x5];
-	u8         reserved_at_5e8[0x3];
+	u8         reserved_at_5f8[0x3];
 	u8         log_max_xrq[0x5];
 
-	u8         reserved_at_5f0[0x200];
+	u8         reserved_at_600[0x200];
 };
 
 enum mlx5_flow_destination_type {
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* RE: [RFC PATCH net-next v4 1/2] macb: Add 1588 support in Cadence GEM.
From: Rafal Ozieblo @ 2017-01-02  9:36 UTC (permalink / raw)
  To: Andrei Pistirica, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, davem@davemloft.net,
	nicolas.ferre@atmel.com, harinikatakamlinux@gmail.com,
	harini.katakam@xilinx.com
  Cc: punnaia@xilinx.com, michals@xilinx.com, anirudh@xilinx.com,
	boris.brezillon@free-electrons.com,
	alexandre.belloni@free-electrons.com, tbultel@pixelsurmer.com,
	richardcochran@gmail.com
In-Reply-To: <1481720175-12703-1-git-send-email-andrei.pistirica@microchip.com>

> -----Original Message-----
> From: Rafal Ozieblo 
> Sent: 28 grudnia 2016 14:23
> Subject: RE: [RFC PATCH net-next v4 1/2] macb: Add 1588 support in Cadence GEM.
> 
> > +static void gem_ptp_tx_hwtstamp(struct macb *bp, struct sk_buff *skb,
> > +				int peer_ev)
> > +{
> > +	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
> > +	struct timespec64 ts;
> > +	u64 ns;
> > +
> > +	/* PTP Peer Event Frame packets */
> > +	if (peer_ev) {
> > +		ts.tv_sec = gem_readl(bp, PEFTSL);
> > +		ts.tv_nsec = gem_readl(bp, PEFTN);
> > +
> > +	/* PTP Event Frame packets */
> > +	} else {
> > +		ts.tv_sec = gem_readl(bp, EFTSL);
> > +		ts.tv_nsec = gem_readl(bp, EFTN);
> > +	}
> I'm wondering what is a difference between timestamp in transmit buffer descriptor (Word 2 and 3) and PTP Event Frame Transmitted Seconds/Nanoseconds Register (0x1E0, 0x1E4).
> 
According Cadence Hardware team:
"It is just that some customers prefer to have the time in the descriptors as that is provided per frame.
The registers are simply overwritten when a new event frame is transmitted/received and so software could miss it."
The question is are you sure that you read timestamp for current frame? (not for the next frame).

^ permalink raw reply

* [PATCH] Update pptp handling to avoid null pointer deref. v2
From: Ian Kumlien @ 2017-01-02  8:18 UTC (permalink / raw)
  To: linux-kernel, netdev, stable; +Cc: Ian Kumlien
In-Reply-To: <20170101231936.5905-1-ian.kumlien@gmail.com>

__skb_flow_dissect can be called with a skb or a data packet, either
can be NULL. All calls seems to have been moved to __skb_header_pointer
except the pptp handling which is still calling skb_header_pointer.

skb_header_pointer will use skb->data and thus:
[  109.556866] BUG: unable to handle kernel NULL pointer dereference at 0000000000000080
[  109.557102] IP: [<ffffffff88dc02f8>] __skb_flow_dissect+0xa88/0xce0
[  109.557263] PGD 0
[  109.557338]
[  109.557484] Oops: 0000 [#1] SMP
[  109.557562] Modules linked in: chaoskey
[  109.557783] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.0 #79
[  109.557867] Hardware name: Supermicro A1SRM-LN7F/LN5F/A1SRM-LN7F-2758, BIOS 1.0c 11/04/2015
[  109.557957] task: ffff94085c27bc00 task.stack: ffffb745c0068000
[  109.558041] RIP: 0010:[<ffffffff88dc02f8>]  [<ffffffff88dc02f8>] __skb_flow_dissect+0xa88/0xce0
[  109.558203] RSP: 0018:ffff94087fc83d40  EFLAGS: 00010206
[  109.558286] RAX: 0000000000000130 RBX: ffffffff8975bf80 RCX: ffff94084fab6800
[  109.558373] RDX: 0000000000000010 RSI: 000000000000000c RDI: 0000000000000000
[  109.558460] RBP: 0000000000000b88 R08: 0000000000000000 R09: 0000000000000022
[  109.558547] R10: 0000000000000008 R11: ffff94087fc83e04 R12: 0000000000000000
[  109.558763] R13: ffff94084fab6800 R14: ffff94087fc83e04 R15: 000000000000002f
[  109.558979] FS:  0000000000000000(0000) GS:ffff94087fc80000(0000) knlGS:0000000000000000
[  109.559326] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  109.559539] CR2: 0000000000000080 CR3: 0000000281809000 CR4: 00000000001026e0
[  109.559753] Stack:
[  109.559957]  000000000000000c ffff94084fab6822 0000000000000001 ffff94085c2b5fc0
[  109.560578]  0000000000000001 0000000000002000 0000000000000000 0000000000000000
[  109.561200]  0000000000000000 0000000000000000 0000000000000000 0000000000000000
[  109.561820] Call Trace:
[  109.562027]  <IRQ>
[  109.562108]  [<ffffffff88dfb4fa>] ? eth_get_headlen+0x7a/0xf0
[  109.562522]  [<ffffffff88c5a35a>] ? igb_poll+0x96a/0xe80
[  109.562737]  [<ffffffff88dc912b>] ? net_rx_action+0x20b/0x350
[  109.562953]  [<ffffffff88546d68>] ? __do_softirq+0xe8/0x280
[  109.563169]  [<ffffffff8854704a>] ? irq_exit+0xaa/0xb0
[  109.563382]  [<ffffffff8847229b>] ? do_IRQ+0x4b/0xc0
[  109.563597]  [<ffffffff8902d4ff>] ? common_interrupt+0x7f/0x7f
[  109.563810]  <EOI>
[  109.563890]  [<ffffffff88d57530>] ? cpuidle_enter_state+0x130/0x2c0
[  109.564304]  [<ffffffff88d57520>] ? cpuidle_enter_state+0x120/0x2c0
[  109.564520]  [<ffffffff8857eacf>] ? cpu_startup_entry+0x19f/0x1f0
[  109.564737]  [<ffffffff8848d55a>] ? start_secondary+0x12a/0x140
[  109.564950] Code: 83 e2 20 a8 80 0f 84 60 01 00 00 c7 04 24 08 00
00 00 66 85 d2 0f 84 be fe ff ff e9 69 fe ff ff 8b 34 24 89 f2 83 c2
04 66 85 c0 <41> 8b 84 24 80 00 00 00 0f 49 d6 41 8d 31 01 d6 41 2b 84
24 84
[  109.569959] RIP  [<ffffffff88dc02f8>] __skb_flow_dissect+0xa88/0xce0
[  109.570245]  RSP <ffff94087fc83d40>
[  109.570453] CR2: 0000000000000080

Fixes: ab10dccb1160 ("rps: Inspect PPTP encapsulated by GRE to get flow hash")
Signed-off-by: Ian Kumlien <ian.kumlien@gmail.com>
---
 net/core/flow_dissector.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c6d8207ffa7e..32e4e0158846 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -445,8 +445,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 			if (hdr->flags & GRE_ACK)
 				offset += sizeof(((struct pptp_gre_header *)0)->ack);
 
-			ppp_hdr = skb_header_pointer(skb, nhoff + offset,
-						     sizeof(_ppp_hdr), _ppp_hdr);
+			ppp_hdr = __skb_header_pointer(skb, nhoff + offset,
+						     sizeof(_ppp_hdr),
+						     data, hlen, _ppp_hdr);
 			if (!ppp_hdr)
 				goto out_bad;
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH] Update pptp handling to avoid null pointer deref. v2
From: Ian Kumlien @ 2017-01-02  8:16 UTC (permalink / raw)
  To: linux-kernel, netdev, stable; +Cc: Ian Kumlien
In-Reply-To: <20170101231936.5905-1-ian.kumlien@gmail.com>

__skb_flow_dissect can be called with a skb or a data packet, either
can be NULL. All calls seems to have been moved to __skb_header_pointer
except the pptp handling which is still calling skb_header_pointer.

skb_header_pointer will use skb->data and thus:
[  109.556866] BUG: unable to handle kernel NULL pointer dereference at 0000000000000080
[  109.557102] IP: [<ffffffff88dc02f8>] __skb_flow_dissect+0xa88/0xce0
[  109.557263] PGD 0
[  109.557338]
[  109.557484] Oops: 0000 [#1] SMP
[  109.557562] Modules linked in: chaoskey
[  109.557783] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.0 #79
[  109.557867] Hardware name: Supermicro A1SRM-LN7F/LN5F/A1SRM-LN7F-2758, BIOS 1.0c 11/04/2015
[  109.557957] task: ffff94085c27bc00 task.stack: ffffb745c0068000
[  109.558041] RIP: 0010:[<ffffffff88dc02f8>]  [<ffffffff88dc02f8>] __skb_flow_dissect+0xa88/0xce0
[  109.558203] RSP: 0018:ffff94087fc83d40  EFLAGS: 00010206
[  109.558286] RAX: 0000000000000130 RBX: ffffffff8975bf80 RCX: ffff94084fab6800
[  109.558373] RDX: 0000000000000010 RSI: 000000000000000c RDI: 0000000000000000
[  109.558460] RBP: 0000000000000b88 R08: 0000000000000000 R09: 0000000000000022
[  109.558547] R10: 0000000000000008 R11: ffff94087fc83e04 R12: 0000000000000000
[  109.558763] R13: ffff94084fab6800 R14: ffff94087fc83e04 R15: 000000000000002f
[  109.558979] FS:  0000000000000000(0000) GS:ffff94087fc80000(0000) knlGS:0000000000000000
[  109.559326] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  109.559539] CR2: 0000000000000080 CR3: 0000000281809000 CR4: 00000000001026e0
[  109.559753] Stack:
[  109.559957]  000000000000000c ffff94084fab6822 0000000000000001 ffff94085c2b5fc0
[  109.560578]  0000000000000001 0000000000002000 0000000000000000 0000000000000000
[  109.561200]  0000000000000000 0000000000000000 0000000000000000 0000000000000000
[  109.561820] Call Trace:
[  109.562027]  <IRQ>
[  109.562108]  [<ffffffff88dfb4fa>] ? eth_get_headlen+0x7a/0xf0
[  109.562522]  [<ffffffff88c5a35a>] ? igb_poll+0x96a/0xe80
[  109.562737]  [<ffffffff88dc912b>] ? net_rx_action+0x20b/0x350
[  109.562953]  [<ffffffff88546d68>] ? __do_softirq+0xe8/0x280
[  109.563169]  [<ffffffff8854704a>] ? irq_exit+0xaa/0xb0
[  109.563382]  [<ffffffff8847229b>] ? do_IRQ+0x4b/0xc0
[  109.563597]  [<ffffffff8902d4ff>] ? common_interrupt+0x7f/0x7f
[  109.563810]  <EOI>
[  109.563890]  [<ffffffff88d57530>] ? cpuidle_enter_state+0x130/0x2c0
[  109.564304]  [<ffffffff88d57520>] ? cpuidle_enter_state+0x120/0x2c0
[  109.564520]  [<ffffffff8857eacf>] ? cpu_startup_entry+0x19f/0x1f0
[  109.564737]  [<ffffffff8848d55a>] ? start_secondary+0x12a/0x140
[  109.564950] Code: 83 e2 20 a8 80 0f 84 60 01 00 00 c7 04 24 08 00
00 00 66 85 d2 0f 84 be fe ff ff e9 69 fe ff ff 8b 34 24 89 f2 83 c2
04 66 85 c0 <41> 8b 84 24 80 00 00 00 0f 49 d6 41 8d 31 01 d6 41 2b 84
24 84
[  109.569959] RIP  [<ffffffff88dc02f8>] __skb_flow_dissect+0xa88/0xce0
[  109.570245]  RSP <ffff94087fc83d40>
[  109.570453] CR2: 0000000000000080

Fixes: ab10dccb1160 ("rps: Inspect PPTP encapsulated by GRE to get flow hash")
Signed-off-by: Ian Kumlien <ian.kumlien@gmail.com>
---
 net/core/flow_dissector.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c6d8207ffa7e..32e4e0158846 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -445,8 +445,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 			if (hdr->flags & GRE_ACK)
 				offset += sizeof(((struct pptp_gre_header *)0)->ack);
 
-			ppp_hdr = skb_header_pointer(skb, nhoff + offset,
-						     sizeof(_ppp_hdr), _ppp_hdr);
+			ppp_hdr = __skb_header_pointer(skb, nhoff + offset,
+						     sizeof(_ppp_hdr),
+						     data, hlen, _ppp_hdr);
 			if (!ppp_hdr)
 				goto out_bad;
 
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH] Update pptp handling to avoid null pointer deref.
From: Ian Kumlien @ 2017-01-02  8:05 UTC (permalink / raw)
  To: David Miller; +Cc: linux-kernel@vger.kernel.org, netdev
In-Reply-To: <20170101.230753.2076894379858224939.davem@davemloft.net>

On Mon, Jan 2, 2017 at 5:07 AM, David Miller <davem@davemloft.net> wrote:
> From: Ian Kumlien <ian.kumlien@gmail.com>
> Date: Mon,  2 Jan 2017 00:19:36 +0100
>
>> __skb_flow_dissect can be called with a skb or a data packet, either
>> can be NULL. All calls seems to have been moved to __skb_header_pointer
>> except the pptp handling which is still calling skb_header_pointer.
>>
>> skb_header_pointer will use skb->data and thus:
>  ...
>> ---
>>
>> Signed-off-by: Ian Kumlien <ian.kumlien@gmail.com>
>
> You need to fix some parts of your submission.
>
> Do not put the signoff after the "---", git will remove all text
> after that "---" from the commit message.

Sorry, I tend to do that automatically

> You must include a proper "Fixes: " tag which indicates which change
> introduced this regression.  This is critical for analyzing your fix
> and also for figuring out which -stable releases your fix should be
> backported to.

I it's supposed to be added after 4.7 but i can't find it in 4.8, will
send it as a stable patch for 4.9

> In this case the guilty commit is ab10dccb1160 ("rps: Inspect PPTP
> encapsulated by GRE to get flow hash")

Thanks, =)

Now patch is incomming

^ permalink raw reply

* tcp_bbr: Forcing set of BBR congestion control as default
From: Sedat Dilek @ 2017-01-02  5:05 UTC (permalink / raw)
  To: David S. Miller <davem@davemloft.net> Neal Cardwell; +Cc: netdev

Hi,

I am trying to force the set of BBR congestion control as default.
My old linux-config uses CUBIC as default.
I want both BBR and CUBIC to be built but BBR shall be my default.

I tried the below snippet.

I refresh my new linux-config like this...

$ MAKE="make V=1" ; COMPILER="mycompiler" ; MAKE_OPTS="CC=$COMPILER
HOSTCC=$COMPILER" ; yes "" | $MAKE $MAKE_OPTS oldconfig && $MAKE
$MAKE_OPTS silentoldconfig < /dev/null

What am I doing wrong?

Thanks in advance.

Regards,
- Sedat -

P.S.: Patch snippet

--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -642,7 +642,7 @@ config TCP_CONG_CDG

 config TCP_CONG_BBR
        tristate "BBR TCP"
-       default n
+       default y
        ---help---

        BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
@@ -657,7 +657,7 @@ config TCP_CONG_BBR

 choice
        prompt "Default TCP congestion control"
-       default DEFAULT_CUBIC
+       default DEFAULT_BBR
        help
          Select the TCP congestion control that will be used by default
          for all connections.
@@ -716,7 +716,7 @@ config DEFAULT_TCP_CONG
        default "dctcp" if DEFAULT_DCTCP
        default "cdg" if DEFAULT_CDG
        default "bbr" if DEFAULT_BBR
-       default "cubic"
+       default "bbr"

 config TCP_MD5SIG
        bool "TCP: MD5 Signature Option support (RFC2385)"

- EOT -

^ permalink raw reply

* Re: [PATCH] Update pptp handling to avoid null pointer deref.
From: David Miller @ 2017-01-02  4:07 UTC (permalink / raw)
  To: ian.kumlien; +Cc: linux-kernel, netdev
In-Reply-To: <20170101231936.5905-1-ian.kumlien@gmail.com>

From: Ian Kumlien <ian.kumlien@gmail.com>
Date: Mon,  2 Jan 2017 00:19:36 +0100

> __skb_flow_dissect can be called with a skb or a data packet, either
> can be NULL. All calls seems to have been moved to __skb_header_pointer
> except the pptp handling which is still calling skb_header_pointer.
> 
> skb_header_pointer will use skb->data and thus:
 ...
> ---
> 
> Signed-off-by: Ian Kumlien <ian.kumlien@gmail.com>

You need to fix some parts of your submission.

Do not put the signoff after the "---", git will remove all text
after that "---" from the commit message.

You must include a proper "Fixes: " tag which indicates which change
introduced this regression.  This is critical for analyzing your fix
and also for figuring out which -stable releases your fix should be
backported to.

In this case the guilty commit is ab10dccb1160 ("rps: Inspect PPTP
encapsulated by GRE to get flow hash")

^ permalink raw reply

* Re: [PATCH] Ipvlan should return an error when an address is already in use.
From: David Miller @ 2017-01-02  3:26 UTC (permalink / raw)
  To: kjlx; +Cc: maheshb, netdev
In-Reply-To: <20161231041058.GC2448@templeofstupid.com>

From: Krister Johansen <kjlx@templeofstupid.com>
Date: Fri, 30 Dec 2016 20:10:58 -0800

> The ipvlan code already knows how to detect when a duplicate address is
> about to be assigned to an ipvlan device.  However, that failure is not
> propogated outward and leads to a silent failure.  This teaches the ip
> address addition functions how to report this error to the user
> applications so that a notifier chain failure during ip address addition
> will not appear to succeed when it actually has not.
> 
> This can be especially useful if it is necessary to provision many
> ipvlans in containers.  The provisioning software (or operator) can use
> this to detect situations where an ip address is unexpectedly in use.
> 
> Signed-off-by: Krister Johansen <kjlx@templeofstupid.com>

Your patch isn't handling the case of primary address promotions,
which also issue NETDEV_UP events on these notifier chains.

But on a more basic level, it's extremely important that once you
start using the notifier_{from,to}_errno() handling for a notifier,
you must start doing so for all such cases of that notifier.

^ permalink raw reply

* Re: [PATCH net 0/2] l2tp: socket lookup fixes for l2tp_ip and l2tp_ip6
From: David Miller @ 2017-01-02  3:07 UTC (permalink / raw)
  To: g.nault; +Cc: netdev, jchapman, celston
In-Reply-To: <cover.1483123297.git.g.nault@alphalink.fr>

From: Guillaume Nault <g.nault@alphalink.fr>
Date: Fri, 30 Dec 2016 19:48:17 +0100

> There are still some cases that aren't correctly handled in the socket
> lookup functions of l2tp_ip and l2tp_ip6. This series fixes lookups for
> connected sockets and for sockets bound to the IPv6 unspecified
> address.
> 
> bind() and connect() should now work as expected on IPPROTO_L2TP
> sockets. Extra features, like SO_REUSEADDR, remain unsupported.
> 
> The matching conditions in __l2tp_ip6_bind_lookup() and
> __l2tp_ip_bind_lookup() are getting hard to read. I've kept the single
> test approach to make the intend of the patches clear. I'll split the
> conditionals once these fixes reach net-next.

Series applied, thank you.

^ permalink raw reply

* Re: [PATCH] drop_monitor: add missing call to genlmsg_end
From: David Miller @ 2017-01-02  3:01 UTC (permalink / raw)
  To: wr0112358; +Cc: nhorman, netdev, linux-kernel
In-Reply-To: <20161231201157.10800-1-wr0112358@gmail.com>

From: Reiter Wolfgang <wr0112358@gmail.com>
Date: Sat, 31 Dec 2016 21:11:57 +0100

> Update nlmsg_len field with genlmsg_end to enable userspace processing
> using nlmsg_next helper. Also adds error handling.
> 
> Signed-off-by: Reiter Wolfgang <wr0112358@gmail.com>

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [PATCH for-next 05/11] IB/mlx5: Refactor UMR post send format
From: David Miller @ 2017-01-02  2:11 UTC (permalink / raw)
  To: saeedm; +Cc: dledford, netdev, linux-rdma, leonro, ilyal, artemyko, leon
In-Reply-To: <1483305027-2573-6-git-send-email-saeedm@mellanox.com>

From: Saeed Mahameed <saeedm@mellanox.com>
Date: Sun,  1 Jan 2017 23:10:21 +0200

> diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
> index a1b3125..6d5f5d4 100644
> --- a/drivers/infiniband/hw/mlx5/qp.c
> +++ b/drivers/infiniband/hw/mlx5/qp.c
> @@ -3080,9 +3080,10 @@ static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg)
>  	dseg->addr       = cpu_to_be64(sg->addr);
>  }
>  
> -static __be16 get_klm_octo(int npages)
> +static inline u64 get_xlt_octo(u64 bytes)
>  {

Again, please don't use 'inline' in foo.c files.

^ permalink raw reply

* Re: [PATCH for-next 03/11] IB/mlx5: Add helper mlx5_ib_post_send_wait
From: David Miller @ 2017-01-02  2:10 UTC (permalink / raw)
  To: saeedm-VPRAkNaXOzVWk0Htik3J/w
  Cc: dledford-H+wXaHxf7aLQT0dZR+AlfA, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, leonro-VPRAkNaXOzVWk0Htik3J/w,
	ilyal-VPRAkNaXOzVWk0Htik3J/w, artemyko-VPRAkNaXOzVWk0Htik3J/w,
	binoy.jayan-QSEj5FYQhm4dnm+yROfE0A, leon-DgEjT+Ai2ygdnm+yROfE0A
In-Reply-To: <1483305027-2573-4-git-send-email-saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

From: Saeed Mahameed <saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Date: Sun,  1 Jan 2017 23:10:19 +0200

> diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
> index 8f608deb..afb6dc1 100644
> --- a/drivers/infiniband/hw/mlx5/mr.c
> +++ b/drivers/infiniband/hw/mlx5/mr.c
> @@ -891,16 +891,40 @@ static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
>  	init_completion(&context->done);
>  }
>  
> +static inline int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
> +					 struct mlx5_umr_wr *umrwr)

Please do not use 'inline' in foo.c files, let the compiler decide.

I know this is already happening in this file, but that should be
continued rather than replicated further.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH] Update pptp handling to avoid null pointer deref.
From: Ian Kumlien @ 2017-01-01 23:19 UTC (permalink / raw)
  To: linux-kernel, netdev; +Cc: Ian Kumlien

__skb_flow_dissect can be called with a skb or a data packet, either
can be NULL. All calls seems to have been moved to __skb_header_pointer
except the pptp handling which is still calling skb_header_pointer.

skb_header_pointer will use skb->data and thus:
[  109.556866] BUG: unable to handle kernel NULL pointer dereference at 0000000000000080
[  109.557102] IP: [<ffffffff88dc02f8>] __skb_flow_dissect+0xa88/0xce0
[  109.557263] PGD 0
[  109.557338]
[  109.557484] Oops: 0000 [#1] SMP
[  109.557562] Modules linked in: chaoskey
[  109.557783] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.0 #79
[  109.557867] Hardware name: Supermicro A1SRM-LN7F/LN5F/A1SRM-LN7F-2758, BIOS 1.0c 11/04/2015
[  109.557957] task: ffff94085c27bc00 task.stack: ffffb745c0068000
[  109.558041] RIP: 0010:[<ffffffff88dc02f8>]  [<ffffffff88dc02f8>] __skb_flow_dissect+0xa88/0xce0
[  109.558203] RSP: 0018:ffff94087fc83d40  EFLAGS: 00010206
[  109.558286] RAX: 0000000000000130 RBX: ffffffff8975bf80 RCX: ffff94084fab6800
[  109.558373] RDX: 0000000000000010 RSI: 000000000000000c RDI: 0000000000000000
[  109.558460] RBP: 0000000000000b88 R08: 0000000000000000 R09: 0000000000000022
[  109.558547] R10: 0000000000000008 R11: ffff94087fc83e04 R12: 0000000000000000
[  109.558763] R13: ffff94084fab6800 R14: ffff94087fc83e04 R15: 000000000000002f
[  109.558979] FS:  0000000000000000(0000) GS:ffff94087fc80000(0000) knlGS:0000000000000000
[  109.559326] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  109.559539] CR2: 0000000000000080 CR3: 0000000281809000 CR4: 00000000001026e0
[  109.559753] Stack:
[  109.559957]  000000000000000c ffff94084fab6822 0000000000000001 ffff94085c2b5fc0
[  109.560578]  0000000000000001 0000000000002000 0000000000000000 0000000000000000
[  109.561200]  0000000000000000 0000000000000000 0000000000000000 0000000000000000
[  109.561820] Call Trace:
[  109.562027]  <IRQ>
[  109.562108]  [<ffffffff88dfb4fa>] ? eth_get_headlen+0x7a/0xf0
[  109.562522]  [<ffffffff88c5a35a>] ? igb_poll+0x96a/0xe80
[  109.562737]  [<ffffffff88dc912b>] ? net_rx_action+0x20b/0x350
[  109.562953]  [<ffffffff88546d68>] ? __do_softirq+0xe8/0x280
[  109.563169]  [<ffffffff8854704a>] ? irq_exit+0xaa/0xb0
[  109.563382]  [<ffffffff8847229b>] ? do_IRQ+0x4b/0xc0
[  109.563597]  [<ffffffff8902d4ff>] ? common_interrupt+0x7f/0x7f
[  109.563810]  <EOI>
[  109.563890]  [<ffffffff88d57530>] ? cpuidle_enter_state+0x130/0x2c0
[  109.564304]  [<ffffffff88d57520>] ? cpuidle_enter_state+0x120/0x2c0
[  109.564520]  [<ffffffff8857eacf>] ? cpu_startup_entry+0x19f/0x1f0
[  109.564737]  [<ffffffff8848d55a>] ? start_secondary+0x12a/0x140
[  109.564950] Code: 83 e2 20 a8 80 0f 84 60 01 00 00 c7 04 24 08 00
00 00 66 85 d2 0f 84 be fe ff ff e9 69 fe ff ff 8b 34 24 89 f2 83 c2
04 66 85 c0 <41> 8b 84 24 80 00 00 00 0f 49 d6 41 8d 31 01 d6 41 2b 84
24 84
[  109.569959] RIP  [<ffffffff88dc02f8>] __skb_flow_dissect+0xa88/0xce0
[  109.570245]  RSP <ffff94087fc83d40>
[  109.570453] CR2: 0000000000000080
---

Signed-off-by: Ian Kumlien <ian.kumlien@gmail.com>
---
 net/core/flow_dissector.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c6d8207ffa7e..32e4e0158846 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -445,8 +445,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 			if (hdr->flags & GRE_ACK)
 				offset += sizeof(((struct pptp_gre_header *)0)->ack);
 
-			ppp_hdr = skb_header_pointer(skb, nhoff + offset,
-						     sizeof(_ppp_hdr), _ppp_hdr);
+			ppp_hdr = __skb_header_pointer(skb, nhoff + offset,
+						     sizeof(_ppp_hdr),
+						     data, hlen, _ppp_hdr);
 			if (!ppp_hdr)
 				goto out_bad;
 
-- 
2.11.0

^ permalink raw reply related

* Re: [BUG] 4.9 - kernel oops when pptp connection is established and the kernel doesn't have pptp modules compiled
From: Ian Kumlien @ 2017-01-01 22:57 UTC (permalink / raw)
  To: linux-kernel@vger.kernel.org; +Cc: Ian Kumlien, netdev
In-Reply-To: <CAA85sZsFOcNMkN5tX-mL6OBgMsAKB=sU0Udt=fp=6WK0Onm6eA@mail.gmail.com>

Sorry, after further inspection, it seems like the choice of
skb_header_pointer is wrong and skb isn't always populated. It seems
like the code has suffered from bitrot since all the surrounding code
is actually fixed.

This fixes it as well:
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c6d8207ffa7e..32e4e0158846 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -445,8 +445,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
                        if (hdr->flags & GRE_ACK)
                                offset += sizeof(((struct
pptp_gre_header *)0)->ack);

-                       ppp_hdr = skb_header_pointer(skb, nhoff + offset,
-                                                    sizeof(_ppp_hdr),
_ppp_hdr);
+                       ppp_hdr = __skb_header_pointer(skb, nhoff + offset,
+                                                    sizeof(_ppp_hdr),
+                                                    data, hlen, _ppp_hdr);
                        if (!ppp_hdr)
                                goto out_bad;

Will send a patch with signed off by n' all.

On Sun, Jan 1, 2017 at 6:31 PM, Ian Kumlien <ian.kumlien@gmail.com> wrote:
> On Fri, Dec 30, 2016 at 11:48 PM, Ian Kumlien <ian.kumlien@gmail.com> wrote:
>> Hi,
>>
>> Been fighting with "crash" to get it to help me to analyze my crash
>> dumps... This is the output from vmcore-dmesg.
>>
>> This is 100% reproducible...
>>
>> Config that lets the connection trough but crashes the kernel:
>> # CONFIG_NF_CONNTRACK_PPTP is not set
>> # CONFIG_NF_NAT_PPTP is not set
>> CONFIG_PPTP=y
>>
>> If I enable the *_NF_* options, it doesn't crash but it also blocks
>> the PPTP packets.
>>
>> The crash is after the negotiation bit...
>
> So, some of the dumps pointed me, after some coaxing, to
> net/core/flow_dissector.c:448
> ---
>                         ppp_hdr = skb_header_pointer(skb, nhoff + offset,
>                                                      sizeof(_ppp_hdr),
> _ppp_hdr);
>                         if (!ppp_hdr)
>                                 goto out_bad;
> --
>
> Ie, copy or get the information from the skb to get more information
> on the pptp connection.
>
> However include/linux/skbuff.h:3109, with my test and debug code added
> static inline void * __must_check
> __skb_header_pointer(const struct sk_buff *skb, int offset,
>                      int len, void *data, int hlen, void *buffer)
> {
>         if (hlen - offset >= len)
>         {
>                 if (skb == NULL || data == NULL)
>                 {
>                         printk("WARNING: something is null skb:%p
> data:%p - offset: %i hlen: %i len: %i\n", skb, data, offset, hlen,
> len);
>                         return NULL;
>                 }
>                 else
>                         return data + offset;
>         }
>
>         if (!skb ||
>             skb_copy_bits(skb, offset, buffer, len) < 0)
>                 return NULL;
>
>         return buffer;
> }
>
> static inline void * __must_check
> skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer)
> {
>         return __skb_header_pointer(skb, offset, len, skb->data,
>                                     skb_headlen(skb), buffer);
> }
> ---
>
> so skb_header_pointer sends skb->data as data, but we never check if
> skb is *NULL*
>
> This does happen when we do a pptp connection:
> [   89.606712] WARNING: something is null skb:          (null)
> data:ffff88bccc0d4000 - offset: 14 hlen: 256 len: 20
> [   89.613264] WARNING: something is null skb:          (null)
> data:ffff88bccc00f800 - offset: 14 hlen: 256 len: 20
> [   89.621005] WARNING: something is null skb:          (null)
> data:ffff88bccc010800 - offset: 14 hlen: 256 len: 20
> [   89.650479] WARNING: something is null skb:          (null)
> data:ffff88bccc2cb000 - offset: 14 hlen: 256 len: 20
>
> So, the question is if the skb should always be there and always be
> valid? In that case something like this should fix it:
> static inline void * __must_check
> __skb_header_pointer(const struct sk_buff *skb, int offset,
>                      int len, void *data, int hlen, void *buffer)
> {
>         if (!skb)
>                 return NULL;
>
>         if (hlen - offset >= len)
>                 return data + offset;
>
>         if (skb_copy_bits(skb, offset, buffer, len) < 0)
>                 return NULL;
>
>         return buffer;
> }
> ---
>
> Else the actual check would have to be moved to skb_header_pointer in
> this case - comments?
>
>> [  109.556866] BUG: unable to handle kernel NULL pointer dereference
>> at 0000000000000080
>> [  109.557102] IP: [<ffffffff88dc02f8>] __skb_flow_dissect+0xa88/0xce0
>> [  109.557263] PGD 0
>> [  109.557338]
>> [  109.557484] Oops: 0000 [#1] SMP
>> [  109.557562] Modules linked in: chaoskey
>> [  109.557783] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.0 #79
>> [  109.557867] Hardware name: Supermicro
>> A1SRM-LN7F/LN5F/A1SRM-LN7F-2758, BIOS 1.0c 11/04/2015
>> [  109.557957] task: ffff94085c27bc00 task.stack: ffffb745c0068000
>> [  109.558041] RIP: 0010:[<ffffffff88dc02f8>]  [<ffffffff88dc02f8>]
>> __skb_flow_dissect+0xa88/0xce0
>> [  109.558203] RSP: 0018:ffff94087fc83d40  EFLAGS: 00010206
>> [  109.558286] RAX: 0000000000000130 RBX: ffffffff8975bf80 RCX: ffff94084fab6800
>> [  109.558373] RDX: 0000000000000010 RSI: 000000000000000c RDI: 0000000000000000
>> [  109.558460] RBP: 0000000000000b88 R08: 0000000000000000 R09: 0000000000000022
>> [  109.558547] R10: 0000000000000008 R11: ffff94087fc83e04 R12: 0000000000000000
>> [  109.558763] R13: ffff94084fab6800 R14: ffff94087fc83e04 R15: 000000000000002f
>> [  109.558979] FS:  0000000000000000(0000) GS:ffff94087fc80000(0000)
>> knlGS:0000000000000000
>> [  109.559326] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [  109.559539] CR2: 0000000000000080 CR3: 0000000281809000 CR4: 00000000001026e0
>> [  109.559753] Stack:
>> [  109.559957]  000000000000000c ffff94084fab6822 0000000000000001
>> ffff94085c2b5fc0
>> [  109.560578]  0000000000000001 0000000000002000 0000000000000000
>> 0000000000000000
>> [  109.561200]  0000000000000000 0000000000000000 0000000000000000
>> 0000000000000000
>> [  109.561820] Call Trace:
>> [  109.562027]  <IRQ>
>> [  109.562108]  [<ffffffff88dfb4fa>] ? eth_get_headlen+0x7a/0xf0
>> [  109.562522]  [<ffffffff88c5a35a>] ? igb_poll+0x96a/0xe80
>> [  109.562737]  [<ffffffff88dc912b>] ? net_rx_action+0x20b/0x350
>> [  109.562953]  [<ffffffff88546d68>] ? __do_softirq+0xe8/0x280
>> [  109.563169]  [<ffffffff8854704a>] ? irq_exit+0xaa/0xb0
>> [  109.563382]  [<ffffffff8847229b>] ? do_IRQ+0x4b/0xc0
>> [  109.563597]  [<ffffffff8902d4ff>] ? common_interrupt+0x7f/0x7f
>> [  109.563810]  <EOI>
>> [  109.563890]  [<ffffffff88d57530>] ? cpuidle_enter_state+0x130/0x2c0
>> [  109.564304]  [<ffffffff88d57520>] ? cpuidle_enter_state+0x120/0x2c0
>> [  109.564520]  [<ffffffff8857eacf>] ? cpu_startup_entry+0x19f/0x1f0
>> [  109.564737]  [<ffffffff8848d55a>] ? start_secondary+0x12a/0x140
>> [  109.564950] Code: 83 e2 20 a8 80 0f 84 60 01 00 00 c7 04 24 08 00
>> 00 00 66 85 d2 0f 84 be fe ff ff e9 69 fe ff ff 8b 34 24 89 f2 83 c2
>> 04 66 85 c0 <41> 8b 84 24 80 00 00 00 0f 49 d6 41 8d 31 01 d6 41 2b 84
>> 24 84
>> [  109.569959] RIP  [<ffffffff88dc02f8>] __skb_flow_dissect+0xa88/0xce0
>> [  109.570245]  RSP <ffff94087fc83d40>
>> [  109.570453] CR2: 0000000000000080

^ permalink raw reply related

* [PATCH v2 net-next 1/2] af_packet: TX_RING support for TPACKET_V3
From: Sowmini Varadhan @ 2017-01-01 22:45 UTC (permalink / raw)
  To: netdev, sowmini.varadhan; +Cc: daniel, willemb, davem
In-Reply-To: <cover.1483309545.git.sowmini.varadhan@oracle.com>

Although TPACKET_V3 Rx has some benefits over TPACKET_V2 Rx, *_v3
does not currently have TX_RING support. As a result an application
that wants the best perf for Tx and Rx (e.g. to handle request/response
transacations) ends up needing 2 sockets, one with *_v2 for Tx and
another with *_v3 for Rx.

This patch enables TPACKET_V2 compatible Tx features in TPACKET_V3
so that an application can use a single descriptor to get the benefits
of _v3 RX_RING and _v2 TX_RING. An application may do a block-send by
first filling up multiple frames in the Tx ring and then triggering a
transmit. This patch only support fixed size Tx frames for TPACKET_V3,
and requires that tp_next_offset must be zero.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
v2: sanity checks on tp_next_offset and corresponding Doc updates
    as suggested by Willem de Bruijn

 Documentation/networking/packet_mmap.txt |    9 +++++++--
 net/packet/af_packet.c                   |   27 +++++++++++++++++++--------
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index daa015a..f3b9e50 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -565,7 +565,7 @@ where 'tpacket_version' can be TPACKET_V1 (default), TPACKET_V2, TPACKET_V3.
 		   (void *)hdr + TPACKET_ALIGN(sizeof(struct tpacket_hdr))
 
 TPACKET_V2 --> TPACKET_V3:
-	- Flexible buffer implementation:
+	- Flexible buffer implementation for RX_RING:
 		1. Blocks can be configured with non-static frame-size
 		2. Read/poll is at a block-level (as opposed to packet-level)
 		3. Added poll timeout to avoid indefinite user-space wait
@@ -574,7 +574,12 @@ where 'tpacket_version' can be TPACKET_V1 (default), TPACKET_V2, TPACKET_V3.
 			4.1 block::timeout
 			4.2 tpkt_hdr::sk_rxhash
 	- RX Hash data available in user space
-	- Currently only RX_RING available
+	- TX_RING semantics are conceptually similar to TPACKET_V2;
+	  use tpacket3_hdr instead of tpacket2_hdr, and TPACKET3_HDRLEN
+	  instead of TPACKET2_HDRLEN. In the current implementation,
+	  the tp_next_offset field in the tpacket3_hdr MUST be set to
+	  zero, indicating that the ring does not hold variable sized frames.
+	  Packets with non-zero values of tp_next_offset will be dropped.
 
 -------------------------------------------------------------------------------
 + AF_PACKET fanout mode
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 89f2e8c..513f52b 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -409,6 +409,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 		break;
 	case TPACKET_V3:
+		h.h3->tp_status = status;
+		flush_dcache_page(pgv_to_page(&h.h3->tp_status));
+		break;
 	default:
 		WARN(1, "TPACKET version not supported.\n");
 		BUG();
@@ -432,6 +435,8 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
 		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 		return h.h2->tp_status;
 	case TPACKET_V3:
+		flush_dcache_page(pgv_to_page(&h.h3->tp_status));
+		return h.h3->tp_status;
 	default:
 		WARN(1, "TPACKET version not supported.\n");
 		BUG();
@@ -2500,6 +2505,13 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
 	ph.raw = frame;
 
 	switch (po->tp_version) {
+	case TPACKET_V3:
+		if (ph.h3->tp_next_offset != 0) {
+			pr_warn_once("variable sized slot not supported");
+			return -EINVAL;
+		}
+		tp_len = ph.h3->tp_len;
+		break;
 	case TPACKET_V2:
 		tp_len = ph.h2->tp_len;
 		break;
@@ -2519,6 +2531,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
 		off_max = po->tx_ring.frame_size - tp_len;
 		if (po->sk.sk_type == SOCK_DGRAM) {
 			switch (po->tp_version) {
+			case TPACKET_V3:
+				off = ph.h3->tp_net;
+				break;
 			case TPACKET_V2:
 				off = ph.h2->tp_net;
 				break;
@@ -2528,6 +2543,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
 			}
 		} else {
 			switch (po->tp_version) {
+			case TPACKET_V3:
+				off = ph.h3->tp_mac;
+				break;
 			case TPACKET_V2:
 				off = ph.h2->tp_mac;
 				break;
@@ -4116,11 +4134,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 	struct tpacket_req *req = &req_u->req;
 
 	lock_sock(sk);
-	/* Opening a Tx-ring is NOT supported in TPACKET_V3 */
-	if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
-		net_warn_ratelimited("Tx-ring is not supported.\n");
-		goto out;
-	}
 
 	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
 	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
@@ -4180,9 +4193,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 			goto out;
 		switch (po->tp_version) {
 		case TPACKET_V3:
-		/* Transmit path is not supported. We checked
-		 * it above but just being paranoid
-		 */
+			/* Block transmit is not supported yet */
 			if (!tx_ring)
 				init_prb_bdqc(po, rb, pg_vec, req_u);
 			break;
-- 
1.7.1

^ permalink raw reply related

* [PATCH v2 net-next 2/2] tools: test case for TPACKET_V3/TX_RING support
From: Sowmini Varadhan @ 2017-01-01 22:45 UTC (permalink / raw)
  To: netdev, sowmini.varadhan; +Cc: daniel, willemb, davem
In-Reply-To: <cover.1483309545.git.sowmini.varadhan@oracle.com>

Add a test case and sample code for (TPACKET_V3, PACKET_TX_RING)

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
v2: Added test case.

 tools/testing/selftests/net/psock_tpacket.c |  110 ++++++++++++++++++++++++++-
 1 files changed, 109 insertions(+), 1 deletions(-)

diff --git a/tools/testing/selftests/net/psock_tpacket.c b/tools/testing/selftests/net/psock_tpacket.c
index 24adf70..f2012dc 100644
--- a/tools/testing/selftests/net/psock_tpacket.c
+++ b/tools/testing/selftests/net/psock_tpacket.c
@@ -311,6 +311,17 @@ static inline void __v2_tx_user_ready(struct tpacket2_hdr *hdr)
 	__sync_synchronize();
 }
 
+static inline int __v3_tx_kernel_ready(struct tpacket3_hdr *hdr)
+{
+	return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING));
+}
+
+static inline void __v3_tx_user_ready(struct tpacket3_hdr *hdr)
+{
+	hdr->tp_status = TP_STATUS_SEND_REQUEST;
+	__sync_synchronize();
+}
+
 static inline int __v1_v2_tx_kernel_ready(void *base, int version)
 {
 	switch (version) {
@@ -578,12 +589,108 @@ static void walk_v3_rx(int sock, struct ring *ring)
 	fprintf(stderr, " %u pkts (%u bytes)", NUM_PACKETS, total_bytes >> 1);
 }
 
+static inline void *
+get_v3_frame(struct ring *ring, int n)
+{
+	uint8_t *f0 = ring->rd[0].iov_base;
+
+	return f0 + (n * ring->req3.tp_frame_size);
+}
+
+static void walk_v3_tx(int sock, struct ring *ring)
+{
+	struct pollfd pfd;
+	int rcv_sock, ret;
+	size_t packet_len;
+	char packet[1024];
+	unsigned int frame_num = 0, got = 0;
+	struct sockaddr_ll ll = {
+		.sll_family = PF_PACKET,
+		.sll_halen = ETH_ALEN,
+	};
+
+	bug_on(ring->type != PACKET_TX_RING);
+	bug_on(ring->req3.tp_frame_nr < NUM_PACKETS);
+
+	rcv_sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+	if (rcv_sock == -1) {
+		perror("socket");
+		exit(1);
+	}
+
+	pair_udp_setfilter(rcv_sock);
+
+	ll.sll_ifindex = if_nametoindex("lo");
+	ret = bind(rcv_sock, (struct sockaddr *) &ll, sizeof(ll));
+	if (ret == -1) {
+		perror("bind");
+		exit(1);
+	}
+
+	memset(&pfd, 0, sizeof(pfd));
+	pfd.fd = sock;
+	pfd.events = POLLOUT | POLLERR;
+	pfd.revents = 0;
+
+	total_packets = NUM_PACKETS;
+	create_payload(packet, &packet_len);
+
+	while (total_packets > 0) {
+		struct tpacket3_hdr *tx = get_v3_frame(ring, frame_num);
+
+		while (__v3_tx_kernel_ready(tx) && total_packets > 0) {
+			tx->tp_snaplen = packet_len;
+			tx->tp_len = packet_len;
+			memcpy((uint8_t *) tx + TPACKET3_HDRLEN -
+			       sizeof(struct sockaddr_ll), packet, packet_len);
+			total_bytes += tx->tp_snaplen;
+
+			status_bar_update();
+			total_packets--;
+
+			__v3_tx_user_ready(tx);
+
+			frame_num = (frame_num + 1) % ring->req3.tp_frame_nr;
+		}
+
+		poll(&pfd, 1, 1);
+	}
+
+	bug_on(total_packets != 0);
+
+	ret = sendto(sock, NULL, 0, 0, NULL, 0);
+	if (ret == -1) {
+		perror("sendto");
+		exit(1);
+	}
+
+	while ((ret = recvfrom(rcv_sock, packet, sizeof(packet),
+			       0, NULL, NULL)) > 0 &&
+	       total_packets < NUM_PACKETS) {
+		got += ret;
+		test_payload(packet, ret);
+
+		status_bar_update();
+		total_packets++;
+	}
+
+	close(rcv_sock);
+
+	if (total_packets != NUM_PACKETS) {
+		fprintf(stderr, "walk_v%d_rx: received %u out of %u pkts\n",
+			ring->version, total_packets, NUM_PACKETS);
+		exit(1);
+	}
+
+	fprintf(stderr, " %u pkts (%u bytes)", NUM_PACKETS, got);
+}
+
 static void walk_v3(int sock, struct ring *ring)
 {
 	if (ring->type == PACKET_RX_RING)
 		walk_v3_rx(sock, ring);
 	else
-		bug_on(1);
+		walk_v3_tx(sock, ring);
 }
 
 static void __v1_v2_fill(struct ring *ring, unsigned int blocks)
@@ -796,6 +903,7 @@ int main(void)
 	ret |= test_tpacket(TPACKET_V2, PACKET_TX_RING);
 
 	ret |= test_tpacket(TPACKET_V3, PACKET_RX_RING);
+	ret |= test_tpacket(TPACKET_V3, PACKET_TX_RING);
 
 	if (ret)
 		return 1;
-- 
1.7.1

^ permalink raw reply related

* [PATCH v2 net-next 0/2] TPACKET_V3 TX_RING support
From: Sowmini Varadhan @ 2017-01-01 22:45 UTC (permalink / raw)
  To: netdev, sowmini.varadhan; +Cc: daniel, willemb, davem

This patch series allows an application to use a single PF_PACKET
descriptor and leverage the best implementations of TX_RING
and RX_RING that exist today.

Updates in v2 are listed below:

- patch 1 (which builds on the earlier patch discussed at
  http://patchwork.ozlabs.org/patch/709840/) verifies that tp_next_offset
  is set to 0 for TPACKET_V3 in the tpacket_snd path, indicating that
  variable-sized frames are not in use. Applications that wish to do
  block-sends must fill up multiple frames in the Tx ring and then
  trigger the transmit. At the current time, only fixed-size frames are
  supported on TX_RING for TPACKET_V3.

- patch 2 in this series adds a test case and sample code for
  (TPACKET_V3, PACKET_TX_RING) in testing/selftests

Sowmini Varadhan (2):
  af_packet: TX_RING support for TPACKET_V3
  tools: test case for TPACKET_V3/TX_RING support

 Documentation/networking/packet_mmap.txt    |    9 ++-
 net/packet/af_packet.c                      |   27 +++++--
 tools/testing/selftests/net/psock_tpacket.c |  110 ++++++++++++++++++++++++++-
 3 files changed, 135 insertions(+), 11 deletions(-)

^ permalink raw reply

* [PATCH for-next 09/11] {net,IB}/mlx5: Refactor page fault handling
From: Saeed Mahameed @ 2017-01-01 21:10 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Ilya Lesokhin,
	Artemy Kovalyov, Leon Romanovsky, Saeed Mahameed
In-Reply-To: <1483305027-2573-1-git-send-email-saeedm@mellanox.com>

From: Artemy Kovalyov <artemyko@mellanox.com>

* Update page fault event according to last specification.
* Separate code path for page fault EQ, completion EQ and async EQ.
* Move page fault handling work queue from mlx5_ib static variable
  into mlx5_core page fault EQ.
* Allocate memory to store ODP event dynamically as the
  events arrive, since in atomic context - use mempool.
* Make mlx5_ib page fault handler run in process context.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c                  |  14 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h               |  49 +---
 drivers/infiniband/hw/mlx5/odp.c                   | 300 ++++++++-------------
 drivers/infiniband/hw/mlx5/qp.c                    |  26 --
 drivers/net/ethernet/mellanox/mlx5/core/dev.c      |  33 +++
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       | 290 +++++++++++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  21 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/qp.c       | 108 --------
 include/linux/mlx5/device.h                        |   6 +-
 include/linux/mlx5/driver.h                        |  97 ++++++-
 include/linux/mlx5/qp.h                            |  44 ---
 12 files changed, 522 insertions(+), 468 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index b871272..86c61e7 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3319,6 +3319,9 @@ static struct mlx5_interface mlx5_ib_interface = {
 	.add            = mlx5_ib_add,
 	.remove         = mlx5_ib_remove,
 	.event          = mlx5_ib_event,
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	.pfault		= mlx5_ib_pfault,
+#endif
 	.protocol	= MLX5_INTERFACE_PROTOCOL_IB,
 };
 
@@ -3329,25 +3332,14 @@ static int __init mlx5_ib_init(void)
 	if (deprecated_prof_sel != 2)
 		pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
 
-	err = mlx5_ib_odp_init();
-	if (err)
-		return err;
-
 	err = mlx5_register_interface(&mlx5_ib_interface);
-	if (err)
-		goto clean_odp;
-
-	return err;
 
-clean_odp:
-	mlx5_ib_odp_cleanup();
 	return err;
 }
 
 static void __exit mlx5_ib_cleanup(void)
 {
 	mlx5_unregister_interface(&mlx5_ib_interface);
-	mlx5_ib_odp_cleanup();
 }
 
 module_init(mlx5_ib_init);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 02d9255..a51c805 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -277,29 +277,6 @@ struct mlx5_ib_rwq_ind_table {
 	u32			rqtn;
 };
 
-/*
- * Connect-IB can trigger up to four concurrent pagefaults
- * per-QP.
- */
-enum mlx5_ib_pagefault_context {
-	MLX5_IB_PAGEFAULT_RESPONDER_READ,
-	MLX5_IB_PAGEFAULT_REQUESTOR_READ,
-	MLX5_IB_PAGEFAULT_RESPONDER_WRITE,
-	MLX5_IB_PAGEFAULT_REQUESTOR_WRITE,
-	MLX5_IB_PAGEFAULT_CONTEXTS
-};
-
-static inline enum mlx5_ib_pagefault_context
-	mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault)
-{
-	return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE);
-}
-
-struct mlx5_ib_pfault {
-	struct work_struct	work;
-	struct mlx5_pagefault	mpfault;
-};
-
 struct mlx5_ib_ubuffer {
 	struct ib_umem	       *umem;
 	int			buf_size;
@@ -385,20 +362,6 @@ struct mlx5_ib_qp {
 	/* Store signature errors */
 	bool			signature_en;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	/*
-	 * A flag that is true for QP's that are in a state that doesn't
-	 * allow page faults, and shouldn't schedule any more faults.
-	 */
-	int                     disable_page_faults;
-	/*
-	 * The disable_page_faults_lock protects a QP's disable_page_faults
-	 * field, allowing for a thread to atomically check whether the QP
-	 * allows page faults, and if so schedule a page fault.
-	 */
-	spinlock_t              disable_page_faults_lock;
-	struct mlx5_ib_pfault	pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS];
-#endif
 	struct list_head	qps_list;
 	struct list_head	cq_recv_list;
 	struct list_head	cq_send_list;
@@ -869,18 +832,13 @@ struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
 int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-extern struct workqueue_struct *mlx5_ib_page_fault_wq;
-
 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev);
-void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
-			       struct mlx5_ib_pfault *pfault);
-void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp);
+void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
+		    struct mlx5_pagefault *pfault);
 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
 void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev);
 int __init mlx5_ib_odp_init(void);
 void mlx5_ib_odp_cleanup(void);
-void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
-void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
 			      unsigned long end);
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
@@ -889,13 +847,10 @@ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 	return;
 }
 
-static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)		{}
 static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
 static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)	{}
 static inline int mlx5_ib_odp_init(void) { return 0; }
 static inline void mlx5_ib_odp_cleanup(void)				{}
-static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {}
-static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)  {}
 
 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index cfd7ee5..26f96c7 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -41,8 +41,6 @@
  * a pagefault. */
 #define MMU_NOTIFIER_TIMEOUT 1000
 
-struct workqueue_struct *mlx5_ib_page_fault_wq;
-
 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
 			      unsigned long end)
 {
@@ -162,38 +160,38 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
 	return container_of(mmkey, struct mlx5_ib_mr, mmkey);
 }
 
-static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp,
-				      struct mlx5_ib_pfault *pfault,
+static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
+				      struct mlx5_pagefault *pfault,
 				      int error)
 {
-	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
-	u32 qpn = qp->trans_qp.base.mqp.qpn;
+	int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
+		     pfault->wqe.wq_num : pfault->token;
 	int ret = mlx5_core_page_fault_resume(dev->mdev,
-					      qpn,
-					      pfault->mpfault.flags,
+					      pfault->token,
+					      wq_num,
+					      pfault->type,
 					      error);
 	if (ret)
-		pr_err("Failed to resolve the page fault on QP 0x%x\n", qpn);
+		mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n",
+			    wq_num);
 }
 
 /*
- * Handle a single data segment in a page-fault WQE.
+ * Handle a single data segment in a page-fault WQE or RDMA region.
  *
- * Returns number of pages retrieved on success. The caller will continue to
+ * Returns number of pages retrieved on success. The caller may continue to
  * the next data segment.
  * Can return the following error codes:
  * -EAGAIN to designate a temporary error. The caller will abort handling the
  *  page fault and resolve it.
  * -EFAULT when there's an error mapping the requested pages. The caller will
- *  abort the page fault handling and possibly move the QP to an error state.
- * On other errors the QP should also be closed with an error.
+ *  abort the page fault handling.
  */
-static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
-					 struct mlx5_ib_pfault *pfault,
+static int pagefault_single_data_segment(struct mlx5_ib_dev *mib_dev,
 					 u32 key, u64 io_virt, size_t bcnt,
+					 u32 *bytes_committed,
 					 u32 *bytes_mapped)
 {
-	struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device);
 	int srcu_key;
 	unsigned int current_seq;
 	u64 start_idx;
@@ -219,12 +217,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
 			 key);
 		if (bytes_mapped)
 			*bytes_mapped +=
-				(bcnt - pfault->mpfault.bytes_committed);
-		goto srcu_unlock;
-	}
-	if (mr->ibmr.pd != qp->ibqp.pd) {
-		pr_err("Page-fault with different PDs for QP and MR.\n");
-		ret = -EFAULT;
+				(bcnt - *bytes_committed);
 		goto srcu_unlock;
 	}
 
@@ -240,8 +233,8 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
 	 * in all iterations (in iteration 2 and above,
 	 * bytes_committed == 0).
 	 */
-	io_virt += pfault->mpfault.bytes_committed;
-	bcnt -= pfault->mpfault.bytes_committed;
+	io_virt += *bytes_committed;
+	bcnt -= *bytes_committed;
 
 	start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT;
 
@@ -300,7 +293,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
 		}
 	}
 	srcu_read_unlock(&mib_dev->mr_srcu, srcu_key);
-	pfault->mpfault.bytes_committed = 0;
+	*bytes_committed = 0;
 	return ret ? ret : npages;
 }
 
@@ -322,8 +315,9 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
  * Returns the number of pages loaded if positive, zero for an empty WQE, or a
  * negative error code.
  */
-static int pagefault_data_segments(struct mlx5_ib_qp *qp,
-				   struct mlx5_ib_pfault *pfault, void *wqe,
+static int pagefault_data_segments(struct mlx5_ib_dev *dev,
+				   struct mlx5_pagefault *pfault,
+				   struct mlx5_ib_qp *qp, void *wqe,
 				   void *wqe_end, u32 *bytes_mapped,
 				   u32 *total_wqe_bytes, int receive_queue)
 {
@@ -367,22 +361,23 @@ static int pagefault_data_segments(struct mlx5_ib_qp *qp,
 
 		if (!inline_segment && total_wqe_bytes) {
 			*total_wqe_bytes += bcnt - min_t(size_t, bcnt,
-					pfault->mpfault.bytes_committed);
+					pfault->bytes_committed);
 		}
 
 		/* A zero length data segment designates a length of 2GB. */
 		if (bcnt == 0)
 			bcnt = 1U << 31;
 
-		if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) {
-			pfault->mpfault.bytes_committed -=
+		if (inline_segment || bcnt <= pfault->bytes_committed) {
+			pfault->bytes_committed -=
 				min_t(size_t, bcnt,
-				      pfault->mpfault.bytes_committed);
+				      pfault->bytes_committed);
 			continue;
 		}
 
-		ret = pagefault_single_data_segment(qp, pfault, key, io_virt,
-						    bcnt, bytes_mapped);
+		ret = pagefault_single_data_segment(dev, key, io_virt, bcnt,
+						    &pfault->bytes_committed,
+						    bytes_mapped);
 		if (ret < 0)
 			break;
 		npages += ret;
@@ -396,12 +391,11 @@ static int pagefault_data_segments(struct mlx5_ib_qp *qp,
  * scatter-gather list, and set wqe_end to the end of the WQE.
  */
 static int mlx5_ib_mr_initiator_pfault_handler(
-	struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault,
-	void **wqe, void **wqe_end, int wqe_length)
+	struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
+	struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
 {
-	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
 	struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
-	u16 wqe_index = pfault->mpfault.wqe.wqe_index;
+	u16 wqe_index = pfault->wqe.wqe_index;
 	unsigned ds, opcode;
 #if defined(DEBUG)
 	u32 ctrl_wqe_index, ctrl_qpn;
@@ -502,10 +496,9 @@ static int mlx5_ib_mr_initiator_pfault_handler(
  * scatter-gather list, and set wqe_end to the end of the WQE.
  */
 static int mlx5_ib_mr_responder_pfault_handler(
-	struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault,
-	void **wqe, void **wqe_end, int wqe_length)
+	struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
+	struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
 {
-	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
 	struct mlx5_ib_wq *wq = &qp->rq;
 	int wqe_size = 1 << wq->wqe_shift;
 
@@ -542,70 +535,83 @@ static int mlx5_ib_mr_responder_pfault_handler(
 	return 0;
 }
 
-static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
-					  struct mlx5_ib_pfault *pfault)
+static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev,
+					      u32 wq_num)
+{
+	struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num);
+
+	if (!mqp) {
+		mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num);
+		return NULL;
+	}
+
+	return to_mibqp(mqp);
+}
+
+static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
+					  struct mlx5_pagefault *pfault)
 {
-	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
 	int ret;
 	void *wqe, *wqe_end;
 	u32 bytes_mapped, total_wqe_bytes;
 	char *buffer = NULL;
-	int resume_with_error = 0;
-	u16 wqe_index = pfault->mpfault.wqe.wqe_index;
-	int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR;
-	u32 qpn = qp->trans_qp.base.mqp.qpn;
+	int resume_with_error = 1;
+	u16 wqe_index = pfault->wqe.wqe_index;
+	int requestor = pfault->type & MLX5_PFAULT_REQUESTOR;
+	struct mlx5_ib_qp *qp;
 
 	buffer = (char *)__get_free_page(GFP_KERNEL);
 	if (!buffer) {
 		mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
-		resume_with_error = 1;
 		goto resolve_page_fault;
 	}
 
+	qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num);
+	if (!qp)
+		goto resolve_page_fault;
+
 	ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
 				    PAGE_SIZE, &qp->trans_qp.base);
 	if (ret < 0) {
-		mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n",
-			    -ret, wqe_index, qpn);
-		resume_with_error = 1;
+		mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n",
+			    ret, wqe_index, pfault->token);
 		goto resolve_page_fault;
 	}
 
 	wqe = buffer;
 	if (requestor)
-		ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe,
+		ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe,
 							  &wqe_end, ret);
 	else
-		ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe,
+		ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe,
 							  &wqe_end, ret);
-	if (ret < 0) {
-		resume_with_error = 1;
+	if (ret < 0)
 		goto resolve_page_fault;
-	}
 
 	if (wqe >= wqe_end) {
 		mlx5_ib_err(dev, "ODP fault on invalid WQE.\n");
-		resume_with_error = 1;
 		goto resolve_page_fault;
 	}
 
-	ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped,
-				      &total_wqe_bytes, !requestor);
+	ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end,
+				      &bytes_mapped, &total_wqe_bytes,
+				      !requestor);
 	if (ret == -EAGAIN) {
+		resume_with_error = 0;
 		goto resolve_page_fault;
 	} else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
-		mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n",
-			    -ret);
-		resume_with_error = 1;
+		if (ret != -ENOENT)
+			mlx5_ib_err(dev, "Error getting user pages for page fault. Error: %d\n",
+				    ret);
 		goto resolve_page_fault;
 	}
 
+	resume_with_error = 0;
 resolve_page_fault:
-	mlx5_ib_page_fault_resume(qp, pfault, resume_with_error);
-	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n",
-		    qpn, resume_with_error,
-		    pfault->mpfault.flags);
-
+	mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
+	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
+		    pfault->token, resume_with_error,
+		    pfault->type);
 	free_page((unsigned long)buffer);
 }
 
@@ -615,15 +621,14 @@ static int pages_in_range(u64 address, u32 length)
 		(address & PAGE_MASK)) >> PAGE_SHIFT;
 }
 
-static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp,
-					   struct mlx5_ib_pfault *pfault)
+static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
+					   struct mlx5_pagefault *pfault)
 {
-	struct mlx5_pagefault *mpfault = &pfault->mpfault;
 	u64 address;
 	u32 length;
-	u32 prefetch_len = mpfault->bytes_committed;
+	u32 prefetch_len = pfault->bytes_committed;
 	int prefetch_activated = 0;
-	u32 rkey = mpfault->rdma.r_key;
+	u32 rkey = pfault->rdma.r_key;
 	int ret;
 
 	/* The RDMA responder handler handles the page fault in two parts.
@@ -632,38 +637,40 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp,
 	 * prefetches more pages. The second operation cannot use the pfault
 	 * context and therefore uses the dummy_pfault context allocated on
 	 * the stack */
-	struct mlx5_ib_pfault dummy_pfault = {};
+	pfault->rdma.rdma_va += pfault->bytes_committed;
+	pfault->rdma.rdma_op_len -= min(pfault->bytes_committed,
+					 pfault->rdma.rdma_op_len);
+	pfault->bytes_committed = 0;
 
-	dummy_pfault.mpfault.bytes_committed = 0;
-
-	mpfault->rdma.rdma_va += mpfault->bytes_committed;
-	mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed,
-					 mpfault->rdma.rdma_op_len);
-	mpfault->bytes_committed = 0;
-
-	address = mpfault->rdma.rdma_va;
-	length  = mpfault->rdma.rdma_op_len;
+	address = pfault->rdma.rdma_va;
+	length  = pfault->rdma.rdma_op_len;
 
 	/* For some operations, the hardware cannot tell the exact message
 	 * length, and in those cases it reports zero. Use prefetch
 	 * logic. */
 	if (length == 0) {
 		prefetch_activated = 1;
-		length = mpfault->rdma.packet_size;
+		length = pfault->rdma.packet_size;
 		prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
 	}
 
-	ret = pagefault_single_data_segment(qp, pfault, rkey, address, length,
-					    NULL);
+	ret = pagefault_single_data_segment(dev, rkey, address, length,
+					    &pfault->bytes_committed, NULL);
 	if (ret == -EAGAIN) {
 		/* We're racing with an invalidation, don't prefetch */
 		prefetch_activated = 0;
 	} else if (ret < 0 || pages_in_range(address, length) > ret) {
-		mlx5_ib_page_fault_resume(qp, pfault, 1);
+		mlx5_ib_page_fault_resume(dev, pfault, 1);
+		if (ret != -ENOENT)
+			mlx5_ib_warn(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n",
+				     ret, pfault->token, pfault->type);
 		return;
 	}
 
-	mlx5_ib_page_fault_resume(qp, pfault, 0);
+	mlx5_ib_page_fault_resume(dev, pfault, 0);
+	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n",
+		    pfault->token, pfault->type,
+		    prefetch_activated);
 
 	/* At this point, there might be a new pagefault already arriving in
 	 * the eq, switch to the dummy pagefault for the rest of the
@@ -671,112 +678,39 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp,
 	 * work-queue is being fenced. */
 
 	if (prefetch_activated) {
-		ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey,
-						    address,
+		u32 bytes_committed = 0;
+
+		ret = pagefault_single_data_segment(dev, rkey, address,
 						    prefetch_len,
-						    NULL);
+						    &bytes_committed, NULL);
 		if (ret < 0) {
-			pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n",
-				ret, prefetch_activated,
-				qp->ibqp.qp_num, address, prefetch_len);
+			mlx5_ib_warn(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
+				     ret, pfault->token, address,
+				     prefetch_len);
 		}
 	}
 }
 
-void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
-			       struct mlx5_ib_pfault *pfault)
+void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
+		    struct mlx5_pagefault *pfault)
 {
-	u8 event_subtype = pfault->mpfault.event_subtype;
+	struct mlx5_ib_dev *dev = context;
+	u8 event_subtype = pfault->event_subtype;
 
 	switch (event_subtype) {
 	case MLX5_PFAULT_SUBTYPE_WQE:
-		mlx5_ib_mr_wqe_pfault_handler(qp, pfault);
+		mlx5_ib_mr_wqe_pfault_handler(dev, pfault);
 		break;
 	case MLX5_PFAULT_SUBTYPE_RDMA:
-		mlx5_ib_mr_rdma_pfault_handler(qp, pfault);
+		mlx5_ib_mr_rdma_pfault_handler(dev, pfault);
 		break;
 	default:
-		pr_warn("Invalid page fault event subtype: 0x%x\n",
-			event_subtype);
-		mlx5_ib_page_fault_resume(qp, pfault, 1);
-		break;
+		mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n",
+			    event_subtype);
+		mlx5_ib_page_fault_resume(dev, pfault, 1);
 	}
 }
 
-static void mlx5_ib_qp_pfault_action(struct work_struct *work)
-{
-	struct mlx5_ib_pfault *pfault = container_of(work,
-						     struct mlx5_ib_pfault,
-						     work);
-	enum mlx5_ib_pagefault_context context =
-		mlx5_ib_get_pagefault_context(&pfault->mpfault);
-	struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp,
-					     pagefaults[context]);
-	mlx5_ib_mr_pfault_handler(qp, pfault);
-}
-
-void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
-	qp->disable_page_faults = 1;
-	spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
-
-	/*
-	 * Note that at this point, we are guarenteed that no more
-	 * work queue elements will be posted to the work queue with
-	 * the QP we are closing.
-	 */
-	flush_workqueue(mlx5_ib_page_fault_wq);
-}
-
-void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
-	qp->disable_page_faults = 0;
-	spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
-}
-
-static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp,
-				   struct mlx5_pagefault *pfault)
-{
-	/*
-	 * Note that we will only get one fault event per QP per context
-	 * (responder/initiator, read/write), until we resolve the page fault
-	 * with the mlx5_ib_page_fault_resume command. Since this function is
-	 * called from within the work element, there is no risk of missing
-	 * events.
-	 */
-	struct mlx5_ib_qp *mibqp = to_mibqp(qp);
-	enum mlx5_ib_pagefault_context context =
-		mlx5_ib_get_pagefault_context(pfault);
-	struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context];
-
-	qp_pfault->mpfault = *pfault;
-
-	/* No need to stop interrupts here since we are in an interrupt */
-	spin_lock(&mibqp->disable_page_faults_lock);
-	if (!mibqp->disable_page_faults)
-		queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work);
-	spin_unlock(&mibqp->disable_page_faults_lock);
-}
-
-void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)
-{
-	int i;
-
-	qp->disable_page_faults = 1;
-	spin_lock_init(&qp->disable_page_faults_lock);
-
-	qp->trans_qp.base.mqp.pfault_handler = mlx5_ib_pfault_handler;
-
-	for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i)
-		INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action);
-}
-
 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev)
 {
 	int ret;
@@ -793,17 +727,3 @@ void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)
 	cleanup_srcu_struct(&ibdev->mr_srcu);
 }
 
-int __init mlx5_ib_odp_init(void)
-{
-	mlx5_ib_page_fault_wq = alloc_ordered_workqueue("mlx5_ib_page_faults",
-							WQ_MEM_RECLAIM);
-	if (!mlx5_ib_page_fault_wq)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void mlx5_ib_odp_cleanup(void)
-{
-	destroy_workqueue(mlx5_ib_page_fault_wq);
-}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 6d5f5d4..8562c90 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -1526,9 +1526,6 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	       &qp->raw_packet_qp.rq.base :
 	       &qp->trans_qp.base;
 
-	if (init_attr->qp_type != IB_QPT_RAW_PACKET)
-		mlx5_ib_odp_create_qp(qp);
-
 	mutex_init(&qp->mutex);
 	spin_lock_init(&qp->sq.lock);
 	spin_lock_init(&qp->rq.lock);
@@ -1923,7 +1920,6 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 
 	if (qp->state != IB_QPS_RESET) {
 		if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) {
-			mlx5_ib_qp_disable_pagefaults(qp);
 			err = mlx5_core_qp_modify(dev->mdev,
 						  MLX5_CMD_OP_2RST_QP, 0,
 						  NULL, &base->mqp);
@@ -2823,16 +2819,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 	if (mlx5_st < 0)
 		goto out;
 
-	/* If moving to a reset or error state, we must disable page faults on
-	 * this QP and flush all current page faults. Otherwise a stale page
-	 * fault may attempt to work on this QP after it is reset and moved
-	 * again to RTS, and may cause the driver and the device to get out of
-	 * sync. */
-	if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
-	    (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR) &&
-	    (qp->ibqp.qp_type != IB_QPT_RAW_PACKET))
-		mlx5_ib_qp_disable_pagefaults(qp);
-
 	if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE ||
 	    !optab[mlx5_cur][mlx5_new])
 		goto out;
@@ -2864,10 +2850,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 	if (err)
 		goto out;
 
-	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT &&
-	    (qp->ibqp.qp_type != IB_QPT_RAW_PACKET))
-		mlx5_ib_qp_enable_pagefaults(qp);
-
 	qp->state = new_state;
 
 	if (attr_mask & IB_QP_ACCESS_FLAGS)
@@ -4533,14 +4515,6 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 		return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask,
 					    qp_init_attr);
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	/*
-	 * Wait for any outstanding page faults, in case the user frees memory
-	 * based upon this query's result.
-	 */
-	flush_workqueue(mlx5_ib_page_fault_wq);
-#endif
-
 	mutex_lock(&qp->mutex);
 
 	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index a9dbc28..a62f4b6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -71,6 +71,16 @@ void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
 	if (dev_ctx->context) {
 		spin_lock_irq(&priv->ctx_lock);
 		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+		if (dev_ctx->intf->pfault) {
+			if (priv->pfault) {
+				mlx5_core_err(dev, "multiple page fault handlers not supported");
+			} else {
+				priv->pfault_ctx = dev_ctx->context;
+				priv->pfault = dev_ctx->intf->pfault;
+			}
+		}
+#endif
 		spin_unlock_irq(&priv->ctx_lock);
 	} else {
 		kfree(dev_ctx);
@@ -97,6 +107,15 @@ void mlx5_remove_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
 	if (!dev_ctx)
 		return;
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	spin_lock_irq(&priv->ctx_lock);
+	if (priv->pfault == dev_ctx->intf->pfault)
+		priv->pfault = NULL;
+	spin_unlock_irq(&priv->ctx_lock);
+
+	synchronize_srcu(&priv->pfault_srcu);
+#endif
+
 	spin_lock_irq(&priv->ctx_lock);
 	list_del(&dev_ctx->list);
 	spin_unlock_irq(&priv->ctx_lock);
@@ -329,6 +348,20 @@ void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
 	spin_unlock_irqrestore(&priv->ctx_lock, flags);
 }
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+void mlx5_core_page_fault(struct mlx5_core_dev *dev,
+			  struct mlx5_pagefault *pfault)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&priv->pfault_srcu);
+	if (priv->pfault)
+		priv->pfault(dev, priv->pfault_ctx, pfault);
+	srcu_read_unlock(&priv->pfault_srcu, srcu_idx);
+}
+#endif
+
 void mlx5_dev_list_lock(void)
 {
 	mutex_lock(&mlx5_intf_mutex);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 8ffcc88..4aff8ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -54,6 +54,7 @@ enum {
 	MLX5_NUM_SPARE_EQE	= 0x80,
 	MLX5_NUM_ASYNC_EQE	= 0x100,
 	MLX5_NUM_CMD_EQE	= 32,
+	MLX5_NUM_PF_DRAIN	= 64,
 };
 
 enum {
@@ -188,10 +189,193 @@ static void eq_update_ci(struct mlx5_eq *eq, int arm)
 	mb();
 }
 
-static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+static void eqe_pf_action(struct work_struct *work)
+{
+	struct mlx5_pagefault *pfault = container_of(work,
+						     struct mlx5_pagefault,
+						     work);
+	struct mlx5_eq *eq = pfault->eq;
+
+	mlx5_core_page_fault(eq->dev, pfault);
+	mempool_free(pfault, eq->pf_ctx.pool);
+}
+
+static void eq_pf_process(struct mlx5_eq *eq)
+{
+	struct mlx5_core_dev *dev = eq->dev;
+	struct mlx5_eqe_page_fault *pf_eqe;
+	struct mlx5_pagefault *pfault;
+	struct mlx5_eqe *eqe;
+	int set_ci = 0;
+
+	while ((eqe = next_eqe_sw(eq))) {
+		pfault = mempool_alloc(eq->pf_ctx.pool, GFP_ATOMIC);
+		if (!pfault) {
+			schedule_work(&eq->pf_ctx.work);
+			break;
+		}
+
+		dma_rmb();
+		pf_eqe = &eqe->data.page_fault;
+		pfault->event_subtype = eqe->sub_type;
+		pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
+
+		mlx5_core_dbg(dev,
+			      "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
+			      eqe->sub_type, pfault->bytes_committed);
+
+		switch (eqe->sub_type) {
+		case MLX5_PFAULT_SUBTYPE_RDMA:
+			/* RDMA based event */
+			pfault->type =
+				be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
+			pfault->token =
+				be32_to_cpu(pf_eqe->rdma.pftype_token) &
+				MLX5_24BIT_MASK;
+			pfault->rdma.r_key =
+				be32_to_cpu(pf_eqe->rdma.r_key);
+			pfault->rdma.packet_size =
+				be16_to_cpu(pf_eqe->rdma.packet_length);
+			pfault->rdma.rdma_op_len =
+				be32_to_cpu(pf_eqe->rdma.rdma_op_len);
+			pfault->rdma.rdma_va =
+				be64_to_cpu(pf_eqe->rdma.rdma_va);
+			mlx5_core_dbg(dev,
+				      "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
+				      pfault->type, pfault->token,
+				      pfault->rdma.r_key);
+			mlx5_core_dbg(dev,
+				      "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
+				      pfault->rdma.rdma_op_len,
+				      pfault->rdma.rdma_va);
+			break;
+
+		case MLX5_PFAULT_SUBTYPE_WQE:
+			/* WQE based event */
+			pfault->type =
+				be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24;
+			pfault->token =
+				be32_to_cpu(pf_eqe->wqe.token);
+			pfault->wqe.wq_num =
+				be32_to_cpu(pf_eqe->wqe.pftype_wq) &
+				MLX5_24BIT_MASK;
+			pfault->wqe.wqe_index =
+				be16_to_cpu(pf_eqe->wqe.wqe_index);
+			pfault->wqe.packet_size =
+				be16_to_cpu(pf_eqe->wqe.packet_length);
+			mlx5_core_dbg(dev,
+				      "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
+				      pfault->type, pfault->token,
+				      pfault->wqe.wq_num,
+				      pfault->wqe.wqe_index);
+			break;
+
+		default:
+			mlx5_core_warn(dev,
+				       "Unsupported page fault event sub-type: 0x%02hhx\n",
+				       eqe->sub_type);
+			/* Unsupported page faults should still be
+			 * resolved by the page fault handler
+			 */
+		}
+
+		pfault->eq = eq;
+		INIT_WORK(&pfault->work, eqe_pf_action);
+		queue_work(eq->pf_ctx.wq, &pfault->work);
+
+		++eq->cons_index;
+		++set_ci;
+
+		if (unlikely(set_ci >= MLX5_NUM_SPARE_EQE)) {
+			eq_update_ci(eq, 0);
+			set_ci = 0;
+		}
+	}
+
+	eq_update_ci(eq, 1);
+}
+
+static irqreturn_t mlx5_eq_pf_int(int irq, void *eq_ptr)
+{
+	struct mlx5_eq *eq = eq_ptr;
+	unsigned long flags;
+
+	if (spin_trylock_irqsave(&eq->pf_ctx.lock, flags)) {
+		eq_pf_process(eq);
+		spin_unlock_irqrestore(&eq->pf_ctx.lock, flags);
+	} else {
+		schedule_work(&eq->pf_ctx.work);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/* mempool_refill() was proposed but unfortunately wasn't accepted
+ * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
+ * Chip workaround.
+ */
+static void mempool_refill(mempool_t *pool)
+{
+	while (pool->curr_nr < pool->min_nr)
+		mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
+}
+
+static void eq_pf_action(struct work_struct *work)
+{
+	struct mlx5_eq *eq = container_of(work, struct mlx5_eq, pf_ctx.work);
+
+	mempool_refill(eq->pf_ctx.pool);
+
+	spin_lock_irq(&eq->pf_ctx.lock);
+	eq_pf_process(eq);
+	spin_unlock_irq(&eq->pf_ctx.lock);
+}
+
+static int init_pf_ctx(struct mlx5_eq_pagefault *pf_ctx, const char *name)
 {
+	spin_lock_init(&pf_ctx->lock);
+	INIT_WORK(&pf_ctx->work, eq_pf_action);
+
+	pf_ctx->wq = alloc_ordered_workqueue(name,
+					     WQ_MEM_RECLAIM);
+	if (!pf_ctx->wq)
+		return -ENOMEM;
+
+	pf_ctx->pool = mempool_create_kmalloc_pool
+		(MLX5_NUM_PF_DRAIN, sizeof(struct mlx5_pagefault));
+	if (!pf_ctx->pool)
+		goto err_wq;
+
+	return 0;
+err_wq:
+	destroy_workqueue(pf_ctx->wq);
+	return -ENOMEM;
+}
+
+int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token,
+				u32 wq_num, u8 type, int error)
+{
+	u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)]   = {0};
+
+	MLX5_SET(page_fault_resume_in, in, opcode,
+		 MLX5_CMD_OP_PAGE_FAULT_RESUME);
+	MLX5_SET(page_fault_resume_in, in, error, !!error);
+	MLX5_SET(page_fault_resume_in, in, page_fault_type, type);
+	MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
+	MLX5_SET(page_fault_resume_in, in, token, token);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume);
+#endif
+
+static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
+{
+	struct mlx5_eq *eq = eq_ptr;
+	struct mlx5_core_dev *dev = eq->dev;
 	struct mlx5_eqe *eqe;
-	int eqes_found = 0;
 	int set_ci = 0;
 	u32 cqn = -1;
 	u32 rsn;
@@ -276,12 +460,6 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 			}
 			break;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-		case MLX5_EVENT_TYPE_PAGE_FAULT:
-			mlx5_eq_pagefault(dev, eqe);
-			break;
-#endif
-
 #ifdef CONFIG_MLX5_CORE_EN
 		case MLX5_EVENT_TYPE_NIC_VPORT_CHANGE:
 			mlx5_eswitch_vport_event(dev->priv.eswitch, eqe);
@@ -299,7 +477,6 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 		}
 
 		++eq->cons_index;
-		eqes_found = 1;
 		++set_ci;
 
 		/* The HCA will think the queue has overflowed if we
@@ -319,17 +496,6 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 	if (cqn != -1)
 		tasklet_schedule(&eq->tasklet_ctx.task);
 
-	return eqes_found;
-}
-
-static irqreturn_t mlx5_msix_handler(int irq, void *eq_ptr)
-{
-	struct mlx5_eq *eq = eq_ptr;
-	struct mlx5_core_dev *dev = eq->dev;
-
-	mlx5_eq_int(dev, eq);
-
-	/* MSI-X vectors always belong to us */
 	return IRQ_HANDLED;
 }
 
@@ -345,22 +511,32 @@ static void init_eq_buf(struct mlx5_eq *eq)
 }
 
 int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
-		       int nent, u64 mask, const char *name, struct mlx5_uar *uar)
+		       int nent, u64 mask, const char *name,
+		       struct mlx5_uar *uar, enum mlx5_eq_type type)
 {
 	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
 	struct mlx5_priv *priv = &dev->priv;
+	irq_handler_t handler;
 	__be64 *pas;
 	void *eqc;
 	int inlen;
 	u32 *in;
 	int err;
 
+	eq->type = type;
 	eq->nent = roundup_pow_of_two(nent + MLX5_NUM_SPARE_EQE);
 	eq->cons_index = 0;
 	err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, &eq->buf);
 	if (err)
 		return err;
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (type == MLX5_EQ_TYPE_PF)
+		handler = mlx5_eq_pf_int;
+	else
+#endif
+		handler = mlx5_eq_int;
+
 	init_eq_buf(eq);
 
 	inlen = MLX5_ST_SZ_BYTES(create_eq_in) +
@@ -396,7 +572,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 	eq->irqn = priv->msix_arr[vecidx].vector;
 	eq->dev = dev;
 	eq->doorbell = uar->map + MLX5_EQ_DOORBEL_OFFSET;
-	err = request_irq(eq->irqn, mlx5_msix_handler, 0,
+	err = request_irq(eq->irqn, handler, 0,
 			  priv->irq_info[vecidx].name, eq);
 	if (err)
 		goto err_eq;
@@ -405,11 +581,20 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 	if (err)
 		goto err_irq;
 
-	INIT_LIST_HEAD(&eq->tasklet_ctx.list);
-	INIT_LIST_HEAD(&eq->tasklet_ctx.process_list);
-	spin_lock_init(&eq->tasklet_ctx.lock);
-	tasklet_init(&eq->tasklet_ctx.task, mlx5_cq_tasklet_cb,
-		     (unsigned long)&eq->tasklet_ctx);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (type == MLX5_EQ_TYPE_PF) {
+		err = init_pf_ctx(&eq->pf_ctx, name);
+		if (err)
+			goto err_irq;
+	} else
+#endif
+	{
+		INIT_LIST_HEAD(&eq->tasklet_ctx.list);
+		INIT_LIST_HEAD(&eq->tasklet_ctx.process_list);
+		spin_lock_init(&eq->tasklet_ctx.lock);
+		tasklet_init(&eq->tasklet_ctx.task, mlx5_cq_tasklet_cb,
+			     (unsigned long)&eq->tasklet_ctx);
+	}
 
 	/* EQs are created in ARMED state
 	 */
@@ -444,7 +629,16 @@ int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
 			       eq->eqn);
 	synchronize_irq(eq->irqn);
-	tasklet_disable(&eq->tasklet_ctx.task);
+
+	if (eq->type == MLX5_EQ_TYPE_COMP) {
+		tasklet_disable(&eq->tasklet_ctx.task);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	} else if (eq->type == MLX5_EQ_TYPE_PF) {
+		cancel_work_sync(&eq->pf_ctx.work);
+		destroy_workqueue(eq->pf_ctx.wq);
+		mempool_destroy(eq->pf_ctx.pool);
+#endif
+	}
 	mlx5_buf_free(dev, &eq->buf);
 
 	return err;
@@ -479,8 +673,6 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 	u64 async_event_mask = MLX5_ASYNC_EVENT_MASK;
 	int err;
 
-	if (MLX5_CAP_GEN(dev, pg))
-		async_event_mask |= (1ull << MLX5_EVENT_TYPE_PAGE_FAULT);
 
 	if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH &&
 	    MLX5_CAP_GEN(dev, vport_group_manager) &&
@@ -494,7 +686,8 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 
 	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
 				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
-				 "mlx5_cmd_eq", &dev->priv.uuari.uars[0]);
+				 "mlx5_cmd_eq", &dev->priv.uuari.uars[0],
+				 MLX5_EQ_TYPE_ASYNC);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
 		return err;
@@ -504,7 +697,8 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 
 	err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC,
 				 MLX5_NUM_ASYNC_EQE, async_event_mask,
-				 "mlx5_async_eq", &dev->priv.uuari.uars[0]);
+				 "mlx5_async_eq", &dev->priv.uuari.uars[0],
+				 MLX5_EQ_TYPE_ASYNC);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
 		goto err1;
@@ -514,13 +708,35 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 				 MLX5_EQ_VEC_PAGES,
 				 /* TODO: sriov max_vf + */ 1,
 				 1 << MLX5_EVENT_TYPE_PAGE_REQUEST, "mlx5_pages_eq",
-				 &dev->priv.uuari.uars[0]);
+				 &dev->priv.uuari.uars[0],
+				 MLX5_EQ_TYPE_ASYNC);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
 		goto err2;
 	}
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (MLX5_CAP_GEN(dev, pg)) {
+		err = mlx5_create_map_eq(dev, &table->pfault_eq,
+					 MLX5_EQ_VEC_PFAULT,
+					 MLX5_NUM_ASYNC_EQE,
+					 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
+					 "mlx5_page_fault_eq",
+					 &dev->priv.uuari.uars[0],
+					 MLX5_EQ_TYPE_PF);
+		if (err) {
+			mlx5_core_warn(dev, "failed to create page fault EQ %d\n",
+				       err);
+			goto err3;
+		}
+	}
+
 	return err;
+err3:
+	mlx5_destroy_unmap_eq(dev, &table->pages_eq);
+#else
+	return err;
+#endif
 
 err2:
 	mlx5_destroy_unmap_eq(dev, &table->async_eq);
@@ -536,6 +752,14 @@ int mlx5_stop_eqs(struct mlx5_core_dev *dev)
 	struct mlx5_eq_table *table = &dev->priv.eq_table;
 	int err;
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (MLX5_CAP_GEN(dev, pg)) {
+		err = mlx5_destroy_unmap_eq(dev, &table->pfault_eq);
+		if (err)
+			return err;
+	}
+#endif
+
 	err = mlx5_destroy_unmap_eq(dev, &table->pages_eq);
 	if (err)
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 1713bd8..f411513 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -753,7 +753,8 @@ static int alloc_comp_eqs(struct mlx5_core_dev *dev)
 		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
 		err = mlx5_create_map_eq(dev, eq,
 					 i + MLX5_EQ_VEC_COMP_BASE, nent, 0,
-					 name, &dev->priv.uuari.uars[0]);
+					 name, &dev->priv.uuari.uars[0],
+					 MLX5_EQ_TYPE_COMP);
 		if (err) {
 			kfree(eq);
 			goto clean;
@@ -1295,10 +1296,19 @@ static int init_one(struct pci_dev *pdev,
 	spin_lock_init(&priv->ctx_lock);
 	mutex_init(&dev->pci_status_mutex);
 	mutex_init(&dev->intf_state_mutex);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	err = init_srcu_struct(&priv->pfault_srcu);
+	if (err) {
+		dev_err(&pdev->dev, "init_srcu_struct failed with error code %d\n",
+			err);
+		goto clean_dev;
+	}
+#endif
 	err = mlx5_pci_init(dev, priv);
 	if (err) {
 		dev_err(&pdev->dev, "mlx5_pci_init failed with error code %d\n", err);
-		goto clean_dev;
+		goto clean_srcu;
 	}
 
 	err = mlx5_health_init(dev);
@@ -1332,7 +1342,11 @@ static int init_one(struct pci_dev *pdev,
 	mlx5_health_cleanup(dev);
 close_pci:
 	mlx5_pci_close(dev, priv);
+clean_srcu:
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	cleanup_srcu_struct(&priv->pfault_srcu);
 clean_dev:
+#endif
 	pci_set_drvdata(pdev, NULL);
 	devlink_free(devlink);
 
@@ -1357,6 +1371,9 @@ static void remove_one(struct pci_dev *pdev)
 	mlx5_pagealloc_cleanup(dev);
 	mlx5_health_cleanup(dev);
 	mlx5_pci_close(dev, priv);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	cleanup_srcu_struct(&priv->pfault_srcu);
+#endif
 	pci_set_drvdata(pdev, NULL);
 	devlink_free(devlink);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index d4a99c9..74241e8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -86,6 +86,8 @@ int mlx5_cmd_init_hca(struct mlx5_core_dev *dev);
 int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
 void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
 		     unsigned long param);
+void mlx5_core_page_fault(struct mlx5_core_dev *dev,
+			  struct mlx5_pagefault *pfault);
 void mlx5_port_module_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
 void mlx5_enter_error_state(struct mlx5_core_dev *dev);
 void mlx5_disable_device(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index 5378a5f..cbbcef2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -143,95 +143,6 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type)
 	mlx5_core_put_rsc(common);
 }
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe)
-{
-	struct mlx5_eqe_page_fault *pf_eqe = &eqe->data.page_fault;
-	int qpn = be32_to_cpu(pf_eqe->flags_qpn) & MLX5_QPN_MASK;
-	struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, qpn);
-	struct mlx5_core_qp *qp =
-		container_of(common, struct mlx5_core_qp, common);
-	struct mlx5_pagefault pfault;
-
-	if (!qp) {
-		mlx5_core_warn(dev, "ODP event for non-existent QP %06x\n",
-			       qpn);
-		return;
-	}
-
-	pfault.event_subtype = eqe->sub_type;
-	pfault.flags = (be32_to_cpu(pf_eqe->flags_qpn) >> MLX5_QPN_BITS) &
-		(MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE | MLX5_PFAULT_RDMA);
-	pfault.bytes_committed = be32_to_cpu(
-		pf_eqe->bytes_committed);
-
-	mlx5_core_dbg(dev,
-		      "PAGE_FAULT: subtype: 0x%02x, flags: 0x%02x,\n",
-		      eqe->sub_type, pfault.flags);
-
-	switch (eqe->sub_type) {
-	case MLX5_PFAULT_SUBTYPE_RDMA:
-		/* RDMA based event */
-		pfault.rdma.r_key =
-			be32_to_cpu(pf_eqe->rdma.r_key);
-		pfault.rdma.packet_size =
-			be16_to_cpu(pf_eqe->rdma.packet_length);
-		pfault.rdma.rdma_op_len =
-			be32_to_cpu(pf_eqe->rdma.rdma_op_len);
-		pfault.rdma.rdma_va =
-			be64_to_cpu(pf_eqe->rdma.rdma_va);
-		mlx5_core_dbg(dev,
-			      "PAGE_FAULT: qpn: 0x%06x, r_key: 0x%08x,\n",
-			      qpn, pfault.rdma.r_key);
-		mlx5_core_dbg(dev,
-			      "PAGE_FAULT: rdma_op_len: 0x%08x,\n",
-			      pfault.rdma.rdma_op_len);
-		mlx5_core_dbg(dev,
-			      "PAGE_FAULT: rdma_va: 0x%016llx,\n",
-			      pfault.rdma.rdma_va);
-		mlx5_core_dbg(dev,
-			      "PAGE_FAULT: bytes_committed: 0x%06x\n",
-			      pfault.bytes_committed);
-		break;
-
-	case MLX5_PFAULT_SUBTYPE_WQE:
-		/* WQE based event */
-		pfault.wqe.wqe_index =
-			be16_to_cpu(pf_eqe->wqe.wqe_index);
-		pfault.wqe.packet_size =
-			be16_to_cpu(pf_eqe->wqe.packet_length);
-		mlx5_core_dbg(dev,
-			      "PAGE_FAULT: qpn: 0x%06x, wqe_index: 0x%04x,\n",
-			      qpn, pfault.wqe.wqe_index);
-		mlx5_core_dbg(dev,
-			      "PAGE_FAULT: bytes_committed: 0x%06x\n",
-			      pfault.bytes_committed);
-		break;
-
-	default:
-		mlx5_core_warn(dev,
-			       "Unsupported page fault event sub-type: 0x%02hhx, QP %06x\n",
-			       eqe->sub_type, qpn);
-		/* Unsupported page faults should still be resolved by the
-		 * page fault handler
-		 */
-	}
-
-	if (qp->pfault_handler) {
-		qp->pfault_handler(qp, &pfault);
-	} else {
-		mlx5_core_err(dev,
-			      "ODP event for QP %08x, without a fault handler in QP\n",
-			      qpn);
-		/* Page fault will remain unresolved. QP will hang until it is
-		 * destroyed
-		 */
-	}
-
-	mlx5_core_put_rsc(common);
-}
-#endif
-
 static int create_qprqsq_common(struct mlx5_core_dev *dev,
 				struct mlx5_core_qp *qp,
 				int rsc_type)
@@ -506,25 +417,6 @@ int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn)
 }
 EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc);
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
-				u8 flags, int error)
-{
-	u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = {0};
-	u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)]   = {0};
-
-	MLX5_SET(page_fault_resume_in, in, opcode,
-		 MLX5_CMD_OP_PAGE_FAULT_RESUME);
-	MLX5_SET(page_fault_resume_in, in, wq_number, qpn);
-	MLX5_SET(page_fault_resume_in, in, page_fault_type, flags);
-	if (error)
-		MLX5_SET(page_fault_resume_in, in, error, 1);
-
-	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
-}
-EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume);
-#endif
-
 int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
 				struct mlx5_core_qp *rq)
 {
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 9f48936..3ccaeff 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -534,7 +534,9 @@ struct mlx5_eqe_page_fault {
 			__be16  wqe_index;
 			u16	reserved2;
 			__be16  packet_length;
-			u8	reserved3[12];
+			__be32  token;
+			u8	reserved4[8];
+			__be32  pftype_wq;
 		} __packed wqe;
 		struct {
 			__be32  r_key;
@@ -542,9 +544,9 @@ struct mlx5_eqe_page_fault {
 			__be16  packet_length;
 			__be32  rdma_op_len;
 			__be64  rdma_va;
+			__be32  pftype_token;
 		} __packed rdma;
 	} __packed;
-	__be32 flags_qpn;
 } __packed;
 
 struct mlx5_eqe_vport_change {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ec52f3b..b52d074 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -42,6 +42,7 @@
 #include <linux/vmalloc.h>
 #include <linux/radix-tree.h>
 #include <linux/workqueue.h>
+#include <linux/mempool.h>
 #include <linux/interrupt.h>
 
 #include <linux/mlx5/device.h>
@@ -83,6 +84,7 @@ enum {
 	MLX5_EQ_VEC_PAGES	 = 0,
 	MLX5_EQ_VEC_CMD		 = 1,
 	MLX5_EQ_VEC_ASYNC	 = 2,
+	MLX5_EQ_VEC_PFAULT	 = 3,
 	MLX5_EQ_VEC_COMP_BASE,
 };
 
@@ -178,6 +180,14 @@ enum mlx5_port_status {
 	MLX5_PORT_DOWN      = 2,
 };
 
+enum mlx5_eq_type {
+	MLX5_EQ_TYPE_COMP,
+	MLX5_EQ_TYPE_ASYNC,
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	MLX5_EQ_TYPE_PF,
+#endif
+};
+
 struct mlx5_uuar_info {
 	struct mlx5_uar	       *uars;
 	int			num_uars;
@@ -333,6 +343,14 @@ struct mlx5_eq_tasklet {
 	spinlock_t lock;
 };
 
+struct mlx5_eq_pagefault {
+	struct work_struct       work;
+	/* Pagefaults lock */
+	spinlock_t		 lock;
+	struct workqueue_struct *wq;
+	mempool_t		*pool;
+};
+
 struct mlx5_eq {
 	struct mlx5_core_dev   *dev;
 	__be32 __iomem	       *doorbell;
@@ -346,7 +364,13 @@ struct mlx5_eq {
 	struct list_head	list;
 	int			index;
 	struct mlx5_rsc_debug	*dbg;
-	struct mlx5_eq_tasklet	tasklet_ctx;
+	enum mlx5_eq_type	type;
+	union {
+		struct mlx5_eq_tasklet   tasklet_ctx;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+		struct mlx5_eq_pagefault pf_ctx;
+#endif
+	};
 };
 
 struct mlx5_core_psv {
@@ -377,6 +401,8 @@ struct mlx5_core_mkey {
 	u32			pd;
 };
 
+#define MLX5_24BIT_MASK		((1 << 24) - 1)
+
 enum mlx5_res_type {
 	MLX5_RES_QP	= MLX5_EVENT_QUEUE_TYPE_QP,
 	MLX5_RES_RQ	= MLX5_EVENT_QUEUE_TYPE_RQ,
@@ -411,6 +437,9 @@ struct mlx5_eq_table {
 	struct mlx5_eq		pages_eq;
 	struct mlx5_eq		async_eq;
 	struct mlx5_eq		cmd_eq;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	struct mlx5_eq		pfault_eq;
+#endif
 	int			num_comp_vectors;
 	/* protect EQs list
 	 */
@@ -497,6 +526,7 @@ struct mlx5_fc_stats {
 
 struct mlx5_eswitch;
 struct mlx5_lag;
+struct mlx5_pagefault;
 
 struct mlx5_rl_entry {
 	u32                     rate;
@@ -601,6 +631,14 @@ struct mlx5_priv {
 	struct mlx5_rl_table            rl_table;
 
 	struct mlx5_port_module_event_stats  pme_stats;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	void		      (*pfault)(struct mlx5_core_dev *dev,
+					void *context,
+					struct mlx5_pagefault *pfault);
+	void		       *pfault_ctx;
+	struct srcu_struct      pfault_srcu;
+#endif
 };
 
 enum mlx5_device_state {
@@ -619,6 +657,50 @@ enum mlx5_pci_status {
 	MLX5_PCI_STATUS_ENABLED,
 };
 
+enum mlx5_pagefault_type_flags {
+	MLX5_PFAULT_REQUESTOR = 1 << 0,
+	MLX5_PFAULT_WRITE     = 1 << 1,
+	MLX5_PFAULT_RDMA      = 1 << 2,
+};
+
+/* Contains the details of a pagefault. */
+struct mlx5_pagefault {
+	u32			bytes_committed;
+	u32			token;
+	u8			event_subtype;
+	u8			type;
+	union {
+		/* Initiator or send message responder pagefault details. */
+		struct {
+			/* Received packet size, only valid for responders. */
+			u32	packet_size;
+			/*
+			 * Number of resource holding WQE, depends on type.
+			 */
+			u32	wq_num;
+			/*
+			 * WQE index. Refers to either the send queue or
+			 * receive queue, according to event_subtype.
+			 */
+			u16	wqe_index;
+		} wqe;
+		/* RDMA responder pagefault details */
+		struct {
+			u32	r_key;
+			/*
+			 * Received packet size, minimal size page fault
+			 * resolution required for forward progress.
+			 */
+			u32	packet_size;
+			u32	rdma_op_len;
+			u64	rdma_va;
+		} rdma;
+	};
+
+	struct mlx5_eq	       *eq;
+	struct work_struct	work;
+};
+
 struct mlx5_td {
 	struct list_head tirs_list;
 	u32              tdn;
@@ -879,15 +961,13 @@ void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas);
 void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas);
 void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn);
 void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type);
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
-#endif
 void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type);
 struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn);
 void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec);
 void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type);
 int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
-		       int nent, u64 mask, const char *name, struct mlx5_uar *uar);
+		       int nent, u64 mask, const char *name,
+		       struct mlx5_uar *uar, enum mlx5_eq_type type);
 int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 int mlx5_start_eqs(struct mlx5_core_dev *dev);
 int mlx5_stop_eqs(struct mlx5_core_dev *dev);
@@ -926,6 +1006,10 @@ int mlx5_query_odp_caps(struct mlx5_core_dev *dev,
 			struct mlx5_odp_caps *odp_caps);
 int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev,
 			     u8 port_num, void *out, size_t sz);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token,
+				u32 wq_num, u8 type, int error);
+#endif
 
 int mlx5_init_rl_table(struct mlx5_core_dev *dev);
 void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev);
@@ -974,6 +1058,9 @@ struct mlx5_interface {
 	void			(*detach)(struct mlx5_core_dev *dev, void *context);
 	void			(*event)(struct mlx5_core_dev *dev, void *context,
 					 enum mlx5_dev_event event, unsigned long param);
+	void			(*pfault)(struct mlx5_core_dev *dev,
+					  void *context,
+					  struct mlx5_pagefault *pfault);
 	void *                  (*get_dev)(void *context);
 	int			protocol;
 	struct list_head	list;
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 693811e..9ed775f 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -50,9 +50,6 @@
 #define MLX5_BSF_APPTAG_ESCAPE	0x1
 #define MLX5_BSF_APPREF_ESCAPE	0x2
 
-#define MLX5_QPN_BITS		24
-#define MLX5_QPN_MASK		((1 << MLX5_QPN_BITS) - 1)
-
 enum mlx5_qp_optpar {
 	MLX5_QP_OPTPAR_ALT_ADDR_PATH		= 1 << 0,
 	MLX5_QP_OPTPAR_RRE			= 1 << 1,
@@ -418,46 +415,9 @@ struct mlx5_stride_block_ctrl_seg {
 	__be16		num_entries;
 };
 
-enum mlx5_pagefault_flags {
-	MLX5_PFAULT_REQUESTOR = 1 << 0,
-	MLX5_PFAULT_WRITE     = 1 << 1,
-	MLX5_PFAULT_RDMA      = 1 << 2,
-};
-
-/* Contains the details of a pagefault. */
-struct mlx5_pagefault {
-	u32			bytes_committed;
-	u8			event_subtype;
-	enum mlx5_pagefault_flags flags;
-	union {
-		/* Initiator or send message responder pagefault details. */
-		struct {
-			/* Received packet size, only valid for responders. */
-			u32	packet_size;
-			/*
-			 * WQE index. Refers to either the send queue or
-			 * receive queue, according to event_subtype.
-			 */
-			u16	wqe_index;
-		} wqe;
-		/* RDMA responder pagefault details */
-		struct {
-			u32	r_key;
-			/*
-			 * Received packet size, minimal size page fault
-			 * resolution required for forward progress.
-			 */
-			u32	packet_size;
-			u32	rdma_op_len;
-			u64	rdma_va;
-		} rdma;
-	};
-};
-
 struct mlx5_core_qp {
 	struct mlx5_core_rsc_common	common; /* must be first */
 	void (*event)		(struct mlx5_core_qp *, int);
-	void (*pfault_handler)(struct mlx5_core_qp *, struct mlx5_pagefault *);
 	int			qpn;
 	struct mlx5_rsc_debug	*dbg;
 	int			pid;
@@ -557,10 +517,6 @@ void mlx5_init_qp_table(struct mlx5_core_dev *dev);
 void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev);
 int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
 void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
-				u8 context, int error);
-#endif
 int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
 				struct mlx5_core_qp *rq);
 void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev,
-- 
2.7.4

^ permalink raw reply related

* [PATCH for-next 06/11] IB/mlx5: Add support for big MRs
From: Saeed Mahameed @ 2017-01-01 21:10 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Ilya Lesokhin,
	Artemy Kovalyov, Leon Romanovsky, Saeed Mahameed
In-Reply-To: <1483305027-2573-1-git-send-email-saeedm@mellanox.com>

From: Artemy Kovalyov <artemyko@mellanox.com>

Make use of extended UMR translation offset.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 +
 drivers/infiniband/hw/mlx5/mr.c      | 8 +++++---
 drivers/infiniband/hw/mlx5/odp.c     | 5 +++++
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index d79580d..73bff77 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -634,6 +634,7 @@ struct mlx5_ib_dev {
 	int				fill_delay;
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 	struct ib_odp_caps	odp_caps;
+	u64			odp_max_size;
 	/*
 	 * Sleepable RCU that prevents destruction of MRs while they are still
 	 * being used by a page fault handler.
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 738ba13..271b78e 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1121,8 +1121,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
 		goto err_1;
 	}
 	pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
-	mlx5_ib_populate_pas(dev, umem, page_shift, pas,
-			     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
+	if (!(access_flags & IB_ACCESS_ON_DEMAND))
+		mlx5_ib_populate_pas(dev, umem, page_shift, pas,
+				     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
 
 	/* The pg_access bit allows setting the access flags
 	 * in the page list submitted with the command. */
@@ -1210,7 +1211,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 			mlx5_ib_dbg(dev, "cache empty for order %d", order);
 			mr = NULL;
 		}
-	} else if (access_flags & IB_ACCESS_ON_DEMAND) {
+	} else if (access_flags & IB_ACCESS_ON_DEMAND &&
+		   !MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) {
 		err = -EINVAL;
 		pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB");
 		goto error;
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 67651ec..1e73c12 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -121,6 +121,11 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 
 	caps->general_caps = IB_ODP_SUPPORT;
 
+	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
+		dev->odp_max_size = U64_MAX;
+	else
+		dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT);
+
 	if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
 		caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH for-next 04/11] net/mlx5: Support new MR features
From: Saeed Mahameed @ 2017-01-01 21:10 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Ilya Lesokhin,
	Artemy Kovalyov, Leon Romanovsky, Saeed Mahameed
In-Reply-To: <1483305027-2573-1-git-send-email-saeedm@mellanox.com>

From: Artemy Kovalyov <artemyko@mellanox.com>

This patch adds the following items to IFC file.

1. MLX5_MKC_ACCESS_MODE_KSM enum value for creating KSM memory keys.
KSM access mode used when indirect MKey associated with fixed memory
size entries.

2. null_mkey field that is used to indicate non-present KLM/KSM
entries, where it causes the device to generate page fault event
when trying to access it.

3. struct mlx5_ifc_cmd_hca_cap_bits capability bits indicating
related value/field is supported:
* fixed_buffer_size - MLX5_MKC_ACCESS_MODE_KSM
* umr_extended_translation_offset - translation_offset_42_16
    in UMR ctrl segment
* null_mkey - null_mkey in QUERY_SPECIAL_CONTEXTS

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4792c85..7c760e5 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -782,11 +782,12 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_max_eq[0x4];
 
 	u8         max_indirection[0x8];
-	u8         reserved_at_108[0x1];
+	u8         fixed_buffer_size[0x1];
 	u8         log_max_mrw_sz[0x7];
 	u8         reserved_at_110[0x2];
 	u8         log_max_bsf_list_size[0x6];
-	u8         reserved_at_118[0x2];
+	u8         umr_extended_translation_offset[0x1];
+	u8         null_mkey[0x1];
 	u8         log_max_klm_list_size[0x6];
 
 	u8         reserved_at_120[0xa];
@@ -2569,6 +2570,7 @@ enum {
 	MLX5_MKC_ACCESS_MODE_PA    = 0x0,
 	MLX5_MKC_ACCESS_MODE_MTT   = 0x1,
 	MLX5_MKC_ACCESS_MODE_KLMS  = 0x2,
+	MLX5_MKC_ACCESS_MODE_KSM   = 0x3,
 };
 
 struct mlx5_ifc_mkc_bits {
@@ -3677,6 +3679,10 @@ struct mlx5_ifc_query_special_contexts_out_bits {
 	u8         dump_fill_mkey[0x20];
 
 	u8         resd_lkey[0x20];
+
+	u8         null_mkey[0x20];
+
+	u8         reserved_at_a0[0x60];
 };
 
 struct mlx5_ifc_query_special_contexts_in_bits {
-- 
2.7.4

^ permalink raw reply related

* [PATCH for-next 00/11] Mellanox mlx5 core and ODP updates 2017-01-01
From: Saeed Mahameed @ 2017-01-01 21:10 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Ilya Lesokhin,
	Artemy Kovalyov, Saeed Mahameed

Hi Dave and Doug,

The following eleven patches mainly come from Artemy Kovalyov
who expanded mlx5 on-demand-paging (ODP) support. In addition
there are three cleanup patches which don't change any functionality,
but are needed to align codebase prior accepting other patches.

Memory region (MR) in IB can be huge and ODP (on-demand paging)
technique allows to use unpinned memory, which can be consumed and
released on demand. This allows to applications do not pin down
the underlying physical pages of the address space, and save from them
need to track the validity of the mappings.

Rather, the HCA requests the latest translations from the OS when pages
are not present, and the OS invalidates translations which are no longer
valid due to either non-present pages or mapping changes.

In existing ODP implementation applications is needed to register
memory buffers for communication, though registered memory regions
need not have valid mappings at registration time.

This patch set performs the following steps to expand
current ODP implementation:

1. It refactors UMR to support large regions, by introducing generic
   function to perform HCA translation table modifications. This
   function supports both atomic and process contexts and is not limited
   by number of modified entries.

   This function allows to enable reallocated memory regions of
   arbitrary size, so adding MR cache buckets to support up to 16GB MRs.

2. It changes page fault event format and refactor page faults logic
   together with addition of atomic support.

3. It prepares mlx5 core code to support implicit registration with
   simplified and relaxed semantics.

   Implicit ODP semantics allows to applications provide special memory
   key that represents their complete address space. Thus all IO accesses
   referencing to this key (with proper access rights associated with the key)
   wouldn't need not register any virtual address range.

Thanks,
        Artemy, Ilya and Leon

The following changes since commit 7ce7d89f48834cefece7804d38fc5d85382edf77
    Linux 4.10-rc1

are available in the git repository at:
    git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git tags/mlx5-odp-for-4.11

for you to fetch changes up to 4ca4c0713ca3097f1be94355d9a36bd1fc7243a2
    IB/mlx5: Improve MR check

Regards,
Saeed.

Artemy Kovalyov (8):
  net/mlx5: Support new MR features
  IB/mlx5: Refactor UMR post send format
  IB/mlx5: Add support for big MRs
  IB/mlx5: Add MR cache for large UMR regions
  net/mlx5: Update PAGE_FAULT_RESUME layout
  {net,IB}/mlx5: Refactor page fault handling
  IB/mlx5: Add ODP atomics support
  IB/mlx5: Improve MR check

Binoy Jayan (1):
  IB/mlx5: Add helper mlx5_ib_post_send_wait

Leon Romanovsky (1):
  IB/mlx5: Reorder code in query device command

Max Gurtovoy (1):
  net/mlx5: Fix offset naming for reserved fields in hca_cap_bits

 drivers/infiniband/hw/mlx5/main.c                  |  50 +-
 drivers/infiniband/hw/mlx5/mem.c                   |  32 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h               |  89 ++--
 drivers/infiniband/hw/mlx5/mr.c                    | 518 ++++++++-------------
 drivers/infiniband/hw/mlx5/odp.c                   | 424 ++++++++---------
 drivers/infiniband/hw/mlx5/qp.c                    | 154 ++----
 drivers/net/ethernet/mellanox/mlx5/core/dev.c      |  33 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       | 290 ++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  41 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/qp.c       | 114 -----
 include/linux/mlx5/device.h                        |   6 +-
 include/linux/mlx5/driver.h                        | 105 ++++-
 include/linux/mlx5/mlx5_ifc.h                      |  31 +-
 include/linux/mlx5/qp.h                            |  76 ++-
 16 files changed, 1000 insertions(+), 967 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH for-next 08/11] net/mlx5: Update PAGE_FAULT_RESUME layout
From: Saeed Mahameed @ 2017-01-01 21:10 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Ilya Lesokhin,
	Artemy Kovalyov, Leon Romanovsky, Saeed Mahameed
In-Reply-To: <1483305027-2573-1-git-send-email-saeedm@mellanox.com>

From: Artemy Kovalyov <artemyko@mellanox.com>

Update PAGE_FAULT_RESUME command layout.

Three bit fields describing page fault: rdma, rdma_write, req_res gave 8
possible combinations, while only a few were legal. Now they
are interpreted as three-bit type field, where former legal
combinations turns into corresponding types and unused were added as new
types.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/qp.c | 10 ++--------
 include/linux/mlx5/mlx5_ifc.h                |  9 ++++-----
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index d0a4005..5378a5f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -515,14 +515,8 @@ int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
 
 	MLX5_SET(page_fault_resume_in, in, opcode,
 		 MLX5_CMD_OP_PAGE_FAULT_RESUME);
-	MLX5_SET(page_fault_resume_in, in, qpn, qpn);
-
-	if (flags & MLX5_PAGE_FAULT_RESUME_REQUESTOR)
-		MLX5_SET(page_fault_resume_in, in, req_res, 1);
-	if (flags & MLX5_PAGE_FAULT_RESUME_WRITE)
-		MLX5_SET(page_fault_resume_in, in, read_write, 1);
-	if (flags & MLX5_PAGE_FAULT_RESUME_RDMA)
-		MLX5_SET(page_fault_resume_in, in, rdma, 1);
+	MLX5_SET(page_fault_resume_in, in, wq_number, qpn);
+	MLX5_SET(page_fault_resume_in, in, page_fault_type, flags);
 	if (error)
 		MLX5_SET(page_fault_resume_in, in, error, 1);
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 7c760e5..608dc98 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4775,12 +4775,11 @@ struct mlx5_ifc_page_fault_resume_in_bits {
 
 	u8         error[0x1];
 	u8         reserved_at_41[0x4];
-	u8         rdma[0x1];
-	u8         read_write[0x1];
-	u8         req_res[0x1];
-	u8         qpn[0x18];
+	u8         page_fault_type[0x3];
+	u8         wq_number[0x18];
 
-	u8         reserved_at_60[0x20];
+	u8         reserved_at_60[0x8];
+	u8         token[0x18];
 };
 
 struct mlx5_ifc_nop_out_bits {
-- 
2.7.4

^ permalink raw reply related

* [PATCH for-next 05/11] IB/mlx5: Refactor UMR post send format
From: Saeed Mahameed @ 2017-01-01 21:10 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Ilya Lesokhin,
	Artemy Kovalyov, Leon Romanovsky, Saeed Mahameed
In-Reply-To: <1483305027-2573-1-git-send-email-saeedm@mellanox.com>

From: Artemy Kovalyov <artemyko@mellanox.com>

* Update struct mlx5_wqe_umr_ctrl_seg.
* Currenlty UMR send_flags aim only certain use cases: enabled/disable
  cached MR, modifying XLT for ODP. By making flags independent make UMR
  more flexible allowing arbitrary manipulations.
* Since different UMR formats have different entry sizes UMR request
  should receive exact size of translation table update instead of
  number of entries. Rename field npages to xlt_size in struct mlx5_umr_wr
  and update relevant code accordingly.
* Add support of length64 bit.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h              |  24 ++--
 drivers/infiniband/hw/mlx5/mr.c                   |  50 +++++----
 drivers/infiniband/hw/mlx5/odp.c                  |   3 +-
 drivers/infiniband/hw/mlx5/qp.c                   | 128 +++++++++-------------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   2 +-
 include/linux/mlx5/qp.h                           |  14 ++-
 6 files changed, 103 insertions(+), 118 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 6c6057e..d79580d 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -174,13 +174,12 @@ struct mlx5_ib_flow_db {
  * enum ib_send_flags and enum ib_qp_type for low-level driver
  */
 
-#define MLX5_IB_SEND_UMR_UNREG	IB_SEND_RESERVED_START
-#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1)
-#define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2)
-
-#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION	(IB_SEND_RESERVED_START << 3)
-#define MLX5_IB_SEND_UMR_UPDATE_PD		(IB_SEND_RESERVED_START << 4)
-#define MLX5_IB_SEND_UMR_UPDATE_ACCESS		IB_SEND_RESERVED_END
+#define MLX5_IB_SEND_UMR_ENABLE_MR	       (IB_SEND_RESERVED_START << 0)
+#define MLX5_IB_SEND_UMR_DISABLE_MR	       (IB_SEND_RESERVED_START << 1)
+#define MLX5_IB_SEND_UMR_FAIL_IF_FREE	       (IB_SEND_RESERVED_START << 2)
+#define MLX5_IB_SEND_UMR_UPDATE_XLT	       (IB_SEND_RESERVED_START << 3)
+#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION    (IB_SEND_RESERVED_START << 4)
+#define MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS       IB_SEND_RESERVED_END
 
 #define MLX5_IB_QPT_REG_UMR	IB_QPT_RESERVED1
 /*
@@ -190,6 +189,9 @@ struct mlx5_ib_flow_db {
 #define MLX5_IB_QPT_HW_GSI	IB_QPT_RESERVED2
 #define MLX5_IB_WR_UMR		IB_WR_RESERVED1
 
+#define MLX5_IB_UMR_OCTOWORD	       16
+#define MLX5_IB_UMR_XLT_ALIGNMENT      64
+
 /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
  *
  * These flags are intended for internal use by the mlx5_ib driver, and they
@@ -414,13 +416,11 @@ enum mlx5_ib_qp_flags {
 
 struct mlx5_umr_wr {
 	struct ib_send_wr		wr;
-	union {
-		u64			virt_addr;
-		u64			offset;
-	} target;
+	u64				virt_addr;
+	u64				offset;
 	struct ib_pd		       *pd;
 	unsigned int			page_shift;
-	unsigned int			npages;
+	unsigned int			xlt_size;
 	u64				length;
 	int				access_flags;
 	u32				mkey;
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index afb6dc1..738ba13 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -774,7 +774,7 @@ static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 	 * To avoid copying garbage after the pas array, we allocate
 	 * a little more.
 	 */
-	*size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT);
+	*size = ALIGN(sizeof(struct mlx5_mtt) * npages, MLX5_UMR_MTT_ALIGNMENT);
 	*mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
 	if (!(*mr_pas))
 		return -ENOMEM;
@@ -782,7 +782,7 @@ static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 	pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN);
 	mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT);
 	/* Clear padding after the actual pages. */
-	memset(pas + npages, 0, *size - npages * sizeof(u64));
+	memset(pas + npages, 0, *size - npages * sizeof(struct mlx5_mtt));
 
 	*dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE);
 	if (dma_mapping_error(ddev, *dma)) {
@@ -801,7 +801,8 @@ static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr,
 	struct mlx5_umr_wr *umrwr = umr_wr(wr);
 
 	sg->addr = dma;
-	sg->length = ALIGN(sizeof(u64) * n, 64);
+	sg->length = ALIGN(sizeof(struct mlx5_mtt) * n,
+			   MLX5_IB_UMR_XLT_ALIGNMENT);
 	sg->lkey = dev->umrc.pd->local_dma_lkey;
 
 	wr->next = NULL;
@@ -813,7 +814,7 @@ static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr,
 
 	wr->opcode = MLX5_IB_WR_UMR;
 
-	umrwr->npages = n;
+	umrwr->xlt_size = sg->length;
 	umrwr->page_shift = page_shift;
 	umrwr->mkey = key;
 }
@@ -827,9 +828,11 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
 
 	prep_umr_wqe_common(pd, wr, sg, dma, n, key, page_shift);
 
-	wr->send_flags = 0;
+	wr->send_flags = MLX5_IB_SEND_UMR_ENABLE_MR |
+			 MLX5_IB_SEND_UMR_UPDATE_TRANSLATION |
+			 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
 
-	umrwr->target.virt_addr = virt_addr;
+	umrwr->virt_addr = virt_addr;
 	umrwr->length = len;
 	umrwr->access_flags = access_flags;
 	umrwr->pd = pd;
@@ -840,7 +843,8 @@ static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
 {
 	struct mlx5_umr_wr *umrwr = umr_wr(wr);
 
-	wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE;
+	wr->send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
+			 MLX5_IB_SEND_UMR_FAIL_IF_FREE;
 	wr->opcode = MLX5_IB_WR_UMR;
 	umrwr->mkey = key;
 }
@@ -993,7 +997,8 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
 	struct mlx5_umr_wr wr;
 	struct ib_sge sg;
 	int err = 0;
-	const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64);
+	const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT /
+					 sizeof(struct mlx5_mtt);
 	const int page_index_mask = page_index_alignment - 1;
 	size_t pages_mapped = 0;
 	size_t pages_to_map = 0;
@@ -1012,7 +1017,7 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
 	if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES)
 		return -EINVAL;
 
-	size = sizeof(u64) * pages_to_map;
+	size = sizeof(struct mlx5_mtt) * pages_to_map;
 	size = min_t(int, PAGE_SIZE, size);
 	/* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim
 	 * code, when we are called from an invalidation. The pas buffer must
@@ -1026,7 +1031,7 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
 		mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
 		memset(pas, 0, size);
 	}
-	pages_iter = size / sizeof(u64);
+	pages_iter = size / sizeof(struct mlx5_mtt);
 	dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
 	if (dma_mapping_error(ddev, dma)) {
 		mlx5_ib_err(dev, "unable to map DMA during MTT update.\n");
@@ -1049,7 +1054,8 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
 					       MLX5_IB_MTT_PRESENT);
 			/* Clear padding after the pages brought from the
 			 * umem. */
-			memset(pas + npages, 0, size - npages * sizeof(u64));
+			memset(pas + npages, 0, size - npages *
+			       sizeof(struct mlx5_mtt));
 		}
 
 		dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
@@ -1057,19 +1063,19 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
 		memset(&wr, 0, sizeof(wr));
 
 		sg.addr = dma;
-		sg.length = ALIGN(npages * sizeof(u64),
+		sg.length = ALIGN(npages * sizeof(struct mlx5_mtt),
 				MLX5_UMR_MTT_ALIGNMENT);
 		sg.lkey = dev->umrc.pd->local_dma_lkey;
 
 		wr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
-				MLX5_IB_SEND_UMR_UPDATE_MTT;
+				   MLX5_IB_SEND_UMR_UPDATE_XLT;
 		wr.wr.sg_list = &sg;
 		wr.wr.num_sge = 1;
 		wr.wr.opcode = MLX5_IB_WR_UMR;
-		wr.npages = sg.length / sizeof(u64);
+		wr.xlt_size = sg.length;
 		wr.page_shift = PAGE_SHIFT;
 		wr.mkey = mr->mmkey.key;
-		wr.target.offset = start_page_index;
+		wr.offset = start_page_index * sizeof(struct mlx5_mtt);
 
 		err = mlx5_ib_post_send_wait(dev, &wr);
 	}
@@ -1272,7 +1278,7 @@ static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr,
 		if (err)
 			return err;
 
-		umrwr.target.virt_addr = virt_addr;
+		umrwr.virt_addr = virt_addr;
 		umrwr.length = length;
 		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
 	}
@@ -1280,14 +1286,10 @@ static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr,
 	prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key,
 			    page_shift);
 
-	if (flags & IB_MR_REREG_PD) {
+	if (flags & IB_MR_REREG_PD || flags & IB_MR_REREG_ACCESS) {
 		umrwr.pd = pd;
-		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD;
-	}
-
-	if (flags & IB_MR_REREG_ACCESS) {
 		umrwr.access_flags = access_flags;
-		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_ACCESS;
+		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
 	}
 
 	/* post send request to UMR QP */
@@ -1552,11 +1554,11 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
 		mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
 		MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
 		err = mlx5_alloc_priv_descs(pd->device, mr,
-					    ndescs, sizeof(u64));
+					    ndescs, sizeof(struct mlx5_mtt));
 		if (err)
 			goto err_free_in;
 
-		mr->desc_size = sizeof(u64);
+		mr->desc_size = sizeof(struct mlx5_mtt);
 		mr->max_descs = ndescs;
 	} else if (mr_type == IB_MR_TYPE_SG_GAPS) {
 		mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index cacb631..67651ec 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -47,7 +47,8 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
 			      unsigned long end)
 {
 	struct mlx5_ib_mr *mr;
-	const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1;
+	const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
+				    sizeof(struct mlx5_mtt)) - 1;
 	u64 idx = 0, blk_start_idx = 0;
 	int in_block = 0;
 	u64 addr;
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index a1b3125..6d5f5d4 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -3080,9 +3080,10 @@ static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg)
 	dseg->addr       = cpu_to_be64(sg->addr);
 }
 
-static __be16 get_klm_octo(int npages)
+static inline u64 get_xlt_octo(u64 bytes)
 {
-	return cpu_to_be16(ALIGN(npages, 8) / 2);
+	return ALIGN(bytes, MLX5_IB_UMR_XLT_ALIGNMENT) /
+	       MLX5_IB_UMR_OCTOWORD;
 }
 
 static __be64 frwr_mkey_mask(void)
@@ -3127,18 +3128,14 @@ static __be64 sig_mkey_mask(void)
 }
 
 static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr,
-				struct mlx5_ib_mr *mr)
+			    struct mlx5_ib_mr *mr)
 {
-	int ndescs = mr->ndescs;
+	int size = mr->ndescs * mr->desc_size;
 
 	memset(umr, 0, sizeof(*umr));
 
-	if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
-		/* KLMs take twice the size of MTTs */
-		ndescs *= 2;
-
 	umr->flags = MLX5_UMR_CHECK_NOT_FREE;
-	umr->klm_octowords = get_klm_octo(ndescs);
+	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size));
 	umr->mkey_mask = frwr_mkey_mask();
 }
 
@@ -3149,37 +3146,17 @@ static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr)
 	umr->flags = MLX5_UMR_INLINE;
 }
 
-static __be64 get_umr_reg_mr_mask(int atomic)
+static __be64 get_umr_enable_mr_mask(void)
 {
 	u64 result;
 
-	result = MLX5_MKEY_MASK_LEN		|
-		 MLX5_MKEY_MASK_PAGE_SIZE	|
-		 MLX5_MKEY_MASK_START_ADDR	|
-		 MLX5_MKEY_MASK_PD		|
-		 MLX5_MKEY_MASK_LR		|
-		 MLX5_MKEY_MASK_LW		|
-		 MLX5_MKEY_MASK_KEY		|
-		 MLX5_MKEY_MASK_RR		|
-		 MLX5_MKEY_MASK_RW		|
+	result = MLX5_MKEY_MASK_KEY |
 		 MLX5_MKEY_MASK_FREE;
 
-	if (atomic)
-		result |= MLX5_MKEY_MASK_A;
-
-	return cpu_to_be64(result);
-}
-
-static __be64 get_umr_unreg_mr_mask(void)
-{
-	u64 result;
-
-	result = MLX5_MKEY_MASK_FREE;
-
 	return cpu_to_be64(result);
 }
 
-static __be64 get_umr_update_mtt_mask(void)
+static __be64 get_umr_disable_mr_mask(void)
 {
 	u64 result;
 
@@ -3194,23 +3171,22 @@ static __be64 get_umr_update_translation_mask(void)
 
 	result = MLX5_MKEY_MASK_LEN |
 		 MLX5_MKEY_MASK_PAGE_SIZE |
-		 MLX5_MKEY_MASK_START_ADDR |
-		 MLX5_MKEY_MASK_KEY |
-		 MLX5_MKEY_MASK_FREE;
+		 MLX5_MKEY_MASK_START_ADDR;
 
 	return cpu_to_be64(result);
 }
 
-static __be64 get_umr_update_access_mask(void)
+static __be64 get_umr_update_access_mask(int atomic)
 {
 	u64 result;
 
-	result = MLX5_MKEY_MASK_LW |
+	result = MLX5_MKEY_MASK_LR |
+		 MLX5_MKEY_MASK_LW |
 		 MLX5_MKEY_MASK_RR |
-		 MLX5_MKEY_MASK_RW |
-		 MLX5_MKEY_MASK_A |
-		 MLX5_MKEY_MASK_KEY |
-		 MLX5_MKEY_MASK_FREE;
+		 MLX5_MKEY_MASK_RW;
+
+	if (atomic)
+		result |= MLX5_MKEY_MASK_A;
 
 	return cpu_to_be64(result);
 }
@@ -3219,9 +3195,7 @@ static __be64 get_umr_update_pd_mask(void)
 {
 	u64 result;
 
-	result = MLX5_MKEY_MASK_PD |
-		 MLX5_MKEY_MASK_KEY |
-		 MLX5_MKEY_MASK_FREE;
+	result = MLX5_MKEY_MASK_PD;
 
 	return cpu_to_be64(result);
 }
@@ -3238,24 +3212,24 @@ static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
 	else
 		umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */
 
-	if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) {
-		umr->klm_octowords = get_klm_octo(umrwr->npages);
-		if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) {
-			umr->mkey_mask = get_umr_update_mtt_mask();
-			umr->bsf_octowords = get_klm_octo(umrwr->target.offset);
-			umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
-		}
-		if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION)
-			umr->mkey_mask |= get_umr_update_translation_mask();
-		if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_ACCESS)
-			umr->mkey_mask |= get_umr_update_access_mask();
-		if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD)
-			umr->mkey_mask |= get_umr_update_pd_mask();
-		if (!umr->mkey_mask)
-			umr->mkey_mask = get_umr_reg_mr_mask(atomic);
-	} else {
-		umr->mkey_mask = get_umr_unreg_mr_mask();
+	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(umrwr->xlt_size));
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_XLT) {
+		u64 offset = get_xlt_octo(umrwr->offset);
+
+		umr->xlt_offset = cpu_to_be16(offset & 0xffff);
+		umr->xlt_offset_47_16 = cpu_to_be32(offset >> 16);
+		umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
 	}
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION)
+		umr->mkey_mask |= get_umr_update_translation_mask();
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS) {
+		umr->mkey_mask |= get_umr_update_access_mask(atomic);
+		umr->mkey_mask |= get_umr_update_pd_mask();
+	}
+	if (wr->send_flags & MLX5_IB_SEND_UMR_ENABLE_MR)
+		umr->mkey_mask |= get_umr_enable_mr_mask();
+	if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR)
+		umr->mkey_mask |= get_umr_disable_mr_mask();
 
 	if (!wr->num_sge)
 		umr->flags |= MLX5_UMR_INLINE;
@@ -3303,17 +3277,17 @@ static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *w
 	struct mlx5_umr_wr *umrwr = umr_wr(wr);
 
 	memset(seg, 0, sizeof(*seg));
-	if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) {
+	if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR)
 		seg->status = MLX5_MKEY_STATUS_FREE;
-		return;
-	}
 
 	seg->flags = convert_access(umrwr->access_flags);
-	if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) {
-		if (umrwr->pd)
-			seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn);
-		seg->start_addr = cpu_to_be64(umrwr->target.virt_addr);
-	}
+	if (umrwr->pd)
+		seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn);
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION &&
+	    !umrwr->length)
+		seg->flags_pd |= cpu_to_be32(MLX5_MKEY_LEN64);
+
+	seg->start_addr = cpu_to_be64(umrwr->virt_addr);
 	seg->len = cpu_to_be64(umrwr->length);
 	seg->log2_page_size = umrwr->page_shift;
 	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 |
@@ -3611,7 +3585,7 @@ static int set_sig_data_segment(struct ib_sig_handover_wr *wr,
 }
 
 static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg,
-				 struct ib_sig_handover_wr *wr, u32 nelements,
+				 struct ib_sig_handover_wr *wr, u32 size,
 				 u32 length, u32 pdn)
 {
 	struct ib_mr *sig_mr = wr->sig_mr;
@@ -3626,17 +3600,17 @@ static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg,
 	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 |
 				    MLX5_MKEY_BSF_EN | pdn);
 	seg->len = cpu_to_be64(length);
-	seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements)));
+	seg->xlt_oct_size = cpu_to_be32(get_xlt_octo(size));
 	seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE);
 }
 
 static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
-				u32 nelements)
+				u32 size)
 {
 	memset(umr, 0, sizeof(*umr));
 
 	umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE;
-	umr->klm_octowords = get_klm_octo(nelements);
+	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size));
 	umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE);
 	umr->mkey_mask = sig_mkey_mask();
 }
@@ -3648,7 +3622,7 @@ static int set_sig_umr_wr(struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp,
 	struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr);
 	struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr);
 	u32 pdn = get_pd(qp)->pdn;
-	u32 klm_oct_size;
+	u32 xlt_size;
 	int region_len, ret;
 
 	if (unlikely(wr->wr.num_sge != 1) ||
@@ -3670,15 +3644,15 @@ static int set_sig_umr_wr(struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp,
 	 * then we use strided block format (3 octowords),
 	 * else we use single KLM (1 octoword)
 	 **/
-	klm_oct_size = wr->prot ? 3 : 1;
+	xlt_size = wr->prot ? 0x30 : sizeof(struct mlx5_klm);
 
-	set_sig_umr_segment(*seg, klm_oct_size);
+	set_sig_umr_segment(*seg, xlt_size);
 	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
 	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
 	if (unlikely((*seg == qp->sq.qend)))
 		*seg = mlx5_get_send_wqe(qp, 0);
 
-	set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn);
+	set_sig_mkey_segment(*seg, wr, xlt_size, region_len, pdn);
 	*seg += sizeof(struct mlx5_mkey_seg);
 	*size += sizeof(struct mlx5_mkey_seg) / 16;
 	if (unlikely((*seg == qp->sq.qend)))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index cbfa38f..5ff86f0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -396,7 +396,7 @@ static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq, struct mlx5e_sq *sq,
 	cseg->imm       = rq->mkey_be;
 
 	ucseg->flags = MLX5_UMR_TRANSLATION_OFFSET_EN;
-	ucseg->klm_octowords =
+	ucseg->xlt_octowords =
 		cpu_to_be16(MLX5_MTT_OCTW(MLX5_MPWRQ_PAGES_PER_WQE));
 	ucseg->bsf_octowords =
 		cpu_to_be16(MLX5_MTT_OCTW(umr_wqe_mtt_offset));
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 0aacb2a..693811e 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -292,10 +292,14 @@ struct mlx5_wqe_data_seg {
 struct mlx5_wqe_umr_ctrl_seg {
 	u8		flags;
 	u8		rsvd0[3];
-	__be16		klm_octowords;
-	__be16		bsf_octowords;
+	__be16		xlt_octowords;
+	union {
+		__be16	xlt_offset;
+		__be16	bsf_octowords;
+	};
 	__be64		mkey_mask;
-	u8		rsvd1[32];
+	__be32		xlt_offset_47_16;
+	u8		rsvd1[28];
 };
 
 struct mlx5_seg_set_psv {
@@ -389,6 +393,10 @@ struct mlx5_bsf {
 	struct mlx5_bsf_inl	m_inl;
 };
 
+struct mlx5_mtt {
+	__be64		ptag;
+};
+
 struct mlx5_klm {
 	__be32		bcount;
 	__be32		key;
-- 
2.7.4

^ permalink raw reply related

* [PATCH for-next 01/11] net/mlx5: Fix offset naming for reserved fields in hca_cap_bits
From: Saeed Mahameed @ 2017-01-01 21:10 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Ilya Lesokhin,
	Artemy Kovalyov, Max Gurtovoy, Leon Romanovsky, Saeed Mahameed
In-Reply-To: <1483305027-2573-1-git-send-email-saeedm@mellanox.com>

From: Max Gurtovoy <maxg@mellanox.com>

Fix offset for reserved fields.

Fixes: 7486216b3a0b ("{net,IB}/mlx5: mlx5_ifc updates")
Fixes: b4ff3a36d3e4 ("net/mlx5: Use offset based reserved field names in the IFC header file")
Fixes: 7d5e14237a55 ("net/mlx5: Update mlx5_ifc hardware features")
Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 57bec54..4792c85 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -826,9 +826,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_1a9[0x2];
 	u8         local_ca_ack_delay[0x5];
 	u8         port_module_event[0x1];
-	u8         reserved_at_1b0[0x1];
+	u8         reserved_at_1b1[0x1];
 	u8         ports_check[0x1];
-	u8         reserved_at_1b2[0x1];
+	u8         reserved_at_1b3[0x1];
 	u8         disable_link_up[0x1];
 	u8         beacon_led[0x1];
 	u8         port_type[0x2];
@@ -858,7 +858,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         compact_address_vector[0x1];
 	u8         striding_rq[0x1];
-	u8         reserved_at_201[0x2];
+	u8         reserved_at_202[0x2];
 	u8         ipoib_basic_offloads[0x1];
 	u8         reserved_at_205[0xa];
 	u8         drain_sigerr[0x1];
@@ -1009,10 +1009,10 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         rndv_offload_rc[0x1];
 	u8         rndv_offload_dc[0x1];
 	u8         log_tag_matching_list_sz[0x5];
-	u8         reserved_at_5e8[0x3];
+	u8         reserved_at_5f8[0x3];
 	u8         log_max_xrq[0x5];
 
-	u8         reserved_at_5f0[0x200];
+	u8         reserved_at_600[0x200];
 };
 
 enum mlx5_flow_destination_type {
-- 
2.7.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox