Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH v3 2/3] net/smc: bound the receive length to the RMB in smc_rx_recvmsg()
From: Bryam Vargas @ 2026-06-19 22:17 UTC (permalink / raw)
  To: Dust Li
  Cc: Wenjia Zhang, D . Wythe, Sidraya Jayagond, Eric Dumazet,
	David S . Miller, Mahanta Jambigi, Wen Gu, Simon Horman,
	Ursula Braun, Stefan Raspl, Tony Lu, Paolo Abeni, Jakub Kicinski,
	netdev, linux-s390, linux-rdma, linux-kernel
In-Reply-To: <ajS4BgnyzRsa7HVm@linux.alibaba.com>

On Fri, 19 Jun 2026 11:31:18 +0800, Dust Li wrote:
> I think we can decide after we see the real issue.

Here it is, as a truth table over the real smc_curs_diff. cons is fixed (the app
isn't reading), bytes_to_rcv is the running sum of per-CDC smc_curs_diff(prod_old,
prod_new), len = 65504:

  scenario                     b2r       count>=len  diff>len  occ>len  OOB no-clamp  OOB clamp
  honest steady / full / wrap  <= len    no          no        no       no            no
  attack single big diff       131007    no          yes       yes      yes           no
  attack count=len-1 wrapflip  327519    no          yes       yes      yes           no
  attack wrap++ count=0        327520    no          no        no       yes           no

Every attack row has count < len, so an input count check accepts it. The last
row is the one that matters: a peer that just increments prod.wrap with count=0
adds len to bytes_to_rcv every CDC, unbounded, and no cursor-level check sees it.
The per-CDC diff is exactly len, and smc_curs_diff(cons, prod) stays at len
because it can't see the wrap accumulation. The only thing that bounds it is
clamping bytes_to_rcv at the consumer. So #2 isn't subsumed by validating cursors
at the input -- the cursor view can't see the accumulator.

> should we also abort the connection like what we did in patch #1 ?

Yes for net-next. Two caveats: First, the detection
has to be on bytes_to_rcv itself, not on a cursor recompute -- the wrap++ row
walks past every cursor check, so an occupancy gate at the input wouldn't catch
it. Second, the abort supplements the clamp, it doesn't replace it: the clamp is
synchronous, the abort via queue_work isn't. The producer add runs in the tasklet
under bh_lock_sock, the consumer sub runs in smc_recvmsg under lock_sock which
drops the spinlock, so they race; between queue_work and abort_work running
smc_conn_kill, smc_recvmsg can read the inflated bytes_to_rcv and copy past the
RMB. The clamp at the consumer is what closes that window.

So v4: -stable keeps the consumer-side clamp on #2, and the same shape on #3 for
sndbuf_space and peer_rmbe_space -- no control-flow change. net-next keeps the
clamp and, when bytes_to_rcv goes over len (which an honest peer never does),
queues the abort the way patch #1 does. Patch #1 keeps its count-based abort for
the urgent index.

Bryam

The table above is this program (gcc -O2 -Wall -Wextra -fwrapv; self-checks, exit 0):

  #include <stdio.h>
  #include <stdint.h>
  typedef uint16_t u16; typedef uint32_t u32;
  union hc { struct { u16 reserved; u16 wrap; u32 count; }; };

  /* verbatim net/smc/smc_cdc.h:149-158 */
  static int smc_curs_diff(unsigned int size, const union hc *old, const union hc *new)
  {
          if (old->wrap != new->wrap) {
                  int v = (int)((size - old->count) + new->count);
                  return v > 0 ? v : 0;
          }
          { int v = (int)(new->count - old->count); return v > 0 ? v : 0; }
  }

  #define LEN 65504
  struct cur { u16 w; u32 c; };

  /* prod[]/cons[]: cursor positions after each CDC. honest=app drains so
   * occupancy stays <= len; attack=cons stuck. */
  static int run(const char *name, int honest, int n,
                 const struct cur *prod, const struct cur *cons)
  {
          union hc po = {0}, co = {0};
          long b2r = 0; int i, cnt_rej = 0, raw_rej = 0, occ_rej = 0, fail = 0;
          for (i = 0; i < n; i++) {
                  union hc p = { .wrap = prod[i].w, .count = prod[i].c };
                  union hc c = { .wrap = cons[i].w, .count = cons[i].c };
                  int dp = smc_curs_diff(LEN, &po, &p);
                  if (prod[i].c >= (u32)LEN) cnt_rej = 1;
                  if (dp > LEN) raw_rej = 1;
                  if (smc_curs_diff(LEN, &c, &p) > LEN) occ_rej = 1;
                  b2r += dp; b2r -= smc_curs_diff(LEN, &co, &c);
                  po = p; co = c;
          }
          int oob_noclamp = b2r > LEN;
          int oob_clamp   = (b2r > LEN ? LEN : b2r) > LEN;   /* always 0 */
          printf("  %-30s b2r=%-8ld cnt_rej=%d raw_rej=%d occ_rej=%d oob_noclamp=%d oob_clamp=%d\n",
                 name, b2r, cnt_rej, raw_rej, occ_rej, oob_noclamp, oob_clamp);
          if (honest) fail = (cnt_rej || raw_rej || occ_rej || oob_noclamp);
          else        fail = (oob_clamp || !oob_noclamp);
          return fail;
  }

  int main(void)
  {
          struct cur ps[][5] = {
                  {{0,5000}}, {{1,0}}, {{0,30000},{0,60000},{1,10000}},
                  {{1,LEN-1}},
                  {{1,LEN-1},{0,LEN-1},{1,LEN-1},{0,LEN-1}},
                  {{1,0},{2,0},{3,0},{4,0},{5,0}},
          };
          struct cur cs[][5] = {
                  {{0,4000}}, {{0,0}}, {{0,0},{0,30000},{0,50000}},
                  {{0,0}},
                  {{0,0},{0,0},{0,0},{0,0}},
                  {{0,0},{0,0},{0,0},{0,0},{0,0}},
          };
          const char *nm[] = { "honest: steady", "honest: full ring",
                  "honest: wrapping", "attack: single big diff",
                  "attack: count=len-1 wrapflip", "attack: wrap++ count=0" };
          int hon[] = { 1,1,1,0,0,0 };
          int nc[]  = { 1,1,3,1,4,5 };
          int i, fails = 0;
          for (i = 0; i < 6; i++)
                  fails += run(nm[i], hon[i], nc[i], ps[i], cs[i]);
          printf("RESULT: %s\n", fails ? "FAIL" : "PASS");
          return fails ? 1 : 0;
  }

(In-kernel KASAN confirming the over-read at count=65503 is available on request;
a small out-of-tree module driving the same smc_curs_diff over a real
rmb_desc->len allocation -- bytes_to_rcv 131007 -> 327519, slab-out-of-bounds in
the recv copy, clean with the clamp.)


^ permalink raw reply

* Re: [PATCH bpf] bpf, sockmap: fix lock inversion between stab->lock and sk_callback_lock
From: Alexei Starovoitov @ 2026-06-19 21:55 UTC (permalink / raw)
  To: John Fastabend, Sechang Lim
  Cc: Jiayuan Chen, Jakub Sitnicki, Alexei Starovoitov, Daniel Borkmann,
	Eric Dumazet, Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn,
	David S . Miller, Jakub Kicinski, Simon Horman, netdev, bpf,
	linux-kernel
In-Reply-To: <ajLR9CRn6O27Ound@john-p8>

On Wed Jun 17, 2026 at 9:59 AM PDT, John Fastabend wrote:
>
> The bot also thinks it found another locking issue. I'm not sure
> supporting 'tc' is really needed here. sockmap is much more easy
> to reason about from socket layer. What about just blocking sockmap
> manipulations from these prog types.
>
> My current thinking on sockmap at the moment is its has sprawled
> across so many layers the locking is overly tricky to reason about.
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index d9bdc3b32c05..5e08d3e03453 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -8567,11 +8567,7 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
>                          return true;
>                  break;
>          case BPF_PROG_TYPE_SOCKET_FILTER:
> -       case BPF_PROG_TYPE_SCHED_CLS:
> -       case BPF_PROG_TYPE_SCHED_ACT:
> -       case BPF_PROG_TYPE_XDP:
>          case BPF_PROG_TYPE_SK_REUSEPORT:
> -       case BPF_PROG_TYPE_FLOW_DISSECTOR:
>          case BPF_PROG_TYPE_SK_LOOKUP:
>                  return true;

+1. Let's disable.

^ permalink raw reply

* Re: [PATCH net v2 01/10] rxrpc: input: reject ACKALL outside transmit phase
From: Jeffrey E Altman @ 2026-06-19 21:32 UTC (permalink / raw)
  To: David Howells, netdev
  Cc: Marc Dionne, Jakub Kicinski, David S. Miller, Eric Dumazet,
	Paolo Abeni, Simon Horman, linux-afs, linux-kernel, Wyatt Feng,
	stable, Yuan Tan, Yifan Wu, Juefei Pu, Zhengchuan Liang, Xin Liu,
	Ren Wei
In-Reply-To: <20260618134802.2477777-2-dhowells@redhat.com>

On 6/18/2026 9:47 AM, David Howells wrote:
> From: Wyatt Feng <bronzed_45_vested@icloud.com>
>
> rxrpc_input_ackall() accepts ACKALL packets without checking whether
> the call is in a state that can legitimately have outstanding transmit
> buffers.  A forged ACKALL can therefore reach a new service call in
> RXRPC_CALL_SERVER_RECV_REQUEST before any reply packets have been
> queued.
>
> In that state call->tx_top is zero and call->tx_queue is NULL, so
> rxrpc_rotate_tx_window() dereferences a NULL txqueue and triggers a
> null-pointer dereference.
>
> Fix rxrpc_input_ackall() to mirror the transmit-state gating already
> used for normal ACK processing, and ignore ACKALL when there is no
> outstanding transmit window to rotate.
>
> Fixes: b341a0263b1b ("rxrpc: Implement progressive transmission queue struct")
> Cc: stable@vger.kernel.org
> Reported-by: Yuan Tan <yuantan098@gmail.com>
> Reported-by: Yifan Wu <yifanwucs@gmail.com>
> Reported-by: Juefei Pu <tomapufckgml@gmail.com>
> Reported-by: Zhengchuan Liang <zcliangcn@gmail.com>
> Reported-by: Xin Liu <bird@lzu.edu.cn>
> Assisted-by: Codex:GPT-5.4
> Signed-off-by: Wyatt Feng <bronzed_45_vested@icloud.com>
> Signed-off-by: Ren Wei <n05ec@lzu.edu.cn>
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: Marc Dionne <marc.dionne@auristor.com>
> cc: linux-afs@lists.infradead.org
> ---
>   net/rxrpc/input.c | 16 +++++++++++++++-
>   1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
> index ce761466b02d..37881dffa898 100644
> --- a/net/rxrpc/input.c
> +++ b/net/rxrpc/input.c
> @@ -1214,8 +1214,22 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
>   static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb)
>   {
>   	struct rxrpc_ack_summary summary = { 0 };
> +	rxrpc_seq_t top = READ_ONCE(call->tx_top);
> +
> +	switch (__rxrpc_call_state(call)) {
> +	case RXRPC_CALL_CLIENT_SEND_REQUEST:
> +	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
> +	case RXRPC_CALL_SERVER_SEND_REPLY:
> +	case RXRPC_CALL_SERVER_AWAIT_ACK:
> +		break;
> +	default:
> +		return;
> +	}
> +
> +	if (call->tx_bottom == top)
> +		return;
>   
> -	if (rxrpc_rotate_tx_window(call, call->tx_top, &summary))
> +	if (rxrpc_rotate_tx_window(call, top, &summary))
>   		rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ackall);
>   }
>   

Wyatt,

Thank you for identifying the NULL pointer dereference but I do not 
believe the patch is correct from an RxRPC protocol perspective.

The rxrpc protocol is not formally standardized.  Linux rxrpc is a clean 
room implementation of Transarc/IBM RxRPC protocol used by AFS 3.0.

I've been spelunking through old source code trees dating back to 
mid-1988.  The original usage of the ACKALL packet was a form of delayed 
acknowledgement only to be sent after all of the DATA packets inclusive 
of the LAST_PACKET had been received.  Your expectation of how the 
packet type is intended to be used is consistent with that behavior.

However, in Nov 1988 the DATA acknowledgement logic was altered in a 
backward incompatible manner.   Instead of immediately sending ACK 
packets in response to every DATA packet except when the final DATA 
packet inclusive of LAST_PACKET was received, the ACK packet usage was 
extended to permit delayed transmissions. From Nov 1988 onward ACK 
packets were scheduled to be sent with a 200ms delay unless the received 
DATA packet was a duplicate, out-of-sequence, out-of-window, etc OR 
unless the received DATA packet had the RX_REQUEST_ACK flag set.   The 
delayed ACKs replaced the ACKALL usage in the general case.

But it appears there was a bug introduced which resulted in the sending 
of arbitrary ACKALL packets at any point in the call lifetime.   This 
bug was not identified until Nov 2001 [OpenAFS 
db2ddfaf1b322710e1bd4edce6d7519157c3c9eb] at which point the sending of 
ACKALL packets was further restricted.  One of the reasons why the 
sending of ACKALL packets at arbitrary times was not identified as a 
problem for more than a decade is that ACKALL packets received when 
there were no transmitted packets waiting for acknowledgement had no 
impact on the call state.   If there were transmitted packets waiting 
for acknowledgement and they were successfully delivered, then the call 
continued successfully.

OpenAFS 1.6 pre-releases attempted to resume use of ACKALL packets as a 
performance enhancement only to revert the change because of 
compatibility problems.

I think the best change at this point would to accept the ACKALL packets 
without generating an error regardless of the call state. If there are 
transmitted DATA packets waiting for acknowledgement, acknowledge them.  
If there are DATA packets which have yet to be sent, leave them alone.  
Only complete the call in response to an ACKALL if the ACKALL is 
received by the acceptor (incoming call) and all DATA packets inclusive 
of LAST_PACKET have been transmitted at least once.

Sincerely,

Jeffrey Altman




^ permalink raw reply

* Re: [PATCH net] net: sit: require CAP_NET_ADMIN in the device netns for changelink
From: Kuniyuki Iwashima @ 2026-06-19 21:29 UTC (permalink / raw)
  To: Maoyi Xie
  Cc: David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Xiao Liang, Nicolas Dichtel, Kees Cook, netdev,
	linux-kernel, stable
In-Reply-To: <20260618070817.3378283-1-maoyixie.tju@gmail.com>

On Thu, Jun 18, 2026 at 12:08 AM Maoyi Xie <maoyixie.tju@gmail.com> wrote:
>
> ipip6_changelink() operates on at most two netns, dev_net(dev) and the
> tunnel link netns t->net. They differ once the device is created in or
> moved to a netns other than the one the request runs in. The rtnl
> changelink path checks CAP_NET_ADMIN only against dev_net(dev), so a
> caller privileged there but not in t->net can rewrite a tunnel that
> lives in t->net.
>
> Gate ipip6_changelink() on rtnl_dev_link_net_capable() at its top,
> before any attribute is parsed. sit was the one tunnel type not covered
> by the recent series that added this check to the other changelink()
> handlers.
>
> Fixes: 5e6700b3bf98 ("sit: add support of x-netns")
> Link: https://lore.kernel.org/netdev/20260612085941.3158249-1-maoyixie.tju@gmail.com/
> Cc: stable@vger.kernel.org
> Signed-off-by: Maoyi Xie <maoyixie.tju@gmail.com>

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>

^ permalink raw reply

* Re: [PATCH net v2] eth: bnxt: improve the timing of stats
From: Michael Chan @ 2026-06-19 21:28 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: davem, netdev, edumazet, pabeni, andrew+netdev, horms,
	pavan.chebbi
In-Reply-To: <20260619191538.104165-1-kuba@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 1159 bytes --]

On Fri, Jun 19, 2026 at 12:15 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> Kernel selftests wait 1.25x of the promised stats refresh time
> (as read from ethtool -c). bnxt reports 1sec by default, but
> the stats update process has two steps. First device DMAs the
> new values, then the service task performs update in full-width
> SW counters. So the worst case delay is actually 2x.
>
> Note that the behavior is different for ring stats and port stats.
> Port stats are fetched synchronously by the service worker, so
> there's no risk of doubling up the delay there.
>
> The problem of stale stats impacts not only tests but real workloads
> which monitor egress bandwidth of a NIC. The inaccuracy causes double
> counting in the next cycle and spurious overload alarms.
>
> Try to read from the DMA buffer more aggressively, to mitigate
> timing issues between DMA and service task. The SW update should
> be cheap.
>
> Fixes: 51f307856b60 ("bnxt_en: Allow statistics DMA to be configurable using ethtool -C.")
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>

Thanks.
Reviewed-by: Michael Chan <michael.chan@broadcom.com>

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5469 bytes --]

^ permalink raw reply

* [PATCH v1 net] ipv4: fib: Don't ignore error route in local/main tables.
From: Kuniyuki Iwashima @ 2026-06-19 21:27 UTC (permalink / raw)
  To: David Ahern, Ido Schimmel, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev

When CONFIG_IP_MULTIPLE_TABLES is enabled but no rule is added,
fib_lookup() performs route lookup directly on two tables.

Since the first lookup does not properly bail out, the result
of an error route in the merged local/main table could be
overwritten by another route in the default table:

  # unshare -n
  # ip link set lo up
  # ip route add 192.168.0.0/24 dev lo table 253
  # ip route add unreachable 192.168.0.0/24
  # ip route get 192.168.0.1
  192.168.0.1 dev lo table default uid 0
      cache <local>

Once a random rule is added, the error route is respected:

  # ip rule add table 0
  # ip rule del table 0
  # ip route get 192.168.0.1
  RTNETLINK answers: No route to host

Let's fix the inconsistent behaviour.

Fixes: f4530fa574df ("ipv4: Avoid overhead when no custom FIB rules are installed.")
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 include/net/ip_fib.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index a71a98505650..c63a3c4967ae 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -374,7 +374,7 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp,
 			     struct fib_result *res, unsigned int flags)
 {
 	struct fib_table *tb;
-	int err = -ENETUNREACH;
+	int err = -EAGAIN;
 
 	flags |= FIB_LOOKUP_NOREF;
 	if (net->ipv4.fib_has_custom_rules)
@@ -388,17 +388,16 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp,
 	if (tb)
 		err = fib_table_lookup(tb, flp, res, flags);
 
-	if (!err)
+	if (err != -EAGAIN)
 		goto out;
 
 	tb = rcu_dereference_rtnl(net->ipv4.fib_default);
 	if (tb)
 		err = fib_table_lookup(tb, flp, res, flags);
 
-out:
 	if (err == -EAGAIN)
 		err = -ENETUNREACH;
-
+out:
 	rcu_read_unlock();
 
 	return err;
-- 
2.55.0.rc0.786.g65d90a0328-goog


^ permalink raw reply related

* Re: [PATCH net 0/2] ieee802154: admin-gate legacy LLSEC dumps + un-deaden ADD/DEL
From: Stefan Schmidt @ 2026-06-19 21:06 UTC (permalink / raw)
  To: Alexander Aring, Miquel Raynal, Michael Bommarito
  Cc: Stefan Schmidt, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Phoebe Buckheister, linux-wpan, netdev,
	linux-kernel
In-Reply-To: <20260520141640.1149513-1-michael.bommarito@gmail.com>

Hello Michael Bommarito.

On Wed, 20 May 2026 10:16:38 -0400, Michael Bommarito wrote:
> The legacy IEEE802154_NL family (net/ieee802154/netlink.c) builds its
> ops table from two macros in net/ieee802154/ieee802154.h. IEEE802154_OP()
> sets .flags = GENL_ADMIN_PERM; IEEE802154_DUMP() sets no flags. Among
> the IEEE802154_DUMP() consumers are four LLSEC dump ops (LIST_KEY,
> LIST_DEV, LIST_DEVKEY, LIST_SECLEVEL), and the LLSEC_LIST_KEY dump
> handler at net/ieee802154/nl-mac.c emits the raw 16-byte AES-128
> keytable bytes (IEEE802154_ATTR_LLSEC_KEY_BYTES, .len = 16, copied
> verbatim from struct ieee802154_llsec_key.key) into the reply skb.
> The modern nl802154 family admin-gates the equivalent reads
> (NL802154_CMD_GET_SEC_KEY at net/ieee802154/nl802154.c:2978 with
> .flags = GENL_ADMIN_PERM) so the legacy interface is the open side.
> 
> [...]

Applied to wpan/wpan-next.git, thanks!

[1/2] ieee802154: admin-gate legacy LLSEC dump operations
      https://git.kernel.org/wpan/wpan-next/c/9c1e0b6d4947
[2/2] ieee802154: allow legacy LLSEC ADD/DEL ops to pass strict validation
      https://git.kernel.org/wpan/wpan-next/c/a6bfdfcc6711

regards,
Stefan Schmidt

^ permalink raw reply

* Re: [PATCH] mac802154: Prevent overwrite return code in mac802154_perform_association()
From: Stefan Schmidt @ 2026-06-19 20:58 UTC (permalink / raw)
  To: alex.aring, miquel.raynal, Robertus Diawan Chris
  Cc: Stefan Schmidt, davem, edumazet, kuba, pabeni, horms, linux-wpan,
	netdev, linux-kernel, linux-kernel-mentees, skhan, me
In-Reply-To: <20260602054133.470293-1-robertusdchris@gmail.com>

Hello Robertus Diawan Chris.

On Tue, 02 Jun 2026 12:41:33 +0700, Robertus Diawan Chris wrote:
> When assoc_status not equal to IEEE802154_ASSOCIATION_SUCCESSFUL, the
> return value assigned to either "-ERANGE" or "-EPERM" but this return
> value will be overwritten to 0 after exiting the conditional scope.
> So, jump to clear_assoc label to preserve the return value when
> assoc_status not equal to IEEE802154_ASSOCIATION_SUCCESSFUL.
> 
> This is reported by Coverity Scan as "Unused value".
> 
> [...]

Applied to wpan/wpan-next.git, thanks!

[1/1] mac802154: Prevent overwrite return code in mac802154_perform_association()
      https://git.kernel.org/wpan/wpan-next/c/649147cb3f8b

regards,
Stefan Schmidt

^ permalink raw reply

* Re: [RFC] Enabling CONFIG_NTP_PPS for NOHZ by adding ntp_error to system_time_snapshot
From: David Woodhouse @ 2026-06-19 20:57 UTC (permalink / raw)
  To: Thomas Gleixner, John Stultz, Stephen Boyd, Miroslav Lichvar,
	Richard Cochran, linux-kernel, netdev
  Cc: Rodolfo Giometti, Alexander Gordeev
In-Reply-To: <87v7beb7s3.ffs@fw13>

[-- Attachment #1: Type: text/plain, Size: 2298 bytes --]

On Fri, 2026-06-19 at 22:21 +0200, Thomas Gleixner wrote:
> On Fri, Jun 19 2026 at 16:34, David Woodhouse wrote:
> > On Fri, 2026-06-19 at 15:34 +0200, Thomas Gleixner wrote:
> > > 
> > > This formatting makes my brain hurt. Can you please split that out into
> > > a separate function?
> > 
> > Yep. There's also a potential error there — an *additional* discrepancy
> > comes from the enforced monotonicity that timekeeping_cycles_to_ns()
> > applies (the case where it just returns tkr->xtime_nsec >> tkr_shift).
> > 
> > I couldn't work out if I cared about the clocksource-is-non-monotonic
> > casse, and even if I did, what I should do about it.
> 
> I think the right thing is just to ignore it.

Yeah, that was basically my conclusion; I had just meant to *mention*
it when posting the RFC.

> The problem is very narrow and mostly related to the historically badly
> synchronized TSC between sockets. The TSC_ADJUST fixup is obviously
> error prone as it adjusts only to the point where the error is not
> longer observable. But in the update transition phase it can result in
> time going backwards because the readout on the other CPU is slightly
> behind tk::tkr_mono::cycles_last. That happens only once in a while and
> we talk about a very low single digit number of TSC cycles.
> 
> > I also wasn't sure if this should be a new CLOCK_REALTIME_NONMONOTONIC
> > or something like that, such that e.g. PTP clients could *ask* for it.
> 
> Hell no!

That was not about the above clocksource nonsense; that was the
question of what the caller (in my example case, the vmclock PTP
snapshot) should *do* with the reported error value.

If I just unconditionally "correct" the CLOCK_REALTIME values then
that's arguably an ABI change. We're silently reporting something
*different* to what we did before.

Maybe that's OK... as I said, in the PPS case we can justify it and
just call it a bug fix?

Or maybe we want a way for callers (not of ktime_get_snapshot_id()
itself, but *their* callers) to *ask* for the "corrected" value
instead. I happened to call that CLOCK_REALTIME_NONMONOTONIC as a straw
man, just because monotonicity is *one* of the reasons why we present
the xtime values that we do, not always the raw "corrected" values.

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]

^ permalink raw reply

* Re: [PATCH] ieee802154: fix kernel-infoleak in dgram_recvmsg()
From: Stefan Schmidt @ 2026-06-19 20:53 UTC (permalink / raw)
  To: syzkaller-bugs, Alexander Aring, David S. Miller, Eric Dumazet,
	Jakub Kicinski, linux-wpan, Miquel Raynal, netdev, Paolo Abeni,
	syzbot
  Cc: Stefan Schmidt, horms, linux-kernel, syzbot
In-Reply-To: <62795fd9-fc0c-48eb-bb82-05ffc5a57104@mail.kernel.org>

Hello syzbot.

On Wed, 27 May 2026 20:18:18 +0000, syzbot wrote:
> KMSAN reported a kernel-infoleak in move_addr_to_user():
> 
> BUG: KMSAN: kernel-infoleak in instrument_copy_to_user
> include/linux/instrumented.h:131 [inline]
> BUG: KMSAN: kernel-infoleak in _inline_copy_to_user
> include/linux/uaccess.h:205 [inline]
> BUG: KMSAN: kernel-infoleak in _copy_to_user+0xcc/0x120
> lib/usercopy.c:26
>  instrument_copy_to_user include/linux/instrumented.h:131 [inline]
>  _inline_copy_to_user include/linux/uaccess.h:205 [inline]
>  _copy_to_user+0xcc/0x120 lib/usercopy.c:26
>  copy_to_user include/linux/uaccess.h:236 [inline]
>  move_addr_to_user+0x2e7/0x440 net/socket.c:302
>  ____sys_recvmsg+0x232/0x610 net/socket.c:2925
>  ...
>  Uninit was stored to memory at:
>  ieee802154_addr_to_sa include/net/ieee802154_netdev.h:369 [inline]
>  dgram_recvmsg+0xa09/0xbe0 net/ieee802154/socket.c:739
> 
> [...]

Applied to wpan/wpan-next.git, thanks!

[1/1] ieee802154: fix kernel-infoleak in dgram_recvmsg()
      https://git.kernel.org/wpan/wpan-next/c/4db86f8ab11b

regards,
Stefan Schmidt

^ permalink raw reply

* Re: [PATCH 2/2] selftests/bpf: validate rx_queue_index in xdp_metadata
From: bot+bpf-ci @ 2026-06-19 20:50 UTC (permalink / raw)
  To: siddharthcibi, ast
  Cc: siddharthcibi, daniel, davem, kuba, hawk, john.fastabend, sdf,
	andrii, eddyz87, memxor, martin.lau, song, yonghong.song, jolsa,
	emil, shuah, netdev, bpf, linux-kselftest, linux-kernel, ast,
	andrii, daniel, martin.lau, eddyz87, yonghong.song, clm,
	ihor.solodrai
In-Reply-To: <20260619195759.41254-3-siddharthcibi@icloud.com>

[-- Attachment #1: Type: text/plain, Size: 3010 bytes --]

> Extend xdp_metadata selftest coverage to validate that
> ctx->rx_queue_index is preserved and observable after XDP redirect
> execution.
>
> Capture rx_queue_index in metadata and assert that it matches the
> expected queue during packet verification.
>
> Signed-off-by: Siddharth_Cibi <siddharthcibi@icloud.com>

The changelog says the test validates that ctx->rx_queue_index is
"preserved and observable after XDP redirect execution".

Looking at the program in progs/xdp_metadata.c, rx() reads
ctx->rx_queue_index and then redirects into an XSKMAP:

	meta->rx_queue_index = ctx->rx_queue_index;
	return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS);

The AF_XDP delivery path copies the frame and metadata straight into
the umem and does not re-run an XDP program after the redirect, so the
value stored in meta->rx_queue_index is the one read in the original
ingress context rather than a value restored on a later re-execution.

Does the assert in verify_xsk_metadata() actually exercise a
redirect-restore path, or does it pass purely from the value captured
in the ingress program?

This isn't a bug, but could the changelog describe which redirect path
the test exercises, since the AF_XDP (XSKMAP) path does not re-run an
XDP program after the redirect?

> diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
> index 5c31054ad4a4..f8cabbbe7bb7 100644
> --- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
> +++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
> @@ -309,7 +309,8 @@ static int verify_xsk_metadata(struct xsk *xsk, bool sent_from_af_xdp)
>
>  	if (!ASSERT_NEQ(meta->rx_hash, 0, "rx_hash"))
>  		return -1;
> -
> +	if (!ASSERT_EQ(meta->rx_queue_index, QUEUE_ID, "rx_queue_index"))
> +        	return -1;

This isn't a bug, but the new return is indented with eight spaces
followed by a tab, while the other return -1; statements in
verify_xsk_metadata() use two tabs. Could it use tabs to match the
surrounding lines?

>  	if (!sent_from_af_xdp) {
>  		if (!ASSERT_NEQ(meta->rx_hash_type & XDP_RSS_TYPE_L4, 0, "rx_hash_type"))
>  			return -1;
> diff --git a/tools/testing/selftests/bpf/progs/xdp_metadata.c b/tools/testing/selftests/bpf/progs/xdp_metadata.c
> index 09bb8a038d52..62ae83860d7f 100644
> --- a/tools/testing/selftests/bpf/progs/xdp_metadata.c
> +++ b/tools/testing/selftests/bpf/progs/xdp_metadata.c
> @@ -98,7 +98,7 @@ int rx(struct xdp_md *ctx)
>  	bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash, &meta->rx_hash_type);
>  	bpf_xdp_metadata_rx_vlan_tag(ctx, &meta->rx_vlan_proto,
>  				     &meta->rx_vlan_tci);
> -
> +	meta->rx_queue_index = ctx->rx_queue_index;
>  	return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS);
>  }


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/27846609854

^ permalink raw reply

* Re: [PATCH 1/2] bpf: preserve rx_queue_index across XDP redirects
From: bot+bpf-ci @ 2026-06-19 20:50 UTC (permalink / raw)
  To: siddharthcibi, ast
  Cc: siddharthcibi, daniel, davem, kuba, hawk, john.fastabend, sdf,
	edumazet, pabeni, horms, andrii, eddyz87, memxor, martin.lau,
	song, yonghong.song, jolsa, emil, netdev, bpf, linux-kernel, ast,
	andrii, daniel, martin.lau, eddyz87, yonghong.song, clm,
	ihor.solodrai
In-Reply-To: <20260619195759.41254-2-siddharthcibi@icloud.com>

[-- Attachment #1: Type: text/plain, Size: 1429 bytes --]

> Store rx_queue_index in struct xdp_frame during xdp_buff to
> xdp_frame conversion and restore it when rebuilding xdp_rxq_info
> for cpumap and devmap execution paths.
> This preserves ingress RX queue information for XDP programs
> executed after redirect, allowing access to the original
> rx_queue_index instead of losing queue context.
> Also propagate rx_queue_index for zero-copy XDP frame conversion.
> Signed-off-by: Siddharth_Cibi <siddharthcibi@icloud.com>

This looks like a fix for the cpumap path, where the synthetic
xdp_rxq_info was previously built without a queue_index, leaving the
placeholder:

kernel/bpf/cpumap.c:cpu_map_bpf_prog_run_xdp() {
	rxq.dev = xdpf->dev_rx;
	rxq.mem.type = xdpf->mem_type;
	/* TODO: report queue_index to xdp_rxq_info */
	...
}

That synthetic xdp_rxq_info, with the missing queue_index and the TODO,
came in when cpu_map_bpf_prog_run_xdp() was added.  Should this carry:

  Fixes: 9216477449f3 ("bpf: cpumap: Add the possibility to attach an eBPF program to cpumap")

The devmap path (dev_map_bpf_prog_run) shares the same missing
queue_index, which was added in fbee97feed9b ("bpf: Add support to attach
bpf program to a devmap entry").


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/27846609854

^ permalink raw reply

* Re: [PATCH net] net: wwan: iosm: bound device offsets in the MUX downlink decoder
From: Loic Poulain @ 2026-06-19 20:48 UTC (permalink / raw)
  To: Maoyi Xie
  Cc: Sergey Ryazanov, Johannes Berg, Andrew Lunn, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, netdev, linux-kernel,
	stable
In-Reply-To: <178185979029.4044562.9993615975949055530@maoyixie.com>

On Fri, Jun 19, 2026 at 11:03 AM Maoyi Xie <maoyixie.tju@gmail.com> wrote:
>
> mux_dl_adb_decode() walks a chain of aggregated datagram tables using
> offsets and lengths taken from the modem. first_table_index,
> next_table_index, table_length, datagram_index and datagram_length are
> all device supplied le values. Only first_table_index was checked, and
> only for being non zero. The decoder then formed adth = block +
> adth_index and read the table header and the datagram entries with no
> bound against the received skb. A modem that reports an index or a
> length past the downlink buffer makes the decoder read out of bounds.
>
> The buffer is IPC_MEM_MAX_DL_MUX_LITE_BUF_SIZE and skb->len is at most
> that, so skb->len is the real limit, but none of these in band offsets
> were checked against it.
>
> Validate every device offset and length against skb->len before use.
> The block header must fit. Each table header, on entry and after every
> next_table_index, must lie inside the skb. The datagram table must fit.
> Each datagram index and length must stay inside the skb. The header
> padding must not exceed the datagram length so the receive length does
> not wrap.
>
> This was reproduced under KASAN as a slab out of bounds read on a normal
> downlink receive once the iosm net device is up.
>
> Fixes: 1f52d7b62285 ("net: wwan: iosm: Enable M.2 7360 WWAN card support")
> Cc: stable@vger.kernel.org
> Signed-off-by: Maoyi Xie <maoyixie.tju@gmail.com>
> ---
>  drivers/net/wwan/iosm/iosm_ipc_mux_codec.c | 23 ++++++++++++++++++++--
>  1 file changed, 21 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c
> index bff46f7ca59f..1c021bb0aa7a 100644
> --- a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c
> +++ b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c
> @@ -557,15 +557,21 @@ static int mux_dl_process_dg(struct iosm_mux *ipc_mux, struct mux_adbh *adbh,
>                                 < sizeof(struct mux_adbh))
>                         goto dg_error;
>
> -               /* Is the packet inside of the ADB */
> +               /* Is the packet inside of the ADB and the received skb ? */
>                 if (le32_to_cpu(dg->datagram_index) >=
> -                                       le32_to_cpu(adbh->block_length)) {
> +                                       le32_to_cpu(adbh->block_length) ||
> +                   le32_to_cpu(dg->datagram_index) >= skb->len ||
> +                   le16_to_cpu(dg->datagram_length) >
> +                           skb->len - le32_to_cpu(dg->datagram_index)) {

The logic is ok, but for readability, I would suggest to convert
dg->datagram_index and dg->datagram_length into intermediate
native-endian local variables (e.g dg_index, dg_len), making the if
condition cleaner and avoiding repeated conversions.


>                         goto dg_error;
>                 } else {
>                         packet_offset =
>                                 le32_to_cpu(dg->datagram_index) +
>                                 dl_head_pad_len;
>                         dg_len = le16_to_cpu(dg->datagram_length);
> +                       /* The header padding must not exceed the datagram. */
> +                       if (dl_head_pad_len >= dg_len)
> +                               goto dg_error;
>                         /* Pass the packet to the netif layer. */
>                         rc = ipc_mux_net_receive(ipc_mux, if_id, ipc_mux->wwan,
>                                                  packet_offset,
> @@ -595,6 +601,10 @@ static void mux_dl_adb_decode(struct iosm_mux *ipc_mux,
>         block = skb->data;
>         adbh = (struct mux_adbh *)block;
>
> +       /* The block header itself must fit in the received skb. */
> +       if (skb->len < sizeof(struct mux_adbh))
> +               goto adb_decode_err;
> +
>         /* Process the aggregated datagram tables. */
>         adth_index = le32_to_cpu(adbh->first_table_index);
>
> @@ -606,6 +616,11 @@ static void mux_dl_adb_decode(struct iosm_mux *ipc_mux,
>
>         /* Loop through mixed session tables. */
>         while (adth_index) {
> +               /* The table header must lie within the received skb. */
> +               if (adth_index < sizeof(struct mux_adbh) ||
> +                   adth_index > skb->len - sizeof(struct mux_adth))
> +                       goto adb_decode_err;
> +
>                 /* Get the reference to the table header. */
>                 adth = (struct mux_adth *)(block + adth_index);
>
> @@ -629,6 +644,10 @@ static void mux_dl_adb_decode(struct iosm_mux *ipc_mux,
>                 if (le16_to_cpu(adth->table_length) < sizeof(struct mux_adth))
>                         goto adb_decode_err;
>
> +               /* The whole datagram table must fit in the received skb. */
> +               if (le16_to_cpu(adth->table_length) > skb->len - adth_index)
> +                       goto adb_decode_err;
> +
>                 /* Calculate the number of datagrams. */
>                 nr_of_dg = (le16_to_cpu(adth->table_length) -
>                                         sizeof(struct mux_adth)) /
> --
> 2.34.1
>

^ permalink raw reply

* Re: [PATCH net v2] mac802154: llsec: add skb_cow_data() before in-place crypto
From: Stefan Schmidt @ 2026-06-19 20:47 UTC (permalink / raw)
  To: alex.aring, miquel.raynal, Doruk Tan Ozturk
  Cc: Stefan Schmidt, aleksander.lobakin, linux-wpan, netdev, security,
	stable
In-Reply-To: <20260526183726.56100-1-doruk@0sec.ai>

Hello Doruk Tan Ozturk.

On Tue, 26 May 2026 20:37:26 +0200, Doruk Tan Ozturk wrote:
> llsec_do_encrypt_unauth(), llsec_do_encrypt_auth(),
> llsec_do_decrypt_unauth(), and llsec_do_decrypt_auth() all perform
> in-place cryptographic transformations on skb data.  They build a
> scatterlist with sg_init_one() pointing into the skb's linear data area
> and then pass the same scatterlist as both src and dst to the crypto API
> (e.g. crypto_skcipher_encrypt/decrypt, crypto_aead_encrypt/decrypt).
> 
> [...]

Applied to wpan/wpan-next.git, thanks!

[1/1] mac802154: llsec: add skb_cow_data() before in-place crypto
      https://git.kernel.org/wpan/wpan-next/c/84a04eb5b210

regards,
Stefan Schmidt

^ permalink raw reply

* Re: [PATCH net v3 0/3] Avoid calling WARN_ON() on allocation failure in cfg802154_switch_netns()
From: Stefan Schmidt @ 2026-06-19 20:29 UTC (permalink / raw)
  To: Alexander Aring, Ivan Abramov
  Cc: Stefan Schmidt, Miquel Raynal, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, linux-wpan, netdev,
	linux-kernel, lvc-project
In-Reply-To: <20250403101935.991385-1-i.abramov@mt-integration.ru>

Hello Ivan Abramov.

On Thu, 03 Apr 2025 13:19:31 +0300, Ivan Abramov wrote:
> This series was inspired by Syzkaller report on warning in
> cfg802154_switch_netns().
> 
> WARNING: CPU: 0 PID: 5837 at net/ieee802154/core.c:258 cfg802154_switch_netns+0x3c7/0x3d0 net/ieee802154/core.c:258
> Modules linked in:
> CPU: 0 UID: 0 PID: 5837 Comm: syz-executor125 Not tainted 6.13.0-rc6-syzkaller-00918-g7b24f164cf00 #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024
> RIP: 0010:cfg802154_switch_netns+0x3c7/0x3d0 net/ieee802154/core.c:258
> Call Trace:
>  <TASK>
>  nl802154_wpan_phy_netns+0x13d/0x210 net/ieee802154/nl802154.c:1292
>  genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline]
>  genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline]
>  genl_rcv_msg+0xb14/0xec0 net/netlink/genetlink.c:1210
>  netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2543
>  genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219
>  netlink_unicast_kernel net/netlink/af_netlink.c:1322 [inline]
>  netlink_unicast+0x7f6/0x990 net/netlink/af_netlink.c:1348
>  netlink_sendmsg+0x8e4/0xcb0 net/netlink/af_netlink.c:1892
>  sock_sendmsg_nosec net/socket.c:711 [inline]
>  __sock_sendmsg+0x221/0x270 net/socket.c:726
>  ____sys_sendmsg+0x52a/0x7e0 net/socket.c:2594
>  ___sys_sendmsg net/socket.c:2648 [inline]
>  __sys_sendmsg+0x269/0x350 net/socket.c:2680
>  do_syscall_x64 arch/x86/entry/common.c:52 [inline]
>  do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
>  entry_SYSCALL_64_after_hwframe+0x77/0x7f
> 
> [...]

Applied to wpan/wpan-next.git, thanks!

[1/3] ieee802154: Restore initial state on failed device_rename() in cfg802154_switch_netns()
      https://git.kernel.org/wpan/wpan-next/c/a2e06b4bef20
[2/3] ieee802154: Avoid calling WARN_ON() on -ENOMEM in cfg802154_switch_netns()
      https://git.kernel.org/wpan/wpan-next/c/0569f67ed6a7
[3/3] ieee802154: Remove WARN_ON() in cfg802154_pernet_exit()
      https://git.kernel.org/wpan/wpan-next/c/e69ed6fc9fb3

regards,
Stefan Schmidt

^ permalink raw reply

* Re: [PATCH] ieee802154: ca8210: fix cas_ctl leak on spi_async failure
From: Stefan Schmidt @ 2026-06-19 20:29 UTC (permalink / raw)
  To: alex.aring, miquel.raynal, Shitalkumar Gandhi
  Cc: Stefan Schmidt, andrew+netdev, davem, edumazet, kuba, pabeni,
	linux-wpan, netdev, linux-kernel, stable, Shitalkumar Gandhi
In-Reply-To: <20260421073259.2259783-1-shitalkumar.gandhi@cambiumnetworks.com>

Hello Shitalkumar Gandhi.

On Tue, 21 Apr 2026 13:02:59 +0530, Shitalkumar Gandhi wrote:
> ca8210_spi_transfer() allocates cas_ctl with kzalloc_obj(GFP_ATOMIC)
> and relies entirely on the SPI completion callback
> ca8210_spi_transfer_complete() to free it.
> 
> The spi_async() API only invokes the completion callback on successful
> submission.  On failure it returns a negative error code without ever
> queuing the callback, which leaves cas_ctl and its embedded spi_message
> and spi_transfer orphaned.  Every kfree(cas_ctl) in the driver is
> inside the completion callback, so there is no other reclamation path.
> 
> [...]

Applied to wpan/wpan-next.git, thanks!

[1/1] ieee802154: ca8210: fix cas_ctl leak on spi_async failure
      https://git.kernel.org/wpan/wpan-next/c/e09390e439bd

regards,
Stefan Schmidt

^ permalink raw reply

* Re: [PATCH wpan v3] ieee802154: ca8210: fix pointer truncation in kfifo on 64-bit
From: Stefan Schmidt @ 2026-06-19 20:37 UTC (permalink / raw)
  To: Miquel Raynal, Alexander Aring, Shitalkumar Gandhi
  Cc: Stefan Schmidt, Simon Horman, Andrew Lunn, David S . Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, linux-wpan, netdev,
	linux-kernel, stable, Shitalkumar Gandhi
In-Reply-To: <20260520105750.30144-1-shitalkumar.gandhi@cambiumnetworks.com>

Hello Shitalkumar Gandhi.

On Wed, 20 May 2026 16:27:50 +0530, Shitalkumar Gandhi wrote:
> ca8210_test_int_driver_write() and ca8210_test_int_user_read() exchange
> a kmalloc'd buffer pointer through a struct kfifo, but pass a literal
> '4' as the byte count to kfifo_in()/kfifo_out().
> 
> This is correct on 32-bit (pointer = 4 bytes), but on 64-bit only the
> low 4 bytes of the 8-byte pointer are written into the FIFO. The reader
> then reads back 4 bytes into an 8-byte local pointer variable, leaving
> the upper 4 bytes uninitialized stack data. The first dereference of
> the reconstructed pointer (fifo_buffer[1]) accesses an arbitrary kernel
> address and generally results in an oops.
> 
> [...]

Applied to wpan/wpan-next.git, thanks!

[1/1] ieee802154: ca8210: fix pointer truncation in kfifo on 64-bit
      https://git.kernel.org/wpan/wpan-next/c/6d7f7bcf225b

regards,
Stefan Schmidt

^ permalink raw reply

* [PATCH rdma-next v8] RDMA: Change capability fields in ib_device_attr from int to u32
From: Erni Sri Satya Vennela @ 2026-06-19 20:30 UTC (permalink / raw)
  To: Jason Gunthorpe, Leon Romanovsky, mkalderon, zyjzyj2000, sagi,
	mgurtovoy, haris.iqbal, jinpu.wang, bvanassche, kbusch,
	Jens Axboe, Christoph Hellwig, kch, smfrench, linkinjeon, metze,
	tom, trondmy, anna, chuck.lever, jlayton, neil, okorniev, Dai.Ngo,
	achender, davem, edumazet, kuba, pabeni, horms, kees, markzhang,
	andriy.shevchenko, ebadger, linux-rdma, linux-kernel,
	target-devel, linux-nvme, linux-cifs, samba-technical, linux-nfs,
	netdev, rds-devel
  Cc: Erni Sri Satya Vennela, Jason Gunthorpe

The capability counter fields in struct ib_device_attr are declared
as signed int, but these values are inherently non-negative. Drivers
maintain their cached caps as u32 and assign them directly into these
int fields; if a cap exceeds INT_MAX the implicit narrowing yields a
negative value visible to the IB core.

Change the signed int capability fields to u32 to match the
underlying nature of the data. Also update consumers across the IB
core, ULPs, NVMe-oF target, RDS, and NFS/RDMA so the new u32 values
are not forced back through signed int or u8 via min()/min_t() or
narrowing local variables.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Acked-by: Stefan Metzmacher <metze@samba.org> # smbdirect
---
Changes in v8:
* Convert the remaining non-negative counter fields max_ee_rd_atom,
  max_ee_init_rd_atom, max_ee, max_rdd, max_raw_ipv6_qp and max_srq_wr
  to u32; keep max_srq as int (its consumer compares it against
  ib_device.num_comp_vectors, still int).
* Drop all remaining min_t() where plain min() now works.
* Make the srq_size module parameters unsigned int so the srq_size min()
  stays a plain min().
* Replace the ternary-inside-min() with the simpler "if (x) x--;".
* Reorder the send_queue_depth min() to min(value, CONST) to match the
  sibling site.
* Restore reverse xmas-tree declaration order.
* Collapse the min()/min3() assignments that now fit onto a single line
  within 100 columns.
* Print the now-u32 fields with %u instead of %d.
Changes in v7:
* Drop min_t() in all sites where a plain min() (or min3()) works
  cleanly
* Guard nvme/host/rdma.c num_inline_segments computation against a
  device reporting max_send_sge == 0, so the u32 subtract
  cannot wrap to UINT_MAX.
* Use %u when printing the newly-u32 capability fields
  in diagnostic messages.
Changes in v6:
* Fix subject prefix: net-next -> rdma-next.
Changes in v5:
* Add U8_MAX clamps in iser_verbs, nvme/host, nvme/target, isert,
* rds/ib_cm, smbdirect/connect and smbdirect/accept where u32 capability
  fields were directly narrowed into u8 rdma_conn_param fields without
  clamping.
* Guard the inline_sge_count calculation in nvmet_rdma_find_get_device()
  to prevent u32 underflow when both max_sge_rd and max_recv_sge are zero.
* Expand type migration to 9 additional fields (max_mw, max_raw_ethy_qp,
  max_mcast_grp, max_mcast_qp_attach, max_total_mcast_qp_attach, max_ah,
  max_srq, max_srq_wr, max_srq_sge)
* Fix min_t(int,...) in svc_rdma_transport; min_t(u32,...) in ipoib,
  srpt, nvme/target, rds/ib, rtrs-clt, rtrs-srv, xprtrdma/verbsdd.
* Fix frwr_ops.c u32 underflow guard (reorder check before subtraction)
* Change sc_max_send_sges to unsigned int, inline_sge_count to u32
* Fix %d -> %u in rxe_qp, rxe_srq, ipoib_cm, ib_isert, svc_rdma_transport
* Update commit message.
Changes in v4:
* Drop clamping the values in mana_ib_query_device, instead update
  the props values from int to u32.
Changes in v3:
* Drop clamping from mana_ib_gd_query_adapter_caps(). The internal u32
  caps cache does not need to be clamped.
* Move all clamping exclusively to mana_ib_query_device(), which is the
  only place the cached u32 values are narrowed into the signed int
  fields of struct ib_device_attr.
* Reframe commit message: this is a u32-to-int type boundary fix, not a
  CVM/untrusted-hardware hardening patch.
Changes in v2:
* Update patch title.
---
 drivers/infiniband/core/cq.c               |  3 +-
 drivers/infiniband/hw/qedr/verbs.c         |  2 +-
 drivers/infiniband/sw/rxe/rxe_qp.c         | 22 +++++-----
 drivers/infiniband/sw/rxe/rxe_srq.c        | 16 +++----
 drivers/infiniband/ulp/ipoib/ipoib_cm.c    | 10 ++---
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c |  3 +-
 drivers/infiniband/ulp/iser/iser_verbs.c   |  5 +--
 drivers/infiniband/ulp/isert/ib_isert.c    |  7 ++-
 drivers/infiniband/ulp/rtrs/rtrs-clt.c     | 11 ++---
 drivers/infiniband/ulp/rtrs/rtrs-srv.c     | 11 ++---
 drivers/infiniband/ulp/srp/ib_srp.c        |  2 +-
 drivers/infiniband/ulp/srpt/ib_srpt.c      | 21 +++++----
 drivers/nvme/host/rdma.c                   |  8 ++--
 drivers/nvme/target/rdma.c                 | 13 +++---
 fs/smb/smbdirect/accept.c                  |  5 ++-
 fs/smb/smbdirect/connect.c                 |  5 ++-
 fs/smb/smbdirect/connection.c              |  8 ++--
 include/linux/sunrpc/svc_rdma.h            |  4 +-
 include/rdma/ib_verbs.h                    | 50 +++++++++++-----------
 net/rds/ib.c                               | 10 ++---
 net/rds/ib_cm.c                            | 10 ++---
 net/sunrpc/xprtrdma/frwr_ops.c             |  7 +--
 net/sunrpc/xprtrdma/svc_rdma_transport.c   |  5 +--
 net/sunrpc/xprtrdma/verbs.c                |  2 +-
 24 files changed, 117 insertions(+), 123 deletions(-)

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index 3d7b6cddd131..ee98188e57fb 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -393,8 +393,7 @@ static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes,
 	 * a reasonable batch size so that we can share CQs between
 	 * multiple users instead of allocating a larger number of CQs.
 	 */
-	nr_cqes = min_t(unsigned int, dev->attrs.max_cqe,
-			max(nr_cqes, IB_MAX_SHARED_CQ_SZ));
+	nr_cqes = min(dev->attrs.max_cqe, max(nr_cqes, IB_MAX_SHARED_CQ_SZ));
 	nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus());
 	for (i = 0; i < nr_cqs; i++) {
 		cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx);
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 679aa6f3a63b..a85ad0171134 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -151,7 +151,7 @@ int qedr_query_device(struct ib_device *ibdev,
 	attr->max_qp_init_rd_atom =
 	    1 << (fls(qattr->max_qp_req_rd_atomic_resc) - 1);
 	attr->max_qp_rd_atom =
-	    min(1 << (fls(qattr->max_qp_resp_rd_atomic_resc) - 1),
+	    min(1U << (fls(qattr->max_qp_resp_rd_atomic_resc) - 1),
 		attr->max_qp_init_rd_atom);
 
 	attr->max_srq = qattr->max_srq;
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index f3dff1aea96a..7a0529a17992 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -67,27 +67,27 @@ static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap,
 			  int has_srq)
 {
 	if (cap->max_send_wr > rxe->attr.max_qp_wr) {
-		rxe_dbg_dev(rxe, "invalid send wr = %u > %d\n",
-			 cap->max_send_wr, rxe->attr.max_qp_wr);
+		rxe_dbg_dev(rxe, "invalid send wr = %u > %u\n",
+			    cap->max_send_wr, rxe->attr.max_qp_wr);
 		goto err1;
 	}
 
 	if (cap->max_send_sge > rxe->attr.max_send_sge) {
-		rxe_dbg_dev(rxe, "invalid send sge = %u > %d\n",
-			 cap->max_send_sge, rxe->attr.max_send_sge);
+		rxe_dbg_dev(rxe, "invalid send sge = %u > %u\n",
+			    cap->max_send_sge, rxe->attr.max_send_sge);
 		goto err1;
 	}
 
 	if (!has_srq) {
 		if (cap->max_recv_wr > rxe->attr.max_qp_wr) {
-			rxe_dbg_dev(rxe, "invalid recv wr = %u > %d\n",
-				 cap->max_recv_wr, rxe->attr.max_qp_wr);
+			rxe_dbg_dev(rxe, "invalid recv wr = %u > %u\n",
+				    cap->max_recv_wr, rxe->attr.max_qp_wr);
 			goto err1;
 		}
 
 		if (cap->max_recv_sge > rxe->attr.max_recv_sge) {
-			rxe_dbg_dev(rxe, "invalid recv sge = %u > %d\n",
-				 cap->max_recv_sge, rxe->attr.max_recv_sge);
+			rxe_dbg_dev(rxe, "invalid recv sge = %u > %u\n",
+				    cap->max_recv_sge, rxe->attr.max_recv_sge);
 			goto err1;
 		}
 	}
@@ -537,9 +537,9 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp,
 
 	if (mask & IB_QP_MAX_QP_RD_ATOMIC) {
 		if (attr->max_rd_atomic > rxe->attr.max_qp_rd_atom) {
-			rxe_dbg_qp(qp, "invalid max_rd_atomic %d > %d\n",
-				 attr->max_rd_atomic,
-				 rxe->attr.max_qp_rd_atom);
+			rxe_dbg_qp(qp, "invalid max_rd_atomic %u > %u\n",
+				   attr->max_rd_atomic,
+				   rxe->attr.max_qp_rd_atom);
 			goto err1;
 		}
 	}
diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c
index c9a7cd38953d..74904a6fdf2b 100644
--- a/drivers/infiniband/sw/rxe/rxe_srq.c
+++ b/drivers/infiniband/sw/rxe/rxe_srq.c
@@ -13,8 +13,8 @@ int rxe_srq_chk_init(struct rxe_dev *rxe, struct ib_srq_init_attr *init)
 	struct ib_srq_attr *attr = &init->attr;
 
 	if (attr->max_wr > rxe->attr.max_srq_wr) {
-		rxe_dbg_dev(rxe, "max_wr(%d) > max_srq_wr(%d)\n",
-			attr->max_wr, rxe->attr.max_srq_wr);
+		rxe_dbg_dev(rxe, "max_wr(%u) > max_srq_wr(%u)\n",
+			    attr->max_wr, rxe->attr.max_srq_wr);
 		goto err1;
 	}
 
@@ -27,8 +27,8 @@ int rxe_srq_chk_init(struct rxe_dev *rxe, struct ib_srq_init_attr *init)
 		attr->max_wr = RXE_MIN_SRQ_WR;
 
 	if (attr->max_sge > rxe->attr.max_srq_sge) {
-		rxe_dbg_dev(rxe, "max_sge(%d) > max_srq_sge(%d)\n",
-			attr->max_sge, rxe->attr.max_srq_sge);
+		rxe_dbg_dev(rxe, "max_sge(%u) > max_srq_sge(%u)\n",
+			    attr->max_sge, rxe->attr.max_srq_sge);
 		goto err1;
 	}
 
@@ -107,8 +107,8 @@ int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
 
 	if (mask & IB_SRQ_MAX_WR) {
 		if (attr->max_wr > rxe->attr.max_srq_wr) {
-			rxe_dbg_srq(srq, "max_wr(%d) > max_srq_wr(%d)\n",
-				attr->max_wr, rxe->attr.max_srq_wr);
+			rxe_dbg_srq(srq, "max_wr(%u) > max_srq_wr(%u)\n",
+				    attr->max_wr, rxe->attr.max_srq_wr);
 			goto err1;
 		}
 
@@ -129,8 +129,8 @@ int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
 
 	if (mask & IB_SRQ_LIMIT) {
 		if (attr->srq_limit > rxe->attr.max_srq_wr) {
-			rxe_dbg_srq(srq, "srq_limit(%d) > max_srq_wr(%d)\n",
-				attr->srq_limit, rxe->attr.max_srq_wr);
+			rxe_dbg_srq(srq, "srq_limit(%u) > max_srq_wr(%u)\n",
+				    attr->srq_limit, rxe->attr.max_srq_wr);
 			goto err1;
 		}
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 57fec88a1629..ed0592898384 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -1071,8 +1071,7 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
 	struct ib_qp *tx_qp;
 
 	if (dev->features & NETIF_F_SG)
-		attr.cap.max_send_sge = min_t(u32, priv->ca->attrs.max_send_sge,
-					      MAX_SKB_FRAGS + 1);
+		attr.cap.max_send_sge = min(priv->ca->attrs.max_send_sge, MAX_SKB_FRAGS + 1);
 
 	tx_qp = ib_create_qp(priv->pd, &attr);
 	tx->max_send_sge = attr.cap.max_send_sge;
@@ -1582,7 +1581,8 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
 int ipoib_cm_dev_init(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(dev);
-	int max_srq_sge, i;
+	u32 max_srq_sge;
+	int i;
 	u8 addr;
 
 	INIT_LIST_HEAD(&priv->cm.passive_ids);
@@ -1600,9 +1600,9 @@ int ipoib_cm_dev_init(struct net_device *dev)
 
 	skb_queue_head_init(&priv->cm.skb_queue);
 
-	ipoib_dbg(priv, "max_srq_sge=%d\n", priv->ca->attrs.max_srq_sge);
+	ipoib_dbg(priv, "max_srq_sge=%u\n", priv->ca->attrs.max_srq_sge);
 
-	max_srq_sge = min_t(int, IPOIB_CM_RX_SG, priv->ca->attrs.max_srq_sge);
+	max_srq_sge = min(priv->ca->attrs.max_srq_sge, IPOIB_CM_RX_SG);
 	ipoib_cm_create_srq(dev, max_srq_sge);
 	if (ipoib_cm_has_srq(dev)) {
 		priv->cm.max_cm_mtu = max_srq_sge * PAGE_SIZE - 0x10;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 3ed1ea566690..2490696a1aab 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -147,8 +147,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 		.cap = {
 			.max_send_wr  = ipoib_sendq_size,
 			.max_recv_wr  = ipoib_recvq_size,
-			.max_send_sge = min_t(u32, priv->ca->attrs.max_send_sge,
-					      MAX_SKB_FRAGS + 1),
+			.max_send_sge = min(priv->ca->attrs.max_send_sge, MAX_SKB_FRAGS + 1),
 			.max_recv_sge = IPOIB_UD_RX_SG
 		},
 		.sq_sig_type = IB_SIGNAL_ALL_WR,
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index f03b3bb3c0c4..55fe68e5b837 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -244,8 +244,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
 		max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
 	else
 		max_send_wr = ISER_QP_MAX_REQ_DTOS + 1;
-	max_send_wr = min_t(unsigned int, max_send_wr,
-			    (unsigned int)ib_dev->attrs.max_qp_wr);
+	max_send_wr = min(max_send_wr, ib_dev->attrs.max_qp_wr);
 
 	cq_size = max_send_wr + ISER_QP_MAX_RECV_DTOS;
 	ib_conn->cq = ib_cq_pool_get(ib_dev, cq_size, -1, IB_POLL_SOFTIRQ);
@@ -589,7 +588,7 @@ static void iser_route_handler(struct rdma_cm_id *cma_id)
 		goto failure;
 
 	memset(&conn_param, 0, sizeof conn_param);
-	conn_param.responder_resources = ib_dev->attrs.max_qp_rd_atom;
+	conn_param.responder_resources = min(ib_dev->attrs.max_qp_rd_atom, U8_MAX);
 	conn_param.initiator_depth = 1;
 	conn_param.retry_count = 7;
 	conn_param.rnr_retry_count = 6;
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 1015a51f750a..4691845bf815 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -214,9 +214,9 @@ isert_create_device_ib_res(struct isert_device *device)
 	struct ib_device *ib_dev = device->ib_device;
 	int ret;
 
-	isert_dbg("devattr->max_send_sge: %d devattr->max_recv_sge %d\n",
+	isert_dbg("devattr->max_send_sge: %u devattr->max_recv_sge %u\n",
 		  ib_dev->attrs.max_send_sge, ib_dev->attrs.max_recv_sge);
-	isert_dbg("devattr->max_sge_rd: %d\n", ib_dev->attrs.max_sge_rd);
+	isert_dbg("devattr->max_sge_rd: %u\n", ib_dev->attrs.max_sge_rd);
 
 	device->pd = ib_alloc_pd(ib_dev, 0);
 	if (IS_ERR(device->pd)) {
@@ -381,8 +381,7 @@ isert_set_nego_params(struct isert_conn *isert_conn,
 	struct ib_device_attr *attr = &isert_conn->device->ib_device->attrs;
 
 	/* Set max inflight RDMA READ requests */
-	isert_conn->initiator_depth = min_t(u8, param->initiator_depth,
-				attr->max_qp_init_rd_atom);
+	isert_conn->initiator_depth = min(param->initiator_depth, attr->max_qp_init_rd_atom);
 	isert_dbg("Using initiator_depth: %u\n", isert_conn->initiator_depth);
 
 	if (param->private_data) {
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
index e351552733df..80b08697f96b 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
@@ -1681,8 +1681,7 @@ static int create_con_cq_qp(struct rtrs_clt_con *con)
 		 * + 2 for drain and heartbeat
 		 * in case qp gets into error state.
 		 */
-		max_send_wr =
-			min_t(int, wr_limit, SERVICE_CON_QUEUE_DEPTH * 2 + 2);
+		max_send_wr = min(wr_limit, SERVICE_CON_QUEUE_DEPTH * 2 + 2);
 		max_recv_wr = max_send_wr;
 	} else {
 		/*
@@ -1698,11 +1697,9 @@ static int create_con_cq_qp(struct rtrs_clt_con *con)
 		wr_limit = clt_path->s.dev->ib_dev->attrs.max_qp_wr;
 		/* Shared between connections */
 		clt_path->s.dev_ref++;
-		max_send_wr = min_t(int, wr_limit,
-			      /* QD * (REQ + RSP + FR REGS or INVS) + drain */
-			      clt_path->queue_depth * 4 + 1);
-		max_recv_wr = min_t(int, wr_limit,
-			      clt_path->queue_depth * 3 + 1);
+		/* QD * (REQ + RSP + FR REGS or INVS) + drain */
+		max_send_wr = min(wr_limit, clt_path->queue_depth * 4 + 1);
+		max_recv_wr = min(wr_limit, clt_path->queue_depth * 3 + 1);
 		max_send_sge = 2;
 	}
 	atomic_set(&con->c.sq_wr_avail, max_send_wr);
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
index 6482ad859bd1..f5a6890235bc 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
@@ -1731,21 +1731,16 @@ static int create_con(struct rtrs_srv_path *srv_path,
 		 * All receive and all send (each requiring invalidate)
 		 * + 2 for drain and heartbeat
 		 */
-		max_send_wr = min_t(int, wr_limit,
-				    SERVICE_CON_QUEUE_DEPTH * 2 + 2);
+		max_send_wr = min(wr_limit, SERVICE_CON_QUEUE_DEPTH * 2 + 2);
 		max_recv_wr = max_send_wr;
 		s->signal_interval = min_not_zero(srv->queue_depth,
 						  (size_t)SERVICE_CON_QUEUE_DEPTH);
 	} else {
 		/* when always_invlaidate enalbed, we need linv+rinv+mr+imm */
 		if (always_invalidate)
-			max_send_wr =
-				min_t(int, wr_limit,
-				      srv->queue_depth * (1 + 4) + 1);
+			max_send_wr = min(wr_limit, srv->queue_depth * (1 + 4) + 1);
 		else
-			max_send_wr =
-				min_t(int, wr_limit,
-				      srv->queue_depth * (1 + 2) + 1);
+			max_send_wr = min(wr_limit, srv->queue_depth * (1 + 2) + 1);
 
 		max_recv_wr = srv->queue_depth + 1;
 	}
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index acbd787de265..0caebbc2810f 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -557,7 +557,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 	init_attr->cap.max_send_wr     = m * target->queue_size;
 	init_attr->cap.max_recv_wr     = target->queue_size + 1;
 	init_attr->cap.max_recv_sge    = 1;
-	init_attr->cap.max_send_sge    = min(SRP_MAX_SGE, attr->max_send_sge);
+	init_attr->cap.max_send_sge    = min(attr->max_send_sge, SRP_MAX_SGE);
 	init_attr->sq_sig_type         = IB_SIGNAL_REQ_WR;
 	init_attr->qp_type             = IB_QPT_RC;
 	init_attr->send_cq             = send_cq;
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 9aec5d80117f..a4e4feba4a02 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -77,8 +77,8 @@ module_param(srp_max_req_size, int, 0444);
 MODULE_PARM_DESC(srp_max_req_size,
 		 "Maximum size of SRP request messages in bytes.");
 
-static int srpt_srq_size = DEFAULT_SRPT_SRQ_SIZE;
-module_param(srpt_srq_size, int, 0444);
+static unsigned int srpt_srq_size = DEFAULT_SRPT_SRQ_SIZE;
+module_param(srpt_srq_size, uint, 0444);
 MODULE_PARM_DESC(srpt_srq_size,
 		 "Shared receive queue (SRQ) size.");
 
@@ -405,8 +405,7 @@ static void srpt_get_ioc(struct srpt_port *sport, u32 slot,
 	if (sdev->use_srq)
 		send_queue_depth = sdev->srq_size;
 	else
-		send_queue_depth = min(MAX_SRPT_RQ_SIZE,
-				       sdev->device->attrs.max_qp_wr);
+		send_queue_depth = min(sdev->device->attrs.max_qp_wr, MAX_SRPT_RQ_SIZE);
 
 	memset(iocp, 0, sizeof(*iocp));
 	strcpy(iocp->id_string, SRPT_ID_STRING);
@@ -1850,7 +1849,7 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
 	struct srpt_port *sport = ch->sport;
 	struct srpt_device *sdev = sport->sdev;
 	const struct ib_device_attr *attrs = &sdev->device->attrs;
-	int sq_size = sport->port_attrib.srp_sq_size;
+	u32 sq_size = sport->port_attrib.srp_sq_size;
 	int i, ret;
 
 	WARN_ON(ch->rq_size < 1);
@@ -1911,13 +1910,13 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
 		bool retry = sq_size > MIN_SRPT_SQ_SIZE;
 
 		if (retry) {
-			pr_debug("failed to create queue pair with sq_size = %d (%d) - retrying\n",
+			pr_debug("failed to create queue pair with sq_size = %u (%d) - retrying\n",
 				 sq_size, ret);
 			ib_cq_pool_put(ch->cq, ch->cq_size);
 			sq_size = max(sq_size / 2, MIN_SRPT_SQ_SIZE);
 			goto retry;
 		} else {
-			pr_err("failed to create queue pair with sq_size = %d (%d)\n",
+			pr_err("failed to create queue pair with sq_size = %u (%d)\n",
 			       sq_size, ret);
 			goto err_destroy_cq;
 		}
@@ -1925,7 +1924,7 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
 
 	atomic_set(&ch->sq_wr_avail, qp_init->cap.max_send_wr);
 
-	pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d ch= %p\n",
+	pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %u ch= %p\n",
 		 __func__, ch->cq->cqe, qp_init->cap.max_send_sge,
 		 qp_init->cap.max_send_wr, ch);
 
@@ -2298,7 +2297,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev,
 	 * depth to avoid that the initiator driver has to report QUEUE_FULL
 	 * to the SCSI mid-layer.
 	 */
-	ch->rq_size = min(MAX_SRPT_RQ_SIZE, sdev->device->attrs.max_qp_wr);
+	ch->rq_size = min(sdev->device->attrs.max_qp_wr, MAX_SRPT_RQ_SIZE);
 	spin_lock_init(&ch->spinlock);
 	ch->state = CH_CONNECTING;
 	INIT_LIST_HEAD(&ch->cmd_wait_list);
@@ -3136,7 +3135,7 @@ static int srpt_alloc_srq(struct srpt_device *sdev)
 		return PTR_ERR(srq);
 	}
 
-	pr_debug("create SRQ #wr= %d max_allow=%d dev= %s\n", sdev->srq_size,
+	pr_debug("create SRQ #wr= %d max_allow=%u dev= %s\n", sdev->srq_size,
 		 sdev->device->attrs.max_srq_wr, dev_name(&device->dev));
 
 	sdev->req_buf_cache = srpt_cache_get(srp_max_req_size);
@@ -3951,7 +3950,7 @@ static int __init srpt_init_module(void)
 
 	if (srpt_srq_size < MIN_SRPT_SRQ_SIZE
 	    || srpt_srq_size > MAX_SRPT_SRQ_SIZE) {
-		pr_err("invalid value %d for kernel module parameter srpt_srq_size -- must be in the range [%d..%d].\n",
+		pr_err("invalid value %u for kernel module parameter srpt_srq_size -- must be in the range [%d..%d].\n",
 		       srpt_srq_size, MIN_SRPT_SRQ_SIZE, MAX_SRPT_SRQ_SIZE);
 		goto out;
 	}
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 6909e3542794..56cd228af1d5 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -394,8 +394,10 @@ nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
 		goto out_free_pd;
 	}
 
-	ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS,
-					ndev->dev->attrs.max_send_sge - 1);
+	ndev->num_inline_segments = ndev->dev->attrs.max_send_sge;
+	if (ndev->num_inline_segments)
+		ndev->num_inline_segments--;
+	ndev->num_inline_segments = min(ndev->num_inline_segments, NVME_RDMA_MAX_INLINE_SEGMENTS);
 	list_add(&ndev->entry, &device_list);
 out_unlock:
 	mutex_unlock(&device_list_mutex);
@@ -1847,7 +1849,7 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
 	param.qp_num = queue->qp->qp_num;
 	param.flow_control = 1;
 
-	param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom;
+	param.responder_resources = min(queue->device->dev->attrs.max_qp_rd_atom, U8_MAX);
 	/* maximum retry count */
 	param.retry_count = 7;
 	param.rnr_retry_count = 7;
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index ac26f4f774c4..1c332d66222a 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -152,7 +152,7 @@ static const struct kernel_param_ops srq_size_ops = {
 	.get = param_get_int,
 };
 
-static int nvmet_rdma_srq_size = 1024;
+static unsigned int nvmet_rdma_srq_size = 1024;
 module_param_cb(srq_size, &srq_size_ops, &nvmet_rdma_srq_size, 0644);
 MODULE_PARM_DESC(srq_size, "set Shared Receive Queue (SRQ) size, should >= 256 (default: 1024)");
 
@@ -1197,7 +1197,7 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
 	struct nvmet_port *nport = port->nport;
 	struct nvmet_rdma_device *ndev;
 	int inline_page_count;
-	int inline_sge_count;
+	u32 inline_sge_count;
 	int ret;
 
 	mutex_lock(&device_list_mutex);
@@ -1213,7 +1213,9 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
 
 	inline_page_count = num_pages(nport->inline_data_size);
 	inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
-				cm_id->device->attrs.max_recv_sge) - 1;
+				cm_id->device->attrs.max_recv_sge);
+	if (inline_sge_count)
+		inline_sge_count--;
 	if (inline_page_count > inline_sge_count) {
 		pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n",
 			nport->inline_data_size, cm_id->device->name,
@@ -1553,8 +1555,9 @@ static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
 
 	param.rnr_retry_count = 7;
 	param.flow_control = 1;
-	param.initiator_depth = min_t(u8, p->initiator_depth,
-		queue->dev->device->attrs.max_qp_init_rd_atom);
+	param.initiator_depth = min3(p->initiator_depth,
+				     queue->dev->device->attrs.max_qp_init_rd_atom,
+				     U8_MAX);
 	param.private_data = &priv;
 	param.private_data_len = sizeof(priv);
 	priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
diff --git a/fs/smb/smbdirect/accept.c b/fs/smb/smbdirect/accept.c
index 529740005838..44b681a20725 100644
--- a/fs/smb/smbdirect/accept.c
+++ b/fs/smb/smbdirect/accept.c
@@ -32,8 +32,9 @@ int smbdirect_accept_connect_request(struct smbdirect_socket *sc,
 	/*
 	 * First set what the we as server are able to support
 	 */
-	sp->initiator_depth = min_t(u8, sp->initiator_depth,
-				    sc->ib.dev->attrs.max_qp_rd_atom);
+	sp->initiator_depth = min3(sp->initiator_depth,
+				   sc->ib.dev->attrs.max_qp_rd_atom,
+				   U8_MAX);
 
 	peer_initiator_depth = param->initiator_depth;
 	peer_responder_resources = param->responder_resources;
diff --git a/fs/smb/smbdirect/connect.c b/fs/smb/smbdirect/connect.c
index cd726b399afe..34a3e72c38fb 100644
--- a/fs/smb/smbdirect/connect.c
+++ b/fs/smb/smbdirect/connect.c
@@ -182,8 +182,9 @@ static int smbdirect_connect_rdma_connect(struct smbdirect_socket *sc)
 	if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
 		sc->mr_io.type = IB_MR_TYPE_SG_GAPS;
 
-	sp->responder_resources = min_t(u8, sp->responder_resources,
-					sc->ib.dev->attrs.max_qp_rd_atom);
+	sp->responder_resources = min3(sp->responder_resources,
+				       sc->ib.dev->attrs.max_qp_rd_atom,
+				       U8_MAX);
 	smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_INFO,
 		"responder_resources=%d\n",
 		sp->responder_resources);
diff --git a/fs/smb/smbdirect/connection.c b/fs/smb/smbdirect/connection.c
index 8adf58097534..690acb84e1b5 100644
--- a/fs/smb/smbdirect/connection.c
+++ b/fs/smb/smbdirect/connection.c
@@ -287,7 +287,7 @@ int smbdirect_connection_create_qp(struct smbdirect_socket *sc)
 	    qp_cap.max_send_wr > sc->ib.dev->attrs.max_qp_wr) {
 		pr_err("Possible CQE overrun: max_send_wr %d\n",
 		       qp_cap.max_send_wr);
-		pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
+		pr_err("device %.*s reporting max_cqe %u max_qp_wr %u\n",
 		       IB_DEVICE_NAME_MAX,
 		       sc->ib.dev->name,
 		       sc->ib.dev->attrs.max_cqe,
@@ -302,7 +302,7 @@ int smbdirect_connection_create_qp(struct smbdirect_socket *sc)
 	     max_send_wr >= sc->ib.dev->attrs.max_qp_wr)) {
 		pr_err("Possible CQE overrun: rdma_send_wr %d + max_send_wr %d = %d\n",
 		       rdma_send_wr, qp_cap.max_send_wr, max_send_wr);
-		pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
+		pr_err("device %.*s reporting max_cqe %u max_qp_wr %u\n",
 		       IB_DEVICE_NAME_MAX,
 		       sc->ib.dev->name,
 		       sc->ib.dev->attrs.max_cqe,
@@ -316,7 +316,7 @@ int smbdirect_connection_create_qp(struct smbdirect_socket *sc)
 	    qp_cap.max_recv_wr > sc->ib.dev->attrs.max_qp_wr) {
 		pr_err("Possible CQE overrun: max_recv_wr %d\n",
 		       qp_cap.max_recv_wr);
-		pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n",
+		pr_err("device %.*s reporting max_cqe %u max_qp_wr %u\n",
 		       IB_DEVICE_NAME_MAX,
 		       sc->ib.dev->name,
 		       sc->ib.dev->attrs.max_cqe,
@@ -328,7 +328,7 @@ int smbdirect_connection_create_qp(struct smbdirect_socket *sc)
 
 	if (qp_cap.max_send_sge > sc->ib.dev->attrs.max_send_sge ||
 	    qp_cap.max_recv_sge > sc->ib.dev->attrs.max_recv_sge) {
-		pr_err("device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
+		pr_err("device %.*s max_send_sge/max_recv_sge = %u/%u too small\n",
 		       IB_DEVICE_NAME_MAX,
 		       sc->ib.dev->name,
 		       sc->ib.dev->attrs.max_send_sge,
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index df6e08aaad57..217f000be5d6 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -78,8 +78,8 @@ struct svcxprt_rdma {
 	struct rdma_cm_id    *sc_cm_id;		/* RDMA connection id */
 	struct list_head     sc_accept_q;	/* Conn. waiting accept */
 	struct rpcrdma_notification sc_rn;	/* removal notification */
-	int		     sc_ord;		/* RDMA read limit */
-	int                  sc_max_send_sges;
+	u32		     sc_ord;		/* RDMA read limit */
+	unsigned int         sc_max_send_sges;
 	bool		     sc_snd_w_inv;	/* OK to use Send With Invalidate */
 
 	atomic_t             sc_sq_avail;	/* SQEs ready to be consumed */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9dd76f489a0b..b8b221b5f564 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -406,36 +406,36 @@ struct ib_device_attr {
 	u32			vendor_id;
 	u32			vendor_part_id;
 	u32			hw_ver;
-	int			max_qp;
-	int			max_qp_wr;
+	u32			max_qp;
+	u32			max_qp_wr;
 	u64			device_cap_flags;
 	u64			kernel_cap_flags;
-	int			max_send_sge;
-	int			max_recv_sge;
-	int			max_sge_rd;
-	int			max_cq;
-	int			max_cqe;
-	int			max_mr;
-	int			max_pd;
-	int			max_qp_rd_atom;
-	int			max_ee_rd_atom;
-	int			max_res_rd_atom;
-	int			max_qp_init_rd_atom;
-	int			max_ee_init_rd_atom;
+	u32			max_send_sge;
+	u32			max_recv_sge;
+	u32			max_sge_rd;
+	u32			max_cq;
+	u32			max_cqe;
+	u32			max_mr;
+	u32			max_pd;
+	u32			max_qp_rd_atom;
+	u32			max_ee_rd_atom;
+	u32			max_res_rd_atom;
+	u32			max_qp_init_rd_atom;
+	u32			max_ee_init_rd_atom;
 	enum ib_atomic_cap	atomic_cap;
 	enum ib_atomic_cap	masked_atomic_cap;
-	int			max_ee;
-	int			max_rdd;
-	int			max_mw;
-	int			max_raw_ipv6_qp;
-	int			max_raw_ethy_qp;
-	int			max_mcast_grp;
-	int			max_mcast_qp_attach;
-	int			max_total_mcast_qp_attach;
-	int			max_ah;
+	u32			max_ee;
+	u32			max_rdd;
+	u32			max_mw;
+	u32			max_raw_ipv6_qp;
+	u32			max_raw_ethy_qp;
+	u32			max_mcast_grp;
+	u32			max_mcast_qp_attach;
+	u32			max_total_mcast_qp_attach;
+	u32			max_ah;
 	int			max_srq;
-	int			max_srq_wr;
-	int			max_srq_sge;
+	u32			max_srq_wr;
+	u32			max_srq_sge;
 	unsigned int		max_fast_reg_page_list_len;
 	unsigned int		max_pi_fast_reg_page_list_len;
 	u16			max_pkeys;
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 39f87272e071..c62684d4259c 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -162,12 +162,12 @@ static int rds_ib_add_one(struct ib_device *device)
 		   IB_ODP_SUPPORT_READ);
 
 	rds_ibdev->max_1m_mrs = device->attrs.max_mr ?
-		min_t(unsigned int, (device->attrs.max_mr / 2),
-		      rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size;
+		min(device->attrs.max_mr / 2,
+		    rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size;
 
 	rds_ibdev->max_8k_mrs = device->attrs.max_mr ?
-		min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE),
-		      rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size;
+		min((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE,
+		    rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size;
 
 	rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
 	rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
@@ -204,7 +204,7 @@ static int rds_ib_add_one(struct ib_device *device)
 		goto put_dev;
 	}
 
-	rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, max_1m_mrs = %d, max_8k_mrs = %d\n",
+	rdsdebug("RDS/IB: max_mr = %u, max_wrs = %d, max_sge = %d, max_1m_mrs = %d, max_8k_mrs = %d\n",
 		 device->attrs.max_mr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
 		 rds_ibdev->max_1m_mrs, rds_ibdev->max_8k_mrs);
 
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 5667f0173b47..17e587c30076 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -173,11 +173,11 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
 
 	memset(conn_param, 0, sizeof(struct rdma_conn_param));
 
-	conn_param->responder_resources =
-		min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
-	conn_param->initiator_depth =
-		min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
-	conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
+	conn_param->responder_resources = min3(rds_ibdev->max_responder_resources,
+					       max_responder_resources, U8_MAX);
+	conn_param->initiator_depth = min3(rds_ibdev->max_initiator_depth,
+					   max_initiator_depth, U8_MAX);
+	conn_param->retry_count = min(rds_ib_retry_count, 7U);
 	conn_param->rnr_retry_count = 7;
 
 	if (dp) {
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 7f79a0a2601e..b2e437afe09d 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -172,8 +172,9 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
 int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
 {
 	const struct ib_device_attr *attrs = &device->attrs;
-	int max_qp_wr, depth, delta;
 	unsigned int max_sge;
+	u32 max_qp_wr;
+	int depth, delta;
 
 	if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) ||
 	    attrs->max_fast_reg_page_list_len == 0) {
@@ -229,10 +230,10 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
 	}
 
 	max_qp_wr = attrs->max_qp_wr;
+	if (max_qp_wr < RPCRDMA_BACKWARD_WRS + 1 + RPCRDMA_MIN_SLOT_TABLE)
+		return -ENOMEM;
 	max_qp_wr -= RPCRDMA_BACKWARD_WRS;
 	max_qp_wr -= 1;
-	if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE)
-		return -ENOMEM;
 	if (ep->re_max_requests > max_qp_wr)
 		ep->re_max_requests = max_qp_wr;
 	ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index f18bc60d9f4f..c768cda2e544 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -544,8 +544,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
 	memset(&conn_param, 0, sizeof conn_param);
 	conn_param.responder_resources = 0;
-	conn_param.initiator_depth = min_t(int, newxprt->sc_ord,
-					   dev->attrs.max_qp_init_rd_atom);
+	conn_param.initiator_depth = min(newxprt->sc_ord, dev->attrs.max_qp_init_rd_atom);
 	if (!conn_param.initiator_depth) {
 		ret = -EINVAL;
 		trace_svcrdma_initdepth_err(newxprt, ret);
@@ -570,7 +569,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		dprintk("    local address   : %pIS:%u\n", sap, rpc_get_port(sap));
 		sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
 		dprintk("    remote address  : %pIS:%u\n", sap, rpc_get_port(sap));
-		dprintk("    max_sge         : %d\n", newxprt->sc_max_send_sges);
+		dprintk("    max_sge         : %u\n", newxprt->sc_max_send_sges);
 		dprintk("    sq_depth        : %d\n", newxprt->sc_sq_depth);
 		dprintk("    rdma_rw_ctxs    : %d\n", ctxts);
 		dprintk("    max_requests    : %d\n", newxprt->sc_max_requests);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index aecf9c0a153f..8ed9da6d2d2f 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -453,7 +453,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
 	/* Client offers RDMA Read but does not initiate */
 	ep->re_remote_cma.initiator_depth = 0;
 	ep->re_remote_cma.responder_resources =
-		min_t(int, U8_MAX, device->attrs.max_qp_rd_atom);
+		min(device->attrs.max_qp_rd_atom, U8_MAX);
 
 	/* Limit transport retries so client can detect server
 	 * GID changes quickly. RPC layer handles re-establishing
-- 
2.34.1


^ permalink raw reply related

* Re: [RFC] Enabling CONFIG_NTP_PPS for NOHZ by adding ntp_error to system_time_snapshot
From: Thomas Gleixner @ 2026-06-19 20:21 UTC (permalink / raw)
  To: David Woodhouse, John Stultz, Stephen Boyd, Miroslav Lichvar,
	Richard Cochran, linux-kernel, netdev
  Cc: Rodolfo Giometti, Alexander Gordeev
In-Reply-To: <02564e5f0b6be4aeb6198af87b46269963985768.camel@infradead.org>

On Fri, Jun 19 2026 at 16:34, David Woodhouse wrote:
> On Fri, 2026-06-19 at 15:34 +0200, Thomas Gleixner wrote:
>> 
>> This formatting makes my brain hurt. Can you please split that out into
>> a separate function?
>
> Yep. There's also a potential error there — an *additional* discrepancy
> comes from the enforced monotonicity that timekeeping_cycles_to_ns()
> applies (the case where it just returns tkr->xtime_nsec >> tkr_shift).
>
> I couldn't work out if I cared about the clocksource-is-non-monotonic
> casse, and even if I did, what I should do about it.

I think the right thing is just to ignore it.

The problem is very narrow and mostly related to the historically badly
synchronized TSC between sockets. The TSC_ADJUST fixup is obviously
error prone as it adjusts only to the point where the error is not
longer observable. But in the update transition phase it can result in
time going backwards because the readout on the other CPU is slightly
behind tk::tkr_mono::cycles_last. That happens only once in a while and
we talk about a very low single digit number of TSC cycles.

> I also wasn't sure if this should be a new CLOCK_REALTIME_NONMONOTONIC
> or something like that, such that e.g. PTP clients could *ask* for it.

Hell no!

> It's all very well hard-coding it in pps_get_ts() and unconditionally
> changing the behaviour... I *think* we could justify that. But the
> example I actually used in the patch was PTP, and that's slightly
> harder to justify the behavioural change.

Just leave it alone.

If the TSCs between sockets are slightly out of [mostly unobservable]
sync then if you don't hit this corner case at the edge of the update
then you have to live with that discrepancy anyway as you don't know
about it at all. So making a magic extra case for this unlikely event is
overkill. Due to speculation, caches etc. pp the snapshot is anyway in
that low single digit TSC cycles margin of inaccuracy.

Don't try to defeat reality and the underlying physics. Perfect is the
enemy of good.

Thanks,

        tglx

^ permalink raw reply

* [PATCH 2/2] selftests/bpf: validate rx_queue_index in xdp_metadata
From: Siddharth_Cibi @ 2026-06-19 19:57 UTC (permalink / raw)
  To: ast
  Cc: Siddharth_Cibi, Daniel Borkmann, David S. Miller, Jakub Kicinski,
	Jesper Dangaard Brouer, John Fastabend, Stanislav Fomichev,
	Andrii Nakryiko, Eduard Zingerman, Kumar Kartikeya Dwivedi,
	Martin KaFai Lau, Song Liu, Yonghong Song, Jiri Olsa,
	Emil Tsalapatis, Shuah Khan, open list:XDP (eXpress Data Path),
	open list:XDP (eXpress Data Path),
	open list:KERNEL SELFTEST FRAMEWORK, open list
In-Reply-To: <20260619195759.41254-1-siddharthcibi@icloud.com>

Extend xdp_metadata selftest coverage to validate that
ctx->rx_queue_index is preserved and observable after XDP redirect
execution.

Capture rx_queue_index in metadata and assert that it matches the
expected queue during packet verification.

Signed-off-by: Siddharth_Cibi <siddharthcibi@icloud.com>
---
 tools/testing/selftests/bpf/prog_tests/xdp_metadata.c | 3 ++-
 tools/testing/selftests/bpf/progs/xdp_metadata.c      | 2 +-
 tools/testing/selftests/bpf/xdp_metadata.h            | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
index 5c31054ad4a4..f8cabbbe7bb7 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
@@ -309,7 +309,8 @@ static int verify_xsk_metadata(struct xsk *xsk, bool sent_from_af_xdp)
 
 	if (!ASSERT_NEQ(meta->rx_hash, 0, "rx_hash"))
 		return -1;
-
+	if (!ASSERT_EQ(meta->rx_queue_index, QUEUE_ID, "rx_queue_index"))
+        	return -1;
 	if (!sent_from_af_xdp) {
 		if (!ASSERT_NEQ(meta->rx_hash_type & XDP_RSS_TYPE_L4, 0, "rx_hash_type"))
 			return -1;
diff --git a/tools/testing/selftests/bpf/progs/xdp_metadata.c b/tools/testing/selftests/bpf/progs/xdp_metadata.c
index 09bb8a038d52..62ae83860d7f 100644
--- a/tools/testing/selftests/bpf/progs/xdp_metadata.c
+++ b/tools/testing/selftests/bpf/progs/xdp_metadata.c
@@ -98,7 +98,7 @@ int rx(struct xdp_md *ctx)
 	bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash, &meta->rx_hash_type);
 	bpf_xdp_metadata_rx_vlan_tag(ctx, &meta->rx_vlan_proto,
 				     &meta->rx_vlan_tci);
-
+	meta->rx_queue_index = ctx->rx_queue_index;
 	return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS);
 }
 
diff --git a/tools/testing/selftests/bpf/xdp_metadata.h b/tools/testing/selftests/bpf/xdp_metadata.h
index 87318ad1117a..1f0ae4c00091 100644
--- a/tools/testing/selftests/bpf/xdp_metadata.h
+++ b/tools/testing/selftests/bpf/xdp_metadata.h
@@ -49,4 +49,5 @@ struct xdp_meta {
 		__s32 rx_vlan_tag_err;
 	};
 	enum xdp_meta_field hint_valid;
+	__u32 rx_queue_index;
 };
-- 
2.53.0


^ permalink raw reply related

* [PATCH 1/2] bpf: preserve rx_queue_index across XDP redirects
From: Siddharth_Cibi @ 2026-06-19 19:57 UTC (permalink / raw)
  To: ast
  Cc: Siddharth C, Daniel Borkmann, David S. Miller, Jakub Kicinski,
	Jesper Dangaard Brouer, John Fastabend, Stanislav Fomichev,
	Eric Dumazet, Paolo Abeni, Simon Horman, Andrii Nakryiko,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Martin KaFai Lau,
	Song Liu, Yonghong Song, Jiri Olsa, Emil Tsalapatis,
	open list:XDP (eXpress Data Path),
	open list:XDP (eXpress Data Path), open list
In-Reply-To: <20260619195759.41254-1-siddharthcibi@icloud.com>

From: Siddharth C <siddharthcibi@icloud.com>

Store rx_queue_index in struct xdp_frame during xdp_buff to
xdp_frame conversion and restore it when rebuilding xdp_rxq_info
for cpumap and devmap execution paths.

This preserves ingress RX queue information for XDP programs
executed after redirect, allowing access to the original
rx_queue_index instead of losing queue context.

Also propagate rx_queue_index for zero-copy XDP frame conversion.

Signed-off-by: Siddharth_Cibi <siddharthcibi@icloud.com>
---
 include/net/xdp.h   | 2 ++
 kernel/bpf/cpumap.c | 2 +-
 kernel/bpf/devmap.c | 5 ++++-
 net/core/xdp.c      | 1 +
 4 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index aa742f413c35..90318b2b76dc 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -301,6 +301,7 @@ struct xdp_frame {
 	 */
 	enum xdp_mem_type mem_type:32;
 	struct net_device *dev_rx; /* used by cpumap */
+	u32 rx_queue_index;
 	u32 frame_sz;
 	u32 flags; /* supported values defined in xdp_buff_flags */
 };
@@ -441,6 +442,7 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
 
 	/* rxq only valid until napi_schedule ends, convert to xdp_mem_type */
 	xdp_frame->mem_type = xdp->rxq->mem.type;
+	xdp_frame->rx_queue_index = xdp->rxq->queue_index;
 
 	return xdp_frame;
 }
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 5e59ab896f05..8f2d7013620f 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -197,7 +197,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
 
 		rxq.dev = xdpf->dev_rx;
 		rxq.mem.type = xdpf->mem_type;
-		/* TODO: report queue_index to xdp_rxq_info */
+		rxq.queue_index = xdpf->rx_queue_index;
 
 		xdp_convert_frame_to_buff(xdpf, &xdp);
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index dc7b859e8bbf..f419fa0e53e5 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -339,7 +339,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
 				struct net_device *rx_dev)
 {
 	struct xdp_txq_info txq = { .dev = tx_dev };
-	struct xdp_rxq_info rxq = { .dev = rx_dev };
+	struct xdp_rxq_info rxq = { };
 	struct xdp_buff xdp;
 	int i, nframes = 0;
 
@@ -349,6 +349,9 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
 		int err;
 
 		xdp_convert_frame_to_buff(xdpf, &xdp);
+		rxq.dev = rx_dev;
+		rxq.mem.type = xdpf->mem_type;
+		rxq.queue_index = xdpf->rx_queue_index;
 		xdp.txq = &txq;
 		xdp.rxq = &rxq;
 
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 9890a30584ba..9691d8dfadf3 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -606,6 +606,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
 	xdpf->metasize = metasize;
 	xdpf->frame_sz = PAGE_SIZE;
 	xdpf->mem_type = MEM_TYPE_PAGE_ORDER0;
+	xdpf->rx_queue_index = xdp->rxq->queue_index;
 
 	xsk_buff_free(xdp);
 	return xdpf;
-- 
2.53.0


^ permalink raw reply related

* (no subject)
From: Siddharth_Cibi @ 2026-06-19 19:57 UTC (permalink / raw)
  To: ast
  Cc: Siddharth_Cibi, Daniel Borkmann, David S. Miller, Jakub Kicinski,
	Jesper Dangaard Brouer, John Fastabend, Stanislav Fomichev,
	open list:XDP (eXpress Data Path):Keyword:(?:b|_)xdp(?:b|_),
	open list:XDP (eXpress Data Path):Keyword:(?:b|_)xdp(?:b|_)

Subject: [PATCH 0/2] preserve rx_queue_index across XDP redirects

XDP programs executed after redirect through cpumap and devmap
currently lose ingress RX queue information because rx_queue_index
is not preserved across xdp_buff to xdp_frame conversion.

Preserve rx_queue_index in struct xdp_frame and restore it when
rebuilding xdp_rxq_info for redirected execution paths.

Add a selftest validating that ctx->rx_queue_index remains available
through xdp_metadata after redirect.

Testing:

* Built modified kernel objects
* Ran tools/testing/selftests/bpf/test_progs -t xdp_metadata -v
* Verified xdp_metadata passes
* Added explicit rx_queue_index assertion

Siddharth C (1):
  bpf: preserve rx_queue_index across XDP redirects

Siddharth_Cibi (1):
  selftests/bpf: validate rx_queue_index in xdp_metadata

 include/net/xdp.h                                     | 2 ++
 kernel/bpf/cpumap.c                                   | 2 +-
 kernel/bpf/devmap.c                                   | 5 ++++-
 net/core/xdp.c                                        | 1 +
 tools/testing/selftests/bpf/prog_tests/xdp_metadata.c | 3 ++-
 tools/testing/selftests/bpf/progs/xdp_metadata.c      | 2 +-
 tools/testing/selftests/bpf/xdp_metadata.h            | 1 +
 7 files changed, 12 insertions(+), 4 deletions(-)

-- 
2.53.0


^ permalink raw reply

* Re: [PATCH net] ipv6: ioam: fix type confusion of dst_entry
From: Justin Iurman @ 2026-06-19 19:42 UTC (permalink / raw)
  To: Jiayuan Chen, netdev
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, linux-kernel
In-Reply-To: <20260618104336.48934-1-jiayuan.chen@linux.dev>

On 6/18/26 12:43, Jiayuan Chen wrote:
> IOAM uses a dummy dst_entry(null_dst) to mark that the destination should
> not be changed after the transformation. This dst is stored in the IOAM lwt
> state and may be passed to dst_cache_set_ip6().
> 
> However, the IPv6 dst cache path eventually calls rt6_get_cookie(), which
> treats the dst_entry as part of a struct rt6_info. Since the null_dst was
> embedded directly as a struct dst_entry in struct ioam6_lwt, this resulted
> in an invalid cast and rt6_get_cookie() reading fields from the wrong
> object.
> 
> In practice, the wrong cookie is not used while dst->obsolete is zero, but
> rt6_get_cookie() may also access per-cpu value when rt->sernum is
> zero. In this case, rt->sernum aliases ioam6_lwt::cache::reset_ts, which
> can become zero, making this a potential invalid pointer access.
> 
> Fix this by embedding a full struct rt6_info for the dummy IPv6 route and
> passing its dst member to the dst APIs.

Good catch, thanks!

> Fixes: 47ce7c854563 ("net: ipv6: ioam6: fix double reallocation")
> Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>

Reviewed-by: Justin Iurman <justin.iurman@gmail.com>

^ permalink raw reply

* Re: [PATCH rdma-next v7] RDMA: Change capability fields in ib_device_attr from int to u32
From: Erni Sri Satya Vennela @ 2026-06-19 19:32 UTC (permalink / raw)
  To: Andy Shevchenko
  Cc: mkalderon, Jason Gunthorpe, Leon Romanovsky, zyjzyj2000, sagi,
	mgurtovoy, haris.iqbal, jinpu.wang, bvanassche, kbusch,
	Jens Axboe, Christoph Hellwig, kch, smfrench, linkinjeon, metze,
	tom, trondmy, anna, chuck.lever, jlayton, neil, okorniev, Dai.Ngo,
	achender, davem, edumazet, kuba, pabeni, horms, kees, ebadger,
	linux-rdma, linux-kernel, target-devel, linux-nvme, linux-cifs,
	samba-technical, linux-nfs, netdev, rds-devel, Jason Gunthorpe
In-Reply-To: <aigwONAwxQx6rLef@ashevche-desk.local>

Hi Andy,

Sorry for delayed response.

> >  	attr->max_qp_init_rd_atom =
> >  	    1 << (fls(qattr->max_qp_req_rd_atomic_resc) - 1);
> 
> FWIW, this one and below looks like reinvention of rounddown_pow_of_two().

Acked.
> 
> >  	attr->max_qp_rd_atom =
> > -	    min(1 << (fls(qattr->max_qp_resp_rd_atomic_resc) - 1),
> > +	    min(1U << (fls(qattr->max_qp_resp_rd_atomic_resc) - 1),
> >  		attr->max_qp_init_rd_atom);
> 
> ...
> 
> >  int ipoib_cm_dev_init(struct net_device *dev)
> >  {
> >  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
> > -	int max_srq_sge, i;
> > +	int i;
> > +	u32 max_srq_sge;
> >  	u8 addr;
> 
> It seems the order is reversed xmas tree, why not preserving it?
> 
Right. I'll fix it in the next version.
> ...
> 
> > --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c
> > +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
> 
> >  		max_send_wr =
> > -			min_t(int, wr_limit, SERVICE_CON_QUEUE_DEPTH * 2 + 2);
> > +			min(wr_limit, SERVICE_CON_QUEUE_DEPTH * 2 + 2);
> 
> Now perfectly a single line
> 
> 		max_send_wr = min(wr_limit, SERVICE_CON_QUEUE_DEPTH * 2 + 2);
> 
> >  		max_recv_wr = max_send_wr;
> 
> ...
> 
> > -		max_send_wr = min_t(int, wr_limit,
> > -			      /* QD * (REQ + RSP + FR REGS or INVS) + drain */
> > -			      clt_path->queue_depth * 4 + 1);
> > -		max_recv_wr = min_t(int, wr_limit,
> > -			      clt_path->queue_depth * 3 + 1);
> > +		max_send_wr = min_t(u32, wr_limit,
> > +				    /* QD * (REQ + RSP + FR REGS or INVS) + drain */
> > +				    clt_path->queue_depth * 4 + 1);
> > +		max_recv_wr = min_t(u32, wr_limit,
> > +				    clt_path->queue_depth * 3 + 1);
> 
> Can we rather update the type of one of them and use min() instead?
> 
I'll remove all the min_t usages in the next version.
> ...
> 
> > --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c
> > +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
> 
> Ditto.
> 
> ...
> 
> > -static int srpt_srq_size = DEFAULT_SRPT_SRQ_SIZE;
> > -module_param(srpt_srq_size, int, 0444);
> > +static unsigned int srpt_srq_size = DEFAULT_SRPT_SRQ_SIZE;
> > +module_param(srpt_srq_size, uint, 0444);
> 
> Theoretically this might break ABI (if somebody uses negative values for
> anything. I don't think it's the case, but just be informed.
> 
Okay. Thankyou for the information. 

> >  MODULE_PARM_DESC(srpt_srq_size,
> >  		 "Shared receive queue (SRQ) size.");
> 
> ...
> 
> > --- a/drivers/nvme/target/rdma.c
> > +++ b/drivers/nvme/target/rdma.c
> 
> > -	ndev->srq_size = min(ndev->device->attrs.max_srq_wr,
> > -			     nvmet_rdma_srq_size);
> > -	ndev->srq_count = min(ndev->device->num_comp_vectors,
> > -			      ndev->device->attrs.max_srq);
> > +	ndev->srq_size = min_t(u32, ndev->device->attrs.max_srq_wr,
> > +			       nvmet_rdma_srq_size);
> > +	ndev->srq_count = min_t(u32, ndev->device->num_comp_vectors,
> > +				ndev->device->attrs.max_srq);
> 
> Same question, can we change type type of variables instead?
>
Yes. I'll be doing it in the next version.
 
> >  	mutex_lock(&device_list_mutex);
> 
> ...
> 
> >  	inline_page_count = num_pages(nport->inline_data_size);
> >  	inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
> > -				cm_id->device->attrs.max_recv_sge) - 1;
> > +				cm_id->device->attrs.max_recv_sge);
> > +	inline_sge_count = inline_sge_count ? inline_sge_count - 1 : 0;
> 
> Simple conditional might be better
> 
> 	if (inline_sge_count)
> 		inline_sge_count--;
> 	OR
> 		inline_sge_count -= 1;
Okay. I'll update all such instances.

> 
> ...
> 
> > +++ b/include/rdma/ib_verbs.h
> 
> > -	int			max_qp;
> > -	int			max_qp_wr;
> > +	u32			max_qp;
> > +	u32			max_qp_wr;
> 
> Nice, but please check that none of these (and beyond) were not used in signed
> multiplication or (which is more disasterous) division. Otherwise it might be
> subtle issues that will be hard to debug.
Yes I have checked that for all the variables I updated.

> 
> ...
> 
> >  	conn_param->responder_resources =
> > -		min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
> > +		min3(rds_ibdev->max_responder_resources,
> > +		     max_responder_resources, U8_MAX);
> >  	conn_param->initiator_depth =
> > -		min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
> > +		min3(rds_ibdev->max_initiator_depth,
> > +		     max_initiator_depth, U8_MAX);
> 
> I believe we can go a few characters over and leave them to be single lines.
> 
Okay.

> >  	conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
> 
> What about this one?
Sorry. I missed this one, I'll update it.

> 
> >  	conn_param->rnr_retry_count = 7;
> 
> ...
> 
> >  int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
> >  {
> >  	const struct ib_device_attr *attrs = &device->attrs;
> > -	int max_qp_wr, depth, delta;
> > +	u32 max_qp_wr;
> > +	int depth, delta;
> >  	unsigned int max_sge;
> 
> Reversed xmas tree order.
Okay

Thankyou for all your suggestions.
The next version will be incorporated with all these changes.

- Vennela
> 
> -- 
> With Best Regards,
> Andy Shevchenko
> 

^ permalink raw reply

* [PATCH net v2] eth: bnxt: improve the timing of stats
From: Jakub Kicinski @ 2026-06-19 19:15 UTC (permalink / raw)
  To: davem
  Cc: netdev, edumazet, pabeni, andrew+netdev, horms, Jakub Kicinski,
	michael.chan, pavan.chebbi

Kernel selftests wait 1.25x of the promised stats refresh time
(as read from ethtool -c). bnxt reports 1sec by default, but
the stats update process has two steps. First device DMAs the
new values, then the service task performs update in full-width
SW counters. So the worst case delay is actually 2x.

Note that the behavior is different for ring stats and port stats.
Port stats are fetched synchronously by the service worker, so
there's no risk of doubling up the delay there.

The problem of stale stats impacts not only tests but real workloads
which monitor egress bandwidth of a NIC. The inaccuracy causes double
counting in the next cycle and spurious overload alarms.

Try to read from the DMA buffer more aggressively, to mitigate
timing issues between DMA and service task. The SW update should
be cheap.

Fixes: 51f307856b60 ("bnxt_en: Allow statistics DMA to be configurable using ethtool -C.")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
CC: michael.chan@broadcom.com
CC: pavan.chebbi@broadcom.com

v2:
 - split the accumulate into port and ring
 - make the sync only cover rings
 - remove sync from callbacks which use port stats (which are fetched
   synchronously by the service worker)
v1: https://lore.kernel.org/20260618181358.3037661-1-kuba@kernel.org

With this patch I had a 50 clean runs of ntuple.py in a row.
Previously it'd fail within 5 runs at most.

Hopefully this is good enough, in the past I sent an RFC to
convert the driver to use SW stats for everything. That felt
a little drastic.
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.h     |  5 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     | 48 ++++++++++++++++++-
 .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c |  1 +
 3 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 6d312259f852..6335dfc14c98 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -2620,6 +2620,10 @@ struct bnxt {
 #define BNXT_MIN_STATS_COAL_TICKS	  250000
 #define BNXT_MAX_STATS_COAL_TICKS	 1000000
 
+	/* Protects stats_updated_jiffies and writes to sw_stats */
+	spinlock_t		stats_lock;
+	unsigned long		stats_updated_jiffies;
+
 	struct work_struct	sp_task;
 	unsigned long		sp_event;
 #define BNXT_RX_NTP_FLTR_SP_EVENT	1
@@ -3027,6 +3031,7 @@ void bnxt_reenable_sriov(struct bnxt *bp);
 void bnxt_close_nic(struct bnxt *, bool, bool);
 void bnxt_get_ring_drv_stats(struct bnxt *bp,
 			     struct bnxt_total_ring_drv_stats *stats);
+void bnxt_sync_ring_stats(struct bnxt *bp);
 bool bnxt_rfs_capable(struct bnxt *bp, bool new_rss_ctx);
 int bnxt_dbg_hwrm_rd_reg(struct bnxt *bp, u32 reg_off, u16 num_words,
 			 u32 *reg_buf);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 055e93a417b6..7513618793da 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -10530,7 +10530,7 @@ static void bnxt_accumulate_stats(struct bnxt_stats_mem *stats)
 				stats->hw_masks, stats->len / 8, false);
 }
 
-static void bnxt_accumulate_all_stats(struct bnxt *bp)
+static void bnxt_accumulate_ring_stats(struct bnxt *bp)
 {
 	struct bnxt_stats_mem *ring0_stats;
 	bool ignore_zero = false;
@@ -10553,6 +10553,10 @@ static void bnxt_accumulate_all_stats(struct bnxt *bp)
 					ring0_stats->hw_masks,
 					ring0_stats->len / 8, ignore_zero);
 	}
+}
+
+static void bnxt_accumulate_port_stats(struct bnxt *bp)
+{
 	if (bp->flags & BNXT_FLAG_PORT_STATS) {
 		struct bnxt_stats_mem *stats = &bp->port_stats;
 		__le64 *hw_stats = stats->hw_stats;
@@ -10575,6 +10579,41 @@ static void bnxt_accumulate_all_stats(struct bnxt *bp)
 	}
 }
 
+static void bnxt_accumulate_all_stats(struct bnxt *bp)
+{
+	bnxt_accumulate_ring_stats(bp);
+	bnxt_accumulate_port_stats(bp);
+}
+
+/* Re-accumulate ring stats from DMA buffers if stale.
+ * uAPIs for reading sw_stats should call this first.
+ *
+ * We promise user space update frequency of bp->stats_coal_ticks but
+ * the update is a two step process - first device updates the DMA buffer,
+ * then we have to update from that buffer to driver stats in the service work.
+ * Worst case we would be 2x off from the desired frequency.
+ * Sync the stats sooner, if stale. The 20% threshold was chosen arbitrarily.
+ *
+ * Ideally we would split the user-configured time into two portions,
+ * i.e. also lower the DMA period by the 20%. But the DMA timer seems to have
+ * too coarse granularity to play such tricks.
+ */
+void bnxt_sync_ring_stats(struct bnxt *bp)
+{
+	unsigned long stale;
+
+	if (!netif_running(bp->dev) || !bp->stats_coal_ticks)
+		return;
+
+	spin_lock(&bp->stats_lock);
+	stale = usecs_to_jiffies(bp->stats_coal_ticks / 5);
+	if (time_after_eq(jiffies, bp->stats_updated_jiffies + stale)) {
+		bnxt_accumulate_ring_stats(bp);
+		bp->stats_updated_jiffies = jiffies;
+	}
+	spin_unlock(&bp->stats_lock);
+}
+
 static int bnxt_hwrm_port_qstats(struct bnxt *bp, u8 flags)
 {
 	struct hwrm_port_qstats_input *req;
@@ -13577,6 +13616,7 @@ bnxt_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		return;
 	}
 
+	bnxt_sync_ring_stats(bp);
 	bnxt_get_ring_stats(bp, stats);
 	bnxt_add_prev_stats(bp, stats);
 
@@ -14753,7 +14793,10 @@ static void bnxt_sp_task(struct work_struct *work)
 	if (test_and_clear_bit(BNXT_PERIODIC_STATS_SP_EVENT, &bp->sp_event)) {
 		bnxt_hwrm_port_qstats(bp, 0);
 		bnxt_hwrm_port_qstats_ext(bp, 0);
+		spin_lock(&bp->stats_lock);
 		bnxt_accumulate_all_stats(bp);
+		bp->stats_updated_jiffies = jiffies;
+		spin_unlock(&bp->stats_lock);
 	}
 
 	if (test_and_clear_bit(BNXT_LINK_CHNG_SP_EVENT, &bp->sp_event)) {
@@ -15488,6 +15531,7 @@ static int bnxt_init_board(struct pci_dev *pdev, struct net_device *dev)
 	INIT_DELAYED_WORK(&bp->fw_reset_task, bnxt_fw_reset_task);
 
 	spin_lock_init(&bp->ntp_fltr_lock);
+	spin_lock_init(&bp->stats_lock);
 #if BITS_PER_LONG == 32
 	spin_lock_init(&bp->db_lock);
 #endif
@@ -16056,6 +16100,7 @@ static void bnxt_get_queue_stats_rx(struct net_device *dev, int i,
 	if (!bp->bnapi)
 		return;
 
+	bnxt_sync_ring_stats(bp);
 	cpr = &bp->bnapi[i]->cp_ring;
 	sw = cpr->stats.sw_stats;
 
@@ -16084,6 +16129,7 @@ static void bnxt_get_queue_stats_tx(struct net_device *dev, int i,
 	if (!bp->tx_ring)
 		return;
 
+	bnxt_sync_ring_stats(bp);
 	bnapi = bp->tx_ring[bp->tx_ring_map[i]].bnapi;
 	sw = bnapi->cp_ring.stats.sw_stats;
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 56d74a3c24b7..62bc9cae613c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -606,6 +606,7 @@ static void bnxt_get_ethtool_stats(struct net_device *dev,
 		goto skip_ring_stats;
 	}
 
+	bnxt_sync_ring_stats(bp);
 	tpa_stats = bnxt_get_num_tpa_ring_stats(bp);
 	for (i = 0; i < bp->cp_nr_rings; i++) {
 		struct bnxt_napi *bnapi = bp->bnapi[i];
-- 
2.54.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox