All of lore.kernel.org
 help / color / mirror / Atom feed
From: Zhu Yanjun <yanjun.zhu@linux.dev>
To: zhenwei pi <zhenwei.pi@linux.dev>,
	linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org,
	"yanjun.zhu@linux.dev" <yanjun.zhu@linux.dev>
Cc: zyjzyj2000@gmail.com, jgg@ziepe.ca, leon@kernel.org
Subject: Re: [PATCH v2 3/4] RDMA/rxe: support perf mgmt GET method
Date: Sat, 28 Mar 2026 21:54:21 -0700	[thread overview]
Message-ID: <bd9d513d-3d75-4bff-b759-a29669e6651a@linux.dev> (raw)
In-Reply-To: <20260329025552.122946-4-zhenwei.pi@linux.dev>

在 2026/3/28 19:55, zhenwei pi 写道:
> In RXE, hardware counters are already supported, but not in a
> standardized manner. For instance, user-space monitoring tools like
> atop only read from the *counters* directory. Therefore, it is
> necessary to add perf management support to RXE.
> 
> Signed-off-by: zhenwei pi <zhenwei.pi@linux.dev>
> ---
>   drivers/infiniband/sw/rxe/Makefile    |  1 +
>   drivers/infiniband/sw/rxe/rxe_loc.h   |  6 ++
>   drivers/infiniband/sw/rxe/rxe_mad.c   | 93 +++++++++++++++++++++++++++
>   drivers/infiniband/sw/rxe/rxe_verbs.c |  1 +
>   drivers/infiniband/sw/rxe/rxe_verbs.h |  5 ++
>   5 files changed, 106 insertions(+)
>   create mode 100644 drivers/infiniband/sw/rxe/rxe_mad.c
> 
> diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile
> index 93134f1d1d0c..3c47e5b982c2 100644
> --- a/drivers/infiniband/sw/rxe/Makefile
> +++ b/drivers/infiniband/sw/rxe/Makefile
> @@ -22,6 +22,7 @@ rdma_rxe-y := \
>   	rxe_mcast.o \
>   	rxe_task.o \
>   	rxe_net.o \
> +	rxe_mad.o \
>   	rxe_hw_counters.o
>   
>   rdma_rxe-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += rxe_odp.o
> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> index 7992290886e1..a8ce85147c1f 100644
> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> @@ -245,4 +245,10 @@ static inline int rxe_ib_advise_mr(struct ib_pd *pd,
>   
>   #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>   
> +/* rxe-mad.c */
> +int rxe_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
> +		    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
> +		    const struct ib_mad *in, struct ib_mad *out,
> +		    size_t *out_mad_size, u16 *out_mad_pkey_index);
> +
>   #endif /* RXE_LOC_H */
> diff --git a/drivers/infiniband/sw/rxe/rxe_mad.c b/drivers/infiniband/sw/rxe/rxe_mad.c
> new file mode 100644
> index 000000000000..5e0567806c02
> --- /dev/null
> +++ b/drivers/infiniband/sw/rxe/rxe_mad.c
> @@ -0,0 +1,93 @@
> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> +/*
> + * Copyright (c) 2026 zhenwei pi <zhenwei.pi@linux.dev>
> + */
> +
> +#include <rdma/ib_pma.h>
> +#include "rxe.h"
> +#include "rxe_hw_counters.h"
> +
> +static int rxe_get_pma_info(struct ib_mad *out)
> +{
> +	struct ib_class_port_info cpi = {};
> +
> +	cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
> +	memcpy((out->data + 40), &cpi, sizeof(cpi));
> +
> +	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
> +}
> +
> +static int rxe_get_pma_counters(struct rxe_dev *rxe, struct ib_mad *out)
> +{
> +	struct ib_pma_portcounters *pma_cnt = (struct ib_pma_portcounters *)(out->data + 40);
> +	s64 val;
> +
> +	/* IBA release 1.8, 16.1.3.5: During operation, instead of overflowing, they shall stop
> +	 * at all ones.
> +	 */
> +	val = rxe_counter_get(rxe, RXE_CNT_LINK_DOWNED);
> +	if (val > U8_MAX)
> +		pma_cnt->link_downed_counter = U8_MAX;
> +	else
> +		pma_cnt->link_downed_counter = (u8)val;
> +
> +	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
> +}
> +
> +static int rxe_get_pma_counters_ext(struct rxe_dev *rxe, struct ib_mad *out)
> +{
> +	struct ib_pma_portcounters_ext *pma_cnt_ext =
> +		(struct ib_pma_portcounters_ext *)(out->data + 40);
> +
> +	pma_cnt_ext->port_xmit_data = cpu_to_be64(rxe_counter_get(rxe, RXE_CNT_SENT_BYTES) >> 2);
> +	pma_cnt_ext->port_rcv_data = cpu_to_be64(rxe_counter_get(rxe, RXE_CNT_RCVD_BYTES) >> 2);
> +	pma_cnt_ext->port_xmit_packets = cpu_to_be64(rxe_counter_get(rxe, RXE_CNT_SENT_PKTS));
> +	pma_cnt_ext->port_rcv_packets = cpu_to_be64(rxe_counter_get(rxe, RXE_CNT_RCVD_PKTS));
> +
> +	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
> +}
> +
> +static int rxe_get_perf_mgmt(struct rxe_dev *rxe, const struct ib_mad *in,
> +				 struct ib_mad *out)
> +{
> +	switch (in->mad_hdr.attr_id) {
> +	case IB_PMA_CLASS_PORT_INFO:
> +		return rxe_get_pma_info(out);
> +
> +	case IB_PMA_PORT_COUNTERS:
> +		return rxe_get_pma_counters(rxe, out);
> +
> +	case IB_PMA_PORT_COUNTERS_EXT:
> +		return rxe_get_pma_counters_ext(rxe, out);
> +
> +	default:
> +		break;
> +	}
> +
> +	return IB_MAD_RESULT_FAILURE;

In rxe_get_perf_mgmt, the default case returns IB_MAD_RESULT_FAILURE.

Returning FAILURE in the MAD stack often results in the packet being 
dropped without a response. This causes management tools (like ibstat or 
perfquery) to hang until a timeout occurs.

According to IBA specs, the agent should return a reply with a proper 
error status.

Return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY and set 
out->mad_hdr.status = cpu_to_be16(IB_MGMT_MAD_STATUS_BAD_ATTR);.

The source code in default branch should be:

"
default:
out->mad_hdr.status = cpu_to_be16(IB_MGMT_MAD_STATUS_BAD_ATTR);
return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
"

> +}
> +
> +int rxe_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
> +		    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
> +		    const struct ib_mad *in, struct ib_mad *out,
> +		    size_t *out_mad_size, u16 *out_mad_pkey_index)
> +{
> +	struct rxe_dev *rxe = to_rdev(ibdev);
> +	u8 mgmt_class = in->mad_hdr.mgmt_class;
> +	u8 method = in->mad_hdr.method;
> +
> +	if (port_num != RXE_PORT)
> +		return IB_MAD_RESULT_FAILURE;
> +

In the IB Management Datagram (MAD) spec, many Reserved fields or unused 
counter fields are strictly required to be zero-filled.

By using memset(out, 0, sizeof(*out)), you ensure that all fields in the 
response packet—including those not currently supported by the 
driver—have a predefined initial value. This prevents the transmission 
of packets containing "random noise" (residual kernel data) to the 
querying entity.

Add "memset(out, 0, sizeof(*out));" here.

Zhu Yanjun

> +	switch (mgmt_class) {
> +	case IB_MGMT_CLASS_PERF_MGMT:
> +		if (method == IB_MGMT_METHOD_GET)
> +			return rxe_get_perf_mgmt(rxe, in, out);
> +		break;
> +
> +	default:
> +		break;
> +	}
> +
> +	return IB_MAD_RESULT_FAILURE;
> +}
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
> index bcd486e8668b..7df0cb5a09a3 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
> @@ -1509,6 +1509,7 @@ static const struct ib_device_ops rxe_dev_ops = {
>   	.post_recv = rxe_post_recv,
>   	.post_send = rxe_post_send,
>   	.post_srq_recv = rxe_post_srq_recv,
> +	.process_mad = rxe_process_mad,
>   	.query_ah = rxe_query_ah,
>   	.query_device = rxe_query_device,
>   	.query_pkey = rxe_query_pkey,
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> index 2bcfb919a40b..1c4fa8eaa733 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> @@ -466,6 +466,11 @@ static inline void rxe_counter_add(struct rxe_dev *rxe, enum rxe_counters index,
>   	atomic64_add(val, &rxe->stats_counters[index]);
>   }
>   
> +static inline s64 rxe_counter_get(struct rxe_dev *rxe, enum rxe_counters index)
> +{
> +	return atomic64_read(&rxe->stats_counters[index]);
> +}
> +
>   static inline struct rxe_dev *to_rdev(struct ib_device *dev)
>   {
>   	return dev ? container_of(dev, struct rxe_dev, ib_dev) : NULL;


  reply	other threads:[~2026-03-29  4:54 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-29  2:55 [PATCH v2 0/4] Support PERF MGMT for RXE zhenwei pi
2026-03-29  2:55 ` [PATCH v2 1/4] RDMA/rxe: use RXE_PORT instead of magic number 1 zhenwei pi
2026-03-29  2:55 ` [PATCH v2 2/4] RDMA/rxe: add SENT/RCVD bytes zhenwei pi
2026-03-29  2:55 ` [PATCH v2 3/4] RDMA/rxe: support perf mgmt GET method zhenwei pi
2026-03-29  4:54   ` Zhu Yanjun [this message]
2026-03-29  2:55 ` [PATCH v2 4/4] RDMA/rxe: use rxe_counter_get zhenwei pi
2026-03-29  4:56   ` Zhu Yanjun

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bd9d513d-3d75-4bff-b759-a29669e6651a@linux.dev \
    --to=yanjun.zhu@linux.dev \
    --cc=jgg@ziepe.ca \
    --cc=leon@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=zhenwei.pi@linux.dev \
    --cc=zyjzyj2000@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.