All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jason Gunthorpe <jgg@nvidia.com>
To: Leon Romanovsky <leon@kernel.org>
Cc: Or Har-Toov <ohartoov@nvidia.com>,
	linux-rdma@vger.kernel.org, Maher Sanalla <msanalla@nvidia.com>
Subject: Re: [PATCH rdma-next 1/3] IB/mad: Replace MAD's refcount with a state machine
Date: Tue, 10 Dec 2024 12:12:41 -0400	[thread overview]
Message-ID: <20241210161241.GJ2347147@nvidia.com> (raw)
In-Reply-To: <b6b991b75a7d8cce465a6c917ef0db1c264165bd.1733233636.git.leonro@nvidia.com>

On Tue, Dec 03, 2024 at 03:52:21PM +0200, Leon Romanovsky wrote:

> Thus, replace with a state machine as the following:
>  * IB_MAD_STATE_SEND_START - MAD was sent to the QP and is waiting
>    for completion notification
>  * IB_MAD_STATE_WAIT_RESP - MAD send completed successfully, waiting
>    for a response
>  * IB_MAD_STATE_EARLY_RESP - Response came early, before send
>    completion notification
>  * IB_MAD_STATE_DONE - MAD processing completed

This is a good idea, that refcounting stuff is totally inscrutable.

> @@ -1773,9 +1772,13 @@ ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
>  void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr)
>  {
>  	mad_send_wr->timeout = 0;
> -	if (mad_send_wr->refcount == 1)
> +	if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) {
> +		mad_send_wr->state = IB_MAD_STATE_DONE;
>  		list_move_tail(&mad_send_wr->agent_list,
>  			      &mad_send_wr->mad_agent_priv->done_list);
> +	} else {
> +		mad_send_wr->state = IB_MAD_STATE_EARLY_RESP;

Let's be even more explicit on all these transitions:

      	      WARN_ON(mad_send_wr->state != IB_MAD_STATE_SEND_START);

Maybe with some macro compiled out unless lockdep is on or
something. But there are clear rules here how the FSM should work,
lets document and check them.

Let me try to guess what they should be at each point..

>  static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
> @@ -2195,6 +2198,7 @@ static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
>  		list_item = &mad_agent_priv->wait_list;
>  	}
>  
> +	mad_send_wr->state = IB_MAD_STATE_WAIT_RESP;
>  	list_add(&mad_send_wr->agent_list, list_item);

      	WARN_ON(mad_send_wr->state != IB_MAD_STATE_SEND_START);

> @@ -2222,6 +2226,11 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
>  
>  	mad_agent_priv = mad_send_wr->mad_agent_priv;
>  	spin_lock_irqsave(&mad_agent_priv->lock, flags);
> +	if (mad_send_wr->state == IB_MAD_STATE_EARLY_RESP) {
> +		mad_send_wr->state = IB_MAD_STATE_DONE;
> +		goto done;
> +	}

     else
      	      WARN_ON(mad_send_wr->state != IB_MAD_STATE_SEND_START);


This flow doesn't seem to touch the agent_list? If we got to
EARLY_RESP then mad_done didn't remove it from the agent list and we
goto done which skips it? Seems like a bug?

> @@ -2232,14 +2241,10 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
>  	if (mad_send_wc->status != IB_WC_SUCCESS &&
>  	    mad_send_wr->status == IB_WC_SUCCESS) {
>  		mad_send_wr->status = mad_send_wc->status;

mad_send_wr->state = IB_MAD_STATE_DONE 

??

> -		mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
> -	}
> -
> -	if (--mad_send_wr->refcount > 0) {
> -		if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
> -		    mad_send_wr->status == IB_WC_SUCCESS) {
> -			wait_for_response(mad_send_wr);
> -		}
> +	} else if (mad_send_wr->status == IB_WC_SUCCESS &&
> +		   mad_send_wr->timeout &&
> +		   mad_send_wr->state == IB_MAD_STATE_SEND_START) {
> +		wait_for_response(mad_send_wr);
>  		goto done;
>  	}

This sequence looks a little off, if we got a EARLY_RESP then we skip
everything else. It is possible for the send WC to indicate fail even
if we got a matching response and all that checking got skipped.

Maybe like this:

	if (mad_send_wc->status != IB_WC_SUCCESS &&
	    mad_send_wr->status == IB_WC_SUCCESS) {
		mad_send_wr->status = mad_send_wc->status;
		WARN_ON(mad_send_wr->state != IB_MAD_STATE_EARLY_RESP &&
			mad_send_wr->state != IB_MAD_STATE_SEND_START);
	} else if (mad_send_wr->status == IB_WC_SUCCESS &&
		   mad_send_wr->timeout) {
		if (mad_send_wr->state == IB_MAD_STATE_SEND_START) {
			wait_for_response(mad_send_wr);
			goto done;
		}
		WARN_ON(mad_send_wr->state == IB_MAD_STATE_EARLY_RESP);
	}

	/* Remove send from MAD agent and notify client of completion */
	mad_send_wr->state = IB_MAD_STATE_DONE;
	list_del(&mad_send_wr->agent_list);

 
> @@ -2407,12 +2412,9 @@ static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
>  
>  	spin_lock_irqsave(&mad_agent_priv->lock, flags);
>  	list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
> -				 &mad_agent_priv->send_list, agent_list) {
> -		if (mad_send_wr->status == IB_WC_SUCCESS) {
> +				 &mad_agent_priv->send_list, agent_list)
> +		if (mad_send_wr->status == IB_WC_SUCCESS)
>  			mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
> -			mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
> -		}

That status check feels like it should be a state check? Should it be
a new state instead of storing status?

What states are valid here anyhow? IB_MAD_STATE_SEND_START and
IB_MAD_STATE_EARLY_RESP I guess?

> @@ -2459,7 +2461,6 @@ int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms)
>  	struct ib_mad_agent_private *mad_agent_priv;
>  	struct ib_mad_send_wr_private *mad_send_wr;
>  	unsigned long flags;
> -	int active;
>  
>  	if (!send_buf)
>  		return -EINVAL;
> @@ -2473,14 +2474,11 @@ int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms)
>  		return -EINVAL;
>  	}
>  
> -	active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1);
> -	if (!timeout_ms) {
> +	if (!timeout_ms)
>  		mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
> -		mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
> -	}

Here again, should FLUSH_ERR be its own state? This is basically
canceling the mad right? Then why do we reset the timeout?

>  	mad_send_wr->send_buf.timeout_ms = timeout_ms;
> -	if (active)
> +	if (mad_send_wr->state == IB_MAD_STATE_SEND_START)
>  		mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
>  	else
>  		ib_reset_mad_timeout(mad_send_wr, timeout_ms);

So this else is IB_MAD_STATE_WAIT_RESP ?

> @@ -2607,7 +2605,7 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
>  		ret = ib_send_mad(mad_send_wr);
>  
>  	if (!ret) {
> -		mad_send_wr->refcount++;
> +		mad_send_wr->state = IB_MAD_STATE_SEND_START;

      	      WARN_ON(mad_send_wr->state != IB_MAD_STATE_WAIT_RESP);

?

Does the caller reliably ensure this?

> diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
> index 8af0619a39cd..dff264e9bb68 100644
> --- a/drivers/infiniband/core/mad_rmpp.c
> +++ b/drivers/infiniband/core/mad_rmpp.c
> @@ -717,13 +717,13 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent,
>  			ib_mad_complete_send_wr(mad_send_wr, &wc);
>  			return;
>  		}
> -		if (mad_send_wr->refcount == 1)
> +		if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP)
>  			ib_reset_mad_timeout(mad_send_wr,
>  					     mad_send_wr->send_buf.timeout_ms);

else
      	      WARN_ON(mad_send_wr->state != IB_MAD_STATE_SEND_START);

>  		spin_unlock_irqrestore(&agent->lock, flags);
>  		ack_ds_ack(agent, mad_recv_wc);
>  		return;
> -	} else if (mad_send_wr->refcount == 1 &&
> +	} else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP &&
>  		   mad_send_wr->seg_num < mad_send_wr->newwin &&
>  		   mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) {
>  		/* Send failure will just result in a timeout/retry */
> @@ -731,7 +731,7 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent,
>  		if (ret)
>  			goto out;
>  
> -		mad_send_wr->refcount++;
> +		mad_send_wr->state = IB_MAD_STATE_SEND_START;

      	      WARN_ON(mad_send_wr->state != IB_MAD_STATE_WAIT_RESP);

?

>  		list_move_tail(&mad_send_wr->agent_list,
>  			      &mad_send_wr->mad_agent_priv->send_list);
>  	}
> @@ -890,7 +890,6 @@ int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
>  	mad_send_wr->newwin = init_newwin(mad_send_wr);
>  
>  	/* We need to wait for the final ACK even if there isn't a response */
> -	mad_send_wr->refcount += (mad_send_wr->timeout == 0);

      WARN_ON(mad_send_wr->state != IB_MAD_STATE_WAIT_RESP); 
??

Jason

  reply	other threads:[~2024-12-10 16:12 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-12-03 13:52 [PATCH rdma-next 0/3] Add Flow Control for Solicited MADs Leon Romanovsky
2024-12-03 13:52 ` [PATCH rdma-next 1/3] IB/mad: Replace MAD's refcount with a state machine Leon Romanovsky
2024-12-10 16:12   ` Jason Gunthorpe [this message]
2024-12-03 13:52 ` [PATCH rdma-next 2/3] IB/mad: Remove unnecessary done list by utilizing MAD states Leon Romanovsky
2024-12-10 19:00   ` Jason Gunthorpe
2024-12-10 23:51     ` Or Har-Toov
2024-12-03 13:52 ` [PATCH rdma-next 3/3] IB/mad: Add flow control for solicited MADs Leon Romanovsky
2024-12-10 20:12   ` Jason Gunthorpe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241210161241.GJ2347147@nvidia.com \
    --to=jgg@nvidia.com \
    --cc=leon@kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=msanalla@nvidia.com \
    --cc=ohartoov@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.