All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Toke Høiland-Jørgensen" <toke@redhat.com>
To: Hou Tao <houtao@huaweicloud.com>, bpf@vger.kernel.org
Cc: netdev@vger.kernel.org, "David S . Miller" <davem@davemloft.net>,
	"Jakub Kicinski" <kuba@kernel.org>,
	"Jesper Dangaard Brouer" <hawk@kernel.org>,
	"John Fastabend" <john.fastabend@gmail.com>,
	"Björn Töpel" <bjorn.topel@gmail.com>,
	"Martin KaFai Lau" <martin.lau@linux.dev>,
	"Alexei Starovoitov" <alexei.starovoitov@gmail.com>,
	"Andrii Nakryiko" <andrii@kernel.org>,
	"Song Liu" <song@kernel.org>, "Hao Luo" <haoluo@google.com>,
	"Yonghong Song" <yonghong.song@linux.dev>,
	"Daniel Borkmann" <daniel@iogearbox.net>,
	"KP Singh" <kpsingh@kernel.org>,
	"Stanislav Fomichev" <sdf@google.com>,
	"Jiri Olsa" <jolsa@kernel.org>,
	houtao1@huawei.com
Subject: Re: [RFC PATCH bpf-next 1/2] bpf, cpumap: Use queue_rcu_work() to remove unnecessary rcu_barrier()
Date: Thu, 10 Aug 2023 12:16:26 +0200	[thread overview]
Message-ID: <87il9nfbid.fsf@toke.dk> (raw)
In-Reply-To: <20230728023030.1906124-2-houtao@huaweicloud.com>

Hou Tao <houtao@huaweicloud.com> writes:

> From: Hou Tao <houtao1@huawei.com>
>
> As for now __cpu_map_entry_replace() uses call_rcu() to wait for the
> inflight xdp program and NAPI poll to exit the RCU read critical
> section, and then launch kworker cpu_map_kthread_stop() to call
> kthread_stop() to handle all pending xdp frames or skbs.
>
> But it is unnecessary to use rcu_barrier() in cpu_map_kthread_stop() to
> wait for the completion of __cpu_map_entry_free(), because rcu_barrier()
> will wait for all pending RCU callbacks and cpu_map_kthread_stop() only
> needs to wait for the completion of a specific __cpu_map_entry_free().
>
> So use queue_rcu_work() to replace call_rcu(), schedule_work() and
> rcu_barrier(). queue_rcu_work() will queue a __cpu_map_entry_free()
> kworker after a RCU grace period. Because __cpu_map_entry_free() is
> running in a kworker context, so it is OK to do all of these freeing
> procedures include kthread_stop() in it.
>
> After the update, there is no need to do reference-counting for
> bpf_cpu_map_entry, because bpf_cpu_map_entry is freed directly in
> __cpu_map_entry_free(), so just remove it.
>
> Signed-off-by: Hou Tao <houtao1@huawei.com>

I think your analysis is correct, and this is a nice cleanup of what is
really a bit of an over-complicated cleanup flow - well done!

I have a few nits below, but with those feel free to resend as non-RFC
and add my:

Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>

> ---
>  kernel/bpf/cpumap.c | 93 +++++++++++----------------------------------
>  1 file changed, 23 insertions(+), 70 deletions(-)
>
> diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
> index 0a16e30b16ef..24f39c37526f 100644
> --- a/kernel/bpf/cpumap.c
> +++ b/kernel/bpf/cpumap.c
> @@ -67,10 +67,7 @@ struct bpf_cpu_map_entry {
>  	struct bpf_cpumap_val value;
>  	struct bpf_prog *prog;
>  
> -	atomic_t refcnt; /* Control when this struct can be free'ed */
> -	struct rcu_head rcu;
> -
> -	struct work_struct kthread_stop_wq;
> +	struct rcu_work free_work;
>  };
>  
>  struct bpf_cpu_map {
> @@ -115,11 +112,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
>  	return &cmap->map;
>  }
>  
> -static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
> -{
> -	atomic_inc(&rcpu->refcnt);
> -}
> -
>  static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
>  {
>  	/* The tear-down procedure should have made sure that queue is
> @@ -134,43 +126,6 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
>  			xdp_return_frame(xdpf);
>  }
>  
> -static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
> -{
> -	if (atomic_dec_and_test(&rcpu->refcnt)) {
> -		if (rcpu->prog)
> -			bpf_prog_put(rcpu->prog);
> -		/* The queue should be empty at this point */
> -		__cpu_map_ring_cleanup(rcpu->queue);
> -		ptr_ring_cleanup(rcpu->queue, NULL);
> -		kfree(rcpu->queue);
> -		kfree(rcpu);
> -	}
> -}
> -
> -/* called from workqueue, to workaround syscall using preempt_disable */
> -static void cpu_map_kthread_stop(struct work_struct *work)
> -{
> -	struct bpf_cpu_map_entry *rcpu;
> -	int err;
> -
> -	rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
> -
> -	/* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
> -	 * as it waits until all in-flight call_rcu() callbacks complete.
> -	 */
> -	rcu_barrier();
> -
> -	/* kthread_stop will wake_up_process and wait for it to complete */
> -	err = kthread_stop(rcpu->kthread);
> -	if (err) {
> -		/* kthread_stop may be called before cpu_map_kthread_run
> -		 * is executed, so we need to release the memory related
> -		 * to rcpu.
> -		 */
> -		put_cpu_map_entry(rcpu);
> -	}
> -}
> -
>  static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
>  				     struct list_head *listp,
>  				     struct xdp_cpumap_stats *stats)
> @@ -395,7 +350,6 @@ static int cpu_map_kthread_run(void *data)
>  	}
>  	__set_current_state(TASK_RUNNING);
>  
> -	put_cpu_map_entry(rcpu);
>  	return 0;
>  }
>  
> @@ -471,9 +425,6 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
>  	if (IS_ERR(rcpu->kthread))
>  		goto free_prog;
>  
> -	get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
> -	get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
> -
>  	/* Make sure kthread runs on a single CPU */
>  	kthread_bind(rcpu->kthread, cpu);
>  	wake_up_process(rcpu->kthread);
> @@ -494,7 +445,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
>  	return NULL;
>  }
>  
> -static void __cpu_map_entry_free(struct rcu_head *rcu)
> +static void __cpu_map_entry_free(struct work_struct *work)
>  {
>  	struct bpf_cpu_map_entry *rcpu;
>  
> @@ -503,30 +454,33 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
>  	 * new packets and cannot change/set flush_needed that can
>  	 * find this entry.
>  	 */
> -	rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
> +	rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work);
>  
>  	free_percpu(rcpu->bulkq);

Let's move this free down to the end along with the others.

> -	/* Cannot kthread_stop() here, last put free rcpu resources */
> -	put_cpu_map_entry(rcpu);
> +
> +	/* kthread_stop will wake_up_process and wait for it to complete */

Suggest adding to this comment: "cpu_map_kthread_run() makes sure the
pointer ring is empty before exiting."

> +	kthread_stop(rcpu->kthread);
> +
> +	if (rcpu->prog)
> +		bpf_prog_put(rcpu->prog);
> +	/* The queue should be empty at this point */
> +	__cpu_map_ring_cleanup(rcpu->queue);
> +	ptr_ring_cleanup(rcpu->queue, NULL);
> +	kfree(rcpu->queue);
> +	kfree(rcpu);
>  }
>  
>  /* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
> - * ensure any driver rcu critical sections have completed, but this
> - * does not guarantee a flush has happened yet. Because driver side
> - * rcu_read_lock/unlock only protects the running XDP program.  The
> - * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
> - * pending flush op doesn't fail.
> + * ensure both any driver rcu critical sections and xdp_do_flush()
> + * have completed.
>   *
>   * The bpf_cpu_map_entry is still used by the kthread, and there can
> - * still be pending packets (in queue and percpu bulkq).  A refcnt
> - * makes sure to last user (kthread_stop vs. call_rcu) free memory
> - * resources.
> + * still be pending packets (in queue and percpu bulkq).
>   *
> - * The rcu callback __cpu_map_entry_free flush remaining packets in
> - * percpu bulkq to queue.  Due to caller map_delete_elem() disable
> - * preemption, cannot call kthread_stop() to make sure queue is empty.
> - * Instead a work_queue is started for stopping kthread,
> - * cpu_map_kthread_stop, which waits for an RCU grace period before
> + * Due to caller map_delete_elem() is in RCU read critical section,
> + * cannot call kthread_stop() to make sure queue is empty. Instead
> + * a work_struct is started for stopping kthread,
> + * __cpu_map_entry_free, which waits for a RCU grace period before
>   * stopping kthread, emptying the queue.
>   */

I think the above comment is a bit too convoluted, still. I'd suggest
just replacing the whole thing with this:

/* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old
 * entry is no longer in use before freeing. We use queue_rcu_work() to call
 * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace
 * period. This means that (a) all pending enqueue and flush operations have
 * completed (because or the RCU callback), and (b) we are in a workqueue
 * context where we can stop the kthread and wait for it to exit before freeing
 * everything.
 */

>  static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
> @@ -536,9 +490,8 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
>  
>  	old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));
>  	if (old_rcpu) {
> -		call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
> -		INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
> -		schedule_work(&old_rcpu->kthread_stop_wq);
> +		INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free);
> +		queue_rcu_work(system_wq, &old_rcpu->free_work);
>  	}
>  }
>  
> -- 
> 2.29.2


  reply	other threads:[~2023-08-10 10:16 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-07-28  2:30 [RFC PATCH bpf-next 0/2] Remove unnecessary synchronizations in cpumap Hou Tao
2023-07-28  2:30 ` [RFC PATCH bpf-next 1/2] bpf, cpumap: Use queue_rcu_work() to remove unnecessary rcu_barrier() Hou Tao
2023-08-10 10:16   ` Toke Høiland-Jørgensen [this message]
2023-08-11 10:22     ` Hou Tao
2023-07-28  2:30 ` [RFC PATCH bpf-next 2/2] bpf, cpumap: Clean up bpf_cpu_map_entry directly in cpu_map_free Hou Tao
2023-08-10 10:22   ` Toke Høiland-Jørgensen
2023-08-11 10:23     ` Hou Tao
2023-08-10  3:22 ` [RFC PATCH bpf-next 0/2] Remove unnecessary synchronizations in cpumap Hou Tao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87il9nfbid.fsf@toke.dk \
    --to=toke@redhat.com \
    --cc=alexei.starovoitov@gmail.com \
    --cc=andrii@kernel.org \
    --cc=bjorn.topel@gmail.com \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=haoluo@google.com \
    --cc=hawk@kernel.org \
    --cc=houtao1@huawei.com \
    --cc=houtao@huaweicloud.com \
    --cc=john.fastabend@gmail.com \
    --cc=jolsa@kernel.org \
    --cc=kpsingh@kernel.org \
    --cc=kuba@kernel.org \
    --cc=martin.lau@linux.dev \
    --cc=netdev@vger.kernel.org \
    --cc=sdf@google.com \
    --cc=song@kernel.org \
    --cc=yonghong.song@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.