Re: [PATCH RFC] ipc/mqueue: introduce msg cache

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Brian Foster <bfoster@redhat.com>
To: Roman Gushchin <roman.gushchin@linux.dev>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	Shakeel Butt <shakeelb@google.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Michal Hocko <mhocko@kernel.org>,
	Muchun Song <muchun.song@linux.dev>,
	Andrew Morton <akpm@linux-foundation.org>,
	Waiman Long <longman@redhat.com>,
	Sven Luther <Sven.Luther@windriver.com>
Subject: Re: [PATCH RFC] ipc/mqueue: introduce msg cache
Date: Thu, 22 Dec 2022 06:52:06 -0500	[thread overview]
Message-ID: <Y6RE5iUHSinUJxDt@bfoster> (raw)
In-Reply-To: <20221220184813.1908318-1-roman.gushchin@linux.dev>

On Tue, Dec 20, 2022 at 10:48:13AM -0800, Roman Gushchin wrote:
> Sven Luther reported a regression in the posix message queues
> performance caused by switching to the per-object tracking of
> slab objects introduced by patch series ending with the
> commit 10befea91b61 ("mm: memcg/slab: use a single set of kmem_caches for all
> allocations").
> 
> To mitigate the regression cache allocated mqueue messages on a small
> percpu cache instead of releasing and re-allocating them every time.
> 
> This change brings the performance measured by a benchmark kindly
> provided by Sven [1] almost back to the original (or even better)
> numbers. Measurements results are also provided by Sven.
> 
> +------------+---------------------------------+--------------------------------+
> | kernel     | mqueue_rcv (ns)     variation   | mqueue_send (ns)   variation   |
> | version    | min avg  max      min      avg  | min avg max       min     avg  |
> +------------+--------------------------+---------------------------------------+
> | 4.18.45    | 351 382 17533    base     base  | 383 410 13178     base    base |
> | 5.8-good   | 380 392  7156   -7,63%  -2,55%  | 376 384  6225    1,86%   6,77% |
> | 5.8-bad    | 524 530  5310  -33,02% -27,92%  | 512 519  8775  -25,20% -21,00% |
> | 5.10       | 520 533  4078  -32,20% -28,33%  | 518 534  8108  -26,06% -23,22% |
> | 5.15       | 431 444  8440  -18,56% -13,96%  | 425 437  6170   -9,88%  -6,18% |
> | 6.0.3      | 474 614  3881  -25,95% -37,79%  | 482 693   931  -20,54% -40,84% |
> | 6.1-rc8    | 496 509  8804  -29,23% -24,95%  | 493 512  5748  -22,31% -19,92% |
> | 6.1-rc8+p  | 392 397  5479  -10,46%  -3,78%  | 364 369 10776    5,22%  11,11% |
> +------------+---------------------------------+--------------------------------+
> 
> The last line reflects the result with this patch ("6.1-rc8+p").
> 
> [1]: https://lore.kernel.org/linux-mm/Y46lqCToUa%2FBgt%2Fc@P9FQF9L96D/T/
> 
> Reported-by: Sven Luther <Sven.Luther@windriver.com>
> Tested-by: Sven Luther <Sven.Luther@windriver.com>
> Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> ---
>  include/linux/memcontrol.h |  12 +++++
>  ipc/mqueue.c               |  20 ++++++--
>  ipc/msg.c                  |  12 ++---
>  ipc/msgutil.c              | 101 +++++++++++++++++++++++++++++++++----
>  ipc/util.h                 |   8 ++-
>  mm/memcontrol.c            |  62 +++++++++++++++++++++++
>  6 files changed, 194 insertions(+), 21 deletions(-)
> 
...
> diff --git a/ipc/msgutil.c b/ipc/msgutil.c
> index d0a0e877cadd..8667708fc00a 100644
> --- a/ipc/msgutil.c
> +++ b/ipc/msgutil.c
...
> @@ -39,16 +40,76 @@ struct msg_msgseg {
...
> +static struct msg_msg *alloc_msg(size_t len, struct msg_cache *cache)
>  {
>  	struct msg_msg *msg;
>  	struct msg_msgseg **pseg;
>  	size_t alen;
>  
> +	if (cache) {
> +		struct pcpu_msg_cache *pc;
> +
> +		msg = NULL;
> +		pc = get_cpu_ptr(cache->pcpu_cache);
> +		if (pc->msg && pc->len == len) {
> +			struct obj_cgroup *objcg;
> +
> +			rcu_read_lock();
> +			objcg = obj_cgroup_from_current();
> +			if (objcg == pc->objcg) {
> +				msg = pc->msg;
> +				pc->msg = NULL;
> +				obj_cgroup_put(pc->objcg);
> +			}
> +			rcu_read_unlock();
> +		}
> +		put_cpu_ptr(cache->pcpu_cache);
> +		if (msg)
> +			return msg;
> +	}
> +
>  	alen = min(len, DATALEN_MSG);
>  	msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_ACCOUNT);
>  	if (msg == NULL)
> @@ -77,18 +138,19 @@ static struct msg_msg *alloc_msg(size_t len)
>  	return msg;
>  
>  out_err:
> -	free_msg(msg);
> +	free_msg(msg, NULL);
>  	return NULL;
>  }
>  
> -struct msg_msg *load_msg(const void __user *src, size_t len)
> +struct msg_msg *load_msg(const void __user *src, size_t len,
> +			 struct msg_cache *cache)
>  {
>  	struct msg_msg *msg;
>  	struct msg_msgseg *seg;
>  	int err = -EFAULT;
>  	size_t alen;
>  
> -	msg = alloc_msg(len);
> +	msg = alloc_msg(len, cache);
>  	if (msg == NULL)
>  		return ERR_PTR(-ENOMEM);
>  
> @@ -104,14 +166,16 @@ struct msg_msg *load_msg(const void __user *src, size_t len)
>  			goto out_err;
>  	}
>  
> -	err = security_msg_msg_alloc(msg);
> -	if (err)
> -		goto out_err;
> +	if (!msg->security) {
> +		err = security_msg_msg_alloc(msg);
> +		if (err)
> +			goto out_err;
> +	}
>  
>  	return msg;
>  
>  out_err:
> -	free_msg(msg);
> +	free_msg(msg, NULL);
>  	return ERR_PTR(err);
>  }
>  #ifdef CONFIG_CHECKPOINT_RESTORE
> @@ -166,10 +230,29 @@ int store_msg(void __user *dest, struct msg_msg *msg, size_t len)
>  	return 0;
>  }
>  
> -void free_msg(struct msg_msg *msg)
> +void free_msg(struct msg_msg *msg, struct msg_cache *cache)
>  {
>  	struct msg_msgseg *seg;
>  
> +	if (cache) {
> +		struct pcpu_msg_cache *pc;
> +		bool cached = false;
> +
> +		pc = get_cpu_ptr(cache->pcpu_cache);
> +		if (!pc->msg) {
> +			pc->objcg = get_obj_cgroup_from_slab_obj(msg);
> +			pc->len = msg->m_ts;
> +			pc->msg = msg;
> +
> +			if (pc->objcg)
> +				cached = true;
> +		}

Hi Roman,

It seems that this is kind of tailored to the ideal conditions
implemented by the test case: i.e., a single, fixed size message being
passed back and forth on a single cpu. Does that actually represent the
production workload?

I'm a little curious if/how this might work for workloads that might
involve more variable sized messages, deeper queue depths (i.e.  sending
more than one message before attempting a recv) and more tasks across
different cpus. For example, it looks like if an "uncommonly" sized
message ended up cached on a cpu, this would always result in subsequent
misses because the alloc side requires an exact size match and the free
side never replaces a cached msg. Hm?

Brian

> +		put_cpu_ptr(cache->pcpu_cache);
> +
> +		if (cached)
> +			return;
> +	}
> +
>  	security_msg_msg_free(msg);
>  
>  	seg = msg->next;
> diff --git a/ipc/util.h b/ipc/util.h
> index b2906e366539..a2da266386aa 100644
> --- a/ipc/util.h
> +++ b/ipc/util.h
> @@ -196,8 +196,12 @@ static inline void ipc_update_pid(struct pid **pos, struct pid *pid)
>  int ipc_parse_version(int *cmd);
>  #endif
>  
> -extern void free_msg(struct msg_msg *msg);
> -extern struct msg_msg *load_msg(const void __user *src, size_t len);
> +struct msg_cache;
> +extern int init_msg_cache(struct msg_cache *cache);
> +extern void free_msg_cache(struct msg_cache *cache);
> +extern void free_msg(struct msg_msg *msg, struct msg_cache *cache);
> +extern struct msg_msg *load_msg(const void __user *src, size_t len,
> +				struct msg_cache *cache);
>  extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst);
>  extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len);
>  
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index a1a35c12635e..28528b4da0fb 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3004,6 +3004,28 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
>  	return objcg;
>  }
>  
> +__always_inline struct obj_cgroup *obj_cgroup_from_current(void)
> +{
> +	struct obj_cgroup *objcg = NULL;
> +	struct mem_cgroup *memcg;
> +
> +	if (memcg_kmem_bypass())
> +		return NULL;
> +
> +	if (unlikely(active_memcg()))
> +		memcg = active_memcg();
> +	else
> +		memcg = mem_cgroup_from_task(current);
> +
> +	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
> +		objcg = rcu_dereference(memcg->objcg);
> +		if (likely(objcg))
> +			return objcg;
> +	}
> +
> +	return NULL;
> +}
> +
>  __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
>  {
>  	struct obj_cgroup *objcg = NULL;
> @@ -3046,6 +3068,46 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
>  	return objcg;
>  }
>  
> +/*
> + * A passed kernel object must be a slab object or a generic kernel page.
> + * Not suitable for objects, allocated using vmalloc().
> + */
> +struct obj_cgroup *get_obj_cgroup_from_slab_obj(void *p)
> +{
> +	struct obj_cgroup *objcg = NULL;
> +	struct folio *folio;
> +
> +	if (mem_cgroup_disabled())
> +		return NULL;
> +
> +	folio = virt_to_folio(p);
> +	/*
> +	 * Slab object can be either a true slab object, which are accounted
> +	 * individually with objcg pointers stored in a separate objcg array,
> +	 * or it can a generic folio with MEMCG_DATA_KMEM flag set.
> +	 */
> +	if (folio_test_slab(folio)) {
> +		struct obj_cgroup **objcgs;
> +		struct slab *slab;
> +		unsigned int off;
> +
> +		slab = folio_slab(folio);
> +		objcgs = slab_objcgs(slab);
> +		if (!objcgs)
> +			return NULL;
> +
> +		off = obj_to_index(slab->slab_cache, slab, p);
> +		objcg = objcgs[off];
> +	} else {
> +		objcg = __folio_objcg(folio);
> +	}
> +
> +	if (objcg)
> +		obj_cgroup_get(objcg);
> +
> +	return objcg;
> +}
> +
>  static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
>  {
>  	mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
> -- 
> 2.39.0
> 
>

next prev parent reply	other threads:[~2022-12-22 11:52 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-12-20 18:48 [PATCH RFC] ipc/mqueue: introduce msg cache Roman Gushchin
2022-12-20 19:53 ` Shakeel Butt
2022-12-20 20:59   ` Roman Gushchin
2022-12-20 23:28     ` Shakeel Butt
2022-12-20 23:56       ` Roman Gushchin
2022-12-21  9:29 ` Thorsten Leemhuis
2022-12-22 11:52 ` Brian Foster [this message]
2022-12-22 16:37   ` Roman Gushchin
2022-12-22 17:54     ` Brian Foster
2023-02-16 12:29 ` Linux regression tracking (Thorsten Leemhuis)
2023-02-17 18:26   ` Roman Gushchin
2023-02-24 10:47     ` Linux regression tracking (Thorsten Leemhuis)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Y6RE5iUHSinUJxDt@bfoster \
    --to=bfoster@redhat.com \
    --cc=Sven.Luther@windriver.com \
    --cc=akpm@linux-foundation.org \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=longman@redhat.com \
    --cc=mhocko@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeelb@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.