Re: [PATCH 11/15] ttm/pool: enable memcg tracking and shrinker. (v2)

cgroups.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: "Christian König" <christian.koenig@amd.com>
To: Dave Airlie <airlied@gmail.com>,
	dri-devel@lists.freedesktop.org, tj@kernel.org,
	Johannes Weiner <hannes@cmpxchg.org>,
	Michal Hocko <mhocko@kernel.org>,
	Roman Gushchin <roman.gushchin@linux.dev>,
	Shakeel Butt <shakeel.butt@linux.dev>,
	Muchun Song <muchun.song@linux.dev>
Cc: cgroups@vger.kernel.org, Dave Chinner <david@fromorbit.com>,
	Waiman Long <longman@redhat.com>,
	simona@ffwll.ch
Subject: Re: [PATCH 11/15] ttm/pool: enable memcg tracking and shrinker. (v2)
Date: Tue, 2 Sep 2025 16:23:31 +0200	[thread overview]
Message-ID: <4e462912-64de-461c-8c4b-204e6f58dde8@amd.com> (raw)
In-Reply-To: <20250902041024.2040450-12-airlied@gmail.com>

On 02.09.25 06:06, Dave Airlie wrote:
> From: Dave Airlie <airlied@redhat.com>
> 
> This enables all the backend code to use the list lru in memcg mode,
> and set the shrinker to be memcg aware.
> 
> It adds the loop case for when pooled pages end up being reparented
> to a higher memcg group, that newer memcg can search for them there
> and take them back.

I can only repeat that as far as I can see that makes no sense at all.

This just enables stealing pages from the page pool per cgroup and won't give them back if another cgroup runs into a low memery situation.

Maybe Thomas and the XE guys have an use case for that, but as far as I can see that behavior is not something we would ever want.

Regards,
Christian.

> 
> Signed-off-by: Dave Airlie <airlied@redhat.com>
> 
> ---
> v2: just use the proper stats.
> ---
>  drivers/gpu/drm/ttm/ttm_pool.c | 127 ++++++++++++++++++++++++++-------
>  mm/list_lru.c                  |   1 +
>  2 files changed, 104 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
> index 2c9969de7517..1e6da2cc1f06 100644
> --- a/drivers/gpu/drm/ttm/ttm_pool.c
> +++ b/drivers/gpu/drm/ttm/ttm_pool.c
> @@ -142,7 +142,9 @@ static int ttm_pool_nid(struct ttm_pool *pool) {
>  }
>  
>  /* Allocate pages of size 1 << order with the given gfp_flags */
> -static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
> +static struct page *ttm_pool_alloc_page(struct ttm_pool *pool,
> +					struct obj_cgroup *objcg,
> +					gfp_t gfp_flags,
>  					unsigned int order)
>  {
>  	unsigned long attr = DMA_ATTR_FORCE_CONTIGUOUS;
> @@ -162,7 +164,10 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
>  		p = alloc_pages_node(pool->nid, gfp_flags, order);
>  		if (p) {
>  			p->private = order;
> -			mod_lruvec_page_state(p, NR_GPU_ACTIVE, 1 << order);
> +			if (!mem_cgroup_charge_gpu_page(objcg, p, order, gfp_flags, false)) {
> +				__free_pages(p, order);
> +				return NULL;
> +			}
>  		}
>  		return p;
>  	}
> @@ -213,8 +218,7 @@ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching,
>  #endif
>  
>  	if (!pool || !pool->use_dma_alloc) {
> -		mod_lruvec_page_state(p, reclaim ? NR_GPU_RECLAIM : NR_GPU_ACTIVE,
> -				      -(1 << order));
> +		mem_cgroup_uncharge_gpu_page(p, order, reclaim);
>  		__free_pages(p, order);
>  		return;
>  	}
> @@ -301,12 +305,11 @@ static void ttm_pool_type_give(struct ttm_pool_type *pt, struct page *p)
>  
>  	INIT_LIST_HEAD(&p->lru);
>  	rcu_read_lock();
> -	list_lru_add(&pt->pages, &p->lru, nid, NULL);
> +	list_lru_add(&pt->pages, &p->lru, nid, page_memcg_check(p));
>  	rcu_read_unlock();
>  
> -	atomic_long_add(num_pages, &allocated_pages[nid]);	
> -	mod_lruvec_page_state(p, NR_GPU_ACTIVE, -num_pages);
> -	mod_lruvec_page_state(p, NR_GPU_RECLAIM, num_pages);
> +	atomic_long_add(num_pages, &allocated_pages[nid]);
> +	mem_cgroup_move_gpu_page_reclaim(NULL, p, pt->order, true);
>  }
>  
>  static enum lru_status take_one_from_lru(struct list_head *item,
> @@ -321,20 +324,56 @@ static enum lru_status take_one_from_lru(struct list_head *item,
>  	return LRU_REMOVED;
>  }
>  
> -/* Take pages from a specific pool_type, return NULL when nothing available */
> -static struct page *ttm_pool_type_take(struct ttm_pool_type *pt, int nid)
> +static int pool_lru_get_page(struct ttm_pool_type *pt, int nid,
> +			     struct page **page_out,
> +			     struct obj_cgroup *objcg,
> +			     struct mem_cgroup *memcg)
>  {
>  	int ret;
>  	struct page *p = NULL;
>  	unsigned long nr_to_walk = 1;
> +	unsigned int num_pages = 1 << pt->order;
>  
> -	ret = list_lru_walk_node(&pt->pages, nid, take_one_from_lru, (void *)&p, &nr_to_walk);
> +	ret = list_lru_walk_one(&pt->pages, nid, memcg, take_one_from_lru, (void *)&p, &nr_to_walk);
>  	if (ret == 1 && p) {
> -		atomic_long_sub(1 << pt->order, &allocated_pages[nid]);
> -		mod_lruvec_page_state(p, NR_GPU_ACTIVE, (1 << pt->order));
> -		mod_lruvec_page_state(p, NR_GPU_RECLAIM, -(1 << pt->order));		
> +		atomic_long_sub(num_pages, &allocated_pages[nid]);
> +
> +		if (!mem_cgroup_move_gpu_page_reclaim(objcg, p, pt->order, false)) {
> +			__free_pages(p, pt->order);
> +			p = NULL;
> +		}
>  	}
> -	return p;
> +	*page_out = p;
> +	return ret;
> +}
> +
> +/* Take pages from a specific pool_type, return NULL when nothing available */
> +static struct page *ttm_pool_type_take(struct ttm_pool_type *pt, int nid,
> +				       struct obj_cgroup *orig_objcg)
> +{
> +	struct page *page_out = NULL;
> +	int ret;
> +	struct mem_cgroup *orig_memcg = orig_objcg ? get_mem_cgroup_from_objcg(orig_objcg) : NULL;
> +	struct mem_cgroup *memcg = orig_memcg;
> +
> +	/*
> +	 * Attempt to get a page from the current memcg, but if it hasn't got any in it's level,
> +	 * go up to the parent and check there. This helps the scenario where multiple apps get
> +	 * started into their own cgroup from a common parent and want to reuse the pools.
> +	 */
> +	while (!page_out) {
> +		ret = pool_lru_get_page(pt, nid, &page_out, orig_objcg, memcg);
> +		if (ret == 1)
> +			break;
> +		if (!memcg)
> +			break;
> +		memcg = parent_mem_cgroup(memcg);
> +		if (!memcg)
> +			break;
> +	}
> +
> +	mem_cgroup_put(orig_memcg);
> +	return page_out;
>  }
>  
>  /* Initialize and add a pool type to the global shrinker list */
> @@ -344,7 +383,7 @@ static void ttm_pool_type_init(struct ttm_pool_type *pt, struct ttm_pool *pool,
>  	pt->pool = pool;
>  	pt->caching = caching;
>  	pt->order = order;
> -	list_lru_init(&pt->pages);
> +	list_lru_init_memcg(&pt->pages, mm_shrinker);
>  
>  	spin_lock(&shrinker_lock);
>  	list_add_tail(&pt->shrinker_list, &shrinker_list);
> @@ -387,6 +426,30 @@ static void ttm_pool_type_fini(struct ttm_pool_type *pt)
>  	ttm_pool_dispose_list(pt, &dispose);
>  }
>  
> +static int ttm_pool_check_objcg(struct obj_cgroup *objcg)
> +{
> +#ifdef CONFIG_MEMCG
> +	int r = 0;
> +	struct mem_cgroup *memcg;
> +	if (!objcg)
> +		return 0;
> +
> +	memcg = get_mem_cgroup_from_objcg(objcg);
> +	for (unsigned i = 0; i < NR_PAGE_ORDERS; i++) {
> +		r = memcg_list_lru_alloc(memcg, &global_write_combined[i].pages, GFP_KERNEL);
> +		if (r) {
> +			break;
> +		}
> +		r = memcg_list_lru_alloc(memcg, &global_uncached[i].pages, GFP_KERNEL);
> +		if (r) {
> +			break;
> +		}
> +	}
> +	mem_cgroup_put(memcg);
> +#endif
> +	return 0;
> +}
> +
>  /* Return the pool_type to use for the given caching and order */
>  static struct ttm_pool_type *ttm_pool_select_type(struct ttm_pool *pool,
>  						  enum ttm_caching caching,
> @@ -416,7 +479,9 @@ static struct ttm_pool_type *ttm_pool_select_type(struct ttm_pool *pool,
>  }
>  
>  /* Free pages using the per-node shrinker list */
> -static unsigned int ttm_pool_shrink(int nid, unsigned long num_to_free)
> +static unsigned int ttm_pool_shrink(int nid,
> +				    struct mem_cgroup *memcg,
> +				    unsigned long num_to_free)
>  {
>  	LIST_HEAD(dispose);
>  	struct ttm_pool_type *pt;
> @@ -428,7 +493,11 @@ static unsigned int ttm_pool_shrink(int nid, unsigned long num_to_free)
>  	list_move_tail(&pt->shrinker_list, &shrinker_list);
>  	spin_unlock(&shrinker_lock);
>  
> -	num_pages = list_lru_walk_node(&pt->pages, nid, pool_move_to_dispose_list, &dispose, &num_to_free);
> +	if (!memcg) {
> +		num_pages = list_lru_walk_node(&pt->pages, nid, pool_move_to_dispose_list, &dispose, &num_to_free);
> +	} else {
> +		num_pages = list_lru_walk_one(&pt->pages, nid, memcg, pool_move_to_dispose_list, &dispose, &num_to_free);
> +	}
>  	num_pages *= 1 << pt->order;
>  
>  	ttm_pool_dispose_list(pt, &dispose);
> @@ -593,6 +662,7 @@ static int ttm_pool_restore_commit(struct ttm_pool_tt_restore *restore,
>  			 */
>  			ttm_pool_split_for_swap(restore->pool, p);
>  			copy_highpage(restore->alloced_page + i, p);
> +			p->memcg_data = 0;
>  			__free_pages(p, 0);
>  		}
>  
> @@ -754,6 +824,7 @@ static int __ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt,
>  	bool allow_pools;
>  	struct page *p;
>  	int r;
> +	struct obj_cgroup *objcg = memcg_account ? tt->objcg : NULL;
>  
>  	WARN_ON(!alloc->remaining_pages || ttm_tt_is_populated(tt));
>  	WARN_ON(alloc->dma_addr && !pool->dev);
> @@ -771,6 +842,9 @@ static int __ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt,
>  
>  	page_caching = tt->caching;
>  	allow_pools = true;
> +
> +	ttm_pool_check_objcg(objcg);
> +
>  	for (order = ttm_pool_alloc_find_order(MAX_PAGE_ORDER, alloc);
>  	     alloc->remaining_pages;
>  	     order = ttm_pool_alloc_find_order(order, alloc)) {
> @@ -780,7 +854,7 @@ static int __ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt,
>  		p = NULL;
>  		pt = ttm_pool_select_type(pool, page_caching, order);
>  		if (pt && allow_pools)
> -			p = ttm_pool_type_take(pt, ttm_pool_nid(pool));
> +			p = ttm_pool_type_take(pt, ttm_pool_nid(pool), objcg);
>  
>  		/*
>  		 * If that fails or previously failed, allocate from system.
> @@ -791,7 +865,7 @@ static int __ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt,
>  		if (!p) {
>  			page_caching = ttm_cached;
>  			allow_pools = false;
> -			p = ttm_pool_alloc_page(pool, gfp_flags, order);
> +			p = ttm_pool_alloc_page(pool, objcg, gfp_flags, order);
>  		}
>  		/* If that fails, lower the order if possible and retry. */
>  		if (!p) {
> @@ -935,7 +1009,7 @@ void ttm_pool_free(struct ttm_pool *pool, struct ttm_tt *tt)
>  
>  	while (atomic_long_read(&allocated_pages[nid]) > pool_node_limit[nid]) {
>  		unsigned long diff = pool_node_limit[nid] - atomic_long_read(&allocated_pages[nid]);
> -		ttm_pool_shrink(nid, diff);
> +		ttm_pool_shrink(nid, NULL, diff);
>  	}
>  }
>  EXPORT_SYMBOL(ttm_pool_free);
> @@ -1055,6 +1129,7 @@ long ttm_pool_backup(struct ttm_pool *pool, struct ttm_tt *tt,
>  			if (flags->purge) {
>  				shrunken += num_pages;
>  				page->private = 0;
> +				page->memcg_data = 0;
>  				__free_pages(page, order);
>  				memset(tt->pages + i, 0,
>  				       num_pages * sizeof(*tt->pages));
> @@ -1191,10 +1266,14 @@ static unsigned long ttm_pool_shrinker_scan(struct shrinker *shrink,
>  					    struct shrink_control *sc)
>  {
>  	unsigned long num_freed = 0;
> +	int num_pools;
> +	spin_lock(&shrinker_lock);
> +	num_pools = list_count_nodes(&shrinker_list);
> +	spin_unlock(&shrinker_lock);
>  
>  	do
> -		num_freed += ttm_pool_shrink(sc->nid, sc->nr_to_scan);
> -	while (num_freed < sc->nr_to_scan &&
> +		num_freed += ttm_pool_shrink(sc->nid, sc->memcg, sc->nr_to_scan);
> +	while (num_pools-- >= 0 && num_freed < sc->nr_to_scan &&
>  	       atomic_long_read(&allocated_pages[sc->nid]));
>  
>  	sc->nr_scanned = num_freed;
> @@ -1381,7 +1460,7 @@ int ttm_pool_mgr_init(unsigned long num_pages)
>  	spin_lock_init(&shrinker_lock);
>  	INIT_LIST_HEAD(&shrinker_list);
>  
> -	mm_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "drm-ttm_pool");
> +	mm_shrinker = shrinker_alloc(SHRINKER_MEMCG_AWARE | SHRINKER_NUMA_AWARE, "drm-ttm_pool");
>  	if (!mm_shrinker)
>  		return -ENOMEM;
>  
> diff --git a/mm/list_lru.c b/mm/list_lru.c
> index 627589d75320..6a277f479dc3 100644
> --- a/mm/list_lru.c
> +++ b/mm/list_lru.c
> @@ -562,6 +562,7 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
>  
>  	return xas_error(&xas);
>  }
> +EXPORT_SYMBOL_GPL(memcg_list_lru_alloc);
>  #else
>  static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
>  {

next prev parent reply	other threads:[~2025-09-02 14:23 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-09-02  4:06 drm/ttm/memcg/lru: enable memcg tracking for ttm and amdgpu driver (complete series v3) Dave Airlie
2025-09-02  4:06 ` [PATCH 01/15] mm: add gpu active/reclaim per-node stat counters (v2) Dave Airlie
2025-09-02  4:06 ` [PATCH 02/15] drm/ttm: use gpu mm stats to track gpu memory allocations. (v4) Dave Airlie
2025-09-02  4:06 ` [PATCH 03/15] ttm/pool: port to list_lru. (v2) Dave Airlie
2025-09-03  0:44   ` kernel test robot
2025-09-02  4:06 ` [PATCH 04/15] ttm/pool: drop numa specific pools Dave Airlie
2025-09-02  4:06 ` [PATCH 05/15] ttm/pool: make pool shrinker NUMA aware Dave Airlie
2025-09-02  4:06 ` [PATCH 06/15] ttm/pool: track allocated_pages per numa node Dave Airlie
2025-09-02  4:06 ` [PATCH 07/15] memcg: add support for GPU page counters. (v3) Dave Airlie
2025-09-02  4:06 ` [PATCH 08/15] ttm: add a memcg accounting flag to the alloc/populate APIs Dave Airlie
2025-09-02  4:06 ` [PATCH 09/15] ttm/pool: initialise the shrinker earlier Dave Airlie
2025-09-02 14:07   ` Christian König
2025-09-04  2:21     ` Dave Airlie
2025-09-02  4:06 ` [PATCH 10/15] ttm: add objcg pointer to bo and tt Dave Airlie
2025-09-02  4:06 ` [PATCH 11/15] ttm/pool: enable memcg tracking and shrinker. (v2) Dave Airlie
2025-09-02 14:23   ` Christian König [this message]
2025-09-04  2:25     ` Dave Airlie
2025-09-04 11:29       ` Christian König
2025-09-02  4:06 ` [PATCH 12/15] ttm: hook up memcg placement flags Dave Airlie
2025-09-02  4:06 ` [PATCH 13/15] memcontrol: allow objcg api when memcg is config off Dave Airlie
2025-09-02  4:06 ` [PATCH 14/15] amdgpu: add support for memory cgroups Dave Airlie
2025-09-02  4:06 ` [PATCH 15/15] ttm: add support for a module option to disable memcg integration Dave Airlie

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4e462912-64de-461c-8c4b-204e6f58dde8@amd.com \
    --to=christian.koenig@amd.com \
    --cc=airlied@gmail.com \
    --cc=cgroups@vger.kernel.org \
    --cc=david@fromorbit.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=hannes@cmpxchg.org \
    --cc=longman@redhat.com \
    --cc=mhocko@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=simona@ffwll.ch \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).