Re: [PATCH] mm/fake-numa: per-phys node fake size

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Mike Rapoport <rppt@kernel.org>
To: Bruno Faccini <bfaccini@nvidia.com>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	akpm@linux-foundation.org, ziy@nvidia.com, ttabi@nvidia.com,
	jhubbard@nvidia.com
Subject: Re: [PATCH] mm/fake-numa: per-phys node fake size
Date: Tue, 24 Sep 2024 13:40:22 +0300	[thread overview]
Message-ID: <ZvKXFnriMlH2y5Oo@kernel.org> (raw)
In-Reply-To: <20240921081348.10016-1-bfaccini@nvidia.com>

On Sat, Sep 21, 2024 at 01:13:49AM -0700, Bruno Faccini wrote:
> Determine fake numa node size on a per-phys node basis to
> handle cases where there are big differences of reserved
> memory size inside physical nodes, this will allow to get
> the expected number of nodes evenly interleaved.
> 
> Consider a system with 2 physical Numa nodes where almost
> all reserved memory sits into a single node, computing the
> fake-numa nodes (fake=N) size as the ratio of all
> available/non-reserved memory can cause the inability to
> create N/2 fake-numa nodes in the physical node.

I'm not sure I understand the problem you are trying to solve.
Can you provide more specific example?

> Signed-off-by: Bruno Faccini <bfaccini@nvidia.com>
> ---
>  mm/numa_emulation.c | 66 ++++++++++++++++++++++++++-------------------
>  1 file changed, 39 insertions(+), 27 deletions(-)
> 
> diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c
> index 031fb9961bf7..0c72c85cfc10 100644
> --- a/mm/numa_emulation.c
> +++ b/mm/numa_emulation.c
> @@ -77,20 +77,19 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
>  }
>  
>  /*
> - * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
> - * to max_addr.
> + * Sets up nr_nodes fake nodes interleaved over all physical nodes
>   *
>   * Returns zero on success or negative on error.
>   */
>  static int __init split_nodes_interleave(struct numa_meminfo *ei,
>  					 struct numa_meminfo *pi,
> -					 u64 addr, u64 max_addr, int nr_nodes)
> +					 int nr_nodes)
>  {
>  	nodemask_t physnode_mask = numa_nodes_parsed;
> -	u64 size;
> -	int big;
> -	int nid = 0;
> -	int i, ret;
> +	int nid = 0, physnodes_with_mem = 0;
> +	int i, ret, phys_blk;
> +	static u64 sizes[MAX_NUMNODES] __initdata;
> +	static int bigs[MAX_NUMNODES] __initdata;
>  
>  	if (nr_nodes <= 0)
>  		return -1;
> @@ -100,25 +99,41 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
>  		nr_nodes = MAX_NUMNODES;
>  	}
>  
> -	/*
> -	 * Calculate target node size.  x86_32 freaks on __udivdi3() so do
> -	 * the division in ulong number of pages and convert back.
> -	 */
> -	size = max_addr - addr - mem_hole_size(addr, max_addr);
> -	size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
> +	/* count physical nodes with memory */
> +	for_each_node_mask(i, physnode_mask) {
> +		phys_blk = emu_find_memblk_by_nid(i, pi);
> +		if (phys_blk < 0)
> +			continue;
> +		physnodes_with_mem++;
> +	}
>  
>  	/*
> -	 * Calculate the number of big nodes that can be allocated as a result
> -	 * of consolidating the remainder.
> +	 * Calculate target fake nodes sizes for each physical node with memory.
> +	 * x86_32 freaks on __udivdi3() so do the division in ulong number of
> +	 * pages and convert back.
>  	 */
> -	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
> -		FAKE_NODE_MIN_SIZE;
> +	for_each_node_mask(i, physnode_mask) {
> +		phys_blk = emu_find_memblk_by_nid(i, pi);
> +		if (phys_blk < 0)
> +			continue;
>  
> -	size &= FAKE_NODE_MIN_HASH_MASK;
> -	if (!size) {
> -		pr_err("Not enough memory for each node.  "
> -			"NUMA emulation disabled.\n");
> -		return -1;
> +		sizes[i] = pi->blk[phys_blk].end - pi->blk[phys_blk].start -
> +			   mem_hole_size(pi->blk[phys_blk].start, pi->blk[phys_blk].end);
> +		sizes[i] = PFN_PHYS((unsigned long)(sizes[i] >> PAGE_SHIFT) /
> +			   nr_nodes * physnodes_with_mem);
> +
> +		/*
> +		 * Calculate the number of big nodes that can be allocated as a result
> +		 * of consolidating the remainder.
> +		 */
> +		bigs[i] = ((sizes[i] & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / physnodes_with_mem /
> +			  FAKE_NODE_MIN_SIZE;
> +		sizes[i] &= FAKE_NODE_MIN_HASH_MASK;
> +		if (!sizes[i]) {
> +			pr_err("Not enough memory for each node inside physical numa node %d. NUMA emulation disabled.\n",
> +			       i);
> +			return -1;
> +		}
>  	}
>  
>  	/*
> @@ -138,16 +150,16 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
>  			}
>  			start = pi->blk[phys_blk].start;
>  			limit = pi->blk[phys_blk].end;
> -			end = start + size;
> +			end = start + sizes[i];
>  
> -			if (nid < big)
> +			if (nid < bigs[i])
>  				end += FAKE_NODE_MIN_SIZE;
>  
>  			/*
>  			 * Continue to add memory to this fake node if its
>  			 * non-reserved memory is less than the per-node size.
>  			 */
> -			while (end - start - mem_hole_size(start, end) < size) {
> +			while (end - start - mem_hole_size(start, end) < sizes[i]) {
>  				end += FAKE_NODE_MIN_SIZE;
>  				if (end > limit) {
>  					end = limit;
> @@ -169,7 +181,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
>  			 * next node, this one must extend to the end of the
>  			 * physical node.
>  			 */
> -			if (limit - end - mem_hole_size(end, limit) < size)
> +			if (limit - end - mem_hole_size(end, limit) < sizes[i])
>  				end = limit;
>  
>  			ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
> @@ -432,7 +444,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
>  		unsigned long n;
>  
>  		n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
> -		ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
> +		ret = split_nodes_interleave(&ei, &pi, n);
>  	}
>  	if (*emu_cmdline == ':')
>  		emu_cmdline++;
> -- 
> 2.34.1
> 

-- 
Sincerely yours,
Mike.

next prev parent reply	other threads:[~2024-09-24 10:43 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-21  8:13 [PATCH] mm/fake-numa: per-phys node fake size Bruno Faccini
2024-09-24 10:40 ` Mike Rapoport [this message]
2024-09-24 15:27   ` Bruno Faccini
2024-09-25  9:28     ` Mike Rapoport
2024-09-29 15:43       ` Bruno Faccini
2024-10-01  7:15         ` Mike Rapoport

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZvKXFnriMlH2y5Oo@kernel.org \
    --to=rppt@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=bfaccini@nvidia.com \
    --cc=jhubbard@nvidia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ttabi@nvidia.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.