linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC]numa: improve I/O performance by optimizing numa interleave allocation
@ 2011-11-18  7:12 Shaohua Li
  2011-11-18 15:56 ` Christoph Lameter
  2011-11-18 17:30 ` Andi Kleen
  0 siblings, 2 replies; 5+ messages in thread
From: Shaohua Li @ 2011-11-18  7:12 UTC (permalink / raw)
  To: lkml, linux-mm
  Cc: Andrew Morton, ak, Jens Axboe, Christoph Lameter,
	lee.schermerhorn

If mem plicy is interleaves, we will allocated pages from nodes in a round
robin way. This surely can do interleave fairly, but not optimal.

Say the pages will be used for I/O later. Interleave allocation for two pages
are allocated from two nodes, so the pages are not physically continuous. Later
each page needs one segment for DMA scatter-gathering. But maxium hardware
segment number is limited. The non-continuous pages will use up maxium
hardware segment number soon and we can't merge I/O to bigger DMA. Allocating
pages from one node hasn't such issue. The memory allocator pcp list makes
we can get physically continuous pages in several alloc quite likely.

So can we make both interleave fairness and continuous allocation happy?
Simplily we can adjust the round robin algorithm. We switch to another node
after several (N) allocation happens. If N isn't too big, we can still get
fair allocation. And we get N continuous pages. I use N=8 in below patch.
I thought 8 isn't too big for modern NUMA machine. Applications which use
interleave are unlikely run short time, so I thought fairness still works.

Run a sequential read workload which accesses disk sdc - sdf,
iostat -x -m 5 shows:

without numactl --interleave=0,1:
Device:         rrqm/s   wrqm/s     r/s     w/s    rMB/s    wMB/s avgrq-sz avgqu-sz   await  svctm  %util
sdc              13.40     0.00  259.00    0.00    67.05     0.00   530.19     5.00   19.38   3.86 100.00
sdd              13.00     0.00  249.00    0.00    64.95     0.00   534.21     5.05   19.73   4.02 100.00
sde              13.60     0.00  258.60    0.00    67.40     0.00   533.78     4.96   18.98   3.87 100.00
sdf              13.00     0.00  261.60    0.00    67.50     0.00   528.44     5.24   19.77   3.82 100.00

with numactl --interleave=0,1:
sdc               6.80     0.00  419.60    0.00    64.90     0.00   316.77    14.17   34.04   2.38 100.00
sdd               6.00     0.00  423.40    0.00    65.58     0.00   317.23    17.33   41.14   2.36 100.00
sde               5.60     0.00  419.60    0.00    64.90     0.00   316.77    17.29   40.94   2.38 100.00
sdf               5.20     0.00  417.80    0.00    64.17     0.00   314.55    16.69   39.42   2.39 100.00

with numactl --interleave=0,1 and below patch:
sdc              13.00     0.00  261.20    0.00    68.20     0.00   534.74     5.05   19.19   3.83 100.00
sde              13.40     0.00  259.00    0.00    67.85     0.00   536.52     4.85   18.80   3.86 100.00
sdf              13.00     0.00  260.60    0.00    68.20     0.00   535.97     4.85   18.61   3.84 100.00
sdd              13.20     0.00  251.60    0.00    66.00     0.00   537.23     4.95   19.45   3.97 100.00

The avgrq-sz is increased a lot. performance boost a little too.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
---
 include/linux/sched.h |    2 +-
 mm/mempolicy.c        |   33 +++++++++++++++++++++++----------
 2 files changed, 24 insertions(+), 11 deletions(-)

Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h	2011-11-18 13:38:50.000000000 +0800
+++ linux/include/linux/sched.h	2011-11-18 13:42:41.000000000 +0800
@@ -1506,7 +1506,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
-	short il_next;
+	int il_alloc_cnt;
 	short pref_node_fork;
 #endif
 	struct rcu_head rcu;
Index: linux/mm/mempolicy.c
===================================================================
--- linux.orig/mm/mempolicy.c	2011-11-18 13:38:50.000000000 +0800
+++ linux/mm/mempolicy.c	2011-11-18 13:44:57.000000000 +0800
@@ -97,6 +97,10 @@
 
 #include "internal.h"
 
+#define IL_ALLOC_STRIP (8)
+#define IL_CNT_TO_NODE(il_alloc_cnt) ((il_alloc_cnt) / IL_ALLOC_STRIP)
+#define IL_NODE_TO_CNT(node) ((node) * IL_ALLOC_STRIP)
+
 /* Internal flags */
 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
@@ -335,12 +339,15 @@ static void mpol_rebind_nodemask(struct
 	else
 		BUG();
 
-	if (!node_isset(current->il_next, tmp)) {
-		current->il_next = next_node(current->il_next, tmp);
-		if (current->il_next >= MAX_NUMNODES)
-			current->il_next = first_node(tmp);
-		if (current->il_next >= MAX_NUMNODES)
-			current->il_next = numa_node_id();
+	if (!node_isset(IL_CNT_TO_NODE(current->il_alloc_cnt), tmp)) {
+		int newnode;
+
+		newnode = next_node(IL_CNT_TO_NODE(current->il_alloc_cnt), tmp);
+		if (newnode >= MAX_NUMNODES)
+			newnode = first_node(tmp);
+		if (newnode >= MAX_NUMNODES)
+			newnode = numa_node_id();
+		current->il_alloc_cnt = IL_NODE_TO_CNT(newnode);
 	}
 }
 
@@ -744,7 +751,8 @@ static long do_set_mempolicy(unsigned sh
 	mpol_set_task_struct_flag();
 	if (new && new->mode == MPOL_INTERLEAVE &&
 	    nodes_weight(new->v.nodes))
-		current->il_next = first_node(new->v.nodes);
+		current->il_alloc_cnt =
+			IL_NODE_TO_CNT(first_node(new->v.nodes));
 	task_unlock(current);
 	if (mm)
 		up_write(&mm->mmap_sem);
@@ -849,7 +857,7 @@ static long do_get_mempolicy(int *policy
 			*policy = err;
 		} else if (pol == current->mempolicy &&
 				pol->mode == MPOL_INTERLEAVE) {
-			*policy = current->il_next;
+			*policy = IL_CNT_TO_NODE(current->il_alloc_cnt);
 		} else {
 			err = -EINVAL;
 			goto out;
@@ -1553,12 +1561,17 @@ static unsigned interleave_nodes(struct
 	unsigned nid, next;
 	struct task_struct *me = current;
 
-	nid = me->il_next;
+	if (((me->il_alloc_cnt + 1) % IL_ALLOC_STRIP) != 0) {
+		me->il_alloc_cnt++;
+		return IL_CNT_TO_NODE(me->il_alloc_cnt);
+	}
+
+	nid = IL_CNT_TO_NODE(me->il_alloc_cnt);
 	next = next_node(nid, policy->v.nodes);
 	if (next >= MAX_NUMNODES)
 		next = first_node(policy->v.nodes);
 	if (next < MAX_NUMNODES)
-		me->il_next = next;
+		me->il_alloc_cnt = IL_NODE_TO_CNT(next);
 	return nid;
 }
 


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC]numa: improve I/O performance by optimizing numa interleave allocation
  2011-11-18  7:12 [RFC]numa: improve I/O performance by optimizing numa interleave allocation Shaohua Li
@ 2011-11-18 15:56 ` Christoph Lameter
  2011-11-18 17:30 ` Andi Kleen
  1 sibling, 0 replies; 5+ messages in thread
From: Christoph Lameter @ 2011-11-18 15:56 UTC (permalink / raw)
  To: Shaohua Li
  Cc: lkml, linux-mm, Andrew Morton, ak, Jens Axboe, lee.schermerhorn

On Fri, 18 Nov 2011, Shaohua Li wrote:

> So can we make both interleave fairness and continuous allocation happy?

Maybe.

> Simplily we can adjust the round robin algorithm. We switch to another node
> after several (N) allocation happens. If N isn't too big, we can still get
> fair allocation. And we get N continuous pages. I use N=8 in below patch.
> I thought 8 isn't too big for modern NUMA machine. Applications which use
> interleave are unlikely run short time, so I thought fairness still works.

People are already complaining that the 4k interleaving is too coarse.
Bioses can often interleave on a cacheline level. A smaller size balances
the load better over multiple nodes. Large sizes can result in imbalances
since f.e. a whole array may end up on one node. Maybe make it tunable
by expanding the numa_policy structure to include a size parameter?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC]numa: improve I/O performance by optimizing numa interleave allocation
  2011-11-18  7:12 [RFC]numa: improve I/O performance by optimizing numa interleave allocation Shaohua Li
  2011-11-18 15:56 ` Christoph Lameter
@ 2011-11-18 17:30 ` Andi Kleen
  2011-11-21  1:39   ` Shaohua Li
  1 sibling, 1 reply; 5+ messages in thread
From: Andi Kleen @ 2011-11-18 17:30 UTC (permalink / raw)
  To: Shaohua Li
  Cc: lkml, linux-mm, Andrew Morton, Jens Axboe, Christoph Lameter,
	lee.schermerhorn

On Fri, Nov 18, 2011 at 03:12:12PM +0800, Shaohua Li wrote:
> If mem plicy is interleaves, we will allocated pages from nodes in a round
> robin way. This surely can do interleave fairly, but not optimal.
> 
> Say the pages will be used for I/O later. Interleave allocation for two pages
> are allocated from two nodes, so the pages are not physically continuous. Later
> each page needs one segment for DMA scatter-gathering. But maxium hardware
> segment number is limited. The non-continuous pages will use up maxium
> hardware segment number soon and we can't merge I/O to bigger DMA. Allocating
> pages from one node hasn't such issue. The memory allocator pcp list makes
> we can get physically continuous pages in several alloc quite likely.

FWIW it depends a lot on the IO hardware if the SG limitation
really makes a measurable difference for IO performance. I saw some wins from 
clustering using the IOMMU before, but that was a long time ago. I wouldn't 
consider it a truth without strong numbers, and then also only
for that particular device measured.

My understanding is that modern IO devices like NHM Express will
be faster at large SG lists.

> So can we make both interleave fairness and continuous allocation happy?
> Simplily we can adjust the round robin algorithm. We switch to another node
> after several (N) allocation happens. If N isn't too big, we can still get
> fair allocation. And we get N continuous pages. I use N=8 in below patch.
> I thought 8 isn't too big for modern NUMA machine. Applications which use
> interleave are unlikely run short time, so I thought fairness still works.

It depends a lot on the CPU access pattern.

Some workloads seem to do reasonable well with 2MB huge page interleaving.
But others actually prefer the cache line interleaving supplied by 
the BIOS.

So you can have a trade off between IO and CPU performance.
When in doubt I usually opt for CPU performance by default.

I definitely wouldn't make it default, but if there are workloads
that benefits a lot it could be an additional parameter to the
interleave policy.

> Run a sequential read workload which accesses disk sdc - sdf,

What IO device is that?

-Andi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC]numa: improve I/O performance by optimizing numa interleave allocation
  2011-11-18 17:30 ` Andi Kleen
@ 2011-11-21  1:39   ` Shaohua Li
  2011-11-23  3:36     ` Shaohua Li
  0 siblings, 1 reply; 5+ messages in thread
From: Shaohua Li @ 2011-11-21  1:39 UTC (permalink / raw)
  To: Andi Kleen
  Cc: lkml, linux-mm, Andrew Morton, Jens Axboe, Christoph Lameter,
	lee.schermerhorn@hp.com

On Sat, 2011-11-19 at 01:30 +0800, Andi Kleen wrote:
> On Fri, Nov 18, 2011 at 03:12:12PM +0800, Shaohua Li wrote:
> > If mem plicy is interleaves, we will allocated pages from nodes in a round
> > robin way. This surely can do interleave fairly, but not optimal.
> > 
> > Say the pages will be used for I/O later. Interleave allocation for two pages
> > are allocated from two nodes, so the pages are not physically continuous. Later
> > each page needs one segment for DMA scatter-gathering. But maxium hardware
> > segment number is limited. The non-continuous pages will use up maxium
> > hardware segment number soon and we can't merge I/O to bigger DMA. Allocating
> > pages from one node hasn't such issue. The memory allocator pcp list makes
> > we can get physically continuous pages in several alloc quite likely.
> 
> FWIW it depends a lot on the IO hardware if the SG limitation
> really makes a measurable difference for IO performance. I saw some wins from 
> clustering using the IOMMU before, but that was a long time ago. I wouldn't 
> consider it a truth without strong numbers, and then also only
> for that particular device measured.
> 
> My understanding is that modern IO devices like NHM Express will
> be faster at large SG lists.
This is a LSI SAS1068E HBA card attaching some hard disks. The
clustering has real benefit here. I/O throughput increases 3% or so.
Not sure about NHM Express, wondering why large SG list could be faster.
doesn't large SG means large DMA descriptor?

> > So can we make both interleave fairness and continuous allocation happy?
> > Simplily we can adjust the round robin algorithm. We switch to another node
> > after several (N) allocation happens. If N isn't too big, we can still get
> > fair allocation. And we get N continuous pages. I use N=8 in below patch.
> > I thought 8 isn't too big for modern NUMA machine. Applications which use
> > interleave are unlikely run short time, so I thought fairness still works.
> 
> It depends a lot on the CPU access pattern.
> 
> Some workloads seem to do reasonable well with 2MB huge page interleaving.
> But others actually prefer the cache line interleaving supplied by 
> the BIOS.
> 
> So you can have a trade off between IO and CPU performance.
> When in doubt I usually opt for CPU performance by default.
Can you elaborate this more? the cache line interleaving can only be
supplied by BIOS. OS can provide N*PAGE_SIZE interleave. I'm wondering
what's the difference for example a 4k or 8k interleave for CPU
performance. Actually if adjacent pages interleaved in two nodes could
be in the same coloring, while two adjacent pages allocated from one
node not. So clustering could be more cache efficient from coloring
point of view.

> I definitely wouldn't make it default, but if there are workloads
> that benefits a lot it could be an additional parameter to the
> interleave policy.
Christoph suggested the same way. the problem is we need change the API,
right? And how are users supposed to use it? It would be difficult to
determine the correct parameter.

If 8 pages clustering is too big, maybe we can use small. I guess a 2
pages clustering is a big win too.

And I didn't change the allocation with a VMA case, which is supposed to
be used for anonymous pages.

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC]numa: improve I/O performance by optimizing numa interleave allocation
  2011-11-21  1:39   ` Shaohua Li
@ 2011-11-23  3:36     ` Shaohua Li
  0 siblings, 0 replies; 5+ messages in thread
From: Shaohua Li @ 2011-11-23  3:36 UTC (permalink / raw)
  To: Andi Kleen
  Cc: lkml, linux-mm, Andrew Morton, Jens Axboe, Christoph Lameter,
	lee.schermerhorn@hp.com

On Mon, 2011-11-21 at 09:39 +0800, Shaohua Li wrote:
> On Sat, 2011-11-19 at 01:30 +0800, Andi Kleen wrote:
> > On Fri, Nov 18, 2011 at 03:12:12PM +0800, Shaohua Li wrote:
> > > If mem plicy is interleaves, we will allocated pages from nodes in a round
> > > robin way. This surely can do interleave fairly, but not optimal.
> > > 
> > > Say the pages will be used for I/O later. Interleave allocation for two pages
> > > are allocated from two nodes, so the pages are not physically continuous. Later
> > > each page needs one segment for DMA scatter-gathering. But maxium hardware
> > > segment number is limited. The non-continuous pages will use up maxium
> > > hardware segment number soon and we can't merge I/O to bigger DMA. Allocating
> > > pages from one node hasn't such issue. The memory allocator pcp list makes
> > > we can get physically continuous pages in several alloc quite likely.
> > 
> > FWIW it depends a lot on the IO hardware if the SG limitation
> > really makes a measurable difference for IO performance. I saw some wins from 
> > clustering using the IOMMU before, but that was a long time ago. I wouldn't 
> > consider it a truth without strong numbers, and then also only
> > for that particular device measured.
> > 
> > My understanding is that modern IO devices like NHM Express will
> > be faster at large SG lists.
> This is a LSI SAS1068E HBA card attaching some hard disks. The
> clustering has real benefit here. I/O throughput increases 3% or so.
> Not sure about NHM Express, wondering why large SG list could be faster.
> doesn't large SG means large DMA descriptor?
> 
> > > So can we make both interleave fairness and continuous allocation happy?
> > > Simplily we can adjust the round robin algorithm. We switch to another node
> > > after several (N) allocation happens. If N isn't too big, we can still get
> > > fair allocation. And we get N continuous pages. I use N=8 in below patch.
> > > I thought 8 isn't too big for modern NUMA machine. Applications which use
> > > interleave are unlikely run short time, so I thought fairness still works.
> > 
> > It depends a lot on the CPU access pattern.
> > 
> > Some workloads seem to do reasonable well with 2MB huge page interleaving.
> > But others actually prefer the cache line interleaving supplied by 
> > the BIOS.
> > 
> > So you can have a trade off between IO and CPU performance.
> > When in doubt I usually opt for CPU performance by default.
> Can you elaborate this more? the cache line interleaving can only be
> supplied by BIOS. OS can provide N*PAGE_SIZE interleave. I'm wondering
> what's the difference for example a 4k or 8k interleave for CPU
> performance. Actually if adjacent pages interleaved in two nodes could
> be in the same coloring, while two adjacent pages allocated from one
> node not. So clustering could be more cache efficient from coloring
> point of view.
> 
> > I definitely wouldn't make it default, but if there are workloads
> > that benefits a lot it could be an additional parameter to the
> > interleave policy.
> Christoph suggested the same way. the problem is we need change the API,
> right? And how are users supposed to use it? It would be difficult to
> determine the correct parameter.
> 
> If 8 pages clustering is too big, maybe we can use small. I guess a 2
> pages clustering is a big win too.
> 
> And I didn't change the allocation with a VMA case, which is supposed to
> be used for anonymous pages.
I tried a 2 pages clustering, it has the same effect like 8 page
clustering in my test environment.
would making the clustering a config option or sysctl be better?

Thanks,
Shaohua


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2011-11-23  3:25 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-11-18  7:12 [RFC]numa: improve I/O performance by optimizing numa interleave allocation Shaohua Li
2011-11-18 15:56 ` Christoph Lameter
2011-11-18 17:30 ` Andi Kleen
2011-11-21  1:39   ` Shaohua Li
2011-11-23  3:36     ` Shaohua Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).