* [PATCH 1/5] md: add a 'md_numa_node' module parameter
@ 2017-07-01 7:20 Zhengyuan Liu
2017-07-01 7:20 ` [PATCH 2/5] raid0: make memory allocation from md's numa node Zhengyuan Liu
` (4 more replies)
0 siblings, 5 replies; 7+ messages in thread
From: Zhengyuan Liu @ 2017-07-01 7:20 UTC (permalink / raw)
To: linux-raid; +Cc: shli, neilb, colyli, liuyun01
This module parameter allows user to control which numa node
the memory for mainly structures (e.g. mddev, md_rdev,
request_queue, gendisk) is allocated from. The idea came from
commit 115485e83f49 ("dm: add 'dm_numa_node' module parameter").
If we don't set this parameter while loading module, it won't
affect the md default behavior.
numa_node_id field is added so personality such as raid0 could
inherit the node from mddev, for other personality which has its
own handling thread we could add such a module parameter too.
This patch includes a bug fix about parameter range check suggested
by Coly Li <colyli@suse.de>.
Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
---
drivers/md/md.c | 18 ++++++++++++++----
drivers/md/md.h | 2 ++
2 files changed, 16 insertions(+), 4 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 092b48f..fae93db 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -184,6 +184,8 @@ static int start_readonly;
*/
static bool create_on_open = true;
+static int md_numa_node = NUMA_NO_NODE;
+
/* bio_clone_mddev
* like bio_clone, but with a local bio set
*/
@@ -588,7 +590,7 @@ static struct mddev *mddev_find(dev_t unit)
}
spin_unlock(&all_mddevs_lock);
- new = kzalloc(sizeof(*new), GFP_KERNEL);
+ new = kzalloc_node(sizeof(*new), GFP_KERNEL, md_numa_node);
if (!new)
return NULL;
@@ -598,6 +600,7 @@ static struct mddev *mddev_find(dev_t unit)
else
new->md_minor = MINOR(unit) >> MdpMinorShift;
+ new->numa_node_id = md_numa_node;
mddev_init(new);
goto retry;
@@ -3387,7 +3390,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
struct md_rdev *rdev;
sector_t size;
- rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
+ rdev = kzalloc_node(sizeof(*rdev), GFP_KERNEL, md_numa_node);
if (!rdev)
return ERR_PTR(-ENOMEM);
@@ -5256,7 +5259,7 @@ static int md_alloc(dev_t dev, char *name)
mddev->hold_active = UNTIL_STOP;
error = -ENOMEM;
- mddev->queue = blk_alloc_queue(GFP_KERNEL);
+ mddev->queue = blk_alloc_queue_node(GFP_KERNEL, md_numa_node);
if (!mddev->queue)
goto abort;
mddev->queue->queuedata = mddev;
@@ -5264,7 +5267,7 @@ static int md_alloc(dev_t dev, char *name)
blk_queue_make_request(mddev->queue, md_make_request);
blk_set_stacking_limits(&mddev->queue->limits);
- disk = alloc_disk(1 << shift);
+ disk = alloc_disk_node(1 << shift, md_numa_node);
if (!disk) {
blk_cleanup_queue(mddev->queue);
mddev->queue = NULL;
@@ -8969,6 +8972,11 @@ static int __init md_init(void)
register_reboot_notifier(&md_notifier);
raid_table_header = register_sysctl_table(raid_root_table);
+ if (md_numa_node < NUMA_NO_NODE)
+ md_numa_node = NUMA_NO_NODE;
+ else if (md_numa_node > (num_online_nodes() - 1))
+ md_numa_node = num_online_nodes() - 1;
+
md_geninit();
return 0;
@@ -9252,6 +9260,8 @@ module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
+module_param(md_numa_node, int, S_IRUGO);
+MODULE_PARM_DESC(md_numa_node, "NUMA node for md device memory allocations");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MD RAID framework");
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 991f0fe..5c91033 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -459,6 +459,8 @@ struct mddev {
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info;
unsigned int good_device_nr; /* good device num within cluster raid */
+
+ int numa_node_id;
};
enum recovery_flags {
--
2.7.4
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 2/5] raid0: make memory allocation from md's numa node
2017-07-01 7:20 [PATCH 1/5] md: add a 'md_numa_node' module parameter Zhengyuan Liu
@ 2017-07-01 7:20 ` Zhengyuan Liu
2017-07-01 7:20 ` [PATCH 3/5] md/linear: " Zhengyuan Liu
` (3 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: Zhengyuan Liu @ 2017-07-01 7:20 UTC (permalink / raw)
To: linux-raid; +Cc: shli, neilb, colyli, liuyun01
raid0 has the same context as md does, so there is no need
to add a module parameter. Just inherit it from md.
Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
---
drivers/md/raid0.c | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 94d9ae9..7a301b3 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -89,7 +89,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
int cnt;
char b[BDEVNAME_SIZE];
char b2[BDEVNAME_SIZE];
- struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+ struct r0conf *conf = kzalloc_node(sizeof(*conf), GFP_KERNEL,
+ mddev->numa_node_id);
unsigned short blksize = 512;
*private_conf = ERR_PTR(-ENOMEM);
@@ -158,13 +159,14 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
err = -ENOMEM;
- conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
- conf->nr_strip_zones, GFP_KERNEL);
+ conf->strip_zone = kzalloc_node(sizeof(struct strip_zone)*
+ conf->nr_strip_zones, GFP_KERNEL,
+ mddev->numa_node_id);
if (!conf->strip_zone)
goto abort;
- conf->devlist = kzalloc(sizeof(struct md_rdev*)*
+ conf->devlist = kzalloc_node(sizeof(struct md_rdev *)*
conf->nr_strip_zones*mddev->raid_disks,
- GFP_KERNEL);
+ GFP_KERNEL, mddev->numa_node_id);
if (!conf->devlist)
goto abort;
--
2.7.4
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 3/5] md/linear: make memory allocation from md's numa node
2017-07-01 7:20 [PATCH 1/5] md: add a 'md_numa_node' module parameter Zhengyuan Liu
2017-07-01 7:20 ` [PATCH 2/5] raid0: make memory allocation from md's numa node Zhengyuan Liu
@ 2017-07-01 7:20 ` Zhengyuan Liu
2017-07-01 7:20 ` [PATCH 4/5] raid10: add a 'r10_numa_node' module parameter Zhengyuan Liu
` (2 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: Zhengyuan Liu @ 2017-07-01 7:20 UTC (permalink / raw)
To: linux-raid; +Cc: shli, neilb, colyli, liuyun01
linear has the same context as md does, so there is no need
to add a module parameter. Just inherit it from md.
Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
---
drivers/md/linear.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 5f1eb91..43ea6eb 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -96,8 +96,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
int i, cnt;
bool discard_supported = false;
- conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
- GFP_KERNEL);
+ conf = kzalloc_node(sizeof (*conf) + raid_disks*sizeof(struct dev_info),
+ GFP_KERNEL, mddev->numa_node_id);
if (!conf)
return NULL;
--
2.7.4
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 4/5] raid10: add a 'r10_numa_node' module parameter
2017-07-01 7:20 [PATCH 1/5] md: add a 'md_numa_node' module parameter Zhengyuan Liu
2017-07-01 7:20 ` [PATCH 2/5] raid0: make memory allocation from md's numa node Zhengyuan Liu
2017-07-01 7:20 ` [PATCH 3/5] md/linear: " Zhengyuan Liu
@ 2017-07-01 7:20 ` Zhengyuan Liu
2017-07-01 7:20 ` [PATCH 5/5] raid5: add a 'r5_numa_node' " Zhengyuan Liu
2017-07-10 17:28 ` [PATCH 1/5] md: add a 'md_numa_node' " Shaohua Li
4 siblings, 0 replies; 7+ messages in thread
From: Zhengyuan Liu @ 2017-07-01 7:20 UTC (permalink / raw)
To: linux-raid; +Cc: shli, neilb, colyli, liuyun01
This module parameter allows user to control which numa node
the memory for mainly structures is allocated from.
Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
---
drivers/md/raid10.c | 32 ++++++++++++++++++++++----------
1 file changed, 22 insertions(+), 10 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 305b0db..f49ab2a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -97,6 +97,8 @@
*/
static int max_queued_requests = 1024;
+static int r10_numa_node = NUMA_NO_NODE;
+
static void allow_barrier(struct r10conf *conf);
static void lower_barrier(struct r10conf *conf);
static int _enough(struct r10conf *conf, int previous, int ignore);
@@ -135,7 +137,7 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
/* allocate a r10bio with room for raid_disks entries in the
* bios array */
- return kzalloc(size, gfp_flags);
+ return kzalloc_node(size, gfp_flags, r10_numa_node);
}
static void r10bio_pool_free(void *r10_bio, void *data)
@@ -179,7 +181,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
nalloc_rp = nalloc;
else
nalloc_rp = nalloc * 2;
- rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags);
+ rps = kmalloc_node(sizeof(struct resync_pages) * nalloc_rp, gfp_flags,
+ r10_numa_node);
if (!rps)
goto out_free_r10bio;
@@ -2801,7 +2804,8 @@ static int init_resync(struct r10conf *conf)
for (i = 0; i < conf->geo.raid_disks; i++)
if (conf->mirrors[i].replacement)
conf->have_replacement = 1;
- conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
+ conf->r10buf_pool = mempool_create_node(buffs, r10buf_pool_alloc, r10buf_pool_free,
+ conf, GFP_KERNEL, r10_numa_node);
if (!conf->r10buf_pool)
return -ENOMEM;
conf->next_resync = 0;
@@ -3532,14 +3536,14 @@ static struct r10conf *setup_conf(struct mddev *mddev)
}
err = -ENOMEM;
- conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
+ conf = kzalloc_node(sizeof(struct r10conf), GFP_KERNEL, r10_numa_node);
if (!conf)
goto out;
/* FIXME calc properly */
- conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
+ conf->mirrors = kzalloc_node(sizeof(struct raid10_info)*(mddev->raid_disks +
max(0,-mddev->delta_disks)),
- GFP_KERNEL);
+ GFP_KERNEL, r10_numa_node);
if (!conf->mirrors)
goto out;
@@ -3549,8 +3553,9 @@ static struct r10conf *setup_conf(struct mddev *mddev)
conf->geo = geo;
conf->copies = copies;
- conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
- r10bio_pool_free, conf);
+ conf->r10bio_pool = mempool_create_node(NR_RAID10_BIOS, r10bio_pool_alloc,
+ r10bio_pool_free, conf,
+ GFP_KERNEL, r10_numa_node);
if (!conf->r10bio_pool)
goto out;
@@ -3971,11 +3976,11 @@ static int raid10_check_reshape(struct mddev *mddev)
conf->mirrors_new = NULL;
if (mddev->delta_disks > 0) {
/* allocate new 'mirrors' list */
- conf->mirrors_new = kzalloc(
+ conf->mirrors_new = kzalloc_node(
sizeof(struct raid10_info)
*(mddev->raid_disks +
mddev->delta_disks),
- GFP_KERNEL);
+ GFP_KERNEL, r10_numa_node);
if (!conf->mirrors_new)
return -ENOMEM;
}
@@ -4728,6 +4733,11 @@ static struct md_personality raid10_personality =
static int __init raid_init(void)
{
+ if (r10_numa_node < NUMA_NO_NODE)
+ r10_numa_node = NUMA_NO_NODE;
+ else if (r10_numa_node > (num_online_nodes() - 1))
+ r10_numa_node = num_online_nodes() - 1;
+
return register_md_personality(&raid10_personality);
}
@@ -4745,3 +4755,5 @@ MODULE_ALIAS("md-raid10");
MODULE_ALIAS("md-level-10");
module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
+module_param(r10_numa_node, int, S_IRUGO);
+MODULE_PARM_DESC(r10_numa_node, "NUMA node for raid10 memory allocations");
--
2.7.4
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 5/5] raid5: add a 'r5_numa_node' module parameter
2017-07-01 7:20 [PATCH 1/5] md: add a 'md_numa_node' module parameter Zhengyuan Liu
` (2 preceding siblings ...)
2017-07-01 7:20 ` [PATCH 4/5] raid10: add a 'r10_numa_node' module parameter Zhengyuan Liu
@ 2017-07-01 7:20 ` Zhengyuan Liu
2017-07-10 17:28 ` [PATCH 1/5] md: add a 'md_numa_node' " Shaohua Li
4 siblings, 0 replies; 7+ messages in thread
From: Zhengyuan Liu @ 2017-07-01 7:20 UTC (permalink / raw)
To: linux-raid; +Cc: shli, neilb, colyli, liuyun01
This module parameter allows user to control which numa node
the memory for mainly structures is allocated from.
Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
---
drivers/md/raid5.c | 54 ++++++++++++++++++++++++++++++++++++------------------
1 file changed, 36 insertions(+), 18 deletions(-)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 547d5fa..b61417f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -75,6 +75,11 @@ static bool devices_handle_discard_safely = false;
module_param(devices_handle_discard_safely, bool, 0644);
MODULE_PARM_DESC(devices_handle_discard_safely,
"Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
+
+static int r5_numa_node = NUMA_NO_NODE;
+module_param(r5_numa_node, int, S_IRUGO);
+MODULE_PARM_DESC(r5_numa_node, "NUMA node for raid5 memory allocations");
+
static struct workqueue_struct *raid5_wq;
static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
@@ -484,7 +489,7 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
for (i = 0; i < num; i++) {
struct page *page;
- if (!(page = alloc_page(gfp))) {
+ if (!(page = alloc_pages_node(r5_numa_node, gfp, 0))) {
return 1;
}
sh->dev[i].page = page;
@@ -2135,7 +2140,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
struct stripe_head *sh;
int i;
- sh = kmem_cache_zalloc(sc, gfp);
+ sh = kmem_cache_alloc_node(sc, gfp | __GFP_ZERO, r5_numa_node);
if (sh) {
spin_lock_init(&sh->stripe_lock);
spin_lock_init(&sh->batch_lock);
@@ -2154,7 +2159,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
}
if (raid5_has_ppl(conf)) {
- sh->ppl_page = alloc_page(gfp);
+ sh->ppl_page = alloc_pages_node(r5_numa_node, gfp, 0);
if (!sh->ppl_page) {
free_stripe(sc, sh);
sh = NULL;
@@ -2383,13 +2388,15 @@ static int resize_stripes(struct r5conf *conf, int newsize)
* is completely stalled, so now is a good time to resize
* conf->disks and the scribble region
*/
- ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
+ ndisks = kzalloc_node(newsize * sizeof(struct disk_info), GFP_NOIO,
+ r5_numa_node);
if (ndisks) {
for (i = 0; i < conf->pool_size; i++)
ndisks[i] = conf->disks[i];
for (i = conf->pool_size; i < newsize; i++) {
- ndisks[i].extra_page = alloc_page(GFP_NOIO);
+ ndisks[i].extra_page = alloc_pages_node(r5_numa_node,
+ GFP_NOIO, 0);
if (!ndisks[i].extra_page)
err = -ENOMEM;
}
@@ -2418,7 +2425,8 @@ static int resize_stripes(struct r5conf *conf, int newsize)
for (i=conf->raid_disks; i < newsize; i++)
if (nsh->dev[i].page == NULL) {
- struct page *p = alloc_page(GFP_NOIO);
+ struct page *p = alloc_pages_node(r5_numa_node,
+ GFP_NOIO, 0);
nsh->dev[i].page = p;
nsh->dev[i].orig_page = p;
if (!p)
@@ -3921,7 +3929,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
dev->page == dev->orig_page &&
!test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
/* alloc page for prexor */
- struct page *p = alloc_page(GFP_NOIO);
+ struct page *p = alloc_pages_node(r5_numa_node,
+ GFP_NOIO, 0);
if (p) {
dev->orig_page = p;
@@ -6653,9 +6662,9 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
}
*group_cnt = num_possible_nodes();
size = sizeof(struct r5worker) * cnt;
- workers = kzalloc(size * *group_cnt, GFP_NOIO);
- *worker_groups = kzalloc(sizeof(struct r5worker_group) *
- *group_cnt, GFP_NOIO);
+ workers = kzalloc_node(size * *group_cnt, GFP_NOIO, r5_numa_node);
+ *worker_groups = kzalloc_node(sizeof(struct r5worker_group) *
+ *group_cnt, GFP_NOIO, r5_numa_node);
if (!*worker_groups || !workers) {
kfree(workers);
kfree(*worker_groups);
@@ -6720,7 +6729,8 @@ static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu
static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
{
if (conf->level == 6 && !percpu->spare_page)
- percpu->spare_page = alloc_page(GFP_KERNEL);
+ percpu->spare_page = alloc_pages_node(r5_numa_node,
+ GFP_KERNEL, 0);
if (!percpu->scribble)
percpu->scribble = scribble_alloc(max(conf->raid_disks,
conf->previous_raid_disks),
@@ -6880,13 +6890,13 @@ static struct r5conf *setup_conf(struct mddev *mddev)
return ERR_PTR(-EINVAL);
}
- conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
+ conf = kzalloc_node(sizeof(struct r5conf), GFP_KERNEL, r5_numa_node);
if (conf == NULL)
goto abort;
INIT_LIST_HEAD(&conf->free_list);
INIT_LIST_HEAD(&conf->pending_list);
- conf->pending_data = kzalloc(sizeof(struct r5pending_data) *
- PENDING_IO_MAX, GFP_KERNEL);
+ conf->pending_data = kzalloc_node(sizeof(struct r5pending_data) *
+ PENDING_IO_MAX, GFP_KERNEL, r5_numa_node);
if (!conf->pending_data)
goto abort;
for (i = 0; i < PENDING_IO_MAX; i++)
@@ -6935,14 +6945,15 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
max_disks = max(conf->raid_disks, conf->previous_raid_disks);
- conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
- GFP_KERNEL);
+ conf->disks = kzalloc_node(max_disks * sizeof(struct disk_info),
+ GFP_KERNEL, r5_numa_node);
if (!conf->disks)
goto abort;
for (i = 0; i < max_disks; i++) {
- conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
+ conf->disks[i].extra_page = alloc_pages_node(r5_numa_node,
+ GFP_KERNEL, 0);
if (!conf->disks[i].extra_page)
goto abort;
}
@@ -6952,7 +6963,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
conf->mddev = mddev;
- if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
+ if ((conf->stripe_hashtbl = kzalloc_node(PAGE_SIZE, GFP_KERNEL, r5_numa_node))
+ == NULL)
goto abort;
/* We init hash_locks[0] separately to that it can be used
@@ -8445,6 +8457,12 @@ static int __init raid5_init(void)
destroy_workqueue(raid5_wq);
return ret;
}
+
+ if (r5_numa_node < NUMA_NO_NODE)
+ r5_numa_node = NUMA_NO_NODE;
+ else if (r5_numa_node > (num_online_nodes() - 1))
+ r5_numa_node = num_online_nodes() - 1;
+
register_md_personality(&raid6_personality);
register_md_personality(&raid5_personality);
register_md_personality(&raid4_personality);
--
2.7.4
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH 1/5] md: add a 'md_numa_node' module parameter
2017-07-01 7:20 [PATCH 1/5] md: add a 'md_numa_node' module parameter Zhengyuan Liu
` (3 preceding siblings ...)
2017-07-01 7:20 ` [PATCH 5/5] raid5: add a 'r5_numa_node' " Zhengyuan Liu
@ 2017-07-10 17:28 ` Shaohua Li
2017-07-18 2:03 ` Zhengyuan Liu
4 siblings, 1 reply; 7+ messages in thread
From: Shaohua Li @ 2017-07-10 17:28 UTC (permalink / raw)
To: Zhengyuan Liu; +Cc: linux-raid, neilb, colyli, liuyun01
On Sat, Jul 01, 2017 at 03:20:54PM +0800, Zhengyuan Liu wrote:
> This module parameter allows user to control which numa node
> the memory for mainly structures (e.g. mddev, md_rdev,
> request_queue, gendisk) is allocated from. The idea came from
> commit 115485e83f49 ("dm: add 'dm_numa_node' module parameter").
> If we don't set this parameter while loading module, it won't
> affect the md default behavior.
>
> numa_node_id field is added so personality such as raid0 could
> inherit the node from mddev, for other personality which has its
> own handling thread we could add such a module parameter too.
>
> This patch includes a bug fix about parameter range check suggested
> by Coly Li <colyli@suse.de>.
>
> Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Can you give an example why this is useful?
If we have several raid arrays, we can't control numa node for them separately.
And in the patch 4/5, one array could have two different parameters to control
memory allocation. this is weird. I have no idea how users use this feature.
If we really want to support numa node, would it be better to deduce the numa
node from the combination of all underlayer disks?
Thanks,
Shaohua
> ---
> drivers/md/md.c | 18 ++++++++++++++----
> drivers/md/md.h | 2 ++
> 2 files changed, 16 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 092b48f..fae93db 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -184,6 +184,8 @@ static int start_readonly;
> */
> static bool create_on_open = true;
>
> +static int md_numa_node = NUMA_NO_NODE;
> +
> /* bio_clone_mddev
> * like bio_clone, but with a local bio set
> */
> @@ -588,7 +590,7 @@ static struct mddev *mddev_find(dev_t unit)
> }
> spin_unlock(&all_mddevs_lock);
>
> - new = kzalloc(sizeof(*new), GFP_KERNEL);
> + new = kzalloc_node(sizeof(*new), GFP_KERNEL, md_numa_node);
> if (!new)
> return NULL;
>
> @@ -598,6 +600,7 @@ static struct mddev *mddev_find(dev_t unit)
> else
> new->md_minor = MINOR(unit) >> MdpMinorShift;
>
> + new->numa_node_id = md_numa_node;
> mddev_init(new);
>
> goto retry;
> @@ -3387,7 +3390,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
> struct md_rdev *rdev;
> sector_t size;
>
> - rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
> + rdev = kzalloc_node(sizeof(*rdev), GFP_KERNEL, md_numa_node);
> if (!rdev)
> return ERR_PTR(-ENOMEM);
>
> @@ -5256,7 +5259,7 @@ static int md_alloc(dev_t dev, char *name)
> mddev->hold_active = UNTIL_STOP;
>
> error = -ENOMEM;
> - mddev->queue = blk_alloc_queue(GFP_KERNEL);
> + mddev->queue = blk_alloc_queue_node(GFP_KERNEL, md_numa_node);
> if (!mddev->queue)
> goto abort;
> mddev->queue->queuedata = mddev;
> @@ -5264,7 +5267,7 @@ static int md_alloc(dev_t dev, char *name)
> blk_queue_make_request(mddev->queue, md_make_request);
> blk_set_stacking_limits(&mddev->queue->limits);
>
> - disk = alloc_disk(1 << shift);
> + disk = alloc_disk_node(1 << shift, md_numa_node);
> if (!disk) {
> blk_cleanup_queue(mddev->queue);
> mddev->queue = NULL;
> @@ -8969,6 +8972,11 @@ static int __init md_init(void)
> register_reboot_notifier(&md_notifier);
> raid_table_header = register_sysctl_table(raid_root_table);
>
> + if (md_numa_node < NUMA_NO_NODE)
> + md_numa_node = NUMA_NO_NODE;
> + else if (md_numa_node > (num_online_nodes() - 1))
> + md_numa_node = num_online_nodes() - 1;
> +
> md_geninit();
> return 0;
>
> @@ -9252,6 +9260,8 @@ module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
> module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
> module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
> module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
> +module_param(md_numa_node, int, S_IRUGO);
> +MODULE_PARM_DESC(md_numa_node, "NUMA node for md device memory allocations");
>
> MODULE_LICENSE("GPL");
> MODULE_DESCRIPTION("MD RAID framework");
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 991f0fe..5c91033 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -459,6 +459,8 @@ struct mddev {
> void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
> struct md_cluster_info *cluster_info;
> unsigned int good_device_nr; /* good device num within cluster raid */
> +
> + int numa_node_id;
> };
>
> enum recovery_flags {
> --
> 2.7.4
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH 1/5] md: add a 'md_numa_node' module parameter
2017-07-10 17:28 ` [PATCH 1/5] md: add a 'md_numa_node' " Shaohua Li
@ 2017-07-18 2:03 ` Zhengyuan Liu
0 siblings, 0 replies; 7+ messages in thread
From: Zhengyuan Liu @ 2017-07-18 2:03 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-raid, neilb, colyli, liuyun01
2017-07-11 1:28 GMT+08:00 Shaohua Li <shli@kernel.org>:
> On Sat, Jul 01, 2017 at 03:20:54PM +0800, Zhengyuan Liu wrote:
>> This module parameter allows user to control which numa node
>> the memory for mainly structures (e.g. mddev, md_rdev,
>> request_queue, gendisk) is allocated from. The idea came from
>> commit 115485e83f49 ("dm: add 'dm_numa_node' module parameter").
>> If we don't set this parameter while loading module, it won't
>> affect the md default behavior.
>>
>> numa_node_id field is added so personality such as raid0 could
>> inherit the node from mddev, for other personality which has its
>> own handling thread we could add such a module parameter too.
>>
>> This patch includes a bug fix about parameter range check suggested
>> by Coly Li <colyli@suse.de>.
>>
>> Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
>
> Can you give an example why this is useful?
To be honest, I do't catch clearly performance improvement on my arm64 numa
server with those patch sets. But, theoretically it should make sense especially
for direct IO when all memory allocation binding to a local numa node through
the IO path. I have did a fio test based on raid0 which created from two raw device,
of course I added a numa module parameter to drivers/block/rbd.c too, the iops
got a bit better (about 2%) when md and brd bind to a same node comparing to
no module parameter, but not always better since numa is handled transparently
by memory management in most case.
>
> If we have several raid arrays, we can't control numa node for them separately.
Yes, current patch doesn't consider this scenario. To control them fall into different
nuna node separately, we could make the module parameter writable just like
commit 115485e83f49 does.
> And in the patch 4/5, one array could have two different parameters to control
> memory allocation. this is weird. I have no idea how users use this feature.
>
> If we really want to support numa node, would it be better to deduce the numa
> node from the combination of all underlayer disks?
I think it is feasible, as we could get the device numa node of all md_rdev when creating.
If any two md_rdev get different numa node property, then we can stop keeping going.
Thanks,
Zhengyuan
>
> Thanks,
> Shaohua
>
>> ---
>> drivers/md/md.c | 18 ++++++++++++++----
>> drivers/md/md.h | 2 ++
>> 2 files changed, 16 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 092b48f..fae93db 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -184,6 +184,8 @@ static int start_readonly;
>> */
>> static bool create_on_open = true;
>>
>> +static int md_numa_node = NUMA_NO_NODE;
>> +
>> /* bio_clone_mddev
>> * like bio_clone, but with a local bio set
>> */
>> @@ -588,7 +590,7 @@ static struct mddev *mddev_find(dev_t unit)
>> }
>> spin_unlock(&all_mddevs_lock);
>>
>> - new = kzalloc(sizeof(*new), GFP_KERNEL);
>> + new = kzalloc_node(sizeof(*new), GFP_KERNEL, md_numa_node);
>> if (!new)
>> return NULL;
>>
>> @@ -598,6 +600,7 @@ static struct mddev *mddev_find(dev_t unit)
>> else
>> new->md_minor = MINOR(unit) >> MdpMinorShift;
>>
>> + new->numa_node_id = md_numa_node;
>> mddev_init(new);
>>
>> goto retry;
>> @@ -3387,7 +3390,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
>> struct md_rdev *rdev;
>> sector_t size;
>>
>> - rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
>> + rdev = kzalloc_node(sizeof(*rdev), GFP_KERNEL, md_numa_node);
>> if (!rdev)
>> return ERR_PTR(-ENOMEM);
>>
>> @@ -5256,7 +5259,7 @@ static int md_alloc(dev_t dev, char *name)
>> mddev->hold_active = UNTIL_STOP;
>>
>> error = -ENOMEM;
>> - mddev->queue = blk_alloc_queue(GFP_KERNEL);
>> + mddev->queue = blk_alloc_queue_node(GFP_KERNEL, md_numa_node);
>> if (!mddev->queue)
>> goto abort;
>> mddev->queue->queuedata = mddev;
>> @@ -5264,7 +5267,7 @@ static int md_alloc(dev_t dev, char *name)
>> blk_queue_make_request(mddev->queue, md_make_request);
>> blk_set_stacking_limits(&mddev->queue->limits);
>>
>> - disk = alloc_disk(1 << shift);
>> + disk = alloc_disk_node(1 << shift, md_numa_node);
>> if (!disk) {
>> blk_cleanup_queue(mddev->queue);
>> mddev->queue = NULL;
>> @@ -8969,6 +8972,11 @@ static int __init md_init(void)
>> register_reboot_notifier(&md_notifier);
>> raid_table_header = register_sysctl_table(raid_root_table);
>>
>> + if (md_numa_node < NUMA_NO_NODE)
>> + md_numa_node = NUMA_NO_NODE;
>> + else if (md_numa_node > (num_online_nodes() - 1))
>> + md_numa_node = num_online_nodes() - 1;
>> +
>> md_geninit();
>> return 0;
>>
>> @@ -9252,6 +9260,8 @@ module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
>> module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
>> module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
>> module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
>> +module_param(md_numa_node, int, S_IRUGO);
>> +MODULE_PARM_DESC(md_numa_node, "NUMA node for md device memory allocations");
>>
>> MODULE_LICENSE("GPL");
>> MODULE_DESCRIPTION("MD RAID framework");
>> diff --git a/drivers/md/md.h b/drivers/md/md.h
>> index 991f0fe..5c91033 100644
>> --- a/drivers/md/md.h
>> +++ b/drivers/md/md.h
>> @@ -459,6 +459,8 @@ struct mddev {
>> void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
>> struct md_cluster_info *cluster_info;
>> unsigned int good_device_nr; /* good device num within cluster raid */
>> +
>> + int numa_node_id;
>> };
>>
>> enum recovery_flags {
>> --
>> 2.7.4
>>
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2017-07-18 2:03 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-07-01 7:20 [PATCH 1/5] md: add a 'md_numa_node' module parameter Zhengyuan Liu
2017-07-01 7:20 ` [PATCH 2/5] raid0: make memory allocation from md's numa node Zhengyuan Liu
2017-07-01 7:20 ` [PATCH 3/5] md/linear: " Zhengyuan Liu
2017-07-01 7:20 ` [PATCH 4/5] raid10: add a 'r10_numa_node' module parameter Zhengyuan Liu
2017-07-01 7:20 ` [PATCH 5/5] raid5: add a 'r5_numa_node' " Zhengyuan Liu
2017-07-10 17:28 ` [PATCH 1/5] md: add a 'md_numa_node' " Shaohua Li
2017-07-18 2:03 ` Zhengyuan Liu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).