linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Jerome Glisse <jglisse@google.com>
To: Andrew Morton <akpm@linux-foundation.org>, linux-mm@kvack.org
Cc: Jerome Glisse <jglisse@google.com>,
	linux-kernel@vger.kernel.org,  stable@vger.kernel.org
Subject: [PATCH] mm: fix maxnode for mbind(), set_mempolicy() and migrate_pages()
Date: Sat, 20 Jul 2024 10:35:43 -0700	[thread overview]
Message-ID: <20240720173543.897972-1-jglisse@google.com> (raw)

Because maxnode bug there is no way to bind or migrate_pages to the
last node in multi-node NUMA system unless you lie about maxnodes
when making the mbind, set_mempolicy or migrate_pages syscall.

Manpage for those syscall describe maxnodes as the number of bits in
the node bitmap ("bit mask of nodes containing up to maxnode bits").
Thus if maxnode is n then we expect to have a n bit(s) bitmap which
means that the mask of valid bits is ((1 << n) - 1). The get_nodes()
decrement lead to the mask being ((1 << (n - 1)) - 1).

The three syscalls use a common helper get_nodes() and first things
this helper do is decrement maxnode by 1 which leads to using n-1 bits
in the provided mask of nodes (see get_bitmap() an helper function to
get_nodes()).

The lead to two bugs, either the last node in the bitmap provided will
not be use in either of the three syscalls, or the syscalls will error
out and return EINVAL if the only bit set in the bitmap was the last
bit in the mask of nodes (which is ignored because of the bug and an
empty mask of nodes is an invalid argument).

I am surprised this bug was never caught ... it has been in the kernel
since forever.

People can use the following function to detect if the kernel has the
bug:

bool kernel_has_maxnodes_bug(void)
{
    unsigned long nodemask = 1;
    bool has_bug;
    long res;

    res = set_mempolicy(MPOL_BIND, &nodemask, 1);
    has_bug = res && (errno == EINVAL);
    set_mempolicy(MPOL_DEFAULT, NULL, 0);
    return has_bug;
}

You can tested with any of the three program below:

gcc mbind.c -o mbind -lnuma
gcc set_mempolicy.c -o set_mempolicy -lnuma
gcc migrate_pages.c -o migrate_pages -lnuma

First argument is maxnode, second argument is the bit index to set in
the mask of node (0 set the first bit, 1 the second bit, ...).

./mbind 2 1 & sleep 2 && numastat -n -p `pidof mbind` && fg
./set_mempolicy 2 1 & sleep 2 && numastat -n -p `pidof set_mempolicy` && fg
./migrate_pages 2 1 & sleep 2 && numastat -n -p `pidof migrate_pages` && fg

mbind.c %< ----------------------------------------------------------

void *anon_mem(size_t size)
{
    void *ret;

    ret = mmap(NULL, size, PROT_READ|
               PROT_WRITE, MAP_PRIVATE|
               MAP_ANON, -1, 0);
    return ret == MAP_FAILED ? NULL : ret;
}

unsigned long mround(unsigned long v, unsigned long m)
{
    if (m == 0) {
        return v;
    }

    return v + m - (v % m);
}

void bitmap_set(void *_bitmap, unsigned long b)
{
    uint8_t *bitmap = _bitmap;

    bitmap[b >> 3] |= (1 << (b & 7));
}

int main(int argc, char *argv[])
{
    unsigned long *nodemask, maxnode, node, i;
    size_t bytes;
    int8_t *mem;
    long res;

    if (argv[1] == NULL || argv[2] == NULL) {
        printf("missing argument: %s maxnodes node\n", argv[0]);
        return -1;
    }
    maxnode = atoi(argv[1]);
    node = atoi(argv[2]);

    bytes = mround(mround(maxnode, 8) >> 3,
                   sizeof(unsigned long));
    nodemask = calloc(bytes, 1);
    mem = anon_mem(NPAGES << 12);
    if (!mem || !nodemask) {
        return -1;
    }

    // Try to bind memory to node
    bitmap_set(nodemask, node);
    res = mbind(mem, NPAGES << 12, MPOL_BIND,
                nodemask, maxnode, 0);
    if (res) {
        printf("mbind(mem, NPAGES << 12, MPOL_BIND, "
               "nodemask, %d, 0) failed with %d\n",
               maxnode, errno);
        return -1;
    }

    // Write something to breakup from the zero page
    for (unsigned i = 0; i < NPAGES; i++) {
        mem[i << 12] = i + 1;
    }

    // Allow numastats to gather statistics
    getchar();

    return 0;
}

set_mempolicy %< ----------------------------------------------------

void *anon_mem(size_t size)
{
    void *ret;

    ret = mmap(NULL, size, PROT_READ|
               PROT_WRITE, MAP_PRIVATE|
               MAP_ANON, -1, 0);
    return ret == MAP_FAILED ? NULL : ret;
}

unsigned long mround(unsigned long v, unsigned long m)
{
    if (m == 0) {
        return v;
    }

    return v + m - (v % m);
}

void bitmap_set(void *_bitmap, unsigned long b)
{
    uint8_t *bitmap = _bitmap;

    bitmap[b >> 3] |= (1 << (b & 7));
}

int main(int argc, char *argv[])
{
    unsigned long *nodemask, maxnode, node, i;
    size_t bytes;
    int8_t *mem;
    long res;

    if (argv[1] == NULL || argv[2] == NULL) {
        printf("missing argument: %s maxnodes node\n", argv[0]);
        return -1;
    }
    maxnode = atoi(argv[1]);
    node = atoi(argv[2]);

    // bind memory to node 0 ...
    i = 1;
    res = set_mempolicy(MPOL_BIND, i, 2);
    if (res) {
        printf("set_mempolicy(MPOL_BIND, []=1, %d) "
               "failed with %d\n", maxnode, errno);
        return -1;
    }

    bytes = mround(mround(maxnode, 8) >> 3,
                   sizeof(unsigned long));
    nodemask = calloc(bytes, 1);
    mem = anon_mem(NPAGES << 12);
    if (!mem || !nodemask) {
        return -1;
    }

    // Try to bind memory to node
    bitmap_set(nodemask, node);
    res = set_mempolicy(MPOL_BIND, nodemask, maxnode);
    if (res) {
        printf("set_mempolicy(MPOL_BIND, nodemask, %d) "
               "failed with %d\n", maxnode, errno);
        return -1;
    }

    // Write something to breakup from the zero page
    for (unsigned i = 0; i < NPAGES; i++) {
        mem[i << 12] = i + 1;
    }

    // Allow numastats to gather statistics
    getchar();

    return 0;
}

migrate_pages %< ----------------------------------------------------

void *anon_mem(size_t size)
{
    void *ret;

    ret = mmap(NULL, size, PROT_READ|
               PROT_WRITE, MAP_PRIVATE|
               MAP_ANON, -1, 0);
    return ret == MAP_FAILED ? NULL : ret;
}

unsigned long mround(unsigned long v, unsigned long m)
{
    if (m == 0) {
        return v;
    }

    return v + m - (v % m);
}

void bitmap_set(void *_bitmap, unsigned long b)
{
    uint8_t *bitmap = _bitmap;

    bitmap[b >> 3] |= (1 << (b & 7));
}

int main(int argc, char *argv[])
{
    unsigned long *old_nodes, *new_nodes, maxnode, node, i;
    size_t bytes;
    int8_t *mem;
    long res;

    if (argv[1] == NULL || argv[2] == NULL) {
        printf("missing argument: %s maxnodes node\n", argv[0]);
        return -1;
    }
    maxnode = atoi(argv[1]);
    node = atoi(argv[2]);

    // bind memory to node 0 ...
    i = 1;
    res = set_mempolicy(MPOL_BIND, &i, 2);
    if (res) {
        printf("set_mempolicy(MPOL_BIND, []=1, %d) "
               "failed with %d\n", maxnode, errno);
        return -1;
    }

    bytes = mround(mround(maxnode, 8) >> 3,
                   sizeof(unsigned long));
    old_nodes = calloc(bytes, 1);
    new_nodes = calloc(bytes, 1);
    mem = anon_mem(NPAGES << 12);
    if (!mem || !new_nodes || !old_nodes) {
        return -1;
    }

    // Write something to breakup from the zero page
    for (unsigned i = 0; i < NPAGES; i++) {
        mem[i << 12] = i + 1;
    }

    // Try to bind memory to node
    bitmap_set(old_nodes, 0);
    bitmap_set(new_nodes, node);
    res = migrate_pages(getpid(), maxnode,
                        old_nodes, new_nodes);
    if (res) {
        printf("migrate_pages(pid, %d, old_nodes, "
               "new_nodes) failed with %d\n",
               maxnode, errno);
        return -1;
    }

    // Allow numastats to gather statistics
    getchar();

    return 0;
}

Signed-off-by: Jérôme Glisse <jglisse@google.com>
To: Andrew Morton <akpm@linux-foundation.org>
To: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org
---
 mm/mempolicy.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index aec756ae5637..658e5366d266 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1434,7 +1434,6 @@ static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 		     unsigned long maxnode)
 {
-	--maxnode;
 	nodes_clear(*nodes);
 	if (maxnode == 0 || !nmask)
 		return 0;
-- 
2.45.2.1089.g2a221341d9-goog



             reply	other threads:[~2024-07-20 17:35 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-07-20 17:35 Jerome Glisse [this message]
2024-07-20 17:55 ` [PATCH] mm: fix maxnode for mbind(), set_mempolicy() and migrate_pages() Matthew Wilcox
2024-07-22 21:21   ` Gregory Price
     [not found] ` <0c390494-e6ba-4cde-aace-cd726f2409a1@redhat.com>
2024-07-23 16:19   ` Jerome Glisse
2024-07-23 16:33   ` Jerome Glisse
2024-07-23 17:37     ` David Hildenbrand
2024-07-23 18:24       ` David Hildenbrand
2024-07-24  4:15       ` Jerome Glisse
2024-07-24  6:27         ` David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240720173543.897972-1-jglisse@google.com \
    --to=jglisse@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).