linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* Bug: Performance regression in 1013af4f585f: mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race
@ 2025-08-29 14:30 Uschakow, Stanislav
  2025-09-01 10:58 ` Jann Horn
  0 siblings, 1 reply; 4+ messages in thread
From: Uschakow, Stanislav @ 2025-08-29 14:30 UTC (permalink / raw)
  To: linux-mm@kvack.org
  Cc: trix@redhat.com, ndesaulniers@google.com, nathan@kernel.org,
	akpm@linux-foundation.org, muchun.song@linux.dev,
	mike.kravetz@oracle.com, jannh@google.com,
	lorenzo.stoakes@oracle.com, liam.howlett@oracle.com,
	muchun.song@linux.dev, osalvador@suse.de, vbabka@suse.cz,
	stable@vger.kernel.org, akpm@linux-foundation.org,
	jannh@google.com

Hello.

We have observed a huge latency increase using `fork()` after ingesting the CVE-2025-38085 fix which leads to the commit `1013af4f585f: mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race`. On large machines with 1.5TB of memory with 196 cores, we identified mmapping of 1.2TB of shared memory and forking itself dozens or hundreds of times we see a increase of execution times of a factor of 4. The reproducer is at the end of the email.

Comparing the a kernel without this patch with a kernel with this patch applied when spawning 1000 children we see those execution times:


Patched kernel: 
$ time make stress
...
real    0m11.275s
user    0m0.177s
sys     0m23.905s

Original kernel : 

$ time make stress
...real    0m2.475s
user    0m1.398s
sys     0m2.501s


The patch in question: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1013af4f585fccc4d3e5c5824d174de2257f7d6d


My observation/assumption is:

each child touches 100 random pages and despawns
on each despawn `huge_pmd_unshare()` is called
each call to `huge_pmd_unshare()` syncrhonizes all threads using `tlb_remove_table_sync_one()` leading to the regression



I'm happy to provide more information.




Thank you
Stanislav Uschakow








=== Reproducer ===

Setup:


#!/bin/bash
echo "Setting up hugepages for reproduction..."

# hugepages (1.2TB / 2MB = 614400 pages)
REQUIRED_PAGES=614400

# Check current hugepage allocation
CURRENT_PAGES=$(cat /proc/sys/vm/nr_hugepages)
echo "Current hugepages: $CURRENT_PAGES"

if [ "$CURRENT_PAGES" -lt "$REQUIRED_PAGES" ]; then
    echo "Allocating $REQUIRED_PAGES hugepages..."
    echo $REQUIRED_PAGES | sudo tee /proc/sys/vm/nr_hugepages

    ALLOCATED=$(cat /proc/sys/vm/nr_hugepages)
    echo "Allocated hugepages: $ALLOCATED"
    
    if [ "$ALLOCATED" -lt "$REQUIRED_PAGES" ]; then
        echo "Warning: Could not allocate all required hugepages"
        echo "Available: $ALLOCATED, Required: $REQUIRED_PAGES"
    fi
fi

echo never | sudo tee /sys/kernel/mm/transparent_hugepage/enabled

echo -e "\nHugepage information:"
cat /proc/meminfo | grep -i huge

echo -e "\nSetup complete. You can now run the reproduction test."



Makefile:


CXX = gcc
CXXFLAGS = -O2 -Wall
TARGET = hugepage_repro
SOURCE = hugepage_repro.c

$(TARGET): $(SOURCE)
    $(CXX) $(CXXFLAGS) -o $(TARGET) $(SOURCE)

clean:
    rm -f $(TARGET)

setup:
    chmod +x setup_hugepages.sh
    ./setup_hugepages.sh

test: $(TARGET)
    ./$(TARGET) 20 3

stress: $(TARGET)
    ./$(TARGET) 1000 1

.PHONY: clean setup test stress



hugepage_repro.c:


#include <sys/mman.h>
#include <sys/wait.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <stdio.h>

#define HUGEPAGE_SIZE (2 * 1024 * 1024) // 2MB
#define TOTAL_SIZE (1200ULL * 1024 * 1024 * 1024) // 1.2TB
#define NUM_HUGEPAGES (TOTAL_SIZE / HUGEPAGE_SIZE)

void* create_hugepage_mapping() {
    void* addr = mmap(NULL, TOTAL_SIZE, PROT_READ | PROT_WRITE,
                      MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
    if (addr == MAP_FAILED) {
        perror("mmap hugepages failed");
        exit(1);
    }
    return addr;
}

void touch_random_pages(void* addr, int num_touches) {
    char* base = (char*)addr;
    for (int i = 0; i < num_touches; ++i) {
        size_t offset = (rand() % NUM_HUGEPAGES) * HUGEPAGE_SIZE;
        volatile char val = base[offset];
        (void)val;
    }
}

void child_process(void* shared_mem, int child_id) {
    struct timespec start, end;
    clock_gettime(CLOCK_MONOTONIC, &start);
    
    touch_random_pages(shared_mem, 100);
    
    clock_gettime(CLOCK_MONOTONIC, &end);
    long duration = (end.tv_sec - start.tv_sec) * 1000000 + 
                   (end.tv_nsec - start.tv_nsec) / 1000;
    
    printf("Child %d completed in %ld μs\n", child_id, duration);
}

int main(int argc, char* argv[]) {
    int num_processes = argc > 1 ? atoi(argv[1]) : 50;
    int iterations = argc > 2 ? atoi(argv[2]) : 5;
    
    printf("Creating %lluGB hugepage mapping...\n", TOTAL_SIZE / (1024*1024*1024));
    void* shared_mem = create_hugepage_mapping();
    
    for (int iter = 0; iter < iterations; ++iter) {
        printf("\nIteration %d: Forking %d processes\n", iter + 1, num_processes);
        
        pid_t children[num_processes];
        struct timespec iter_start, iter_end;
        clock_gettime(CLOCK_MONOTONIC, &iter_start);
        
        for (int i = 0; i < num_processes; ++i) {
            pid_t pid = fork();
            if (pid == 0) {
                child_process(shared_mem, i);
                exit(0);
            } else if (pid > 0) {
                children[i] = pid;
            }
        }
        
        for (int i = 0; i < num_processes; ++i) {
            waitpid(children[i], NULL, 0);
        }
        
        clock_gettime(CLOCK_MONOTONIC, &iter_end);
        long iter_duration = (iter_end.tv_sec - iter_start.tv_sec) * 1000 + 
                            (iter_end.tv_nsec - iter_start.tv_nsec) / 1000000;
        printf("Iteration completed in %ld ms\n", iter_duration);
    }
    
    munmap(shared_mem, TOTAL_SIZE);
    return 0;
}




Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-09-04 12:39 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-29 14:30 Bug: Performance regression in 1013af4f585f: mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race Uschakow, Stanislav
2025-09-01 10:58 ` Jann Horn
2025-09-01 11:26   ` David Hildenbrand
2025-09-04 12:39     ` Uschakow, Stanislav

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).