Re: [Bugme-new] [Bug 13302] New: "bad pmd" on fork() of process with hugepage shared memory segments attached

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Mel Gorman <mel@csn.ul.ie>
To: starlight@binnacle.cx
Cc: Andrew Morton <akpm@linux-foundation.org>,
	linux-mm@kvack.org, bugzilla-daemon@bugzilla.kernel.org,
	bugme-daemon@bugzilla.kernel.org, Adam Litke <agl@us.ibm.com>,
	Eric B Munson <ebmunson@us.ibm.com>,
	riel@redhat.com, lee.schermerhorn@hp.com
Subject: Re: [Bugme-new] [Bug 13302] New: "bad pmd" on fork() of process with hugepage shared memory segments attached
Date: Wed, 20 May 2009 12:35:25 +0100	[thread overview]
Message-ID: <20090520113525.GA4409@csn.ul.ie> (raw)
In-Reply-To: <6.2.5.6.2.20090515145151.03a55298@binnacle.cx>

[-- Attachment #1: Type: text/plain, Size: 1565 bytes --]

On Fri, May 15, 2009 at 02:53:27PM -0400, starlight@binnacle.cx wrote:
> Here's another possible clue:
> 
> I tried the first 'tcbm' testcase on a 2.6.27.7
> kernel that was hanging around from a few months
> ago and it breaks it 100% of the time.
> 
> Completely hoses huge memory.  Enough "bad pmd"
> errors to fill the kernel log.
> 

So I investigated what's wrong with 2.6.27.7. The problem is a race between
exec() and the handling of mlock()ed VMAs but I can't see where. The normal
teardown of pages is applied to a shared memory segment as if VM_HUGETLB
was not set.

This was fixed between 2.6.27 and 2.6.28 but apparently by accident during the
introduction of CONFIG_UNEVITABLE_LRU. This patchset made a number of changes
to how mlock()ed are handled but I didn't spot which was the relevant change
that fixed the problem and reverse bisecting didn't help. I've added two people
that were working on the unevictable LRU patches to see if they spot something.

For context, the two attached files are used to reproduce a problem
where bad pmd messages are scribbled all over the console on 2.6.27.7.
Do something like

echo 64 > /proc/sys/vm/nr_hugepages
mount -t hugetlbfs none /mnt
sh ./test-tcbm.sh

I did confirm that it didn't matter to 2.6.29.1 if CONFIG_UNEVITABLE_LRU is
set or not.  It's possible the race it still there but I don't know where
it is.

Any ideas where the race might be?

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

[-- Attachment #2: tcbm.c --]
[-- Type: text/x-csrc, Size: 4618 bytes --]

#include <errno.h>
#include <fcntl.h>
#include <memory.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sched.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <sys/shm.h>
#include <sys/resource.h>
#include <sys/mman.h>

#define LARGE_SHARED_SEGMENT_KEY	0x12345600
#define LARGE_SHARED_SEGMENT_SIZE	((size_t)0x40000000)
#define LARGE_SHARED_SEGMENT_ADDR	((void *)0x40000000)

#define SMALL_SHARED_SEGMENT_KEY	0x12345601
#define SMALL_SHARED_SEGMENT_SIZE	((size_t)0x20000000)
#define SMALL_SHARED_SEGMENT_ADDR	((void *)0x94000000)

#define NUM_SMALL_BUFFERS		50

char *helper_program = "echo";
char *helper_args[] = { "-n", ".", NULL };

void child_signal_handler(const int unused)
{
	int errno_save;
	pid_t dead_pid;
	int dead_status;

	errno_save = errno;

	do {
		dead_pid = waitpid(-1, &dead_status, WNOHANG);
		if (dead_pid == -1) {
			if (errno == ECHILD)
				break;
			perror("waitpid");
			exit(EXIT_FAILURE);
		}
	} while (dead_pid != 0);

	errno = errno_save;
	return;
}

int rabbits(void)
{
	int sched_policy;
	int pid;

	pid = fork();
	if (pid != 0)
		return 0;

	sched_policy = sched_getscheduler(0);
	if (sched_policy == -1)
		perror("sched_getscheduler");

	/* Set the childs policy to SCHED_OTHER */
	if (sched_policy != SCHED_OTHER) {
		struct sched_param sched;
		memset(&sched, 0, sizeof(sched));
		sched.sched_priority = 0;
		if (sched_setscheduler(0, SCHED_OTHER, &sched) != 0)
			perror("sched_setscheduler");
	}

	/* Set the priority of the process */
	errno = 0;
	const int nice = getpriority(PRIO_PROCESS, 0);
	if (errno != 0)
		perror("getpriority");
	if (nice < -10)
		if (setpriority(PRIO_PROCESS, 0, -10) != 0)
			perror("setpriority");

	/* Launch helper program */
	execvp(helper_program, helper_args);
	perror("execvp");
	exit(EXIT_FAILURE);
}

int main(int argc, const char** argv, const char** envp)
{
	struct sched_param sched;
	struct sigaction sas_child;
	int i;

	/* Set the round robin scheduler */
	memset(&sched, 0, sizeof(sched));
	sched.sched_priority = 26;
	if (sched_setscheduler(0, SCHED_RR, &sched) != 0) {
		perror("sched_setscheduler(SCHED_RR, 26)");
		return 1;
	}

	/* Set a signal handler for children exiting */
	memset(&sas_child, 0, sizeof(sas_child));
	sas_child.sa_handler = child_signal_handler;
	if (sigaction(SIGCHLD, &sas_child, NULL) != 0) {
		perror("sigaction(SIGCHLD)");
		return 1;
	}

	/* Create a large shared memory segment */
	int seg1id = shmget(LARGE_SHARED_SEGMENT_KEY,
				LARGE_SHARED_SEGMENT_SIZE,
				IPC_CREAT|SHM_HUGETLB|0640);
	if (seg1id == -1) {
		perror("shmget(LARGE_SEGMENT)");
		return 1;
	}

	/* Attach at the 16GB offset */
	void* seg1adr = shmat(seg1id, LARGE_SHARED_SEGMENT_ADDR, 0);
	if (seg1adr == (void*)-1) {
		perror("shmat(LARGE_SEGMENT)");
		return 1;
	}

	/* Initialise the start of the segment and mlock it */
	memset(seg1adr, 0xFF, LARGE_SHARED_SEGMENT_SIZE/2);
	if (mlock(seg1adr, LARGE_SHARED_SEGMENT_SIZE) != 0) {
		perror("mlock(LARGE_SEGMENT)");
		return 1;
	}

	/* Create a second smaller segment */
	int seg2id = shmget(SMALL_SHARED_SEGMENT_KEY,
				SMALL_SHARED_SEGMENT_SIZE,
				IPC_CREAT|SHM_HUGETLB|0640);
	if (seg2id == -1) {
		perror("shmget(SMALL_SEGMENT)");
		return 1;
	}

	/* Attach small segment */
	void *seg2adr = shmat(seg2id, SMALL_SHARED_SEGMENT_ADDR, 0);
	if (seg2adr == (void*) -1) {
		perror("shmat(SMALL_SEGMENT)");
		return 1;
	}

	/* Initialise all of small segment and mlock */
	memset(seg2adr, 0xFF, (size_t) SMALL_SHARED_SEGMENT_SIZE);
/*
	if (mlock(seg2adr, (size_t) SMALL_SHARED_SEGMENT_SIZE) != 0) {
		perror("mlock(SMALL_SEGMENT)");
		return 1;
	}
*/

	/* Create a number of approximately 516K buffers */
	for (i = 0; i < NUM_SMALL_BUFFERS; i++) {
		void* mmtarg = mmap(NULL, 528384,
				PROT_READ|PROT_WRITE,
				MAP_PRIVATE|MAP_ANONYMOUS,
				-1, 0);
		if (mmtarg == (void*) -1) {
			perror("mmap");
			return 1;
		}
	}

	/* Dump maps */
	{
		char buf[4097];
		int bytes;
		int fd = open("/proc/self/maps", O_RDONLY);
		while ((bytes = read(fd, buf, 4096)) > 0) {
			printf("%s", buf);
		}
		close(fd);
	}

	/* Create one child per small buffer */
	for (i = 0; i < NUM_SMALL_BUFFERS; i++) {
		rabbits();
		usleep(500);
	}

	/* Wait until children shut up signalling */
	printf("Waiting for children\n");
	while (sleep(3) != 0);

	/* Detach */
	if (shmdt(seg1adr) == -1)
		perror("shmdt(LARGE_SEGMENT)");
	if (shmdt(seg2adr) == -1)
		perror("shmdt(SMALL_SEGMENT)");
	if (shmctl(seg1id, IPC_RMID, NULL) == -1)
		perror("shmrm(LARGE_SEGMENT)");
	if (shmctl(seg2id, IPC_RMID, NULL) == -1)
		perror("shmrm(SMALL_SEGMENT)");

	printf("Done\n");
	return 0;
}

[-- Attachment #3: test-tcbm.sh --]
[-- Type: application/x-sh, Size: 603 bytes --]

next prev parent reply	other threads:[~2009-05-20 11:34 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-05-15 18:53 [Bugme-new] [Bug 13302] New: "bad pmd" on fork() of process with hugepage shared memory segments attached starlight
2009-05-20 11:35 ` Mel Gorman [this message]
2009-05-20 14:29   ` Mel Gorman
2009-05-20 14:53   ` Lee Schermerhorn
2009-05-20 15:05     ` Lee Schermerhorn
2009-05-20 15:41       ` Mel Gorman
2009-05-21  0:41         ` KOSAKI Motohiro
2009-05-22 16:41           ` Mel Gorman
2009-05-24 13:44             ` KOSAKI Motohiro
2009-05-25  8:51               ` Mel Gorman
2009-05-25 10:10                 ` Hugh Dickins
2009-05-25 13:17                   ` Mel Gorman
  -- strict thread matches above, loose matches on Subject: below --
2009-05-15 18:44 starlight
2009-05-18 16:36 ` Mel Gorman
2009-05-15  5:32 starlight
2009-05-15 14:55 ` Mel Gorman
2009-05-15 15:02   ` starlight
     [not found] <bug-13302-10286@http.bugzilla.kernel.org/>
2009-05-13 20:08 ` Andrew Morton
2009-05-14 10:53   ` Mel Gorman
2009-05-14 10:59     ` Mel Gorman
2009-05-14 17:20       ` starlight
2009-05-14 17:49         ` Mel Gorman
2009-05-14 18:42           ` starlight
2009-05-14 19:10           ` starlight
2009-05-14 17:16     ` starlight

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090520113525.GA4409@csn.ul.ie \
    --to=mel@csn.ul.ie \
    --cc=agl@us.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=bugme-daemon@bugzilla.kernel.org \
    --cc=bugzilla-daemon@bugzilla.kernel.org \
    --cc=ebmunson@us.ibm.com \
    --cc=lee.schermerhorn@hp.com \
    --cc=linux-mm@kvack.org \
    --cc=riel@redhat.com \
    --cc=starlight@binnacle.cx \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).