[Linux-ia64] [RFC] Multithreaded core dump for 2.5.10-ia64 kernel.

Linux IA64 platform development
 help / color / mirror / Atom feed

From: "Gross, Mark" <mark.gross@intel.com>
To: linux-ia64@vger.kernel.org
Subject: [Linux-ia64] [RFC] Multithreaded core dump for 2.5.10-ia64 kernel.
Date: Wed, 05 Jun 2002 21:33:12 +0000	[thread overview]
Message-ID: <marc-linux-ia64-105590701905644@msgid-missing> (raw)

[-- Attachment #1: Type: text/plain, Size: 2252 bytes --]

This is a version of the tcore multi-threaded core dump patch that supports
both i386 and ia64 architectures.  

Its been smoke tested on a 2 way IPF box, it doesn't seem to crash ;)

I'm new to the stack unwinding thing WRT ia64 and would love to get some
feed back on my tweaks to process.c that collect the register state data for
the thread processes in the crashing thread group.  

This multithread core dump is very similar to the patches I've been tossing
up to the LKML, only this one supports ia64 core dumps as well as i386.  

This patch applies cleanly to the 2.5.10+ia64 kernel.
It has a off by 2 line offset warnings in binfmt_elf.c for the stock 2.5.10
kernel due to some include lines added to binfmt_elf.c by the ia64 patch.  I
build this patch against the ia64 kernel.  Regardless, it works for both
architectures.

To use this patch you need to "echo 1 > /proc/sys/kernel/core_dumps_threads"
to enable the tcore code path.  Then when a multi-threaded application dumps
core, it will crate a core file with the data for all the processes in the
thread group.

This code works by "suspending" the other thread group processes, and then
adding extra note sections to the core file with the user register state
data.  GDB 5.x knows how to work with these core files with one caveat.  You
may need to "strip libpthread*" before loading up the core file and crashing
application into gdb.  GDB 5.x; x < 2,  seems to have a problem loading
symbols for libpthread in this type of use.

The process suspend design does a number of interesting things that are
worth pointing out.
1) it creates an extra run queue that has no cpu, the phantom cpu.
2) it uses process migration to migrate the processes of the thread group to
the phantom run queue.
3) it removes the down_write from elf_dump_core, to avoid any possibility of
deadlocking with one of the suspended thread processes.  This is safe to do
as the other thread process are suspended and cannot free pages while they
aren't getting any CPU time.

One last note, I would have made this patch against the 2.5.18-ia64, but I
was missing a header file (perfmon_itanium.h) and had some problems building
it. 

--mgross

(W) 503-712-8218
MS: JF1-05
2111 N.E. 25th Ave.
Hillsboro, OR 97124


[-- Attachment #2: tcore-2.5.10-ia64.pat --]
[-- Type: application/octet-stream, Size: 32651 bytes --]

diff -urN -X dontdiff linux2510.pure/arch/i386/kernel/i387.c linux-2.5.10/arch/i386/kernel/i387.c
--- linux2510.pure/arch/i386/kernel/i387.c	Wed Apr 24 00:15:23 2002
+++ linux-2.5.10/arch/i386/kernel/i387.c	Tue Jun  4 10:11:34 2002
@@ -528,3 +528,36 @@
 
 	return fpvalid;
 }
+
+int dump_task_fpu( struct task_struct *tsk, struct user_i387_struct *fpu )
+{
+	int fpvalid;
+
+	fpvalid = tsk->used_math;
+	if ( fpvalid ) {
+		if (tsk == current) unlazy_fpu( tsk );
+		if ( cpu_has_fxsr ) {
+			copy_fpu_fxsave( tsk, fpu );
+		} else {
+			copy_fpu_fsave( tsk, fpu );
+		}
+	}
+
+	return fpvalid;
+}
+
+int dump_task_extended_fpu( struct task_struct *tsk, struct user_fxsr_struct *fpu )
+{
+	int fpvalid;
+	
+	fpvalid = tsk->used_math && cpu_has_fxsr;
+	if ( fpvalid ) {
+		if (tsk == current) unlazy_fpu( tsk );
+		memcpy( fpu, &tsk->thread.i387.fxsave,
+		sizeof(struct user_fxsr_struct) );
+	}
+	
+	return fpvalid;
+}
+
+
diff -urN -X dontdiff linux2510.pure/arch/i386/kernel/process.c linux-2.5.10/arch/i386/kernel/process.c
--- linux2510.pure/arch/i386/kernel/process.c	Wed Apr 24 00:15:07 2002
+++ linux-2.5.10/arch/i386/kernel/process.c	Tue Jun  4 10:11:34 2002
@@ -652,6 +652,24 @@
 
 	dump->u_fpvalid = dump_fpu (regs, &dump->i387);
 }
+/* 
+ * Capture the user space registers if the task is not running (in user space)
+ */
+#include <linux/elfcore.h>
+int dump_task_regs(struct task_struct *tsk,  elf_gregset_t *regs)
+{
+	struct pt_regs ptregs;
+	
+	ptregs = *(struct pt_regs *)((unsigned long)tsk->thread_info + THREAD_SIZE - sizeof(struct pt_regs));
+	ptregs.xcs &= 0xffff;
+	ptregs.xds &= 0xffff;
+	ptregs.xes &= 0xffff;
+	ptregs.xss &= 0xffff;
+
+	elf_core_copy_regs(regs, &ptregs);
+
+	return 1;
+}
 
 /*
  * This special macro can be used to load a debugging register
diff -urN -X dontdiff linux2510.pure/arch/ia64/kernel/process.c linux-2.5.10/arch/ia64/kernel/process.c
--- linux2510.pure/arch/ia64/kernel/process.c	Wed Apr 24 00:15:18 2002
+++ linux-2.5.10/arch/ia64/kernel/process.c	Wed Jun  5 13:03:26 2002
@@ -366,6 +366,8 @@
 void
 do_copy_regs (struct unw_frame_info *info, void *arg)
 {
+	do_copy_task_regs(current, info, arg);
+#if 0
 	unsigned long mask, sp, nat_bits = 0, ip, ar_rnat, urbs_end, cfm;
 	elf_greg_t *dst = arg;
 	struct pt_regs *pt;
@@ -431,11 +433,14 @@
 	dst[52] = pt->ar_pfs;	/* UNW_AR_PFS is == to pt->cr_ifs for interrupt frames */
 	unw_get_ar(info, UNW_AR_LC, &dst[53]);
 	unw_get_ar(info, UNW_AR_EC, &dst[54]);
+#endif
 }
 
 void
 do_dump_fpu (struct unw_frame_info *info, void *arg)
 {
+	do_dump_task_fpu(current, info, arg);
+#if 0
 	elf_fpreg_t *dst = arg;
 	int i;
 
@@ -452,6 +457,7 @@
 	ia64_flush_fph(current);
 	if ((current->thread.flags & IA64_THREAD_FPH_VALID) != 0)
 		memcpy(dst + 32, current->thread.fph, 96*16);
+#endif
 }
 
 void
@@ -467,6 +473,119 @@
 	return 1;	/* f0-f31 are always valid so we always return 1 */
 }
 
+static void
+do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg)
+{
+	unsigned long mask, sp, nat_bits = 0, ip, ar_rnat, urbs_end, cfm;
+	elf_greg_t *dst = arg;
+	struct pt_regs *pt;
+	char nat;
+	int i;
+
+	memset(dst, 0, sizeof(elf_gregset_t));	/* don't leak any kernel bits to user-level */
+
+	if (unw_unwind_to_user(info) < 0)
+		return;
+
+	unw_get_sp(info, &sp);
+	pt = (struct pt_regs *) (sp + 16);
+
+	urbs_end = ia64_get_user_rbs_end(task, pt, &cfm);
+
+	if (ia64_sync_user_rbs(task, info->sw, pt->ar_bspstore, urbs_end) < 0)
+		return;
+
+	ia64_peek(task, info->sw, urbs_end, (long) ia64_rse_rnat_addr((long *) urbs_end),
+		  &ar_rnat);
+
+	/*
+	 * coredump format:
+	 *	r0-r31
+	 *	NaT bits (for r0-r31; bit N == 1 iff rN is a NaT)
+	 *	predicate registers (p0-p63)
+	 *	b0-b7
+	 *	ip cfm user-mask
+	 *	ar.rsc ar.bsp ar.bspstore ar.rnat
+	 *	ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec
+	 */
+
+	/* r0 is zero */
+	for (i = 1, mask = (1UL << i); i < 32; ++i) {
+		unw_get_gr(info, i, &dst[i], &nat);
+		if (nat)
+			nat_bits |= mask;
+		mask <<= 1;
+	}
+	dst[32] = nat_bits;
+	unw_get_pr(info, &dst[33]);
+
+	for (i = 0; i < 8; ++i)
+		unw_get_br(info, i, &dst[34 + i]);
+
+	unw_get_rp(info, &ip);
+	dst[42] = ip + ia64_psr(pt)->ri;
+	dst[43] = cfm;
+	dst[44] = pt->cr_ipsr & IA64_PSR_UM;
+
+	unw_get_ar(info, UNW_AR_RSC, &dst[45]);
+	/*
+	 * For bsp and bspstore, unw_get_ar() would return the kernel
+	 * addresses, but we need the user-level addresses instead:
+	 */
+	dst[46] = urbs_end;	/* note: by convention PT_AR_BSP points to the end of the urbs! */
+	dst[47] = pt->ar_bspstore;
+	dst[48] = ar_rnat;
+	unw_get_ar(info, UNW_AR_CCV, &dst[49]);
+	unw_get_ar(info, UNW_AR_UNAT, &dst[50]);
+	unw_get_ar(info, UNW_AR_FPSR, &dst[51]);
+	dst[52] = pt->ar_pfs;	/* UNW_AR_PFS is == to pt->cr_ifs for interrupt frames */
+	unw_get_ar(info, UNW_AR_LC, &dst[53]);
+	unw_get_ar(info, UNW_AR_EC, &dst[54]);
+}
+
+void
+do_dump_task_fpu (struct task_struct *task, struct unw_frame_info *info, void *arg)
+{
+	elf_fpreg_t *dst = arg;
+	int i;
+
+	memset(dst, 0, sizeof(elf_fpregset_t));	/* don't leak any "random" bits */
+
+	if (unw_unwind_to_user(info) < 0)
+		return;
+
+	/* f0 is 0.0, f1 is 1.0 */
+
+	for (i = 2; i < 32; ++i)
+		unw_get_fr(info, i, dst + i);
+
+	ia64_flush_fph(task);
+	if ((task->thread.flags & IA64_THREAD_FPH_VALID) != 0)
+		memcpy(dst + 32, task->thread.fph, 96*16);
+}
+
+int dump_task_regs(struct task_struct *task, elf_gregset_t *regs)
+{
+	struct unw_frame_info tcore_info;
+
+	memset(&tcore_info, 0, sizeof(tcore_info));	
+	unw_init_from_blocked_task(&tcore_info, task);
+	do_copy_task_regs(task, &tcore_info, regs);
+	
+	return 1;
+}
+
+int dump_task_fpu (struct task_struct *task, elf_fpregset_t *dst)
+{
+	struct unw_frame_info tcore_info;
+	
+	memset(&tcore_info, 0, sizeof(tcore_info));	
+	unw_init_from_blocked_task(&tcore_info, task);
+	do_dump_task_fpu(task, &tcore_info, dst);
+	
+	return 1; 
+}
+
 asmlinkage long
 sys_execve (char *filename, char **argv, char **envp, struct pt_regs *regs)
 {
Binary files linux2510.pure/arch/ia64/tools/print_offsets and linux-2.5.10/arch/ia64/tools/print_offsets differ
diff -urN -X dontdiff linux2510.pure/fs/binfmt_elf.c linux-2.5.10/fs/binfmt_elf.c
--- linux2510.pure/fs/binfmt_elf.c	Tue May 14 10:58:13 2002
+++ linux-2.5.10/fs/binfmt_elf.c	Wed Jun  5 13:03:28 2002
@@ -13,6 +13,7 @@
 
 #include <linux/fs.h>
 #include <linux/stat.h>
+#include <linux/sched.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
@@ -30,6 +31,7 @@
 #include <linux/elfcore.h>
 #include <linux/init.h>
 #include <linux/highuid.h>
+#include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/compiler.h>
 #include <linux/highmem.h>
@@ -956,7 +958,7 @@
 /* #define DEBUG */
 
 #ifdef DEBUG
-static void dump_regs(const char *str, elf_greg_t *r)
+static void dump_regs(const char *str, elf_gregset_t *r)
 {
 	int i;
 	static const char *regs[] = { "ebx", "ecx", "edx", "esi", "edi", "ebp",
@@ -1004,6 +1006,167 @@
 #define DUMP_SEEK(off)	\
 	if (!dump_seek(file, (off))) \
 		goto end_coredump;
+
+static inline void fill_elf_header(struct elfhdr *elf, int segs)
+{
+	memcpy(elf->e_ident, ELFMAG, SELFMAG);
+	elf->e_ident[EI_CLASS] = ELF_CLASS;
+	elf->e_ident[EI_DATA] = ELF_DATA;
+	elf->e_ident[EI_VERSION] = EV_CURRENT;
+	memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
+
+	elf->e_type = ET_CORE;
+	elf->e_machine = ELF_ARCH;
+	elf->e_version = EV_CURRENT;
+	elf->e_entry = 0;
+	elf->e_phoff = sizeof(struct elfhdr);
+	elf->e_shoff = 0;
+	elf->e_flags = 0;
+	elf->e_ehsize = sizeof(struct elfhdr);
+	elf->e_phentsize = sizeof(struct elf_phdr);
+	elf->e_phnum = segs;
+	elf->e_shentsize = 0;
+	elf->e_shnum = 0;
+	elf->e_shstrndx = 0;
+	return;
+}
+
+static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, off_t offset)
+{
+	phdr->p_type = PT_NOTE;
+	phdr->p_offset = offset;
+	phdr->p_vaddr = 0;
+	phdr->p_paddr = 0;
+	phdr->p_filesz = sz;
+	phdr->p_memsz = 0;
+	phdr->p_flags = 0;
+	phdr->p_align = 0;
+	return;
+}
+
+static inline void fill_note(struct memelfnote *note, const char *name, int type, 
+		unsigned int sz, void *data)
+{
+	note->name = name;
+	note->type = type;
+	note->datasz = sz;
+	note->data = data;
+	return;
+}
+
+/*
+ * fill up all the fields in prstatus from the given task struct, except registers
+ * which need to be filled up seperately.
+ */
+static inline void fill_prstatus(struct elf_prstatus *prstatus, struct task_struct *p, long signr) 
+{
+	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
+	prstatus->pr_sigpend = p->pending.signal.sig[0];
+	prstatus->pr_sighold = p->blocked.sig[0];
+	prstatus->pr_pid = p->pid;
+	prstatus->pr_ppid = p->parent->pid;
+	prstatus->pr_pgrp = p->pgrp;
+	prstatus->pr_sid = p->session;
+	prstatus->pr_utime.tv_sec = CT_TO_SECS(p->times.tms_utime);
+	prstatus->pr_utime.tv_usec = CT_TO_USECS(p->times.tms_utime);
+	prstatus->pr_stime.tv_sec = CT_TO_SECS(p->times.tms_stime);
+	prstatus->pr_stime.tv_usec = CT_TO_USECS(p->times.tms_stime);
+	prstatus->pr_cutime.tv_sec = CT_TO_SECS(p->times.tms_cutime);
+	prstatus->pr_cutime.tv_usec = CT_TO_USECS(p->times.tms_cutime);
+	prstatus->pr_cstime.tv_sec = CT_TO_SECS(p->times.tms_cstime);
+	prstatus->pr_cstime.tv_usec = CT_TO_USECS(p->times.tms_cstime);
+	return;
+}
+
+static inline void fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p)
+{
+	int i;
+	
+	psinfo->pr_pid = p->pid;
+	psinfo->pr_ppid = p->parent->pid;
+	psinfo->pr_pgrp = p->pgrp;
+	psinfo->pr_sid = p->session;
+
+	i = p->state ? ffz(~p->state) + 1 : 0;
+	psinfo->pr_state = i;
+	psinfo->pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i];
+	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
+	psinfo->pr_nice =  task_nice(p);
+	psinfo->pr_flag = p->flags;
+	psinfo->pr_uid = NEW_TO_OLD_UID(p->uid);
+	psinfo->pr_gid = NEW_TO_OLD_GID(p->gid);
+	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
+	return;
+}
+
+/*
+ * This is the variable that can be set in proc to determine if we want to
+ * dump a multithreaded core or not. A value of 1 means yes while any
+ * other value means no.
+ *
+ * It is located at /proc/sys/kernel/core_dumps_threads
+ */
+extern int core_dumps_threads;
+
+/* Here is the structure in which status of each thread is captured. */
+struct elf_thread_status
+{
+	struct list_head list;
+	struct elf_prstatus prstatus;	/* NT_PRSTATUS */
+	elf_fpregset_t fpu;		/* NT_PRFPREG */
+#ifdef ELF_CORE_COPY_XFPREGS
+	elf_fpxregset_t xfpu;		/* NT_PRXFPREG */
+#endif
+	struct memelfnote notes[3];
+	int num_notes;
+};
+
+/*
+ * In order to add the specific thread information for the elf file format,
+ * we need to keep a linked list of every threads pr_status and then
+ * create a single section for them in the final core file.
+ */
+static int elf_dump_thread_status(long signr, struct task_struct * p, struct list_head * thread_list)
+{
+
+	struct elf_thread_status *t;
+	int sz = 0;
+
+	t = kmalloc(sizeof(*t), GFP_KERNEL);
+	if (!t) {
+		printk(KERN_WARNING "Cannot allocate memory for thread status.\n");
+		return 0;
+	}
+
+	INIT_LIST_HEAD(&t->list);
+	t->num_notes = 0;
+
+	fill_prstatus(&t->prstatus, p, signr);
+	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);	
+	
+	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), &(t->prstatus));
+	t->num_notes++;
+	sz += notesize(&t->notes[0]);
+
+	if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, &t->fpu))) {
+		fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), &(t->fpu));
+		t->num_notes++;
+		sz += notesize(&t->notes[1]);
+	}
+
+#ifdef ELF_CORE_COPY_XFPREGS
+	if (elf_core_copy_task_xfpregs(p, &t->xfpu)) {
+		fill_note(&t->notes[2], "LINUX", NT_PRXFPREG, sizeof(t->xfpu), &(t->xfpu));
+		t->num_notes++;
+		sz += notesize(&t->notes[2]);
+	}
+#endif	
+	list_add(&t->list, thread_list);
+	return sz;
+}
+
+
+
 /*
  * Actual dumper
  *
@@ -1022,12 +1185,27 @@
 	struct elfhdr elf;
 	off_t offset = 0, dataoff;
 	unsigned long limit = current->rlim[RLIMIT_CORE].rlim_cur;
-	int numnote = 4;
-	struct memelfnote notes[4];
+	int numnote = 5;
+	struct memelfnote notes[5];
 	struct elf_prstatus prstatus;	/* NT_PRSTATUS */
-	elf_fpregset_t fpu;		/* NT_PRFPREG */
 	struct elf_prpsinfo psinfo;	/* NT_PRPSINFO */
+ 	struct task_struct *p;
+ 	LIST_HEAD(thread_list);
+ 	struct list_head *t;
+	elf_fpregset_t fpu;
+#ifdef ELF_CORE_COPY_XFPREGS
+	elf_fpxregset_t xfpu;
+#endif
+	int dump_threads = core_dumps_threads; /* this value should not change once the */
+					/* dumping starts */
+	int thread_status_size = 0;
+	
 
+	/* First pause all related threaded processes */
+	if (dump_threads)	{
+		tcore_suspend_threads();
+	}
+	
 	/* first copy the parameters from user space */
 	memset(&psinfo, 0, sizeof(psinfo));
 	{
@@ -1045,7 +1223,6 @@
 
 	}
 
-	memset(&prstatus, 0, sizeof(prstatus));
 	/*
 	 * This transfers the registers from regs into the standard
 	 * coredump arrangement, whatever that is.
@@ -1061,9 +1238,44 @@
 	else
 		*(struct pt_regs *)&prstatus.pr_reg = *regs;
 #endif
+ 
+	if (dump_threads) {
+		/* capture the status of all other threads */
+		if (signr) {
+			read_lock(&tasklist_lock);
+			for_each_task(p)
+				if (current->mm == p->mm && current != p) {
+					int sz = elf_dump_thread_status(signr, p, &thread_list);
+					if (!sz) {
+						read_unlock(&tasklist_lock);
+						goto cleanup;
+					}
+					else
+						thread_status_size += sz;
+				}
+			read_unlock(&tasklist_lock);
+		}
+	} /* End if(dump_threads) */
+	
+	memset(&prstatus, 0, sizeof(prstatus));
+	fill_prstatus(&prstatus, current, signr);
+	elf_core_copy_regs(&prstatus.pr_reg, regs);
+	
+	/* We no longer stop all vm operations */
 
-	/* now stop all vm operations */
-	down_write(&current->mm->mmap_sem);
+	/* This because those proceses that could possibly 
+	 * change map_count or the mmap / vma pages are now suspended.
+	 *
+	 * Only ptrace can touch these memory address, but it cannot change
+	 * the map_count or the pages.  So no possibility of crashing exists while dumping
+	 * the mm->vm_next areas to the core file.
+	 *
+	 * Grabbing mmap_sem in this function is risky WRT the use of suspend_threads.
+	 * Although no locks ups have been induced, if one of the suspended threads was 
+	 * in line for the current->mmap_sem and if gets it while on the Phantom runque,
+	 * then we would dead lock in this function if we continue to attempt to down_write
+	 * in this function.
+	 */
 	segs = current->mm->map_count;
 
 #ifdef DEBUG
@@ -1071,25 +1283,7 @@
 #endif
 
 	/* Set up header */
-	memcpy(elf.e_ident, ELFMAG, SELFMAG);
-	elf.e_ident[EI_CLASS] = ELF_CLASS;
-	elf.e_ident[EI_DATA] = ELF_DATA;
-	elf.e_ident[EI_VERSION] = EV_CURRENT;
-	memset(elf.e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
-
-	elf.e_type = ET_CORE;
-	elf.e_machine = ELF_ARCH;
-	elf.e_version = EV_CURRENT;
-	elf.e_entry = 0;
-	elf.e_phoff = sizeof(elf);
-	elf.e_shoff = 0;
-	elf.e_flags = 0;
-	elf.e_ehsize = sizeof(elf);
-	elf.e_phentsize = sizeof(struct elf_phdr);
-	elf.e_phnum = segs+1;		/* Include notes */
-	elf.e_shentsize = 0;
-	elf.e_shnum = 0;
-	elf.e_shstrndx = 0;
+	fill_elf_header(&elf, segs+1); /* including notes section*/
 
 	fs = get_fs();
 	set_fs(KERNEL_DS);
@@ -1106,64 +1300,34 @@
 	 * with info from their /proc.
 	 */
 
-	notes[0].name = "CORE";
-	notes[0].type = NT_PRSTATUS;
-	notes[0].datasz = sizeof(prstatus);
-	notes[0].data = &prstatus;
-	prstatus.pr_info.si_signo = prstatus.pr_cursig = signr;
-	prstatus.pr_sigpend = current->pending.signal.sig[0];
-	prstatus.pr_sighold = current->blocked.sig[0];
-	psinfo.pr_pid = prstatus.pr_pid = current->pid;
-	psinfo.pr_ppid = prstatus.pr_ppid = current->parent->pid;
-	psinfo.pr_pgrp = prstatus.pr_pgrp = current->pgrp;
-	psinfo.pr_sid = prstatus.pr_sid = current->session;
-	prstatus.pr_utime.tv_sec = CT_TO_SECS(current->times.tms_utime);
-	prstatus.pr_utime.tv_usec = CT_TO_USECS(current->times.tms_utime);
-	prstatus.pr_stime.tv_sec = CT_TO_SECS(current->times.tms_stime);
-	prstatus.pr_stime.tv_usec = CT_TO_USECS(current->times.tms_stime);
-	prstatus.pr_cutime.tv_sec = CT_TO_SECS(current->times.tms_cutime);
-	prstatus.pr_cutime.tv_usec = CT_TO_USECS(current->times.tms_cutime);
-	prstatus.pr_cstime.tv_sec = CT_TO_SECS(current->times.tms_cstime);
-	prstatus.pr_cstime.tv_usec = CT_TO_USECS(current->times.tms_cstime);
+	fill_note(&notes[0], "CORE", NT_PRSTATUS, sizeof(prstatus), &prstatus);
+ 	
+	fill_psinfo(&psinfo, current);
+	fill_note(&notes[1], "CORE", NT_PRPSINFO, sizeof(psinfo), &psinfo);
+	
+	fill_note(&notes[2], "CORE", NT_TASKSTRUCT, sizeof(*current), current);
+  
+  	/* Try to dump the FPU. */
+	if ((prstatus.pr_fpvalid = elf_core_copy_task_fpregs(current, &fpu))) {
+		fill_note(&notes[3], "CORE", NT_PRFPREG, sizeof(fpu), &fpu);
+	} else {
+		--numnote;
+ 	}
+#ifdef ELF_CORE_COPY_XFPREGS
+	if (elf_core_copy_task_xfpregs(current, &xfpu)) {
+		fill_note(&notes[4], "LINUX", NT_PRXFPREG, sizeof(xfpu), &xfpu);
+	} else {
+		--numnote;
+	}
+#else
+	numnote --;
+#endif	
 
 #ifdef DEBUG
 	dump_regs("Passed in regs", (elf_greg_t *)regs);
 	dump_regs("prstatus regs", (elf_greg_t *)&prstatus.pr_reg);
 #endif
 
-	notes[1].name = "CORE";
-	notes[1].type = NT_PRPSINFO;
-	notes[1].datasz = sizeof(psinfo);
-	notes[1].data = &psinfo;
-	i = current->state ? ffz(~current->state) + 1 : 0;
-	psinfo.pr_state = i;
-	psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i];
-	psinfo.pr_zomb = psinfo.pr_sname == 'Z';
-	psinfo.pr_nice = task_nice(current);
-	psinfo.pr_flag = current->flags;
-	psinfo.pr_uid = NEW_TO_OLD_UID(current->uid);
-	psinfo.pr_gid = NEW_TO_OLD_GID(current->gid);
-	strncpy(psinfo.pr_fname, current->comm, sizeof(psinfo.pr_fname));
-
-	notes[2].name = "CORE";
-	notes[2].type = NT_TASKSTRUCT;
-	notes[2].datasz = sizeof(*current);
-	notes[2].data = current;
-
-	/* Try to dump the FPU. */
-	prstatus.pr_fpvalid = dump_fpu (regs, &fpu);
-	if (!prstatus.pr_fpvalid)
-	{
-		numnote--;
-	}
-	else
-	{
-		notes[3].name = "CORE";
-		notes[3].type = NT_PRFPREG;
-		notes[3].datasz = sizeof(fpu);
-		notes[3].data = &fpu;
-	}
-	
 	/* Write notes phdr entry */
 	{
 		struct elf_phdr phdr;
@@ -1171,17 +1335,12 @@
 
 		for(i = 0; i < numnote; i++)
 			sz += notesize(&notes[i]);
+		
+		if (dump_threads)
+			sz += thread_status_size;
 
-		phdr.p_type = PT_NOTE;
-		phdr.p_offset = offset;
-		phdr.p_vaddr = 0;
-		phdr.p_paddr = 0;
-		phdr.p_filesz = sz;
-		phdr.p_memsz = 0;
-		phdr.p_flags = 0;
-		phdr.p_align = 0;
-
-		offset += phdr.p_filesz;
+		fill_elf_note_phdr(&phdr, sz, offset);
+		offset += sz;
 		DUMP_WRITE(&phdr, sizeof(phdr));
 	}
 
@@ -1210,10 +1369,21 @@
 		DUMP_WRITE(&phdr, sizeof(phdr));
 	}
 
+ 	/* write out the notes section */
 	for(i = 0; i < numnote; i++)
 		if (!writenote(&notes[i], file))
 			goto end_coredump;
 
+	/* write out the thread status notes section */
+ 	if (dump_threads)  {
+		list_for_each(t, &thread_list) {
+			struct elf_thread_status *tmp = list_entry(t, struct elf_thread_status, list);
+			for (i = 0; i < tmp->num_notes; i++)
+				if (!writenote(&tmp->notes[i], file))
+					goto end_coredump;
+		}
+ 	}
+ 
 	DUMP_SEEK(dataoff);
 
 	for(vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) {
@@ -1257,11 +1427,23 @@
 		       (off_t) file->f_pos, offset);
 	}
 
- end_coredump:
+end_coredump:
 	set_fs(fs);
-	up_write(&current->mm->mmap_sem);
+
+cleanup:
+	if (dump_threads)  {
+		while(!list_empty(&thread_list)) {
+			struct list_head *tmp = thread_list.next;
+			list_del(tmp);
+			kfree(list_entry(tmp, struct elf_thread_status, list));
+		}
+
+		tcore_resume_threads();
+	}
+
 	return has_dumped;
 }
+
 #endif		/* USE_ELF_CORE_DUMP */
 
 static int __init init_elf_binfmt(void)
diff -urN -X dontdiff linux2510.pure/include/asm-i386/elf.h linux-2.5.10/include/asm-i386/elf.h
--- linux2510.pure/include/asm-i386/elf.h	Wed Apr 24 00:15:21 2002
+++ linux-2.5.10/include/asm-i386/elf.h	Tue Jun  4 10:11:34 2002
@@ -99,6 +99,16 @@
 
 #ifdef __KERNEL__
 #define SET_PERSONALITY(ex, ibcs2) set_personality((ibcs2)?PER_SVR4:PER_LINUX)
+
+
+extern int dump_task_regs (struct task_struct *, elf_gregset_t *);
+extern int dump_task_fpu (struct task_struct *, elf_fpregset_t *);
+extern int dump_task_extended_fpu (struct task_struct *, struct user_fxsr_struct *);
+
+#define ELF_CORE_COPY_TASK_REGS(tsk, elf_regs) dump_task_regs(tsk, elf_regs)
+#define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
+#define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs)
+
 #endif
 
 #endif
diff -urN -X dontdiff linux2510.pure/include/asm-ia64/elf.h linux-2.5.10/include/asm-ia64/elf.h
--- linux2510.pure/include/asm-ia64/elf.h	Tue May 14 11:00:34 2002
+++ linux-2.5.10/include/asm-ia64/elf.h	Wed Jun  5 09:37:02 2002
@@ -65,12 +65,16 @@
 #define ELF_NGREG	128	/* we really need just 72 but let's leave some headroom... */
 #define ELF_NFPREG	128	/* f0 and f1 could be omitted, but so what... */
 
+typedef unsigned long elf_fpxregset_t;
+
 typedef unsigned long elf_greg_t;
 typedef elf_greg_t elf_gregset_t[ELF_NGREG];
 
 typedef struct ia64_fpreg elf_fpreg_t;
 typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
 
+
+
 struct pt_regs;	/* forward declaration... */
 extern void ia64_elf_core_copy_regs (struct pt_regs *src, elf_gregset_t dst);
 #define ELF_CORE_COPY_REGS(_dest,_regs)	ia64_elf_core_copy_regs(_regs, _dest);
@@ -88,6 +92,14 @@
 struct elf64_hdr;
 extern void ia64_set_personality (struct elf64_hdr *elf_ex, int ibcs2_interpreter);
 #define SET_PERSONALITY(ex, ibcs2)	ia64_set_personality(&(ex), ibcs2)
+
+extern int dump_task_regs(struct task_struct *, elf_gregset_t *);
+extern int dump_task_fpu (struct task_struct *, elf_fpregset_t *);
+
+#define ELF_CORE_COPY_TASK_REGS(tsk, elf_gregs) dump_task_regs(tsk, elf_gregs)
+#define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
+
+
 #endif
 
 #endif /* _ASM_IA64_ELF_H */
diff -urN -X dontdiff linux2510.pure/include/linux/elf.h linux-2.5.10/include/linux/elf.h
--- linux2510.pure/include/linux/elf.h	Tue May 14 11:00:34 2002
+++ linux-2.5.10/include/linux/elf.h	Wed Jun  5 13:03:25 2002
@@ -576,6 +576,9 @@
 #define NT_PRPSINFO	3
 #define NT_TASKSTRUCT	4
 #define NT_PRFPXREG	20
+#define NT_PRXFPREG     0x46e62b7f	/* note name must be "LINUX" as per GDB */
+					/* from gdb5.1/include/elf/common.h */
+
 
 /* Note header in a PT_NOTE section */
 typedef struct elf32_note {
@@ -607,5 +610,4 @@
 
 #endif
 
-
 #endif /* _LINUX_ELF_H */
diff -urN -X dontdiff linux2510.pure/include/linux/elfcore.h linux-2.5.10/include/linux/elfcore.h
--- linux2510.pure/include/linux/elfcore.h	Tue May 14 11:12:15 2002
+++ linux-2.5.10/include/linux/elfcore.h	Wed Jun  5 09:38:08 2002
@@ -86,4 +86,48 @@
 #define PRARGSZ ELF_PRARGSZ 
 #endif
 
+#ifdef __KERNEL__
+static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs)
+{
+#ifdef ELF_CORE_COPY_REGS
+	ELF_CORE_COPY_REGS((*elfregs), regs)
+#else
+	if (sizeof(elf_gregset_t) != sizeof(struct pt_regs)) {
+		printk("sizeof(elf_gregset_t) (%ld) != sizeof(struct pt_regs) (%ld)\n",
+			(long)sizeof(elf_gregset_t), (long)sizeof(struct pt_regs));
+	} else
+		*(struct pt_regs *)elfregs = *regs;
+#endif
+}
+
+static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs)
+{
+#ifdef ELF_CORE_COPY_TASK_REGS
+	
+	return ELF_CORE_COPY_TASK_REGS(t, elfregs);
+#endif
+	return 0;
+}
+
+extern int dump_fpu (struct pt_regs *, elf_fpregset_t *);
+
+static inline int elf_core_copy_task_fpregs(struct task_struct *t, elf_fpregset_t *fpu)
+{
+#ifdef ELF_CORE_COPY_FPREGS
+	return ELF_CORE_COPY_FPREGS(t, fpu);
+#else
+	return dump_fpu(NULL, fpu);
+#endif
+}
+
+#ifdef ELF_CORE_COPY_XFPREGS
+static inline int elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
+{
+	return ELF_CORE_COPY_XFPREGS(t, xfpu);
+}
+#endif
+
+#endif /* __KERNEL__ */
+
+
 #endif /* _LINUX_ELFCORE_H */
diff -urN -X dontdiff linux2510.pure/include/linux/sched.h linux-2.5.10/include/linux/sched.h
--- linux2510.pure/include/linux/sched.h	Tue May 14 11:00:25 2002
+++ linux-2.5.10/include/linux/sched.h	Tue Jun  4 10:15:51 2002
@@ -130,6 +130,14 @@
 
 #include <linux/spinlock.h>
 
+
+/* functions for pausing and resumming functions 
+ * common mm's without using signals.  These are used
+ * by the multithreaded elf core dump code in binfmt_elf.c*/
+extern void tcore_suspend_threads( void );
+extern void tcore_resume_threads( void );
+
+
 /*
  * This serializes "schedule()" and also protects
  * the run-queue from deletions/modifications (but
diff -urN -X dontdiff linux2510.pure/include/linux/sysctl.h linux-2.5.10/include/linux/sysctl.h
--- linux2510.pure/include/linux/sysctl.h	Tue May 14 11:01:31 2002
+++ linux-2.5.10/include/linux/sysctl.h	Tue Jun  4 10:16:04 2002
@@ -87,6 +87,7 @@
 	KERN_CAP_BSET=14,	/* int: capability bounding set */
 	KERN_PANIC=15,		/* int: panic timeout */
 	KERN_REALROOTDEV=16,	/* real root device to mount after initrd */
+	KERN_CORE_DUMPS_THREADS=17, /* int: include status of others threads in dump */
 
 	KERN_SPARC_REBOOT=21,	/* reboot command on Sparc */
 	KERN_CTLALTDEL=22,	/* int: allow ctl-alt-del to reboot */
diff -urN -X dontdiff linux2510.pure/kernel/sched.c linux-2.5.10/kernel/sched.c
--- linux2510.pure/kernel/sched.c	Tue May 14 10:58:13 2002
+++ linux-2.5.10/kernel/sched.c	Tue Jun  4 10:11:34 2002
@@ -149,7 +149,8 @@
 	list_t migration_queue;
 } ____cacheline_aligned;
 
-static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
+static struct runqueue runqueues[NR_CPUS + 1] __cacheline_aligned;
+#define PHANTOM_CPU NR_CPUS
 
 #define cpu_rq(cpu)		(runqueues + (cpu))
 #define this_rq()		cpu_rq(smp_processor_id())
@@ -258,6 +259,9 @@
 #ifdef CONFIG_SMP
 	int need_resched, nrpolling;
 
+	if( unlikely(!p->cpus_allowed) )
+			return;
+			
 	preempt_disable();
 	/* minimise the chance of sending an interrupt to poll_idle() */
 	nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
@@ -334,7 +338,7 @@
 	p->state = TASK_RUNNING;
 	if (!p->array) {
 		activate_task(p, rq);
-		if (p->prio < rq->curr->prio)
+		if (p->cpus_allowed && (p->prio < rq->curr->prio) )
 			resched_task(rq->curr);
 		success = 1;
 	}
@@ -991,6 +995,133 @@
 
 void scheduling_functions_end_here(void) { }
 
+/*
+ * needed for accurate core dumps of multi-threaded applications.
+ * see binfmt_elf.c for more information.
+ */
+static void reschedule_other_cpus(void)
+{
+#ifdef CONFIG_SMP
+	int i, cpu;
+	struct task_struct *p;
+
+	for(i=0; i< smp_num_cpus; i++) {
+		cpu = cpu_logical_map(i);
+		p = cpu_curr(cpu);
+		if (p->thread_info->cpu!= smp_processor_id()) {
+			set_tsk_need_resched(p);
+			smp_send_reschedule(p->thread_info->cpu);
+		}
+	}
+#endif	
+	return;
+}
+
+
+/* functions for pausing and resumming functions with out using signals */
+void tcore_suspend_threads(void)
+{
+	unsigned long flags;
+	runqueue_t *phantomQ;
+	task_t *threads[NR_CPUS], *p;
+	int i, OnCPUCount = 0;
+
+//
+// grab all the rq_locks.
+// current is the process dumping core
+//  
+
+	down_write(&current->mm->mmap_sem);
+	// avoid potential race on >2 way SMP if 3 or more thread procs
+	// dump core at the same time.
+	
+	local_irq_save(flags);
+
+	for(i=0; i< smp_num_cpus; i++) {
+		spin_lock(&cpu_rq(i)->lock);
+	}
+
+	current->cpus_allowed = 1UL << current->thread_info->cpu;
+	// prevent migraion of dumping process making life complicated.
+
+	phantomQ = cpu_rq(PHANTOM_CPU); 
+	spin_lock(&phantomQ->lock);
+	
+	reschedule_other_cpus();
+	// this is an optional IPI, but it makes for the most accurate core files possible.
+	
+	read_lock(&tasklist_lock);
+
+	for_each_task(p) {
+		if (current->mm == p->mm && current != p) {
+			if( p == task_rq(p)->curr ) {
+				//then remember it for later us of set_cpus_allowed
+				threads[OnCPUCount] = p;
+				p->cpus_allowed = 0;//prevent load balance from moving these guys.
+				OnCPUCount ++;
+			} else {
+				// we manualy move the process to the phantom run queue.
+
+				if (p->array) {
+					deactivate_task(p, task_rq(p));
+					activate_task(p, phantomQ);
+				}
+				p->thread_info->cpu = PHANTOM_CPU;
+				p->cpus_allowed = 0;//prevent load balance from moving these guys.
+			}
+		}
+	}
+	read_unlock(&tasklist_lock);
+
+	spin_unlock(&phantomQ->lock);
+	for(i=smp_num_cpus-1; 0<= i; i--) {
+		spin_unlock(&cpu_rq(i)->lock);
+	}
+
+	local_irq_restore(flags);
+
+	for( i = 0; i<OnCPUCount; i++) {
+		set_cpus_allowed(threads[i], 0);
+	}
+	
+	up_write(&current->mm->mmap_sem);
+}
+
+void tcore_resume_threads(void)
+{
+	unsigned long flags;
+	runqueue_t *phantomQ;
+	task_t *p;
+	int i;
+
+	local_irq_save(flags);
+	phantomQ = cpu_rq(PHANTOM_CPU);
+
+	spin_lock(&task_rq(current)->lock);
+	spin_lock(&phantomQ->lock);
+	
+	read_lock(&tasklist_lock);
+	for_each_task(p) {
+		if (current->mm == p->mm && current != p) {
+			p->cpus_allowed = 1UL << current->thread_info->cpu;
+			if (p->array) {
+				deactivate_task(p,phantomQ);
+				activate_task(p, task_rq(current));
+			}
+			p->thread_info->cpu = current->thread_info->cpu;
+		}
+	}
+
+	read_unlock(&tasklist_lock);
+
+	spin_unlock(&phantomQ->lock);
+	spin_unlock(&task_rq(current)->lock);
+
+	local_irq_restore(flags);
+}
+
+
+
 void set_user_nice(task_t *p, long nice)
 {
 	unsigned long flags;
@@ -1565,11 +1696,11 @@
 {
 	runqueue_t *rq;
 	int i, j, k;
+	prio_array_t *array;
 
-	for (i = 0; i < NR_CPUS; i++) {
-		runqueue_t *rq = cpu_rq(i);
-		prio_array_t *array;
 
+	for (i = 0; i < NR_CPUS; i++) {
+		rq = cpu_rq(i);
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
 		spin_lock_init(&rq->lock);
@@ -1586,6 +1717,28 @@
 			__set_bit(MAX_PRIO, array->bitmap);
 		}
 	}
+
+ 
+	i = PHANTOM_CPU;
+	rq = cpu_rq(i);
+	rq->active = rq->arrays;
+	rq->expired = rq->arrays + 1;
+	rq->curr = NULL;
+	spin_lock_init(&rq->lock);
+	spin_lock_init(&rq->frozen);
+	INIT_LIST_HEAD(&rq->migration_queue);
+
+	for (j = 0; j < 2; j++) {
+		array = rq->arrays + j;
+		for (k = 0; k < MAX_PRIO; k++) {
+			INIT_LIST_HEAD(array->queue + k);
+			__clear_bit(k, array->bitmap);
+		}
+		// delimiter for bitsearch
+		__set_bit(MAX_PRIO, array->bitmap);
+	}
+
+
 	/*
 	 * We have to do a little magic to get the first
 	 * process right in SMP mode.
@@ -1645,9 +1798,11 @@
 	migration_req_t req;
 	runqueue_t *rq;
 
-	new_mask &= cpu_online_map;
-	if (!new_mask)
-		BUG();
+	if(new_mask){ // can be O for TCore process suspends
+		new_mask &= cpu_online_map;
+		if (!new_mask)
+			BUG();
+	}
 
 	preempt_disable();
 	rq = task_rq_lock(p, &flags);
@@ -1720,7 +1875,12 @@
 		spin_unlock_irqrestore(&rq->lock, flags);
 
 		p = req->task;
-		cpu_dest = __ffs(p->cpus_allowed);
+
+		if( p->cpus_allowed)
+			cpu_dest = __ffs(p->cpus_allowed);
+		else
+			cpu_dest = PHANTOM_CPU;
+
 		rq_dest = cpu_rq(cpu_dest);
 repeat:
 		cpu_src = p->thread_info->cpu;
diff -urN -X dontdiff linux2510.pure/kernel/sysctl.c linux-2.5.10/kernel/sysctl.c
--- linux2510.pure/kernel/sysctl.c	Wed Apr 24 00:15:10 2002
+++ linux-2.5.10/kernel/sysctl.c	Tue Jun  4 10:11:34 2002
@@ -38,6 +38,8 @@
 #include <linux/nfs_fs.h>
 #endif
 
+int core_dumps_threads = 1;
+
 #if defined(CONFIG_SYSCTL)
 
 /* External variables not in a header file. */
@@ -172,7 +174,9 @@
 	 0644, NULL, &proc_dointvec},
 	{KERN_TAINTED, "tainted", &tainted, sizeof(int),
 	 0644, NULL, &proc_dointvec},
-	{KERN_CAP_BSET, "cap-bound", &cap_bset, sizeof(kernel_cap_t),
+	{KERN_CORE_DUMPS_THREADS, "core_dumps_threads", &core_dumps_threads, sizeof(int),
+	 0644, NULL, &proc_dointvec},
+	 {KERN_CAP_BSET, "cap-bound", &cap_bset, sizeof(kernel_cap_t),
 	 0600, NULL, &proc_dointvec_bset},
 #ifdef CONFIG_BLK_DEV_INITRD
 	{KERN_REALROOTDEV, "real-root-dev", &real_root_dev, sizeof(int),

next             reply	other threads:[~2002-06-05 21:33 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2002-06-05 21:33 Gross, Mark [this message]
2002-06-06 16:13 ` [Linux-ia64] [RFC] Multithreaded core dump for 2.5.10-ia64 kernel David Mosberger
2002-06-06 16:28 ` [Linux-ia64] [RFC] Multithreaded core dump for 2.5.10-ia64 ke Gross, Mark
2002-06-06 16:35 ` David Mosberger

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=marc-linux-ia64-105590701905644@msgid-missing \
    --to=mark.gross@intel.com \
    --cc=linux-ia64@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox