LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* ppc32/kprobe: Fix a bug for kprobe stwu r1
From: Tiejun Chen @ 2011-12-12  8:50 UTC (permalink / raw)
  To: benh, linuxppc-dev

ppc32/kprobe: Fix a bug for kprobe stwu r1

There patches is used to fix that known kprobe bug,
[BUG?]3.0-rc4+ftrace+kprobe: set kprobe at instruction 'stwu' lead to system crash/freeze

https://lkml.org/lkml/2011/7/3/156

We withdraw the original way to provide a dedicated exception stack. Now we
implement this based on Ben's suggestion:

https://lkml.org/lkml/2011/11/30/327

Here I fix this bug only for ppc32 since Ben address another problem in ppc64
exception return codes. So I think I'd better send another patch to fix this
bug issued from ppc64 firstly. Then its convenient to merge this fix into ppc64.

Tiejun Chen (4):
      powerpc/kprobe: introduce a new thread flag
      ppc32/kprobe: introduce copy_exc_stack
      ppc32/kprobe: complete kprobe and migrate exception frame
      ppc32/kprobe: don't emulate store when kprobe stwu r1

 arch/powerpc/include/asm/page_32.h     |    1 +
 arch/powerpc/include/asm/thread_info.h |    2 ++
 arch/powerpc/kernel/entry_32.S         |   26 ++++++++++++++++++++++++++
 arch/powerpc/kernel/misc_32.S          |   16 +++++++++++++++-
 arch/powerpc/kernel/ppc_ksyms.c        |    1 +
 arch/powerpc/lib/sstep.c               |   19 +++++++++++++++++--
 6 files changed, 62 insertions(+), 3 deletions(-)

Tiejun

^ permalink raw reply

* [PATCH 2/4] ppc32/kprobe: introduce copy_exc_stack
From: Tiejun Chen @ 2011-12-12  8:50 UTC (permalink / raw)
  To: benh, linuxppc-dev
In-Reply-To: <1323679853-31751-1-git-send-email-tiejun.chen@windriver.com>

We need a copy mechanism to migrate exception stack. But looks copy_page()
already implement this well so we can complete copy_exc_stack() based on
that directly.

Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
---
 arch/powerpc/include/asm/page_32.h |    1 +
 arch/powerpc/kernel/misc_32.S      |   16 +++++++++++++++-
 arch/powerpc/kernel/ppc_ksyms.c    |    1 +
 3 files changed, 17 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h
index 68d73b2..2c1fd84 100644
--- a/arch/powerpc/include/asm/page_32.h
+++ b/arch/powerpc/include/asm/page_32.h
@@ -40,6 +40,7 @@ struct page;
 extern void clear_pages(void *page, int order);
 static inline void clear_page(void *page) { clear_pages(page, 0); }
 extern void copy_page(void *to, void *from);
+extern void copy_exc_stack(void *to, void *from);
 
 #include <asm-generic/getorder.h>
 
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 998a100..aa02545 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -527,7 +527,7 @@ _GLOBAL(clear_pages)
 	stw	r8,12(r3);	\
 	stwu	r9,16(r3)
 
-_GLOBAL(copy_page)
+ready_copy:
 	addi	r3,r3,-4
 	addi	r4,r4,-4
 
@@ -544,7 +544,21 @@ _GLOBAL(copy_page)
 	dcbt	r5,r4
 	li	r11,L1_CACHE_BYTES+4
 #endif /* MAX_COPY_PREFETCH */
+	blr
+
+_GLOBAL(copy_exc_stack)
+	mflr	r12
+	bl	ready_copy
+	mtlr	r12
+	li	r0,INT_FRAME_SIZE/L1_CACHE_BYTES - MAX_COPY_PREFETCH
+	b	go_copy
+
+_GLOBAL(copy_page)
+	mflr	r12
+	bl	ready_copy
+	mtlr	r12
 	li	r0,PAGE_SIZE/L1_CACHE_BYTES - MAX_COPY_PREFETCH
+go_copy:
 	crclr	4*cr0+eq
 2:
 	mtctr	r0
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index f5ae872..2223daf 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -88,6 +88,7 @@ EXPORT_SYMBOL(__clear_user);
 EXPORT_SYMBOL(__strncpy_from_user);
 EXPORT_SYMBOL(__strnlen_user);
 EXPORT_SYMBOL(copy_page);
+EXPORT_SYMBOL(copy_exc_stack);
 
 #if defined(CONFIG_PCI) && defined(CONFIG_PPC32)
 EXPORT_SYMBOL(isa_io_base);
-- 
1.5.6

^ permalink raw reply related

* [PATCH 1/4] powerpc/kprobe: introduce a new thread flag
From: Tiejun Chen @ 2011-12-12  8:50 UTC (permalink / raw)
  To: benh, linuxppc-dev
In-Reply-To: <1323679853-31751-1-git-send-email-tiejun.chen@windriver.com>

We need to add a new thread flag, TIF_KPROBE/_TIF_DELAYED_KPROBE,
for handling kprobe operation while exiting exception.

Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
---
 arch/powerpc/include/asm/thread_info.h |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 836f231..3378734 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -112,6 +112,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_FREEZE		14	/* Freezing for suspend */
 #define TIF_SYSCALL_TRACEPOINT	15	/* syscall tracepoint instrumentation */
 #define TIF_RUNLATCH		16	/* Is the runlatch enabled? */
+#define TIF_KPROBE		17	/* Is the delayed kprobe operation? */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -130,6 +131,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
 #define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
 #define _TIF_RUNLATCH		(1<<TIF_RUNLATCH)
+#define _TIF_DELAYED_KPROBE	(1<<TIF_KPROBE)
 #define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 				 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT)
 
-- 
1.5.6

^ permalink raw reply related

* [PATCH 3/4] ppc32/kprobe: complete kprobe and migrate exception frame
From: Tiejun Chen @ 2011-12-12  8:50 UTC (permalink / raw)
  To: benh, linuxppc-dev
In-Reply-To: <1323679853-31751-1-git-send-email-tiejun.chen@windriver.com>

We can't emulate stwu since that may corrupt current exception stack.
So we will have to do real store operation in the exception return code.

Firstly we'll allocate a trampoline exception frame below the kprobed
function stack and copy the current exception frame to the trampoline.
Then we can do this real store operation to implement 'stwu', and reroute
the trampoline frame to r1 to complete this exception migration.

Signed-off-by: Tiejun Chen <tiejun.chen@windriver.com>
---
 arch/powerpc/kernel/entry_32.S |   26 ++++++++++++++++++++++++++
 1 files changed, 26 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 56212bc..d56e311 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -1185,6 +1185,8 @@ recheck:
 	bne-	do_resched
 	andi.	r0,r9,_TIF_USER_WORK_MASK
 	beq	restore_user
+	andis.	r0,r9,_TIF_DELAYED_KPROBE@h
+	bne-	restore_kprobe
 do_user_signal:			/* r10 contains MSR_KERNEL here */
 	ori	r10,r10,MSR_EE
 	SYNC
@@ -1202,6 +1204,30 @@ do_user_signal:			/* r10 contains MSR_KERNEL here */
 	REST_NVGPRS(r1)
 	b	recheck
 
+restore_kprobe:
+	lwz	r3,GPR1(r1)
+	subi    r3,r3,INT_FRAME_SIZE; /* Allocate a trampoline exception frame */
+	mr	r4,r1
+	bl	copy_exc_stack	/* Copy from the original to the trampoline */
+
+	/* Do real stw operation to complete stwu */
+	mr	r4,r1
+	addi	r4,r4,INT_FRAME_SIZE	/* Get kprobed entry */
+	lwz	r5,GPR1(r1)		/* Backup r1 */
+	stw	r4,GPR1(r1)		/* Now store that safely */
+
+	/* Reroute the trampoline frame to r1 */
+	subi    r5,r5,INT_FRAME_SIZE
+	mr	r1,r5
+
+	/* Clear _TIF_DELAYED_KPROBE flag */
+	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
+	lwz	r0,TI_FLAGS(r9)
+	rlwinm	r0,r0,0,_TIF_DELAYED_KPROBE
+	stw	r0,TI_FLAGS(r9)
+
+	b	restore
+
 /*
  * We come here when we are at the end of handling an exception
  * that occurred at a place where taking an exception will lose
-- 
1.5.6

^ permalink raw reply related

* [PATCH] block/swim3: Locking fixes
From: Benjamin Herrenschmidt @ 2011-12-12  4:57 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Jens Axboe

The old PowerMac swim3 driver has some "interesting" locking issues,
using a private lock and failing to lock the queue before completing
requests, which triggered WARN_ONs among others.

This rips out the private lock, makes everything operate under the
block queue lock, and generally makes things simpler.

We used to also share a queue between the two possible instances which
was problematic since we might pick the wrong controller in some cases,
so make the queue and the current request per-instance and use
queuedata to point to our private data which is a lot cleaner.

We still share the queue lock but then, it's nearly impossible to actually
use 2 swim3's simultaneously: one would need to have a Wallstreet
PowerBook, the only machine afaik with two of these on the motherboard,
and populate both hotswap bays with a floppy drive (the machine ships
only with one), so nobody cares...

While at it, add a little fix to clear up stale interrupts when loading
the driver or plugging a floppy drive in a bay.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 drivers/block/swim3.c |  362 +++++++++++++++++++++++++++++--------------------
 1 files changed, 215 insertions(+), 147 deletions(-)

diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index ae3e167..89ddab1 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -16,6 +16,8 @@
  * handle GCR disks
  */
 
+#undef DEBUG
+
 #include <linux/stddef.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
@@ -36,13 +38,11 @@
 #include <asm/machdep.h>
 #include <asm/pmac_feature.h>
 
-static DEFINE_MUTEX(swim3_mutex);
-static struct request_queue *swim3_queue;
-static struct gendisk *disks[2];
-static struct request *fd_req;
-
 #define MAX_FLOPPIES	2
 
+static DEFINE_MUTEX(swim3_mutex);
+static struct gendisk *disks[MAX_FLOPPIES];
+
 enum swim_state {
 	idle,
 	locating,
@@ -177,7 +177,6 @@ struct swim3 {
 
 struct floppy_state {
 	enum swim_state	state;
-	spinlock_t lock;
 	struct swim3 __iomem *swim3;	/* hardware registers */
 	struct dbdma_regs __iomem *dma;	/* DMA controller registers */
 	int	swim3_intr;	/* interrupt number for SWIM3 */
@@ -204,8 +203,20 @@ struct floppy_state {
 	int	wanted;
 	struct macio_dev *mdev;
 	char	dbdma_cmd_space[5 * sizeof(struct dbdma_cmd)];
+	int	index;
+	struct request *cur_req;
 };
 
+#define swim3_err(fmt, arg...)	dev_err(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#define swim3_warn(fmt, arg...)	dev_warn(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#define swim3_info(fmt, arg...)	dev_info(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+
+#ifdef DEBUG
+#define swim3_dbg(fmt, arg...)	dev_dbg(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#else
+#define swim3_dbg(fmt, arg...)	do { } while(0)
+#endif
+
 static struct floppy_state floppy_states[MAX_FLOPPIES];
 static int floppy_count = 0;
 static DEFINE_SPINLOCK(swim3_lock);
@@ -224,17 +235,8 @@ static unsigned short write_postamble[] = {
 	0, 0, 0, 0, 0, 0
 };
 
-static void swim3_select(struct floppy_state *fs, int sel);
-static void swim3_action(struct floppy_state *fs, int action);
-static int swim3_readbit(struct floppy_state *fs, int bit);
-static void do_fd_request(struct request_queue * q);
-static void start_request(struct floppy_state *fs);
-static void set_timeout(struct floppy_state *fs, int nticks,
-			void (*proc)(unsigned long));
-static void scan_track(struct floppy_state *fs);
 static void seek_track(struct floppy_state *fs, int n);
 static void init_dma(struct dbdma_cmd *cp, int cmd, void *buf, int count);
-static void setup_transfer(struct floppy_state *fs);
 static void act(struct floppy_state *fs);
 static void scan_timeout(unsigned long data);
 static void seek_timeout(unsigned long data);
@@ -254,18 +256,21 @@ static unsigned int floppy_check_events(struct gendisk *disk,
 					unsigned int clearing);
 static int floppy_revalidate(struct gendisk *disk);
 
-static bool swim3_end_request(int err, unsigned int nr_bytes)
+static bool swim3_end_request(struct floppy_state *fs, int err, unsigned int nr_bytes)
 {
-	if (__blk_end_request(fd_req, err, nr_bytes))
-		return true;
+	struct request *req = fs->cur_req;
+	int rc;
 
-	fd_req = NULL;
-	return false;
-}
+	swim3_dbg("  end request, err=%d nr_bytes=%d, cur_req=%p\n",
+		  err, nr_bytes, req);
 
-static bool swim3_end_request_cur(int err)
-{
-	return swim3_end_request(err, blk_rq_cur_bytes(fd_req));
+	if (err)
+		nr_bytes = blk_rq_cur_bytes(req);
+	rc = __blk_end_request(req, err, nr_bytes);
+	if (rc)
+		return true;
+	fs->cur_req = NULL;
+	return false;
 }
 
 static void swim3_select(struct floppy_state *fs, int sel)
@@ -303,50 +308,53 @@ static int swim3_readbit(struct floppy_state *fs, int bit)
 	return (stat & DATA) == 0;
 }
 
-static void do_fd_request(struct request_queue * q)
-{
-	int i;
-
-	for(i=0; i<floppy_count; i++) {
-		struct floppy_state *fs = &floppy_states[i];
-		if (fs->mdev->media_bay &&
-		    check_media_bay(fs->mdev->media_bay) != MB_FD)
-			continue;
-		start_request(fs);
-	}
-}
-
 static void start_request(struct floppy_state *fs)
 {
 	struct request *req;
 	unsigned long x;
 
+	swim3_dbg("start request, initial state=%d\n", fs->state);
+
 	if (fs->state == idle && fs->wanted) {
 		fs->state = available;
 		wake_up(&fs->wait);
 		return;
 	}
 	while (fs->state == idle) {
-		if (!fd_req) {
-			fd_req = blk_fetch_request(swim3_queue);
-			if (!fd_req)
+		swim3_dbg("start request, idle loop, cur_req=%p\n", fs->cur_req);
+		if (!fs->cur_req) {
+			fs->cur_req = blk_fetch_request(disks[fs->index]->queue);
+			swim3_dbg("  fetched request %p\n", fs->cur_req);
+			if (!fs->cur_req)
 				break;
 		}
-		req = fd_req;
-#if 0
-		printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
-		       req->rq_disk->disk_name, req->cmd,
-		       (long)blk_rq_pos(req), blk_rq_sectors(req), req->buffer);
-		printk("           errors=%d current_nr_sectors=%u\n",
-		       req->errors, blk_rq_cur_sectors(req));
+		req = fs->cur_req;
+
+		if (fs->mdev->media_bay &&
+		    check_media_bay(fs->mdev->media_bay) != MB_FD) {
+			swim3_dbg("%s", "  media bay absent, dropping req\n");
+			swim3_end_request(fs, -ENODEV, 0);
+			continue;
+		}
+
+#if 0 /* This is really too verbose */
+		swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
+			  req->rq_disk->disk_name, req->cmd,
+			  (long)blk_rq_pos(req), blk_rq_sectors(req),
+			  req->buffer);
+		swim3_dbg("           errors=%d current_nr_sectors=%u\n",
+			  req->errors, blk_rq_cur_sectors(req));
 #endif
 
 		if (blk_rq_pos(req) >= fs->total_secs) {
-			swim3_end_request_cur(-EIO);
+			swim3_dbg("  pos out of bounds (%ld, max is %ld)\n",
+				  (long)blk_rq_pos(req), (long)fs->total_secs);
+			swim3_end_request(fs, -EIO, 0);
 			continue;
 		}
 		if (fs->ejected) {
-			swim3_end_request_cur(-EIO);
+			swim3_dbg("%s", "  disk ejected\n");
+			swim3_end_request(fs, -EIO, 0);
 			continue;
 		}
 
@@ -354,7 +362,8 @@ static void start_request(struct floppy_state *fs)
 			if (fs->write_prot < 0)
 				fs->write_prot = swim3_readbit(fs, WRITE_PROT);
 			if (fs->write_prot) {
-				swim3_end_request_cur(-EIO);
+				swim3_dbg("%s", "  try to write, disk write protected\n");
+				swim3_end_request(fs, -EIO, 0);
 				continue;
 			}
 		}
@@ -369,7 +378,6 @@ static void start_request(struct floppy_state *fs)
 		x = ((long)blk_rq_pos(req)) % fs->secpercyl;
 		fs->head = x / fs->secpertrack;
 		fs->req_sector = x % fs->secpertrack + 1;
-		fd_req = req;
 		fs->state = do_transfer;
 		fs->retries = 0;
 
@@ -377,12 +385,14 @@ static void start_request(struct floppy_state *fs)
 	}
 }
 
+static void do_fd_request(struct request_queue * q)
+{
+	start_request(q->queuedata);
+}
+
 static void set_timeout(struct floppy_state *fs, int nticks,
 			void (*proc)(unsigned long))
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&fs->lock, flags);
 	if (fs->timeout_pending)
 		del_timer(&fs->timeout);
 	fs->timeout.expires = jiffies + nticks;
@@ -390,7 +400,6 @@ static void set_timeout(struct floppy_state *fs, int nticks,
 	fs->timeout.data = (unsigned long) fs;
 	add_timer(&fs->timeout);
 	fs->timeout_pending = 1;
-	spin_unlock_irqrestore(&fs->lock, flags);
 }
 
 static inline void scan_track(struct floppy_state *fs)
@@ -442,40 +451,45 @@ static inline void setup_transfer(struct floppy_state *fs)
 	struct swim3 __iomem *sw = fs->swim3;
 	struct dbdma_cmd *cp = fs->dma_cmd;
 	struct dbdma_regs __iomem *dr = fs->dma;
+	struct request *req = fs->cur_req;
 
-	if (blk_rq_cur_sectors(fd_req) <= 0) {
-		printk(KERN_ERR "swim3: transfer 0 sectors?\n");
+	if (blk_rq_cur_sectors(req) <= 0) {
+		swim3_warn("%s", "Transfer 0 sectors ?\n");
 		return;
 	}
-	if (rq_data_dir(fd_req) == WRITE)
+	if (rq_data_dir(req) == WRITE)
 		n = 1;
 	else {
 		n = fs->secpertrack - fs->req_sector + 1;
-		if (n > blk_rq_cur_sectors(fd_req))
-			n = blk_rq_cur_sectors(fd_req);
+		if (n > blk_rq_cur_sectors(req))
+			n = blk_rq_cur_sectors(req);
 	}
+
+	swim3_dbg("  setup xfer at sect %d (of %d) head %d for %d\n",
+		  fs->req_sector, fs->secpertrack, fs->head, n);
+
 	fs->scount = n;
 	swim3_select(fs, fs->head? READ_DATA_1: READ_DATA_0);
 	out_8(&sw->sector, fs->req_sector);
 	out_8(&sw->nsect, n);
 	out_8(&sw->gap3, 0);
 	out_le32(&dr->cmdptr, virt_to_bus(cp));
-	if (rq_data_dir(fd_req) == WRITE) {
+	if (rq_data_dir(req) == WRITE) {
 		/* Set up 3 dma commands: write preamble, data, postamble */
 		init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble));
 		++cp;
-		init_dma(cp, OUTPUT_MORE, fd_req->buffer, 512);
+		init_dma(cp, OUTPUT_MORE, req->buffer, 512);
 		++cp;
 		init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble));
 	} else {
-		init_dma(cp, INPUT_LAST, fd_req->buffer, n * 512);
+		init_dma(cp, INPUT_LAST, req->buffer, n * 512);
 	}
 	++cp;
 	out_le16(&cp->command, DBDMA_STOP);
 	out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
 	in_8(&sw->error);
 	out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
-	if (rq_data_dir(fd_req) == WRITE)
+	if (rq_data_dir(req) == WRITE)
 		out_8(&sw->control_bis, WRITE_SECTORS);
 	in_8(&sw->intr);
 	out_le32(&dr->control, (RUN << 16) | RUN);
@@ -488,12 +502,16 @@ static inline void setup_transfer(struct floppy_state *fs)
 static void act(struct floppy_state *fs)
 {
 	for (;;) {
+		swim3_dbg("  act loop, state=%d, req_cyl=%d, cur_cyl=%d\n",
+			  fs->state, fs->req_cyl, fs->cur_cyl);
+
 		switch (fs->state) {
 		case idle:
 			return;		/* XXX shouldn't get here */
 
 		case locating:
 			if (swim3_readbit(fs, TRACK_ZERO)) {
+				swim3_dbg("%s", "    locate track 0\n");
 				fs->cur_cyl = 0;
 				if (fs->req_cyl == 0)
 					fs->state = do_transfer;
@@ -511,7 +529,7 @@ static void act(struct floppy_state *fs)
 				break;
 			}
 			if (fs->req_cyl == fs->cur_cyl) {
-				printk("whoops, seeking 0\n");
+				swim3_warn("%s", "Whoops, seeking 0\n");
 				fs->state = do_transfer;
 				break;
 			}
@@ -527,7 +545,9 @@ static void act(struct floppy_state *fs)
 		case do_transfer:
 			if (fs->cur_cyl != fs->req_cyl) {
 				if (fs->retries > 5) {
-					swim3_end_request_cur(-EIO);
+					swim3_err("Wrong cylinder in transfer, want: %d got %d\n",
+						  fs->req_cyl, fs->cur_cyl);
+					swim3_end_request(fs, -EIO, 0);
 					fs->state = idle;
 					return;
 				}
@@ -542,7 +562,7 @@ static void act(struct floppy_state *fs)
 			return;
 
 		default:
-			printk(KERN_ERR"swim3: unknown state %d\n", fs->state);
+			swim3_err("Unknown state %d\n", fs->state);
 			return;
 		}
 	}
@@ -552,59 +572,75 @@ static void scan_timeout(unsigned long data)
 {
 	struct floppy_state *fs = (struct floppy_state *) data;
 	struct swim3 __iomem *sw = fs->swim3;
+	unsigned long flags;
+
+	swim3_dbg("* scan timeout, state=%d\n", fs->state);
 
+	spin_lock_irqsave(&swim3_lock, flags);
 	fs->timeout_pending = 0;
 	out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
 	out_8(&sw->select, RELAX);
 	out_8(&sw->intr_enable, 0);
 	fs->cur_cyl = -1;
 	if (fs->retries > 5) {
-		swim3_end_request_cur(-EIO);
+		swim3_end_request(fs, -EIO, 0);
 		fs->state = idle;
 		start_request(fs);
 	} else {
 		fs->state = jogging;
 		act(fs);
 	}
+	spin_unlock_irqrestore(&swim3_lock, flags);
 }
 
 static void seek_timeout(unsigned long data)
 {
 	struct floppy_state *fs = (struct floppy_state *) data;
 	struct swim3 __iomem *sw = fs->swim3;
+	unsigned long flags;
+
+	swim3_dbg("* seek timeout, state=%d\n", fs->state);
 
+	spin_lock_irqsave(&swim3_lock, flags);
 	fs->timeout_pending = 0;
 	out_8(&sw->control_bic, DO_SEEK);
 	out_8(&sw->select, RELAX);
 	out_8(&sw->intr_enable, 0);
-	printk(KERN_ERR "swim3: seek timeout\n");
-	swim3_end_request_cur(-EIO);
+	swim3_err("%s", "Seek timeout\n");
+	swim3_end_request(fs, -EIO, 0);
 	fs->state = idle;
 	start_request(fs);
+	spin_unlock_irqrestore(&swim3_lock, flags);
 }
 
 static void settle_timeout(unsigned long data)
 {
 	struct floppy_state *fs = (struct floppy_state *) data;
 	struct swim3 __iomem *sw = fs->swim3;
+	unsigned long flags;
+
+	swim3_dbg("* settle timeout, state=%d\n", fs->state);
 
+	spin_lock_irqsave(&swim3_lock, flags);
 	fs->timeout_pending = 0;
 	if (swim3_readbit(fs, SEEK_COMPLETE)) {
 		out_8(&sw->select, RELAX);
 		fs->state = locating;
 		act(fs);
-		return;
+		goto unlock;
 	}
 	out_8(&sw->select, RELAX);
 	if (fs->settle_time < 2*HZ) {
 		++fs->settle_time;
 		set_timeout(fs, 1, settle_timeout);
-		return;
+		goto unlock;
 	}
-	printk(KERN_ERR "swim3: seek settle timeout\n");
-	swim3_end_request_cur(-EIO);
+	swim3_err("%s", "Seek settle timeout\n");
+	swim3_end_request(fs, -EIO, 0);
 	fs->state = idle;
 	start_request(fs);
+ unlock:
+	spin_unlock_irqrestore(&swim3_lock, flags);
 }
 
 static void xfer_timeout(unsigned long data)
@@ -612,8 +648,12 @@ static void xfer_timeout(unsigned long data)
 	struct floppy_state *fs = (struct floppy_state *) data;
 	struct swim3 __iomem *sw = fs->swim3;
 	struct dbdma_regs __iomem *dr = fs->dma;
+	unsigned long flags;
 	int n;
 
+	swim3_dbg("* xfer timeout, state=%d\n", fs->state);
+
+	spin_lock_irqsave(&swim3_lock, flags);
 	fs->timeout_pending = 0;
 	out_le32(&dr->control, RUN << 16);
 	/* We must wait a bit for dbdma to stop */
@@ -622,12 +662,13 @@ static void xfer_timeout(unsigned long data)
 	out_8(&sw->intr_enable, 0);
 	out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION);
 	out_8(&sw->select, RELAX);
-	printk(KERN_ERR "swim3: timeout %sing sector %ld\n",
-	       (rq_data_dir(fd_req)==WRITE? "writ": "read"),
-	       (long)blk_rq_pos(fd_req));
-	swim3_end_request_cur(-EIO);
+	swim3_err("Timeout %sing sector %ld\n",
+	       (rq_data_dir(fs->cur_req)==WRITE? "writ": "read"),
+	       (long)blk_rq_pos(fs->cur_req));
+	swim3_end_request(fs, -EIO, 0);
 	fs->state = idle;
 	start_request(fs);
+	spin_unlock_irqrestore(&swim3_lock, flags);
 }
 
 static irqreturn_t swim3_interrupt(int irq, void *dev_id)
@@ -638,12 +679,17 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 	int stat, resid;
 	struct dbdma_regs __iomem *dr;
 	struct dbdma_cmd *cp;
+	unsigned long flags;
+	struct request *req = fs->cur_req;
+
+	swim3_dbg("* interrupt, state=%d\n", fs->state);
 
+	spin_lock_irqsave(&swim3_lock, flags);
 	intr = in_8(&sw->intr);
 	err = (intr & ERROR_INTR)? in_8(&sw->error): 0;
 	if ((intr & ERROR_INTR) && fs->state != do_transfer)
-		printk(KERN_ERR "swim3_interrupt, state=%d, dir=%x, intr=%x, err=%x\n",
-		       fs->state, rq_data_dir(fd_req), intr, err);
+		swim3_err("Non-transfer error interrupt: state=%d, dir=%x, intr=%x, err=%x\n",
+			  fs->state, rq_data_dir(req), intr, err);
 	switch (fs->state) {
 	case locating:
 		if (intr & SEEN_SECTOR) {
@@ -653,10 +699,10 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 			del_timer(&fs->timeout);
 			fs->timeout_pending = 0;
 			if (sw->ctrack == 0xff) {
-				printk(KERN_ERR "swim3: seen sector but cyl=ff?\n");
+				swim3_err("%s", "Seen sector but cyl=ff?\n");
 				fs->cur_cyl = -1;
 				if (fs->retries > 5) {
-					swim3_end_request_cur(-EIO);
+					swim3_end_request(fs, -EIO, 0);
 					fs->state = idle;
 					start_request(fs);
 				} else {
@@ -668,8 +714,8 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 			fs->cur_cyl = sw->ctrack;
 			fs->cur_sector = sw->csect;
 			if (fs->expect_cyl != -1 && fs->expect_cyl != fs->cur_cyl)
-				printk(KERN_ERR "swim3: expected cyl %d, got %d\n",
-				       fs->expect_cyl, fs->cur_cyl);
+				swim3_err("Expected cyl %d, got %d\n",
+					  fs->expect_cyl, fs->cur_cyl);
 			fs->state = do_transfer;
 			act(fs);
 		}
@@ -704,7 +750,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 		fs->timeout_pending = 0;
 		dr = fs->dma;
 		cp = fs->dma_cmd;
-		if (rq_data_dir(fd_req) == WRITE)
+		if (rq_data_dir(req) == WRITE)
 			++cp;
 		/*
 		 * Check that the main data transfer has finished.
@@ -729,31 +775,32 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 		if (intr & ERROR_INTR) {
 			n = fs->scount - 1 - resid / 512;
 			if (n > 0) {
-				blk_update_request(fd_req, 0, n << 9);
+				blk_update_request(req, 0, n << 9);
 				fs->req_sector += n;
 			}
 			if (fs->retries < 5) {
 				++fs->retries;
 				act(fs);
 			} else {
-				printk("swim3: error %sing block %ld (err=%x)\n",
-				       rq_data_dir(fd_req) == WRITE? "writ": "read",
-				       (long)blk_rq_pos(fd_req), err);
-				swim3_end_request_cur(-EIO);
+				swim3_err("Error %sing block %ld (err=%x)\n",
+				       rq_data_dir(req) == WRITE? "writ": "read",
+				       (long)blk_rq_pos(req), err);
+				swim3_end_request(fs, -EIO, 0);
 				fs->state = idle;
 			}
 		} else {
 			if ((stat & ACTIVE) == 0 || resid != 0) {
 				/* musta been an error */
-				printk(KERN_ERR "swim3: fd dma: stat=%x resid=%d\n", stat, resid);
-				printk(KERN_ERR "  state=%d, dir=%x, intr=%x, err=%x\n",
-				       fs->state, rq_data_dir(fd_req), intr, err);
-				swim3_end_request_cur(-EIO);
+				swim3_err("fd dma error: stat=%x resid=%d\n", stat, resid);
+				swim3_err("  state=%d, dir=%x, intr=%x, err=%x\n",
+					  fs->state, rq_data_dir(req), intr, err);
+				swim3_end_request(fs, -EIO, 0);
 				fs->state = idle;
 				start_request(fs);
 				break;
 			}
-			if (swim3_end_request(0, fs->scount << 9)) {
+			fs->retries = 0;
+			if (swim3_end_request(fs, 0, fs->scount << 9)) {
 				fs->req_sector += fs->scount;
 				if (fs->req_sector > fs->secpertrack) {
 					fs->req_sector -= fs->secpertrack;
@@ -770,8 +817,9 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 			start_request(fs);
 		break;
 	default:
-		printk(KERN_ERR "swim3: don't know what to do in state %d\n", fs->state);
+		swim3_err("Don't know what to do in state %d\n", fs->state);
 	}
+	spin_unlock_irqrestore(&swim3_lock, flags);
 	return IRQ_HANDLED;
 }
 
@@ -781,26 +829,31 @@ static void fd_dma_interrupt(int irq, void *dev_id)
 }
 */
 
+/* Called under the mutex to grab exclusive access to a drive */
 static int grab_drive(struct floppy_state *fs, enum swim_state state,
 		      int interruptible)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&fs->lock, flags);
-	if (fs->state != idle) {
+	swim3_dbg("%s", "-> grab drive\n");
+
+	spin_lock_irqsave(&swim3_lock, flags);
+	if (fs->state != idle && fs->state != available) {
 		++fs->wanted;
 		while (fs->state != available) {
+			spin_unlock_irqrestore(&swim3_lock, flags);
 			if (interruptible && signal_pending(current)) {
 				--fs->wanted;
-				spin_unlock_irqrestore(&fs->lock, flags);
 				return -EINTR;
 			}
 			interruptible_sleep_on(&fs->wait);
+			spin_lock_irqsave(&swim3_lock, flags);
 		}
 		--fs->wanted;
 	}
 	fs->state = state;
-	spin_unlock_irqrestore(&fs->lock, flags);
+	spin_unlock_irqrestore(&swim3_lock, flags);
+
 	return 0;
 }
 
@@ -808,10 +861,12 @@ static void release_drive(struct floppy_state *fs)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&fs->lock, flags);
+	swim3_dbg("%s", "-> release drive\n");
+
+	spin_lock_irqsave(&swim3_lock, flags);
 	fs->state = idle;
 	start_request(fs);
-	spin_unlock_irqrestore(&fs->lock, flags);
+	spin_unlock_irqrestore(&swim3_lock, flags);
 }
 
 static int fd_eject(struct floppy_state *fs)
@@ -966,6 +1021,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
 {
 	struct floppy_state *fs = disk->private_data;
 	struct swim3 __iomem *sw = fs->swim3;
+
 	mutex_lock(&swim3_mutex);
 	if (fs->ref_count > 0 && --fs->ref_count == 0) {
 		swim3_action(fs, MOTOR_OFF);
@@ -1031,30 +1087,48 @@ static const struct block_device_operations floppy_fops = {
 	.revalidate_disk= floppy_revalidate,
 };
 
+static void swim3_mb_event(struct macio_dev* mdev, int mb_state)
+{
+	struct floppy_state *fs = macio_get_drvdata(mdev);
+	struct swim3 __iomem *sw = fs->swim3;
+
+	if (!fs)
+		return;
+	if (mb_state != MB_FD)
+		return;
+
+	/* Clear state */
+	out_8(&sw->intr_enable, 0);
+	in_8(&sw->intr);
+	in_8(&sw->error);
+}
+
 static int swim3_add_device(struct macio_dev *mdev, int index)
 {
 	struct device_node *swim = mdev->ofdev.dev.of_node;
 	struct floppy_state *fs = &floppy_states[index];
 	int rc = -EBUSY;
 
+	/* Do this first for message macros */
+	memset(fs, 0, sizeof(*fs));
+	fs->mdev = mdev;
+	fs->index = index;
+
 	/* Check & Request resources */
 	if (macio_resource_count(mdev) < 2) {
-		printk(KERN_WARNING "ifd%d: no address for %s\n",
-		       index, swim->full_name);
+		swim3_err("%s", "No address in device-tree\n");
 		return -ENXIO;
 	}
-	if (macio_irq_count(mdev) < 2) {
-		printk(KERN_WARNING "fd%d: no intrs for device %s\n",
-			index, swim->full_name);
+	if (macio_irq_count(mdev) < 1) {
+		swim3_err("%s", "No interrupt in device-tree\n");
+		return -ENXIO;
 	}
 	if (macio_request_resource(mdev, 0, "swim3 (mmio)")) {
-		printk(KERN_ERR "fd%d: can't request mmio resource for %s\n",
-		       index, swim->full_name);
+		swim3_err("%s", "Can't request mmio resource\n");
 		return -EBUSY;
 	}
 	if (macio_request_resource(mdev, 1, "swim3 (dma)")) {
-		printk(KERN_ERR "fd%d: can't request dma resource for %s\n",
-		       index, swim->full_name);
+		swim3_err("%s", "Can't request dma resource\n");
 		macio_release_resource(mdev, 0);
 		return -EBUSY;
 	}
@@ -1063,22 +1137,18 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
 	if (mdev->media_bay == NULL)
 		pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 1);
 	
-	memset(fs, 0, sizeof(*fs));
-	spin_lock_init(&fs->lock);
 	fs->state = idle;
 	fs->swim3 = (struct swim3 __iomem *)
 		ioremap(macio_resource_start(mdev, 0), 0x200);
 	if (fs->swim3 == NULL) {
-		printk("fd%d: couldn't map registers for %s\n",
-		       index, swim->full_name);
+		swim3_err("%s", "Couldn't map mmio registers\n");
 		rc = -ENOMEM;
 		goto out_release;
 	}
 	fs->dma = (struct dbdma_regs __iomem *)
 		ioremap(macio_resource_start(mdev, 1), 0x200);
 	if (fs->dma == NULL) {
-		printk("fd%d: couldn't map DMA for %s\n",
-		       index, swim->full_name);
+		swim3_err("%s", "Couldn't map dma registers\n");
 		iounmap(fs->swim3);
 		rc = -ENOMEM;
 		goto out_release;
@@ -1090,31 +1160,25 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
 	fs->secpercyl = 36;
 	fs->secpertrack = 18;
 	fs->total_secs = 2880;
-	fs->mdev = mdev;
 	init_waitqueue_head(&fs->wait);
 
 	fs->dma_cmd = (struct dbdma_cmd *) DBDMA_ALIGN(fs->dbdma_cmd_space);
 	memset(fs->dma_cmd, 0, 2 * sizeof(struct dbdma_cmd));
 	st_le16(&fs->dma_cmd[1].command, DBDMA_STOP);
 
+	if (mdev->media_bay == NULL || check_media_bay(mdev->media_bay) == MB_FD)
+		swim3_mb_event(mdev, MB_FD);
+
 	if (request_irq(fs->swim3_intr, swim3_interrupt, 0, "SWIM3", fs)) {
-		printk(KERN_ERR "fd%d: couldn't request irq %d for %s\n",
-		       index, fs->swim3_intr, swim->full_name);
+		swim3_err("%s", "Couldn't request interrupt\n");
 		pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 0);
 		goto out_unmap;
 		return -EBUSY;
 	}
-/*
-	if (request_irq(fs->dma_intr, fd_dma_interrupt, 0, "SWIM3-dma", fs)) {
-		printk(KERN_ERR "Couldn't get irq %d for SWIM3 DMA",
-		       fs->dma_intr);
-		return -EBUSY;
-	}
-*/
 
 	init_timer(&fs->timeout);
 
-	printk(KERN_INFO "fd%d: SWIM3 floppy controller %s\n", floppy_count,
+	swim3_info("SWIM3 floppy controller %s\n",
 		mdev->media_bay ? "in media bay" : "");
 
 	return 0;
@@ -1132,41 +1196,42 @@ static int swim3_add_device(struct macio_dev *mdev, int index)
 
 static int __devinit swim3_attach(struct macio_dev *mdev, const struct of_device_id *match)
 {
-	int i, rc;
 	struct gendisk *disk;
+	int index, rc;
+
+	index = floppy_count++;
+	if (index >= MAX_FLOPPIES)
+		return -ENXIO;
 
 	/* Add the drive */
-	rc = swim3_add_device(mdev, floppy_count);
+	rc = swim3_add_device(mdev, index);
 	if (rc)
 		return rc;
+	/* Now register that disk. Same comment about failure handling */
+	disk = disks[index] = alloc_disk(1);
+	if (disk == NULL)
+		return -ENOMEM;
+	disk->queue = blk_init_queue(do_fd_request, &swim3_lock);
+	if (disk->queue == NULL) {
+		put_disk(disk);
+		return -ENOMEM;
+	}
+	disk->queue->queuedata = &floppy_states[index];
 
-	/* Now create the queue if not there yet */
-	if (swim3_queue == NULL) {
+	if (index == 0) {
 		/* If we failed, there isn't much we can do as the driver is still
 		 * too dumb to remove the device, just bail out
 		 */
 		if (register_blkdev(FLOPPY_MAJOR, "fd"))
 			return 0;
-		swim3_queue = blk_init_queue(do_fd_request, &swim3_lock);
-		if (swim3_queue == NULL) {
-			unregister_blkdev(FLOPPY_MAJOR, "fd");
-			return 0;
-		}
 	}
 
-	/* Now register that disk. Same comment about failure handling */
-	i = floppy_count++;
-	disk = disks[i] = alloc_disk(1);
-	if (disk == NULL)
-		return 0;
-
 	disk->major = FLOPPY_MAJOR;
-	disk->first_minor = i;
+	disk->first_minor = index;
 	disk->fops = &floppy_fops;
-	disk->private_data = &floppy_states[i];
-	disk->queue = swim3_queue;
+	disk->private_data = &floppy_states[index];
 	disk->flags |= GENHD_FL_REMOVABLE;
-	sprintf(disk->disk_name, "fd%d", i);
+	sprintf(disk->disk_name, "fd%d", index);
 	set_capacity(disk, 2880);
 	add_disk(disk);
 
@@ -1194,6 +1259,9 @@ static struct macio_driver swim3_driver =
 		.of_match_table	= swim3_match,
 	},
 	.probe		= swim3_attach,
+#ifdef CONFIG_PMAC_MEDIABAY
+	.mediabay_event	= swim3_mb_event,
+#endif
 #if 0
 	.suspend	= swim3_suspend,
 	.resume		= swim3_resume,

^ permalink raw reply related

* Re: linux-next bad Kconfig for drivers/hid
From: Tony Breeds @ 2011-12-12  0:31 UTC (permalink / raw)
  To: Jiri Kosina; +Cc: Jeremy Fitzhardinge, LinuxPPC-dev, Linux Kernel ML
In-Reply-To: <alpine.LNX.2.00.1112120020330.21970@pobox.suse.cz>

[-- Attachment #1: Type: text/plain, Size: 844 bytes --]

On Mon, Dec 12, 2011 at 12:21:16AM +0100, Jiri Kosina wrote:
> On Thu, 8 Dec 2011, Jeremy Fitzhardinge wrote:
> > 
> > Hm.  How about making it "depends on HID && POWER_SUPPLY"?  I think that
> > would needlessly disable it if HID is also modular, but I'm not sure how
> > to fix that.  "depends on HID && POWER_SUPPLY && HID == POWER_SUPPLY"?

That would work, but I think technically I think you could end up with
HID=m and POWER_SUPPLY=m which would still allow HID_BATTERY_STRENGTH=y
which is the same problem.

I don't know what kind of .config contortions you'd need to do to get
there.
 
> How about making it 'default POWER_SUPPLY' instead?

By itself that wont help as POWER_SUPPLY=m statisfies.

So it looks like we have Jeremy's:
	HID && POWER_SUPPLY && HID == POWER_SUPPLY
or my:
	POWER_SUPPLY=y

Yours Tony

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH 01/16 v3] pmac_zilog: fix unexpected irq
From: Benjamin Herrenschmidt @ 2011-12-11 23:55 UTC (permalink / raw)
  To: Finn Thain; +Cc: linuxppc-dev, linux-m68k, Geert Uytterhoeven, linux-serial
In-Reply-To: <1323647315.19891.10.camel@pasglop>

On Mon, 2011-12-12 at 10:48 +1100, Benjamin Herrenschmidt wrote:
> Any chance you can test this patch ? I would not be surprised if it
> broke m68k since I had to do some of the changes in there "blind",
> so let me know... with this, I can again suspend/resume properly on
> a Pismo while using the internal modem among other things.

 ..../....

Forgot to commit a fix before sending, but it's a trivial one:

> -#ifdef DEBUG_HARD
> +#ifde DEBUG_HARD
>  	pmz_debug("irq, r3: %x\n", r3);
>  #endif

Remove that hunk :-)

Cheers,
Ben.

^ permalink raw reply

* [PATCH] powerpc/pmac: Simplify old pmac PIC interrupt handling
From: Benjamin Herrenschmidt @ 2011-12-11 23:50 UTC (permalink / raw)
  To: linuxppc-dev

In the old days, we treated all interrupts from the legacy Apple home made
interrupt controllers as level, with a trick reading the "level" register
along with the "event" register to work arounds bugs where it would
occasionally fail to latch some events.

Doing so appeared to work fine for both level and edge interrupts.

Later on, we discovered in Darwin source the magic masks that define which
interrupts are actually level and which are edge, and implemented a
different algorithm, more similar to what Apple does, that treats those
differently.

I recently discovered however that this caused problems (including loss
of interrupts) with an old Wallstreet PowerBook when trying to use the
internal modem (connected to a cascaded controller).

It looks like some interrupts are treated as edge while they are really
level and I'm starting to seriously doubt the correctness of the Darwin
code (which has other obvious bugs when you read it, so ...)

This patch reverts to our original behaviour of treating everything as
a level interrupt. It appears to solve the problems with the modem on
the Wallstreet and everything else seems to be working properly as well.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/platforms/powermac/pic.c |   34 +++++---------------------------
 1 files changed, 6 insertions(+), 28 deletions(-)

Please test on anything pre-MPIC: all oldworld and very few newworld
such as the Lombard PowerBook (101) or the original iMac.

diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index d8aedf1..7761aab 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -52,13 +52,8 @@ struct device_node *of_irq_dflt_pic;
 /* Default addresses */
 static volatile struct pmac_irq_hw __iomem *pmac_irq_hw[4];
 
-#define GC_LEVEL_MASK		0x3ff00000
-#define OHARE_LEVEL_MASK	0x1ff00000
-#define HEATHROW_LEVEL_MASK	0x1ff00000
-
 static int max_irqs;
 static int max_real_irqs;
-static u32 level_mask[4];
 
 static DEFINE_RAW_SPINLOCK(pmac_pic_lock);
 
@@ -217,8 +212,7 @@ static irqreturn_t gatwick_action(int cpl, void *dev_id)
 	for (irq = max_irqs; (irq -= 32) >= max_real_irqs; ) {
 		int i = irq >> 5;
 		bits = in_le32(&pmac_irq_hw[i]->event) | ppc_lost_interrupts[i];
-		/* We must read level interrupts from the level register */
-		bits |= (in_le32(&pmac_irq_hw[i]->level) & level_mask[i]);
+		bits |= in_le32(&pmac_irq_hw[i]->level);
 		bits &= ppc_cached_irq_mask[i];
 		if (bits == 0)
 			continue;
@@ -248,8 +242,7 @@ static unsigned int pmac_pic_get_irq(void)
 	for (irq = max_real_irqs; (irq -= 32) >= 0; ) {
 		int i = irq >> 5;
 		bits = in_le32(&pmac_irq_hw[i]->event) | ppc_lost_interrupts[i];
-		/* We must read level interrupts from the level register */
-		bits |= (in_le32(&pmac_irq_hw[i]->level) & level_mask[i]);
+		bits |= in_le32(&pmac_irq_hw[i]->level);
 		bits &= ppc_cached_irq_mask[i];
 		if (bits == 0)
 			continue;
@@ -284,19 +277,14 @@ static int pmac_pic_host_match(struct irq_host *h, struct device_node *node)
 static int pmac_pic_host_map(struct irq_host *h, unsigned int virq,
 			     irq_hw_number_t hw)
 {
-	int level;
-
 	if (hw >= max_irqs)
 		return -EINVAL;
 
 	/* Mark level interrupts, set delayed disable for edge ones and set
 	 * handlers
 	 */
-	level = !!(level_mask[hw >> 5] & (1UL << (hw & 0x1f)));
-	if (level)
-		irq_set_status_flags(virq, IRQ_LEVEL);
-	irq_set_chip_and_handler(virq, &pmac_pic,
-				 level ? handle_level_irq : handle_edge_irq);
+	irq_set_status_flags(virq, IRQ_LEVEL);
+	irq_set_chip_and_handler(virq, &pmac_pic, handle_level_irq);
 	return 0;
 }
 
@@ -334,21 +322,14 @@ static void __init pmac_pic_probe_oldstyle(void)
 
 	if ((master = of_find_node_by_name(NULL, "gc")) != NULL) {
 		max_irqs = max_real_irqs = 32;
-		level_mask[0] = GC_LEVEL_MASK;
 	} else if ((master = of_find_node_by_name(NULL, "ohare")) != NULL) {
 		max_irqs = max_real_irqs = 32;
-		level_mask[0] = OHARE_LEVEL_MASK;
-
 		/* We might have a second cascaded ohare */
 		slave = of_find_node_by_name(NULL, "pci106b,7");
-		if (slave) {
+		if (slave)
 			max_irqs = 64;
-			level_mask[1] = OHARE_LEVEL_MASK;
-		}
 	} else if ((master = of_find_node_by_name(NULL, "mac-io")) != NULL) {
 		max_irqs = max_real_irqs = 64;
-		level_mask[0] = HEATHROW_LEVEL_MASK;
-		level_mask[1] = 0;
 
 		/* We might have a second cascaded heathrow */
 		slave = of_find_node_by_name(master, "mac-io");
@@ -363,11 +344,8 @@ static void __init pmac_pic_probe_oldstyle(void)
 		}
 
 		/* We found a slave */
-		if (slave) {
+		if (slave)
 			max_irqs = 128;
-			level_mask[2] = HEATHROW_LEVEL_MASK;
-			level_mask[3] = 0;
-		}
 	}
 	BUG_ON(master == NULL);
 

^ permalink raw reply related

* Re: [PATCH 01/16 v3] pmac_zilog: fix unexpected irq
From: Benjamin Herrenschmidt @ 2011-12-11 23:48 UTC (permalink / raw)
  To: Finn Thain; +Cc: linuxppc-dev, linux-m68k, Geert Uytterhoeven, linux-serial
In-Reply-To: <alpine.LNX.2.00.1112082128330.2357@nippy.intranet>

Any chance you can test this patch ? I would not be surprised if it
broke m68k since I had to do some of the changes in there "blind",
so let me know... with this, I can again suspend/resume properly on
a Pismo while using the internal modem among other things.

>From c2dbe7117bb94c59a4b2a215fc87fe7eabb7658d Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 12 Dec 2011 10:44:08 +1100
Subject: [PATCH 2/2] tty/serial/pmac_zilog: Fix suspend & resume

This patch reworks & simplifies pmac_zilog handling of suspend/resume,
essentially removing all the specific code in there and using the
generic uart helpers.

This required properly registering the tty as a child of the macio (or platform)
device, so I had to delay the registration a bit (we used to register the ports
very very early). We still register the kernel console early though.

I removed a couple of unused or useless flags as well, relying on the
core to not call us when asleep. I also removed the essentially useless
interrupt mutex, simplifying the locking a bit.

I removed some code for handling unexpected interrupt which should never
be hit and could potentially be harmful (causing us to access a register
on a powered off SCC). We diable port interrupts on close always so there
should be no need to drain data on a closed port.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 drivers/tty/serial/pmac_zilog.c |  350 ++++++++++-----------------------------
 drivers/tty/serial/pmac_zilog.h |   18 +-
 2 files changed, 99 insertions(+), 269 deletions(-)

diff --git a/drivers/tty/serial/pmac_zilog.c b/drivers/tty/serial/pmac_zilog.c
index 51941f0..46cb39b 100644
--- a/drivers/tty/serial/pmac_zilog.c
+++ b/drivers/tty/serial/pmac_zilog.c
@@ -106,7 +106,6 @@ MODULE_LICENSE("GPL");
  */
 static struct uart_pmac_port	pmz_ports[MAX_ZS_PORTS];
 static int			pmz_ports_count;
-static DEFINE_MUTEX(pmz_irq_mutex);
 
 static struct uart_driver pmz_uart_reg = {
 	.owner		=	THIS_MODULE,
@@ -126,9 +125,6 @@ static void pmz_load_zsregs(struct uart_pmac_port *uap, u8 *regs)
 {
 	int i;
 
-	if (ZS_IS_ASLEEP(uap))
-		return;
-
 	/* Let pending transmits finish.  */
 	for (i = 0; i < 1000; i++) {
 		unsigned char stat = read_zsreg(uap, R1);
@@ -234,26 +230,6 @@ static struct tty_struct *pmz_receive_chars(struct uart_pmac_port *uap)
 	unsigned char ch, r1, drop, error, flag;
 	int loops = 0;
 
-	/* The interrupt can be enabled when the port isn't open, typically
-	 * that happens when using one port is open and the other closed (stale
-	 * interrupt) or when one port is used as a console.
-	 */
-	if (!ZS_IS_OPEN(uap)) {
-		pmz_debug("pmz: draining input\n");
-		/* Port is closed, drain input data */
-		for (;;) {
-			if ((++loops) > 1000)
-				goto flood;
-			(void)read_zsreg(uap, R1);
-			write_zsreg(uap, R0, ERR_RES);
-			(void)read_zsdata(uap);
-			ch = read_zsreg(uap, R0);
-			if (!(ch & Rx_CH_AV))
-				break;
-		}
-		return NULL;
-	}
-
 	/* Sanity check, make sure the old bug is no longer happening */
 	if (uap->port.state == NULL || uap->port.state->port.tty == NULL) {
 		WARN_ON(1);
@@ -393,8 +369,6 @@ static void pmz_transmit_chars(struct uart_pmac_port *uap)
 {
 	struct circ_buf *xmit;
 
-	if (ZS_IS_ASLEEP(uap))
-		return;
 	if (ZS_IS_CONS(uap)) {
 		unsigned char status = read_zsreg(uap, R0);
 
@@ -485,12 +459,16 @@ static irqreturn_t pmz_interrupt(int irq, void *dev_id)
 	spin_lock(&uap_a->port.lock);
 	r3 = read_zsreg(uap_a, R3);
 
-#ifdef DEBUG_HARD
+#ifde DEBUG_HARD
 	pmz_debug("irq, r3: %x\n", r3);
 #endif
 	/* Channel A */
 	tty = NULL;
 	if (r3 & (CHAEXT | CHATxIP | CHARxIP)) {
+		if (!ZS_IS_OPEN(uap_a)) {
+			pmz_debug("ChanA interrupt while open !\n");
+			goto skip_a;
+		}
 		write_zsreg(uap_a, R0, RES_H_IUS);
 		zssync(uap_a);		
 		if (r3 & CHAEXT)
@@ -501,16 +479,21 @@ static irqreturn_t pmz_interrupt(int irq, void *dev_id)
 			pmz_transmit_chars(uap_a);
 		rc = IRQ_HANDLED;
 	}
+ skip_a:
 	spin_unlock(&uap_a->port.lock);
 	if (tty != NULL)
 		tty_flip_buffer_push(tty);
 
-	if (uap_b->node == NULL)
+	if (!uap_b)
 		goto out;
 
 	spin_lock(&uap_b->port.lock);
 	tty = NULL;
 	if (r3 & (CHBEXT | CHBTxIP | CHBRxIP)) {
+		if (!ZS_IS_OPEN(uap_a)) {
+			pmz_debug("ChanB interrupt while open !\n");
+			goto skip_b;
+		}
 		write_zsreg(uap_b, R0, RES_H_IUS);
 		zssync(uap_b);
 		if (r3 & CHBEXT)
@@ -521,14 +504,12 @@ static irqreturn_t pmz_interrupt(int irq, void *dev_id)
 			pmz_transmit_chars(uap_b);
 		rc = IRQ_HANDLED;
 	}
+ skip_b:
 	spin_unlock(&uap_b->port.lock);
 	if (tty != NULL)
 		tty_flip_buffer_push(tty);
 
  out:
-#ifdef DEBUG_HARD
-	pmz_debug("irq done.\n");
-#endif
 	return rc;
 }
 
@@ -553,12 +534,8 @@ static inline u8 pmz_peek_status(struct uart_pmac_port *uap)
  */
 static unsigned int pmz_tx_empty(struct uart_port *port)
 {
-	struct uart_pmac_port *uap = to_pmz(port);
 	unsigned char status;
 
-	if (ZS_IS_ASLEEP(uap) || uap->node == NULL)
-		return TIOCSER_TEMT;
-
 	status = pmz_peek_status(to_pmz(port));
 	if (status & Tx_BUF_EMP)
 		return TIOCSER_TEMT;
@@ -580,8 +557,7 @@ static void pmz_set_mctrl(struct uart_port *port, unsigned int mctrl)
 	if (ZS_IS_IRDA(uap))
 		return;
 	/* We get called during boot with a port not up yet */
-	if (ZS_IS_ASLEEP(uap) ||
-	    !(ZS_IS_OPEN(uap) || ZS_IS_CONS(uap)))
+	if (!(ZS_IS_OPEN(uap) || ZS_IS_CONS(uap)))
 		return;
 
 	set_bits = clear_bits = 0;
@@ -600,8 +576,7 @@ static void pmz_set_mctrl(struct uart_port *port, unsigned int mctrl)
 	/* NOTE: Not subject to 'transmitter active' rule.  */ 
 	uap->curregs[R5] |= set_bits;
 	uap->curregs[R5] &= ~clear_bits;
-	if (ZS_IS_ASLEEP(uap))
-		return;
+
 	write_zsreg(uap, R5, uap->curregs[R5]);
 	pmz_debug("pmz_set_mctrl: set bits: %x, clear bits: %x -> %x\n",
 		  set_bits, clear_bits, uap->curregs[R5]);
@@ -619,9 +594,6 @@ static unsigned int pmz_get_mctrl(struct uart_port *port)
 	unsigned char status;
 	unsigned int ret;
 
-	if (ZS_IS_ASLEEP(uap) || uap->node == NULL)
-		return 0;
-
 	status = read_zsreg(uap, R0);
 
 	ret = 0;
@@ -659,9 +631,6 @@ static void pmz_start_tx(struct uart_port *port)
 	uap->flags |= PMACZILOG_FLAG_TX_ACTIVE;
 	uap->flags &= ~PMACZILOG_FLAG_TX_STOPPED;
 
-	if (ZS_IS_ASLEEP(uap) || uap->node == NULL)
-		return;
-
 	status = read_zsreg(uap, R0);
 
 	/* TX busy?  Just wait for the TX done interrupt.  */
@@ -700,9 +669,6 @@ static void pmz_stop_rx(struct uart_port *port)
 {
 	struct uart_pmac_port *uap = to_pmz(port);
 
-	if (ZS_IS_ASLEEP(uap) || uap->node == NULL)
-		return;
-
 	pmz_debug("pmz: stop_rx()()\n");
 
 	/* Disable all RX interrupts.  */
@@ -721,14 +687,12 @@ static void pmz_enable_ms(struct uart_port *port)
 	struct uart_pmac_port *uap = to_pmz(port);
 	unsigned char new_reg;
 
-	if (ZS_IS_IRDA(uap) || uap->node == NULL)
+	if (ZS_IS_IRDA(uap))
 		return;
 	new_reg = uap->curregs[R15] | (DCDIE | SYNCIE | CTSIE);
 	if (new_reg != uap->curregs[R15]) {
 		uap->curregs[R15] = new_reg;
 
-		if (ZS_IS_ASLEEP(uap))
-			return;
 		/* NOTE: Not subject to 'transmitter active' rule. */
 		write_zsreg(uap, R15, uap->curregs[R15]);
 	}
@@ -744,8 +708,6 @@ static void pmz_break_ctl(struct uart_port *port, int break_state)
 	unsigned char set_bits, clear_bits, new_reg;
 	unsigned long flags;
 
-	if (uap->node == NULL)
-		return;
 	set_bits = clear_bits = 0;
 
 	if (break_state)
@@ -758,12 +720,6 @@ static void pmz_break_ctl(struct uart_port *port, int break_state)
 	new_reg = (uap->curregs[R5] | set_bits) & ~clear_bits;
 	if (new_reg != uap->curregs[R5]) {
 		uap->curregs[R5] = new_reg;
-
-		/* NOTE: Not subject to 'transmitter active' rule. */
-		if (ZS_IS_ASLEEP(uap)) {
-			spin_unlock_irqrestore(&port->lock, flags);
-			return;
-		}
 		write_zsreg(uap, R5, uap->curregs[R5]);
 	}
 
@@ -937,14 +893,21 @@ static int __pmz_startup(struct uart_pmac_port *uap)
 
 static void pmz_irda_reset(struct uart_pmac_port *uap)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&uap->port.lock, flags);
 	uap->curregs[R5] |= DTR;
 	write_zsreg(uap, R5, uap->curregs[R5]);
 	zssync(uap);
-	mdelay(110);
+	spin_unlock_irqrestore(&uap->port.lock, flags);
+	msleep(110);
+
+	spin_lock_irqsave(&uap->port.lock, flags);
 	uap->curregs[R5] &= ~DTR;
 	write_zsreg(uap, R5, uap->curregs[R5]);
 	zssync(uap);
-	mdelay(10);
+	spin_unlock_irqrestore(&uap->port.lock, flags);
+	msleep(10);
 }
 
 /*
@@ -959,13 +922,6 @@ static int pmz_startup(struct uart_port *port)
 
 	pmz_debug("pmz: startup()\n");
 
-	if (ZS_IS_ASLEEP(uap))
-		return -EAGAIN;
-	if (uap->node == NULL)
-		return -ENODEV;
-
-	mutex_lock(&pmz_irq_mutex);
-
 	uap->flags |= PMACZILOG_FLAG_IS_OPEN;
 
 	/* A console is never powered down. Else, power up and
@@ -976,18 +932,14 @@ static int pmz_startup(struct uart_port *port)
 		pwr_delay = __pmz_startup(uap);
 		spin_unlock_irqrestore(&port->lock, flags);
 	}	
-
-	pmz_get_port_A(uap)->flags |= PMACZILOG_FLAG_IS_IRQ_ON;
+	sprintf(uap->irq_name, PMACZILOG_NAME"%d", uap->port.line);
 	if (request_irq(uap->port.irq, pmz_interrupt, IRQF_SHARED,
-			"SCC", uap)) {
+			uap->irq_name, uap)) {
 		pmz_error("Unable to register zs interrupt handler.\n");
 		pmz_set_scc_power(uap, 0);
-		mutex_unlock(&pmz_irq_mutex);
 		return -ENXIO;
 	}
 
-	mutex_unlock(&pmz_irq_mutex);
-
 	/* Right now, we deal with delay by blocking here, I'll be
 	 * smarter later on
 	 */
@@ -1017,26 +969,19 @@ static void pmz_shutdown(struct uart_port *port)
 
 	pmz_debug("pmz: shutdown()\n");
 
-	if (uap->node == NULL)
-		return;
-
-	mutex_lock(&pmz_irq_mutex);
-
 	spin_lock_irqsave(&port->lock, flags);
 
-	if (!ZS_IS_ASLEEP(uap)) {
-		/* Disable interrupt requests for the channel */
-		pmz_interrupt_control(uap, 0);
+	/* Disable interrupt requests for the channel */
+	pmz_interrupt_control(uap, 0);
 
-		if (!ZS_IS_CONS(uap)) {
-			/* Disable receiver and transmitter */
-			uap->curregs[R3] &= ~RxENABLE;
-			uap->curregs[R5] &= ~TxENABLE;
+	if (!ZS_IS_CONS(uap)) {
+		/* Disable receiver and transmitter */
+		uap->curregs[R3] &= ~RxENABLE;
+		uap->curregs[R5] &= ~TxENABLE;
 
-			/* Disable break assertion */
-			uap->curregs[R5] &= ~SND_BRK;
-			pmz_maybe_update_regs(uap);
-		}
+		/* Disable break assertion */
+		uap->curregs[R5] &= ~SND_BRK;
+		pmz_maybe_update_regs(uap);
 	}
 
 	spin_unlock_irqrestore(&port->lock, flags);
@@ -1048,16 +993,11 @@ static void pmz_shutdown(struct uart_port *port)
 
 	uap->flags &= ~PMACZILOG_FLAG_IS_OPEN;
 
-	if (!ZS_IS_OPEN(uap->mate))
-		pmz_get_port_A(uap)->flags &= ~PMACZILOG_FLAG_IS_IRQ_ON;
-
-	if (!ZS_IS_ASLEEP(uap) && !ZS_IS_CONS(uap))
+	if (!ZS_IS_CONS(uap))
 		pmz_set_scc_power(uap, 0);	/* Shut the chip down */
 
 	spin_unlock_irqrestore(&port->lock, flags);
 
-	mutex_unlock(&pmz_irq_mutex);
-
 	pmz_debug("pmz: shutdown() done.\n");
 }
 
@@ -1305,9 +1245,6 @@ static void __pmz_set_termios(struct uart_port *port, struct ktermios *termios,
 
 	pmz_debug("pmz: set_termios()\n");
 
-	if (ZS_IS_ASLEEP(uap))
-		return;
-
 	memcpy(&uap->termios_cache, termios, sizeof(struct ktermios));
 
 	/* XXX Check which revs of machines actually allow 1 and 4Mb speeds
@@ -1605,25 +1542,34 @@ static void pmz_dispose_port(struct uart_pmac_port *uap)
  */
 static int pmz_attach(struct macio_dev *mdev, const struct of_device_id *match)
 {
+	struct uart_pmac_port *uap;
 	int i;
 	
 	/* Iterate the pmz_ports array to find a matching entry
 	 */
 	for (i = 0; i < MAX_ZS_PORTS; i++)
-		if (pmz_ports[i].node == mdev->ofdev.dev.of_node) {
-			struct uart_pmac_port *uap = &pmz_ports[i];
-
-			uap->dev = mdev;
-			dev_set_drvdata(&mdev->ofdev.dev, uap);
-			if (macio_request_resources(uap->dev, "pmac_zilog"))
-				printk(KERN_WARNING "%s: Failed to request resource"
-				       ", port still active\n",
-				       uap->node->name);
-			else
-				uap->flags |= PMACZILOG_FLAG_RSRC_REQUESTED;				
-			return 0;
-		}
-	return -ENODEV;
+		if (pmz_ports[i].node == mdev->ofdev.dev.of_node)
+			break;
+	if (i >= MAX_ZS_PORTS)
+		return -ENODEV;
+
+
+	uap = &pmz_ports[i];
+	uap->dev = mdev;
+	uap->port.dev = &mdev->ofdev.dev;
+	dev_set_drvdata(&mdev->ofdev.dev, uap);
+
+	/* We still activate the port even when failing to request resources
+	 * to work around bugs in ancient Apple device-trees
+	 */
+	if (macio_request_resources(uap->dev, "pmac_zilog"))
+		printk(KERN_WARNING "%s: Failed to request resource"
+		       ", port still active\n",
+		       uap->node->name);
+	else
+		uap->flags |= PMACZILOG_FLAG_RSRC_REQUESTED;				
+
+	return uart_add_one_port(&pmz_uart_reg, &uap->port);
 }
 
 /*
@@ -1637,12 +1583,15 @@ static int pmz_detach(struct macio_dev *mdev)
 	if (!uap)
 		return -ENODEV;
 
+	uart_remove_one_port(&pmz_uart_reg, &uap->port);
+
 	if (uap->flags & PMACZILOG_FLAG_RSRC_REQUESTED) {
 		macio_release_resources(uap->dev);
 		uap->flags &= ~PMACZILOG_FLAG_RSRC_REQUESTED;
 	}
 	dev_set_drvdata(&mdev->ofdev.dev, NULL);
 	uap->dev = NULL;
+	uap->port.dev = NULL;
 	
 	return 0;
 }
@@ -1651,62 +1600,13 @@ static int pmz_detach(struct macio_dev *mdev)
 static int pmz_suspend(struct macio_dev *mdev, pm_message_t pm_state)
 {
 	struct uart_pmac_port *uap = dev_get_drvdata(&mdev->ofdev.dev);
-	struct uart_state *state;
-	unsigned long flags;
 
 	if (uap == NULL) {
 		printk("HRM... pmz_suspend with NULL uap\n");
 		return 0;
 	}
 
-	if (pm_state.event == mdev->ofdev.dev.power.power_state.event)
-		return 0;
-
-	pmz_debug("suspend, switching to state %d\n", pm_state.event);
-
-	state = pmz_uart_reg.state + uap->port.line;
-
-	mutex_lock(&pmz_irq_mutex);
-	mutex_lock(&state->port.mutex);
-
-	spin_lock_irqsave(&uap->port.lock, flags);
-
-	if (ZS_IS_OPEN(uap) || ZS_IS_CONS(uap)) {
-		/* Disable interrupt requests for the channel */
-		pmz_interrupt_control(uap, 0);
-
-		/* Disable receiver and transmitter */
-		uap->curregs[R3] &= ~RxENABLE;
-		uap->curregs[R5] &= ~TxENABLE;
-
-		/* Disable break assertion */
-		uap->curregs[R5] &= ~SND_BRK;
-		pmz_load_zsregs(uap, uap->curregs);
-
-		uap->flags |= PMACZILOG_FLAG_IS_ASLEEP;
-		mb();
-	}
-
-	spin_unlock_irqrestore(&uap->port.lock, flags);
-
-	if (ZS_IS_OPEN(uap) || ZS_IS_OPEN(uap->mate))
-		if (ZS_IS_ASLEEP(uap->mate) && ZS_IS_IRQ_ON(pmz_get_port_A(uap))) {
-			pmz_get_port_A(uap)->flags &= ~PMACZILOG_FLAG_IS_IRQ_ON;
-			disable_irq(uap->port.irq);
-		}
-
-	if (ZS_IS_CONS(uap))
-		uap->port.cons->flags &= ~CON_ENABLED;
-
-	/* Shut the chip down */
-	pmz_set_scc_power(uap, 0);
-
-	mutex_unlock(&state->port.mutex);
-	mutex_unlock(&pmz_irq_mutex);
-
-	pmz_debug("suspend, switching complete\n");
-
-	mdev->ofdev.dev.power.power_state = pm_state;
+	uart_suspend_port(&pmz_uart_reg, &uap->port);
 
 	return 0;
 }
@@ -1715,74 +1615,20 @@ static int pmz_suspend(struct macio_dev *mdev, pm_message_t pm_state)
 static int pmz_resume(struct macio_dev *mdev)
 {
 	struct uart_pmac_port *uap = dev_get_drvdata(&mdev->ofdev.dev);
-	struct uart_state *state;
-	unsigned long flags;
-	int pwr_delay = 0;
 
 	if (uap == NULL)
 		return 0;
 
-	if (mdev->ofdev.dev.power.power_state.event == PM_EVENT_ON)
-		return 0;
-	
-	pmz_debug("resume, switching to state 0\n");
-
-	state = pmz_uart_reg.state + uap->port.line;
-
-	mutex_lock(&pmz_irq_mutex);
-	mutex_lock(&state->port.mutex);
-
-	spin_lock_irqsave(&uap->port.lock, flags);
-	if (!ZS_IS_OPEN(uap) && !ZS_IS_CONS(uap)) {
-		spin_unlock_irqrestore(&uap->port.lock, flags);
-		goto bail;
-	}
-	pwr_delay = __pmz_startup(uap);
-
-	/* Take care of config that may have changed while asleep */
-	__pmz_set_termios(&uap->port, &uap->termios_cache, NULL);
-
-	spin_unlock_irqrestore(&uap->port.lock, flags);
-
-	if (ZS_IS_CONS(uap))
-		uap->port.cons->flags |= CON_ENABLED;
-
-	/* Re-enable IRQ on the controller */
-	if (ZS_IS_OPEN(uap) && !ZS_IS_IRQ_ON(pmz_get_port_A(uap))) {
-		pmz_get_port_A(uap)->flags |= PMACZILOG_FLAG_IS_IRQ_ON;
-		enable_irq(uap->port.irq);
-	}
-
-	if (ZS_IS_OPEN(uap)) {
-		spin_lock_irqsave(&uap->port.lock, flags);
-		pmz_interrupt_control(uap, 1);
-		spin_unlock_irqrestore(&uap->port.lock, flags);
-	}
-
- bail:
-	mutex_unlock(&state->port.mutex);
-	mutex_unlock(&pmz_irq_mutex);
-
-	/* Right now, we deal with delay by blocking here, I'll be
-	 * smarter later on
-	 */
-	if (pwr_delay != 0) {
-		pmz_debug("pmz: delaying %d ms\n", pwr_delay);
-		msleep(pwr_delay);
-	}
-
-	pmz_debug("resume, switching complete\n");
-
-	mdev->ofdev.dev.power.power_state.event = PM_EVENT_ON;
+	uart_resume_port(&pmz_uart_reg, &uap->port);
 
 	return 0;
 }
 
 /*
  * Probe all ports in the system and build the ports array, we register
- * with the serial layer at this point, the macio-type probing is only
- * used later to "attach" to the sysfs tree so we get power management
- * events
+ * with the serial layer later, so we get a proper struct device which
+ * allows the tty to attach properly. This is later than it used to be
+ * but the tty layer really wants it that way.
  */
 static int __init pmz_probe(void)
 {
@@ -1818,8 +1664,10 @@ static int __init pmz_probe(void)
 		/*
 		 * Fill basic fields in the port structures
 		 */
-		pmz_ports[count].mate		= &pmz_ports[count+1];
-		pmz_ports[count+1].mate		= &pmz_ports[count];
+		if (node_b != NULL) {
+			pmz_ports[count].mate		= &pmz_ports[count+1];
+			pmz_ports[count+1].mate		= &pmz_ports[count];
+		}
 		pmz_ports[count].flags		= PMACZILOG_FLAG_IS_CHANNEL_A;
 		pmz_ports[count].node		= node_a;
 		pmz_ports[count+1].node		= node_b;
@@ -1887,19 +1735,19 @@ static int __init pmz_probe(void)
 
 	pmz_ports_count = 0;
 
-	pmz_ports[0].mate      = &pmz_ports[1];
 	pmz_ports[0].port.line = 0;
 	pmz_ports[0].flags     = PMACZILOG_FLAG_IS_CHANNEL_A;
-	pmz_ports[0].node      = &scc_a_pdev;
+	pmz_ports[0].pdev      = &scc_a_pdev;
 	err = pmz_init_port(&pmz_ports[0]);
 	if (err)
 		return err;
 	pmz_ports_count++;
 
+	pmz_ports[0].mate      = &pmz_ports[1];
 	pmz_ports[1].mate      = &pmz_ports[0];
 	pmz_ports[1].port.line = 1;
 	pmz_ports[1].flags     = 0;
-	pmz_ports[1].node      = &scc_b_pdev;
+	pmz_ports[1].pdev      = &scc_b_pdev;
 	err = pmz_init_port(&pmz_ports[1]);
 	if (err)
 		return err;
@@ -1918,13 +1766,22 @@ static int __init pmz_attach(struct platform_device *pdev)
 	int i;
 
 	for (i = 0; i < pmz_ports_count; i++)
-		if (pmz_ports[i].node == pdev)
-			return 0;
-	return -ENODEV;
+		if (pmz_ports[i].pdev == pdev)
+			break;
+	if (i >= pmz_ports_count)
+		return -ENODEV;
+
+	uap = &pmz_ports[i];
+	uap->port.dev = &pdev->dev;
+	dev_set_drvdata(&mdev->ofdev.dev, uap);
+
+			return uart_add_one_port(&pmz_uart_reg,
+						 &pmz_ports[i]->port);
 }
 
 static int __exit pmz_detach(struct platform_device *pdev)
 {
+	uart_remove_one_port(&pmz_uart_reg, &uap->port);
 	return 0;
 }
 
@@ -1956,38 +1813,13 @@ static struct console pmz_console = {
  */
 static int __init pmz_register(void)
 {
-	int i, rc;
-	
 	pmz_uart_reg.nr = pmz_ports_count;
 	pmz_uart_reg.cons = PMACZILOG_CONSOLE;
 
 	/*
 	 * Register this driver with the serial core
 	 */
-	rc = uart_register_driver(&pmz_uart_reg);
-	if (rc)
-		return rc;
-
-	/*
-	 * Register each port with the serial core
-	 */
-	for (i = 0; i < pmz_ports_count; i++) {
-		struct uart_pmac_port *uport = &pmz_ports[i];
-		/* NULL node may happen on wallstreet */
-		if (uport->node != NULL)
-			rc = uart_add_one_port(&pmz_uart_reg, &uport->port);
-		if (rc)
-			goto err_out;
-	}
-
-	return 0;
-err_out:
-	while (i-- > 0) {
-		struct uart_pmac_port *uport = &pmz_ports[i];
-		uart_remove_one_port(&pmz_uart_reg, &uport->port);
-	}
-	uart_unregister_driver(&pmz_uart_reg);
-	return rc;
+	return uart_register_driver(&pmz_uart_reg);
 }
 
 #ifdef CONFIG_PPC_PMAC
@@ -2086,10 +1918,8 @@ static void __exit exit_pmz(void)
 
 	for (i = 0; i < pmz_ports_count; i++) {
 		struct uart_pmac_port *uport = &pmz_ports[i];
-		if (uport->node != NULL) {
-			uart_remove_one_port(&pmz_uart_reg, &uport->port);
+		if (uport->node != NULL)
 			pmz_dispose_port(uport);
-		}
 	}
 	/* Unregister UART driver */
 	uart_unregister_driver(&pmz_uart_reg);
@@ -2116,8 +1946,6 @@ static void pmz_console_write(struct console *con, const char *s, unsigned int c
 	struct uart_pmac_port *uap = &pmz_ports[con->index];
 	unsigned long flags;
 
-	if (ZS_IS_ASLEEP(uap))
-		return;
 	spin_lock_irqsave(&uap->port.lock, flags);
 
 	/* Turn of interrupts and enable the transmitter. */
@@ -2162,8 +1990,10 @@ static int __init pmz_console_setup(struct console *co, char *options)
 	if (co->index >= pmz_ports_count)
 		co->index = 0;
 	uap = &pmz_ports[co->index];
+#ifdef CONFIG_PPC_PMAC
 	if (uap->node == NULL)
 		return -ENODEV;
+#endif
 	port = &uap->port;
 
 	/*
diff --git a/drivers/tty/serial/pmac_zilog.h b/drivers/tty/serial/pmac_zilog.h
index cbc34fb..9ae4556 100644
--- a/drivers/tty/serial/pmac_zilog.h
+++ b/drivers/tty/serial/pmac_zilog.h
@@ -2,9 +2,12 @@
 #define __PMAC_ZILOG_H__
 
 #ifdef CONFIG_PPC_PMAC
-#define pmz_debug(fmt, arg...)	dev_dbg(&uap->dev->ofdev.dev, fmt, ## arg)
-#define pmz_error(fmt, arg...)	dev_err(&uap->dev->ofdev.dev, fmt, ## arg)
-#define pmz_info(fmt, arg...)	dev_info(&uap->dev->ofdev.dev, fmt, ## arg)
+/* We cannot use dev_* because this can be called early, way before
+ * we are matched with a device (when using it as a kernel console)
+ */
+#define pmz_debug(fmt, arg...)	pr_debug("ttyPZ%d: " fmt, uap->port.line, ## arg)
+#define pmz_error(fmt, arg...)	pr_err("ttyPZ%d: " fmt, uap->port.line, ## arg)
+#define pmz_info(fmt, arg...)	pr_info("ttyPZ%d: " fmt, uap->port.line, ## arg)
 #else
 #define pmz_debug(fmt, arg...)	dev_dbg(&uap->node->dev, fmt, ## arg)
 #define pmz_error(fmt, arg...)	dev_err(&uap->node->dev, fmt, ## arg)
@@ -35,7 +38,7 @@ struct uart_pmac_port {
 	 */
 	struct device_node		*node;
 #else
-	struct platform_device		*node;
+	struct platform_device		*pdev;
 #endif
 
 	/* Port type as obtained from device tree (IRDA, modem, ...) */
@@ -50,14 +53,11 @@ struct uart_pmac_port {
 #define PMACZILOG_FLAG_REGS_HELD	0x00000010
 #define PMACZILOG_FLAG_TX_STOPPED	0x00000020
 #define PMACZILOG_FLAG_TX_ACTIVE	0x00000040
-#define PMACZILOG_FLAG_ENABLED          0x00000080
 #define PMACZILOG_FLAG_IS_IRDA		0x00000100
 #define PMACZILOG_FLAG_IS_INTMODEM	0x00000200
 #define PMACZILOG_FLAG_HAS_DMA		0x00000400
 #define PMACZILOG_FLAG_RSRC_REQUESTED	0x00000800
-#define PMACZILOG_FLAG_IS_ASLEEP	0x00001000
 #define PMACZILOG_FLAG_IS_OPEN		0x00002000
-#define PMACZILOG_FLAG_IS_IRQ_ON	0x00004000
 #define PMACZILOG_FLAG_IS_EXTCLK	0x00008000
 #define PMACZILOG_FLAG_BREAK		0x00010000
 
@@ -74,6 +74,8 @@ struct uart_pmac_port {
 	volatile struct dbdma_regs	__iomem *rx_dma_regs;
 #endif
 
+	unsigned char			irq_name[8];
+
 	struct ktermios			termios_cache;
 };
 
@@ -388,9 +390,7 @@ static inline void zssync(struct uart_pmac_port *port)
 #define ZS_IS_IRDA(UP)			((UP)->flags & PMACZILOG_FLAG_IS_IRDA)
 #define ZS_IS_INTMODEM(UP)		((UP)->flags & PMACZILOG_FLAG_IS_INTMODEM)
 #define ZS_HAS_DMA(UP)			((UP)->flags & PMACZILOG_FLAG_HAS_DMA)
-#define ZS_IS_ASLEEP(UP)		((UP)->flags & PMACZILOG_FLAG_IS_ASLEEP)
 #define ZS_IS_OPEN(UP)			((UP)->flags & PMACZILOG_FLAG_IS_OPEN)
-#define ZS_IS_IRQ_ON(UP)		((UP)->flags & PMACZILOG_FLAG_IS_IRQ_ON)
 #define ZS_IS_EXTCLK(UP)		((UP)->flags & PMACZILOG_FLAG_IS_EXTCLK)
 
 #endif /* __PMAC_ZILOG_H__ */
-- 
1.7.7.3

^ permalink raw reply related

* Re: linux-next bad Kconfig for drivers/hid
From: Jiri Kosina @ 2011-12-11 23:21 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: LinuxPPC-dev, Linux Kernel ML
In-Reply-To: <4EE19DA0.5010705@goop.org>

On Thu, 8 Dec 2011, Jeremy Fitzhardinge wrote:

> > Commit 4f5ca836bef3 (HID: hid-input: add support for HID devices
> > reporting Battery Strength) went into linux-next on Dec 1st since then a
> > ppc6xx_defconfig has been failing with:
> >
> > ---
> > drivers/built-in.o: In function `hidinput_cleanup_battery':
> > /scratch/tony/working/drivers/hid/hid-input.c:351: undefined reference to `power_supply_unregister'
> > drivers/built-in.o: In function `hidinput_setup_battery':
> > /scratch/tony/working/drivers/hid/hid-input.c:338: undefined reference to `power_supply_register'
> > make[1]: *** [.tmp_vmlinux1] Error 1
> > ---
> >
> > http://kisskb.ellerman.id.au/kisskb/buildresult/5012563/
> > vs
> > http://kisskb.ellerman.id.au/kisskb/buildresult/5017366/
> >
> > The defconfig in question doens't mention either option
> > (CONFIG_POWER_SUPPLY or CONFIG_HID_BATTERY_STRENGTH) and kbuild is
> > genertaing
> > CONFIG_HID_BATTERY_STRENGTH=y
> > CONFIG_POWER_SUPPLY=m
> > which clearly isn't going to work.
> >
> > The following change to HID_BATTERY_STRENGTH Kconfig "works" but seems a
> > little gross.
> >
> > diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig
> > index 5ed64f6..d2a94e6 100644
> > --- a/drivers/hid/Kconfig
> > +++ b/drivers/hid/Kconfig
> > @@ -33,7 +33,7 @@ config HID
> >  
> >  config HID_BATTERY_STRENGTH
> >         bool
> > -       depends on POWER_SUPPLY
> > +       depends on POWER_SUPPLY=y
> >         default y
> >  
> >  config HIDRAW
> >
> > Any chance we can get a fix into linux-next?
> 
> Hm.  How about making it "depends on HID && POWER_SUPPLY"?  I think that
> would needlessly disable it if HID is also modular, but I'm not sure how
> to fix that.  "depends on HID && POWER_SUPPLY && HID == POWER_SUPPLY"?

How about making it 'default POWER_SUPPLY' instead?

I somehow feel that we'd rather avoid too much 'default y's.

Thanks,

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply

* [PATCH] powerpc: fix compile error with 85xx/p1022_ds.c
From: Michael Neuling @ 2011-12-11 22:49 UTC (permalink / raw)
  To: Kyle Moffett, Benjamin Herrenschmidt; +Cc: sfr, linux-next, linuxppc-dev

Current linux-next compiled with mpc85xx_defconfig causes this:
 arch/powerpc/platforms/85xx/p1022_ds.c:341:14: error: 'udbg_progress' undeclared here (not in a function)

Add include to fix this.

Signed-off-by: Michael Neuling <mikey@neuling.org>

diff --git a/arch/powerpc/platforms/85xx/p1022_ds.c b/arch/powerpc/platforms/85xx/p1022_ds.c
index bb3d84f..b0984ad 100644
--- a/arch/powerpc/platforms/85xx/p1022_ds.c
+++ b/arch/powerpc/platforms/85xx/p1022_ds.c
@@ -25,6 +25,7 @@
 
 #include <sysdev/fsl_soc.h>
 #include <sysdev/fsl_pci.h>
+#include <asm/udbg.h>
 #include <asm/fsl_guts.h>
 #include "smp.h"
 

^ permalink raw reply related

* Re: [PATCH 00/14] Backport 8xx TLB to 2.4
From: Willy Tarreau @ 2011-12-11 17:33 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: Scott Wood, linuxppc-dev, Dan Malek
In-Reply-To: <OF6B6562C0.926C4B3C-ONC1257963.005F34B1-C1257963.005F34B2@transmode.se>

Hi Joakim,

On Sun, Dec 11, 2011 at 06:19:54PM +0100, Joakim Tjernlund wrote:
> > To: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
> > From: Willy Tarreau <w@1wt.eu>
> > 
> > Hi Joakim,  On Mon, Oct 10, 2011 at 01:30:06PM +0200, Joakim Tjernlund wrote: > This is a
> > backport from 2.6 which I did to overcome 8xx CPU > bugs. 8xx does not update the DAR register
> > when taking a TLB > error caused by dcbX and icbi insns which makes it very > tricky to use these
> > insns. Also the dcbst wrongly sets the > the store bit when faulting into DTLB error. > A few
> > more bugs very found during development. >  > I know 2.4 is in strict maintenance mode and 8xx is
> > obsolete > but as it is still in use I wanted 8xx to age with grace.  Thank you. I must admit I
> > was hoping those patches would come in for a last release before the end of the year :-)  Unless
> > there is any objection from anyone, I'll merge them when kernel.org is back online.  Cheers,
> > Willy
> 
> Did this go anywhere?

Not yet, I just need to find some time to release another 2.4 with these
patches.

Best regards,
Willy

^ permalink raw reply

* Re: [PATCH 00/14] Backport 8xx TLB to 2.4
From: Joakim Tjernlund @ 2011-12-11 17:19 UTC (permalink / raw)
  To: Willy Tarreau; +Cc: Scott Wood, linuxppc-dev, Dan Malek
In-Reply-To: <20111010123020.GA21699@1wt.eu>

> To: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
> From: Willy Tarreau <w@1wt.eu>
>=20
> Hi Joakim,  On Mon, Oct 10, 2011 at 01:30:06PM +0200, Joakim Tjernlund wr=
ote: > This is a
> backport from 2.6 which I did to overcome 8xx CPU > bugs. 8xx does not up=
date the DAR register
> when taking a TLB > error caused by dcbX and icbi insns which makes it ve=
ry > tricky to use these
> insns. Also the dcbst wrongly sets the > the store bit when faulting into=
 DTLB error. > A few
> more bugs very found during development. >  > I know 2.4 is in strict mai=
ntenance mode and 8xx is
> obsolete > but as it is still in use I wanted 8xx to age with grace.  Tha=
nk you. I must admit I
> was hoping those patches would come in for a last release before the end =
of the year :-)  Unless
> there is any objection from anyone, I'll merge them when kernel.org is ba=
ck online.  Cheers,
> Willy

Did this go anywhere?

 Jocke  =
=

^ permalink raw reply

* Re: [UPDATED] [PATCH v4 3/7] [ppc] Process dynamic relocations for kernel
From: Segher Boessenkool @ 2011-12-10 20:02 UTC (permalink / raw)
  To: Suzuki K. Poulose; +Cc: Scott Wood, linux ppc dev, Josh Poimboeuf
In-Reply-To: <20111210023646.19516.1453.stgit@suzukikp.in.ibm.com>

Hi Suzuki,

Looks quite good, a few comments...

> +get_type:
> +	/* r4 holds the relocation type */
> +	extrwi	r4, r4, 8, 24	/* r4 = ((char*)r4)[3] */

This comment is confusing (only makes sense together with the
lwz a long way up).

> +nxtrela:
> +	/*
> +	 * We have to flush the modified instructions to the
> +	 * main storage from the d-cache. And also, invalidate the
> +	 * cached instructions in i-cache which has been modified.
> +	 *
> +	 * We delay the msync / isync operation till the end, since
> +	 * we won't be executing the modified instructions until
> +	 * we return from here.
> +	 */
> +	dcbst	r4,r7
> +	icbi	r4,r7

You still need a sync between these two.  Without it, the icbi can
complete before the dcbst for the same address does, which leaves
room for an instruction fetch from that address to get old data.

> +	cmpwi	r8, 0		/* relasz = 0 ? */
> +	ble	done
> +	add	r9, r9, r6	/* move to next entry in the .rela table */
> +	subf	r8, r6, r8	/* relasz -= relaent */
> +	b	applyrela
> +
> +done:
> +	msync			/* Wait for the flush to finish */

The instruction is called "sync".  msync is a BookE thing.

>  	next if (/R_PPC64_RELATIVE/ or /R_PPC64_NONE/ or
>  	         /R_PPC64_ADDR64\s+mach_/);
> +	next if (/R_PPC_ADDR16_LO/ or /R_PPC_ADDR16_HI/ or
> +		 /R_PPC_ADDR16_HA/ or /R_PPC_RELATIVE/);

Nothing new, but these should probably have \b or \s or just
a space on each side.


Segher

^ permalink raw reply

* [RFC PATCH v6 08/10] fadump: Invalidate registration and release reserved memory for general use.
From: Mahesh J Salgaonkar @ 2011-12-10  6:51 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Anton Blanchard, Amerigo Wang, Kexec-ml, Milton Miller,
	Randy Dunlap, Eric W. Biederman, Vivek Goyal
In-Reply-To: <20111210064301.10195.3344.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

This patch introduces an sysfs interface '/sys/kernel/fadump_release_mem' to
invalidate the last fadump registration, invalidate '/proc/vmcore', release
the reserved memory for general use and re-register for future kernel dump.
Once the dump is copied to the disk, the userspace tool will echo 1 to
'/sys/kernel/fadump_release_mem'.

Release the reserved memory region excluding the size of the memory required
for future kernel dump registration.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/fadump.h |    3 +
 arch/powerpc/kernel/fadump.c      |  158 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 157 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index 6768195..88dbf96 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -208,6 +208,9 @@ extern int fadump_reserve_mem(void);
 extern int setup_fadump(void);
 extern int is_fadump_active(void);
 extern void crash_fadump(struct pt_regs *, const char *);
+extern void fadump_cleanup(void);
+
+extern void vmcore_cleanup(void);
 #else	/* CONFIG_FA_DUMP */
 static inline int is_fadump_active(void) { return 0; }
 static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 5689954..8874d6b 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -33,6 +33,8 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/crash_dump.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
 
 #include <asm/page.h>
 #include <asm/prom.h>
@@ -984,6 +986,132 @@ static int fadump_unregister_dump(struct fadump_mem_struct *fdm)
 	return 0;
 }
 
+static int fadump_invalidate_dump(struct fadump_mem_struct *fdm)
+{
+	int rc = 0;
+	unsigned int wait_time;
+
+	pr_debug("Invalidating firmware-assisted dump registration\n");
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
+			FADUMP_INVALIDATE, fdm,
+			sizeof(struct fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+	} while (wait_time);
+
+	if (rc) {
+		printk(KERN_ERR "Failed to invalidate firmware-assisted dump "
+			"rgistration. unexpected error(%d).\n", rc);
+		return rc;
+	}
+	fw_dump.dump_active = 0;
+	fdm_active = NULL;
+	return 0;
+}
+
+void fadump_cleanup(void)
+{
+	/* Invalidate the registration only if dump is active. */
+	if (fw_dump.dump_active) {
+		init_fadump_mem_struct(&fdm,
+			fdm_active->cpu_state_data.destination_address);
+		fadump_invalidate_dump(&fdm);
+	}
+}
+
+/*
+ * Release the memory that was reserved in early boot to preserve the memory
+ * contents. The released memory will be available for general use.
+ */
+static void fadump_release_memory(unsigned long begin, unsigned long end)
+{
+	unsigned long addr;
+	unsigned long ra_start, ra_end;
+
+	ra_start = fw_dump.reserve_dump_area_start;
+	ra_end = ra_start + fw_dump.reserve_dump_area_size;
+
+	for (addr = begin; addr < end; addr += PAGE_SIZE) {
+		/*
+		 * exclude the dump reserve area. Will reuse it for next
+		 * fadump registration.
+		 */
+		if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start))
+			continue;
+
+		ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
+		init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
+		free_page((unsigned long)__va(addr));
+		totalram_pages++;
+	}
+}
+
+static void fadump_invalidate_release_mem(void)
+{
+	unsigned long reserved_area_start, reserved_area_end;
+	unsigned long destination_address;
+
+	mutex_lock(&fadump_mutex);
+	if (!fw_dump.dump_active) {
+		mutex_unlock(&fadump_mutex);
+		return;
+	}
+
+	destination_address = fdm_active->cpu_state_data.destination_address;
+	fadump_cleanup();
+	mutex_unlock(&fadump_mutex);
+
+	/*
+	 * Save the current reserved memory bounds we will require them
+	 * later for releasing the memory for general use.
+	 */
+	reserved_area_start = fw_dump.reserve_dump_area_start;
+	reserved_area_end = reserved_area_start +
+			fw_dump.reserve_dump_area_size;
+	/*
+	 * Setup reserve_dump_area_start and its size so that we can
+	 * reuse this reserved memory for Re-registration.
+	 */
+	fw_dump.reserve_dump_area_start = destination_address;
+	fw_dump.reserve_dump_area_size = get_fadump_area_size();
+
+	fadump_release_memory(reserved_area_start, reserved_area_end);
+	if (fw_dump.cpu_notes_buf) {
+		fadump_cpu_notes_buf_free(
+				(unsigned long)__va(fw_dump.cpu_notes_buf),
+				fw_dump.cpu_notes_buf_size);
+		fw_dump.cpu_notes_buf = 0;
+		fw_dump.cpu_notes_buf_size = 0;
+	}
+	/* Initialize the kernel dump memory structure for FAD registration. */
+	init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
+}
+
+static ssize_t fadump_release_memory_store(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					const char *buf, size_t count)
+{
+	if (!fw_dump.dump_active)
+		return -EPERM;
+
+	if (buf[0] == '1') {
+		/*
+		 * Take away the '/proc/vmcore'. We are releasing the dump
+		 * memory, hence it will not be valid anymore.
+		 */
+		vmcore_cleanup();
+		fadump_invalidate_release_mem();
+
+	} else
+		return -EINVAL;
+	return count;
+}
+
 static ssize_t fadump_enabled_show(struct kobject *kobj,
 					struct kobj_attribute *attr,
 					char *buf)
@@ -1043,10 +1171,13 @@ static int fadump_region_show(struct seq_file *m, void *private)
 	if (!fw_dump.fadump_enabled)
 		return 0;
 
+	mutex_lock(&fadump_mutex);
 	if (fdm_active)
 		fdm_ptr = fdm_active;
-	else
+	else {
+		mutex_unlock(&fadump_mutex);
 		fdm_ptr = &fdm;
+	}
 
 	seq_printf(m,
 			"CPU : [%#016llx-%#016llx] %#llx bytes, "
@@ -1076,7 +1207,7 @@ static int fadump_region_show(struct seq_file *m, void *private)
 	if (!fdm_active ||
 		(fw_dump.reserve_dump_area_start ==
 		fdm_ptr->cpu_state_data.destination_address))
-		return 0;
+		goto out;
 
 	/* Dump is active. Show reserved memory region. */
 	seq_printf(m,
@@ -1088,9 +1219,15 @@ static int fadump_region_show(struct seq_file *m, void *private)
 			fw_dump.reserve_dump_area_start,
 			fdm_ptr->cpu_state_data.destination_address -
 			fw_dump.reserve_dump_area_start);
+out:
+	if (fdm_active)
+		mutex_unlock(&fadump_mutex);
 	return 0;
 }
 
+static struct kobj_attribute fadump_release_attr = __ATTR(fadump_release_mem,
+						0200, NULL,
+						fadump_release_memory_store);
 static struct kobj_attribute fadump_attr = __ATTR(fadump_enabled,
 						0444, fadump_enabled_show,
 						NULL);
@@ -1131,6 +1268,13 @@ static void fadump_init_files(void)
 	if (!debugfs_file)
 		printk(KERN_ERR "fadump: unable to create debugfs file"
 				" fadump_region\n");
+
+	if (fw_dump.dump_active) {
+		rc = sysfs_create_file(kernel_kobj, &fadump_release_attr.attr);
+		if (rc)
+			printk(KERN_ERR "fadump: unable to create sysfs file"
+				" fadump_release_mem (%d)\n", rc);
+	}
 	return;
 }
 
@@ -1150,8 +1294,14 @@ int __init setup_fadump(void)
 	 * If dump data is available then see if it is valid and prepare for
 	 * saving it to the disk.
 	 */
-	if (fw_dump.dump_active)
-		process_fadump(fdm_active);
+	if (fw_dump.dump_active) {
+		/*
+		 * if dump process fails then invalidate the registration
+		 * and release memory before proceeding for re-registration.
+		 */
+		if (process_fadump(fdm_active) < 0)
+			fadump_invalidate_release_mem();
+	}
 	/* Initialize the kernel dump memory structure for FAD registration. */
 	else if (fw_dump.reserve_dump_area_size)
 		init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);

^ permalink raw reply related

* [RFC PATCH v6 10/10] fadump: Remove the phyp assisted dump code.
From: Mahesh J Salgaonkar @ 2011-12-10  6:51 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Anton Blanchard, Amerigo Wang, Kexec-ml, Milton Miller,
	Randy Dunlap, Eric W. Biederman, Vivek Goyal
In-Reply-To: <20111210064301.10195.3344.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

Remove the phyp assisted dump implementation which is not is use.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 Documentation/powerpc/phyp-assisted-dump.txt |  127 ------
 arch/powerpc/Kconfig                         |   10 -
 arch/powerpc/include/asm/phyp_dump.h         |   47 --
 arch/powerpc/kernel/prom.c                   |   87 ----
 arch/powerpc/platforms/pseries/Makefile      |    1 
 arch/powerpc/platforms/pseries/phyp_dump.c   |  513 --------------------------
 6 files changed, 0 insertions(+), 785 deletions(-)
 delete mode 100644 Documentation/powerpc/phyp-assisted-dump.txt
 delete mode 100644 arch/powerpc/include/asm/phyp_dump.h
 delete mode 100644 arch/powerpc/platforms/pseries/phyp_dump.c

diff --git a/Documentation/powerpc/phyp-assisted-dump.txt b/Documentation/powerpc/phyp-assisted-dump.txt
deleted file mode 100644
index ad34020..0000000
--- a/Documentation/powerpc/phyp-assisted-dump.txt
+++ /dev/null
@@ -1,127 +0,0 @@
-
-                   Hypervisor-Assisted Dump
-                   ------------------------
-                       November 2007
-
-The goal of hypervisor-assisted dump is to enable the dump of
-a crashed system, and to do so from a fully-reset system, and
-to minimize the total elapsed time until the system is back
-in production use.
-
-As compared to kdump or other strategies, hypervisor-assisted
-dump offers several strong, practical advantages:
-
--- Unlike kdump, the system has been reset, and loaded
-   with a fresh copy of the kernel.  In particular,
-   PCI and I/O devices have been reinitialized and are
-   in a clean, consistent state.
--- As the dump is performed, the dumped memory becomes
-   immediately available to the system for normal use.
--- After the dump is completed, no further reboots are
-   required; the system will be fully usable, and running
-   in its normal, production mode on its normal kernel.
-
-The above can only be accomplished by coordination with,
-and assistance from the hypervisor. The procedure is
-as follows:
-
--- When a system crashes, the hypervisor will save
-   the low 256MB of RAM to a previously registered
-   save region. It will also save system state, system
-   registers, and hardware PTE's.
-
--- After the low 256MB area has been saved, the
-   hypervisor will reset PCI and other hardware state.
-   It will *not* clear RAM. It will then launch the
-   bootloader, as normal.
-
--- The freshly booted kernel will notice that there
-   is a new node (ibm,dump-kernel) in the device tree,
-   indicating that there is crash data available from
-   a previous boot. It will boot into only 256MB of RAM,
-   reserving the rest of system memory.
-
--- Userspace tools will parse /sys/kernel/release_region
-   and read /proc/vmcore to obtain the contents of memory,
-   which holds the previous crashed kernel. The userspace
-   tools may copy this info to disk, or network, nas, san,
-   iscsi, etc. as desired.
-
-   For Example: the values in /sys/kernel/release-region
-   would look something like this (address-range pairs).
-   CPU:0x177fee000-0x10000: HPTE:0x177ffe020-0x1000: /
-   DUMP:0x177fff020-0x10000000, 0x10000000-0x16F1D370A
-
--- As the userspace tools complete saving a portion of
-   dump, they echo an offset and size to
-   /sys/kernel/release_region to release the reserved
-   memory back to general use.
-
-   An example of this is:
-     "echo 0x40000000 0x10000000 > /sys/kernel/release_region"
-   which will release 256MB at the 1GB boundary.
-
-Please note that the hypervisor-assisted dump feature
-is only available on Power6-based systems with recent
-firmware versions.
-
-Implementation details:
-----------------------
-
-During boot, a check is made to see if firmware supports
-this feature on this particular machine. If it does, then
-we check to see if a active dump is waiting for us. If yes
-then everything but 256 MB of RAM is reserved during early
-boot. This area is released once we collect a dump from user
-land scripts that are run. If there is dump data, then
-the /sys/kernel/release_region file is created, and
-the reserved memory is held.
-
-If there is no waiting dump data, then only the highest
-256MB of the ram is reserved as a scratch area. This area
-is *not* released: this region will be kept permanently
-reserved, so that it can act as a receptacle for a copy
-of the low 256MB in the case a crash does occur. See,
-however, "open issues" below, as to whether
-such a reserved region is really needed.
-
-Currently the dump will be copied from /proc/vmcore to a
-a new file upon user intervention. The starting address
-to be read and the range for each data point in provided
-in /sys/kernel/release_region.
-
-The tools to examine the dump will be same as the ones
-used for kdump.
-
-General notes:
---------------
-Security: please note that there are potential security issues
-with any sort of dump mechanism. In particular, plaintext
-(unencrypted) data, and possibly passwords, may be present in
-the dump data. Userspace tools must take adequate precautions to
-preserve security.
-
-Open issues/ToDo:
-------------
- o The various code paths that tell the hypervisor that a crash
-   occurred, vs. it simply being a normal reboot, should be
-   reviewed, and possibly clarified/fixed.
-
- o Instead of using /sys/kernel, should there be a /sys/dump
-   instead? There is a dump_subsys being created by the s390 code,
-   perhaps the pseries code should use a similar layout as well.
-
- o Is reserving a 256MB region really required? The goal of
-   reserving a 256MB scratch area is to make sure that no
-   important crash data is clobbered when the hypervisor
-   save low mem to the scratch area. But, if one could assure
-   that nothing important is located in some 256MB area, then
-   it would not need to be reserved. Something that can be
-   improved in subsequent versions.
-
- o Still working the kdump team to integrate this with kdump,
-   some work remains but this would not affect the current
-   patches.
-
- o Still need to write a shell script, to copy the dump away.
-   Currently I am parsing it manually.
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5a61aaa..e298cf0 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -369,16 +369,6 @@ config CRASH_DUMP
 	  The same kernel binary can be used as production kernel and dump
 	  capture kernel.
 
-config PHYP_DUMP
-	bool "Hypervisor-assisted dump (EXPERIMENTAL)"
-	depends on PPC_PSERIES && EXPERIMENTAL
-	help
-	  Hypervisor-assisted dump is meant to be a kdump replacement
-	  offering robustness and speed not possible without system
-	  hypervisor assistance.
-
-	  If unsure, say "N"
-
 config FA_DUMP
 	bool "Firmware-assisted dump"
 	depends on PPC64 && PPC_RTAS && CRASH_DUMP
diff --git a/arch/powerpc/include/asm/phyp_dump.h b/arch/powerpc/include/asm/phyp_dump.h
deleted file mode 100644
index fa74c6c..0000000
--- a/arch/powerpc/include/asm/phyp_dump.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Hypervisor-assisted dump
- *
- * Linas Vepstas, Manish Ahuja 2008
- * Copyright 2008 IBM Corp.
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- */
-
-#ifndef _PPC64_PHYP_DUMP_H
-#define _PPC64_PHYP_DUMP_H
-
-#ifdef CONFIG_PHYP_DUMP
-
-/* The RMR region will be saved for later dumping
- * whenever the kernel crashes. Set this to 256MB. */
-#define PHYP_DUMP_RMR_START 0x0
-#define PHYP_DUMP_RMR_END   (1UL<<28)
-
-struct phyp_dump {
-	/* Memory that is reserved during very early boot. */
-	unsigned long init_reserve_start;
-	unsigned long init_reserve_size;
-	/* cmd line options during boot */
-	unsigned long reserve_bootvar;
-	unsigned long phyp_dump_at_boot;
-	/* Check status during boot if dump supported, active & present*/
-	unsigned long phyp_dump_configured;
-	unsigned long phyp_dump_is_active;
-	/* store cpu & hpte size */
-	unsigned long cpu_state_size;
-	unsigned long hpte_region_size;
-	/* previous scratch area values */
-	unsigned long reserved_scratch_addr;
-	unsigned long reserved_scratch_size;
-};
-
-extern struct phyp_dump *phyp_dump_info;
-
-int early_init_dt_scan_phyp_dump(unsigned long node,
-		const char *uname, int depth, void *data);
-
-#endif /* CONFIG_PHYP_DUMP */
-#endif /* _PPC64_PHYP_DUMP_H */
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 42ddf4b..b931558 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -52,7 +52,6 @@
 #include <asm/machdep.h>
 #include <asm/pSeries_reconfig.h>
 #include <asm/pci-bridge.h>
-#include <asm/phyp_dump.h>
 #include <asm/kexec.h>
 #include <asm/opal.h>
 #include <asm/fadump.h>
@@ -616,86 +615,6 @@ static void __init early_reserve_mem(void)
 	}
 }
 
-#ifdef CONFIG_PHYP_DUMP
-/**
- * phyp_dump_calculate_reserve_size() - reserve variable boot area 5% or arg
- *
- * Function to find the largest size we need to reserve
- * during early boot process.
- *
- * It either looks for boot param and returns that OR
- * returns larger of 256 or 5% rounded down to multiples of 256MB.
- *
- */
-static inline unsigned long phyp_dump_calculate_reserve_size(void)
-{
-	unsigned long tmp;
-
-	if (phyp_dump_info->reserve_bootvar)
-		return phyp_dump_info->reserve_bootvar;
-
-	/* divide by 20 to get 5% of value */
-	tmp = memblock_end_of_DRAM();
-	do_div(tmp, 20);
-
-	/* round it down in multiples of 256 */
-	tmp = tmp & ~0x0FFFFFFFUL;
-
-	return (tmp > PHYP_DUMP_RMR_END ? tmp : PHYP_DUMP_RMR_END);
-}
-
-/**
- * phyp_dump_reserve_mem() - reserve all not-yet-dumped mmemory
- *
- * This routine may reserve memory regions in the kernel only
- * if the system is supported and a dump was taken in last
- * boot instance or if the hardware is supported and the
- * scratch area needs to be setup. In other instances it returns
- * without reserving anything. The memory in case of dump being
- * active is freed when the dump is collected (by userland tools).
- */
-static void __init phyp_dump_reserve_mem(void)
-{
-	unsigned long base, size;
-	unsigned long variable_reserve_size;
-
-	if (!phyp_dump_info->phyp_dump_configured) {
-		printk(KERN_ERR "Phyp-dump not supported on this hardware\n");
-		return;
-	}
-
-	if (!phyp_dump_info->phyp_dump_at_boot) {
-		printk(KERN_INFO "Phyp-dump disabled at boot time\n");
-		return;
-	}
-
-	variable_reserve_size = phyp_dump_calculate_reserve_size();
-
-	if (phyp_dump_info->phyp_dump_is_active) {
-		/* Reserve *everything* above RMR.Area freed by userland tools*/
-		base = variable_reserve_size;
-		size = memblock_end_of_DRAM() - base;
-
-		/* XXX crashed_ram_end is wrong, since it may be beyond
-		 * the memory_limit, it will need to be adjusted. */
-		memblock_reserve(base, size);
-
-		phyp_dump_info->init_reserve_start = base;
-		phyp_dump_info->init_reserve_size = size;
-	} else {
-		size = phyp_dump_info->cpu_state_size +
-			phyp_dump_info->hpte_region_size +
-			variable_reserve_size;
-		base = memblock_end_of_DRAM() - size;
-		memblock_reserve(base, size);
-		phyp_dump_info->init_reserve_start = base;
-		phyp_dump_info->init_reserve_size = size;
-	}
-}
-#else
-static inline void __init phyp_dump_reserve_mem(void) {}
-#endif /* CONFIG_PHYP_DUMP  && CONFIG_PPC_RTAS */
-
 void __init early_init_devtree(void *params)
 {
 	phys_addr_t limit;
@@ -715,11 +634,6 @@ void __init early_init_devtree(void *params)
 	of_scan_flat_dt(early_init_dt_scan_opal, NULL);
 #endif
 
-#ifdef CONFIG_PHYP_DUMP
-	/* scan tree to see if dump occurred during last boot */
-	of_scan_flat_dt(early_init_dt_scan_phyp_dump, NULL);
-#endif
-
 #ifdef CONFIG_FA_DUMP
 	/* scan tree to see if dump is active during last boot */
 	of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL);
@@ -767,7 +681,6 @@ void __init early_init_devtree(void *params)
 #endif
 		reserve_crashkernel();
 	early_reserve_mem();
-	phyp_dump_reserve_mem();
 
 	limit = memory_limit;
 	if (! limit) {
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 3556e40..b00620e 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -18,7 +18,6 @@ obj-$(CONFIG_MEMORY_HOTPLUG)	+= hotplug-memory.o
 obj-$(CONFIG_HVC_CONSOLE)	+= hvconsole.o
 obj-$(CONFIG_HVCS)		+= hvcserver.o
 obj-$(CONFIG_HCALL_STATS)	+= hvCall_inst.o
-obj-$(CONFIG_PHYP_DUMP)		+= phyp_dump.o
 obj-$(CONFIG_CMM)		+= cmm.o
 obj-$(CONFIG_DTL)		+= dtl.o
 obj-$(CONFIG_IO_EVENT_IRQ)	+= io_event_irq.o
diff --git a/arch/powerpc/platforms/pseries/phyp_dump.c b/arch/powerpc/platforms/pseries/phyp_dump.c
deleted file mode 100644
index 6e7742d..0000000
--- a/arch/powerpc/platforms/pseries/phyp_dump.c
+++ /dev/null
@@ -1,513 +0,0 @@
-/*
- * Hypervisor-assisted dump
- *
- * Linas Vepstas, Manish Ahuja 2008
- * Copyright 2008 IBM Corp.
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <linux/kobject.h>
-#include <linux/mm.h>
-#include <linux/of.h>
-#include <linux/pfn.h>
-#include <linux/swap.h>
-#include <linux/sysfs.h>
-
-#include <asm/page.h>
-#include <asm/phyp_dump.h>
-#include <asm/machdep.h>
-#include <asm/prom.h>
-#include <asm/rtas.h>
-
-/* Variables, used to communicate data between early boot and late boot */
-static struct phyp_dump phyp_dump_vars;
-struct phyp_dump *phyp_dump_info = &phyp_dump_vars;
-
-static int ibm_configure_kernel_dump;
-/* ------------------------------------------------- */
-/* RTAS interfaces to declare the dump regions */
-
-struct dump_section {
-	u32 dump_flags;
-	u16 source_type;
-	u16 error_flags;
-	u64 source_address;
-	u64 source_length;
-	u64 length_copied;
-	u64 destination_address;
-};
-
-struct phyp_dump_header {
-	u32 version;
-	u16 num_of_sections;
-	u16 status;
-
-	u32 first_offset_section;
-	u32 dump_disk_section;
-	u64 block_num_dd;
-	u64 num_of_blocks_dd;
-	u32 offset_dd;
-	u32 maxtime_to_auto;
-	/* No dump disk path string used */
-
-	struct dump_section cpu_data;
-	struct dump_section hpte_data;
-	struct dump_section kernel_data;
-};
-
-/* The dump header *must be* in low memory, so .bss it */
-static struct phyp_dump_header phdr;
-
-#define NUM_DUMP_SECTIONS	3
-#define DUMP_HEADER_VERSION	0x1
-#define DUMP_REQUEST_FLAG	0x1
-#define DUMP_SOURCE_CPU		0x0001
-#define DUMP_SOURCE_HPTE	0x0002
-#define DUMP_SOURCE_RMO		0x0011
-#define DUMP_ERROR_FLAG		0x2000
-#define DUMP_TRIGGERED		0x4000
-#define DUMP_PERFORMED		0x8000
-
-
-/**
- * init_dump_header() - initialize the header declaring a dump
- * Returns: length of dump save area.
- *
- * When the hypervisor saves crashed state, it needs to put
- * it somewhere. The dump header tells the hypervisor where
- * the data can be saved.
- */
-static unsigned long init_dump_header(struct phyp_dump_header *ph)
-{
-	unsigned long addr_offset = 0;
-
-	/* Set up the dump header */
-	ph->version = DUMP_HEADER_VERSION;
-	ph->num_of_sections = NUM_DUMP_SECTIONS;
-	ph->status = 0;
-
-	ph->first_offset_section =
-		(u32)offsetof(struct phyp_dump_header, cpu_data);
-	ph->dump_disk_section = 0;
-	ph->block_num_dd = 0;
-	ph->num_of_blocks_dd = 0;
-	ph->offset_dd = 0;
-
-	ph->maxtime_to_auto = 0; /* disabled */
-
-	/* The first two sections are mandatory */
-	ph->cpu_data.dump_flags = DUMP_REQUEST_FLAG;
-	ph->cpu_data.source_type = DUMP_SOURCE_CPU;
-	ph->cpu_data.source_address = 0;
-	ph->cpu_data.source_length = phyp_dump_info->cpu_state_size;
-	ph->cpu_data.destination_address = addr_offset;
-	addr_offset += phyp_dump_info->cpu_state_size;
-
-	ph->hpte_data.dump_flags = DUMP_REQUEST_FLAG;
-	ph->hpte_data.source_type = DUMP_SOURCE_HPTE;
-	ph->hpte_data.source_address = 0;
-	ph->hpte_data.source_length = phyp_dump_info->hpte_region_size;
-	ph->hpte_data.destination_address = addr_offset;
-	addr_offset += phyp_dump_info->hpte_region_size;
-
-	/* This section describes the low kernel region */
-	ph->kernel_data.dump_flags = DUMP_REQUEST_FLAG;
-	ph->kernel_data.source_type = DUMP_SOURCE_RMO;
-	ph->kernel_data.source_address = PHYP_DUMP_RMR_START;
-	ph->kernel_data.source_length = PHYP_DUMP_RMR_END;
-	ph->kernel_data.destination_address = addr_offset;
-	addr_offset += ph->kernel_data.source_length;
-
-	return addr_offset;
-}
-
-static void print_dump_header(const struct phyp_dump_header *ph)
-{
-#ifdef DEBUG
-	if (ph == NULL)
-		return;
-
-	printk(KERN_INFO "dump header:\n");
-	/* setup some ph->sections required */
-	printk(KERN_INFO "version = %d\n", ph->version);
-	printk(KERN_INFO "Sections = %d\n", ph->num_of_sections);
-	printk(KERN_INFO "Status = 0x%x\n", ph->status);
-
-	/* No ph->disk, so all should be set to 0 */
-	printk(KERN_INFO "Offset to first section 0x%x\n",
-		ph->first_offset_section);
-	printk(KERN_INFO "dump disk sections should be zero\n");
-	printk(KERN_INFO "dump disk section = %d\n", ph->dump_disk_section);
-	printk(KERN_INFO "block num = %lld\n", ph->block_num_dd);
-	printk(KERN_INFO "number of blocks = %lld\n", ph->num_of_blocks_dd);
-	printk(KERN_INFO "dump disk offset = %d\n", ph->offset_dd);
-	printk(KERN_INFO "Max auto time= %d\n", ph->maxtime_to_auto);
-
-	/*set cpu state and hpte states as well scratch pad area */
-	printk(KERN_INFO " CPU AREA\n");
-	printk(KERN_INFO "cpu dump_flags =%d\n", ph->cpu_data.dump_flags);
-	printk(KERN_INFO "cpu source_type =%d\n", ph->cpu_data.source_type);
-	printk(KERN_INFO "cpu error_flags =%d\n", ph->cpu_data.error_flags);
-	printk(KERN_INFO "cpu source_address =%llx\n",
-		ph->cpu_data.source_address);
-	printk(KERN_INFO "cpu source_length =%llx\n",
-		ph->cpu_data.source_length);
-	printk(KERN_INFO "cpu length_copied =%llx\n",
-		ph->cpu_data.length_copied);
-
-	printk(KERN_INFO " HPTE AREA\n");
-	printk(KERN_INFO "HPTE dump_flags =%d\n", ph->hpte_data.dump_flags);
-	printk(KERN_INFO "HPTE source_type =%d\n", ph->hpte_data.source_type);
-	printk(KERN_INFO "HPTE error_flags =%d\n", ph->hpte_data.error_flags);
-	printk(KERN_INFO "HPTE source_address =%llx\n",
-		ph->hpte_data.source_address);
-	printk(KERN_INFO "HPTE source_length =%llx\n",
-		ph->hpte_data.source_length);
-	printk(KERN_INFO "HPTE length_copied =%llx\n",
-		ph->hpte_data.length_copied);
-
-	printk(KERN_INFO " SRSD AREA\n");
-	printk(KERN_INFO "SRSD dump_flags =%d\n", ph->kernel_data.dump_flags);
-	printk(KERN_INFO "SRSD source_type =%d\n", ph->kernel_data.source_type);
-	printk(KERN_INFO "SRSD error_flags =%d\n", ph->kernel_data.error_flags);
-	printk(KERN_INFO "SRSD source_address =%llx\n",
-		ph->kernel_data.source_address);
-	printk(KERN_INFO "SRSD source_length =%llx\n",
-		ph->kernel_data.source_length);
-	printk(KERN_INFO "SRSD length_copied =%llx\n",
-		ph->kernel_data.length_copied);
-#endif
-}
-
-static ssize_t show_phyp_dump_active(struct kobject *kobj,
-			struct kobj_attribute *attr, char *buf)
-{
-
-	/* create filesystem entry so kdump is phyp-dump aware */
-	return sprintf(buf, "%lx\n", phyp_dump_info->phyp_dump_at_boot);
-}
-
-static struct kobj_attribute pdl = __ATTR(phyp_dump_active, 0600,
-					show_phyp_dump_active,
-					NULL);
-
-static void register_dump_area(struct phyp_dump_header *ph, unsigned long addr)
-{
-	int rc;
-
-	/* Add addr value if not initialized before */
-	if (ph->cpu_data.destination_address == 0) {
-		ph->cpu_data.destination_address += addr;
-		ph->hpte_data.destination_address += addr;
-		ph->kernel_data.destination_address += addr;
-	}
-
-	/* ToDo Invalidate kdump and free memory range. */
-
-	do {
-		rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL,
-				1, ph, sizeof(struct phyp_dump_header));
-	} while (rtas_busy_delay(rc));
-
-	if (rc) {
-		printk(KERN_ERR "phyp-dump: unexpected error (%d) on "
-						"register\n", rc);
-		print_dump_header(ph);
-		return;
-	}
-
-	rc = sysfs_create_file(kernel_kobj, &pdl.attr);
-	if (rc)
-		printk(KERN_ERR "phyp-dump: unable to create sysfs"
-				" file (%d)\n", rc);
-}
-
-static
-void invalidate_last_dump(struct phyp_dump_header *ph, unsigned long addr)
-{
-	int rc;
-
-	/* Add addr value if not initialized before */
-	if (ph->cpu_data.destination_address == 0) {
-		ph->cpu_data.destination_address += addr;
-		ph->hpte_data.destination_address += addr;
-		ph->kernel_data.destination_address += addr;
-	}
-
-	do {
-		rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL,
-				2, ph, sizeof(struct phyp_dump_header));
-	} while (rtas_busy_delay(rc));
-
-	if (rc) {
-		printk(KERN_ERR "phyp-dump: unexpected error (%d) "
-						"on invalidate\n", rc);
-		print_dump_header(ph);
-	}
-}
-
-/* ------------------------------------------------- */
-/**
- * release_memory_range -- release memory previously memblock_reserved
- * @start_pfn: starting physical frame number
- * @nr_pages: number of pages to free.
- *
- * This routine will release memory that had been previously
- * memblock_reserved in early boot. The released memory becomes
- * available for genreal use.
- */
-static void release_memory_range(unsigned long start_pfn,
-			unsigned long nr_pages)
-{
-	struct page *rpage;
-	unsigned long end_pfn;
-	long i;
-
-	end_pfn = start_pfn + nr_pages;
-
-	for (i = start_pfn; i <= end_pfn; i++) {
-		rpage = pfn_to_page(i);
-		if (PageReserved(rpage)) {
-			ClearPageReserved(rpage);
-			init_page_count(rpage);
-			__free_page(rpage);
-			totalram_pages++;
-		}
-	}
-}
-
-/**
- * track_freed_range -- Counts the range being freed.
- * Once the counter goes to zero, it re-registers dump for
- * future use.
- */
-static void
-track_freed_range(unsigned long addr, unsigned long length)
-{
-	static unsigned long scratch_area_size, reserved_area_size;
-
-	if (addr < phyp_dump_info->init_reserve_start)
-		return;
-
-	if ((addr >= phyp_dump_info->init_reserve_start) &&
-	    (addr <= phyp_dump_info->init_reserve_start +
-	     phyp_dump_info->init_reserve_size))
-		reserved_area_size += length;
-
-	if ((addr >= phyp_dump_info->reserved_scratch_addr) &&
-	    (addr <= phyp_dump_info->reserved_scratch_addr +
-	     phyp_dump_info->reserved_scratch_size))
-		scratch_area_size += length;
-
-	if ((reserved_area_size == phyp_dump_info->init_reserve_size) &&
-	    (scratch_area_size == phyp_dump_info->reserved_scratch_size)) {
-
-		invalidate_last_dump(&phdr,
-				phyp_dump_info->reserved_scratch_addr);
-		register_dump_area(&phdr,
-				phyp_dump_info->reserved_scratch_addr);
-	}
-}
-
-/* ------------------------------------------------- */
-/**
- * sysfs_release_region -- sysfs interface to release memory range.
- *
- * Usage:
- *   "echo <start addr> <length> > /sys/kernel/release_region"
- *
- * Example:
- *   "echo 0x40000000 0x10000000 > /sys/kernel/release_region"
- *
- * will release 256MB starting at 1GB.
- */
-static ssize_t store_release_region(struct kobject *kobj,
-				struct kobj_attribute *attr,
-				const char *buf, size_t count)
-{
-	unsigned long start_addr, length, end_addr;
-	unsigned long start_pfn, nr_pages;
-	ssize_t ret;
-
-	ret = sscanf(buf, "%lx %lx", &start_addr, &length);
-	if (ret != 2)
-		return -EINVAL;
-
-	track_freed_range(start_addr, length);
-
-	/* Range-check - don't free any reserved memory that
-	 * wasn't reserved for phyp-dump */
-	if (start_addr < phyp_dump_info->init_reserve_start)
-		start_addr = phyp_dump_info->init_reserve_start;
-
-	end_addr = phyp_dump_info->init_reserve_start +
-			phyp_dump_info->init_reserve_size;
-	if (start_addr+length > end_addr)
-		length = end_addr - start_addr;
-
-	/* Release the region of memory assed in by user */
-	start_pfn = PFN_DOWN(start_addr);
-	nr_pages = PFN_DOWN(length);
-	release_memory_range(start_pfn, nr_pages);
-
-	return count;
-}
-
-static ssize_t show_release_region(struct kobject *kobj,
-			struct kobj_attribute *attr, char *buf)
-{
-	u64 second_addr_range;
-
-	/* total reserved size - start of scratch area */
-	second_addr_range = phyp_dump_info->init_reserve_size -
-				phyp_dump_info->reserved_scratch_size;
-	return sprintf(buf, "CPU:0x%llx-0x%llx: HPTE:0x%llx-0x%llx:"
-			    " DUMP:0x%llx-0x%llx, 0x%lx-0x%llx:\n",
-		phdr.cpu_data.destination_address,
-		phdr.cpu_data.length_copied,
-		phdr.hpte_data.destination_address,
-		phdr.hpte_data.length_copied,
-		phdr.kernel_data.destination_address,
-		phdr.kernel_data.length_copied,
-		phyp_dump_info->init_reserve_start,
-		second_addr_range);
-}
-
-static struct kobj_attribute rr = __ATTR(release_region, 0600,
-					show_release_region,
-					store_release_region);
-
-static int __init phyp_dump_setup(void)
-{
-	struct device_node *rtas;
-	const struct phyp_dump_header *dump_header = NULL;
-	unsigned long dump_area_start;
-	unsigned long dump_area_length;
-	int header_len = 0;
-	int rc;
-
-	/* If no memory was reserved in early boot, there is nothing to do */
-	if (phyp_dump_info->init_reserve_size == 0)
-		return 0;
-
-	/* Return if phyp dump not supported */
-	if (!phyp_dump_info->phyp_dump_configured)
-		return -ENOSYS;
-
-	/* Is there dump data waiting for us? If there isn't,
-	 * then register a new dump area, and release all of
-	 * the rest of the reserved ram.
-	 *
-	 * The /rtas/ibm,kernel-dump rtas node is present only
-	 * if there is dump data waiting for us.
-	 */
-	rtas = of_find_node_by_path("/rtas");
-	if (rtas) {
-		dump_header = of_get_property(rtas, "ibm,kernel-dump",
-						&header_len);
-		of_node_put(rtas);
-	}
-
-	ibm_configure_kernel_dump = rtas_token("ibm,configure-kernel-dump");
-
-	print_dump_header(dump_header);
-	dump_area_length = init_dump_header(&phdr);
-	/* align down */
-	dump_area_start = phyp_dump_info->init_reserve_start & PAGE_MASK;
-
-	if (dump_header == NULL) {
-		register_dump_area(&phdr, dump_area_start);
-		return 0;
-	}
-
-	/* re-register the dump area, if old dump was invalid */
-	if ((dump_header) && (dump_header->status & DUMP_ERROR_FLAG)) {
-		invalidate_last_dump(&phdr, dump_area_start);
-		register_dump_area(&phdr, dump_area_start);
-		return 0;
-	}
-
-	if (dump_header) {
-		phyp_dump_info->reserved_scratch_addr =
-				dump_header->cpu_data.destination_address;
-		phyp_dump_info->reserved_scratch_size =
-				dump_header->cpu_data.source_length +
-				dump_header->hpte_data.source_length +
-				dump_header->kernel_data.source_length;
-	}
-
-	/* Should we create a dump_subsys, analogous to s390/ipl.c ? */
-	rc = sysfs_create_file(kernel_kobj, &rr.attr);
-	if (rc)
-		printk(KERN_ERR "phyp-dump: unable to create sysfs file (%d)\n",
-									rc);
-
-	/* ToDo: re-register the dump area, for next time. */
-	return 0;
-}
-machine_subsys_initcall(pseries, phyp_dump_setup);
-
-int __init early_init_dt_scan_phyp_dump(unsigned long node,
-		const char *uname, int depth, void *data)
-{
-	const unsigned int *sizes;
-
-	phyp_dump_info->phyp_dump_configured = 0;
-	phyp_dump_info->phyp_dump_is_active = 0;
-
-	if (depth != 1 || strcmp(uname, "rtas") != 0)
-		return 0;
-
-	if (of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL))
-		phyp_dump_info->phyp_dump_configured++;
-
-	if (of_get_flat_dt_prop(node, "ibm,dump-kernel", NULL))
-		phyp_dump_info->phyp_dump_is_active++;
-
-	sizes = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
-				    NULL);
-	if (!sizes)
-		return 0;
-
-	if (sizes[0] == 1)
-		phyp_dump_info->cpu_state_size = *((unsigned long *)&sizes[1]);
-
-	if (sizes[3] == 2)
-		phyp_dump_info->hpte_region_size =
-						*((unsigned long *)&sizes[4]);
-	return 1;
-}
-
-/* Look for phyp_dump= cmdline option */
-static int __init early_phyp_dump_enabled(char *p)
-{
-	phyp_dump_info->phyp_dump_at_boot = 1;
-
-        if (!p)
-                return 0;
-
-        if (strncmp(p, "1", 1) == 0)
-		phyp_dump_info->phyp_dump_at_boot = 1;
-        else if (strncmp(p, "0", 1) == 0)
-		phyp_dump_info->phyp_dump_at_boot = 0;
-
-        return 0;
-}
-early_param("phyp_dump", early_phyp_dump_enabled);
-
-/* Look for phyp_dump_reserve_size= cmdline option */
-static int __init early_phyp_dump_reserve_size(char *p)
-{
-        if (p)
-		phyp_dump_info->reserve_bootvar = memparse(p, &p);
-
-        return 0;
-}
-early_param("phyp_dump_reserve_size", early_phyp_dump_reserve_size);

^ permalink raw reply related

* [RFC PATCH v6 09/10] fadump: Invalidate the fadump registration during machine shutdown.
From: Mahesh J Salgaonkar @ 2011-12-10  6:51 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Anton Blanchard, Amerigo Wang, Kexec-ml, Milton Miller,
	Randy Dunlap, Eric W. Biederman, Vivek Goyal
In-Reply-To: <20111210064301.10195.3344.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

If dump is active during system reboot, shutdown or halt then invalidate
the fadump registration as it does not get invalidated automatically.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/setup-common.c |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 4e62a56..b0ebdea 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -110,6 +110,14 @@ EXPORT_SYMBOL(ppc_do_canonicalize_irqs);
 /* also used by kexec */
 void machine_shutdown(void)
 {
+#ifdef CONFIG_FA_DUMP
+	/*
+	 * if fadump is active, cleanup the fadump registration before we
+	 * shutdown.
+	 */
+	fadump_cleanup();
+#endif
+
 	if (ppc_md.machine_shutdown)
 		ppc_md.machine_shutdown();
 }

^ permalink raw reply related

* [RFC PATCH v6 05/10] fadump: Convert firmware-assisted cpu state dump data into elf notes.
From: Mahesh J Salgaonkar @ 2011-12-10  6:50 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Anton Blanchard, Amerigo Wang, Kexec-ml, Milton Miller,
	Randy Dunlap, Eric W. Biederman, Vivek Goyal
In-Reply-To: <20111210064301.10195.3344.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

When registered for firmware assisted dump on powerpc, firmware preserves
the registers for the active CPUs during a system crash. This patch reads
the cpu register data stored in Firmware-assisted dump format (except for
crashing cpu) and converts it into elf notes and updates the PT_NOTE program
header accordingly. The exact register state for crashing cpu is saved to
fadump crash info structure in scratch area during crash_fadump() and read
during second kernel boot.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/fadump.h  |   44 +++++
 arch/powerpc/kernel/fadump.c       |  314 ++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/setup-common.c |    6 +
 arch/powerpc/kernel/traps.c        |    3 
 4 files changed, 365 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index 9e172a5..6768195 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -65,6 +65,18 @@
 /* Dump status flag */
 #define FADUMP_ERROR_FLAG	0x2000
 
+#define FADUMP_CPU_ID_MASK	((1UL << 32) - 1)
+
+#define CPU_UNKNOWN		(~((u32)0))
+
+/* Utility macros */
+#define SKIP_TO_NEXT_CPU(reg_entry)			\
+({							\
+	while (reg_entry->reg_id != REG_ID("CPUEND"))	\
+		reg_entry++;				\
+	reg_entry++;					\
+})
+
 /* Kernel Dump section info */
 struct fadump_section {
 	u32	request_flag;
@@ -119,6 +131,9 @@ struct fw_dump {
 	unsigned long	reserve_bootvar;
 
 	unsigned long	fadumphdr_addr;
+	unsigned long	cpu_notes_buf;
+	unsigned long	cpu_notes_buf_size;
+
 	int		ibm_configure_kernel_dump;
 
 	unsigned long	fadump_enabled:1;
@@ -143,13 +158,40 @@ static inline u64 str_to_u64(const char *str)
 	return val;
 }
 #define STR_TO_HEX(x)	str_to_u64(x)
+#define REG_ID(x)	str_to_u64(x)
 
 #define FADUMP_CRASH_INFO_MAGIC		STR_TO_HEX("FADMPINF")
+#define REGSAVE_AREA_MAGIC		STR_TO_HEX("REGSAVE")
+
+/* The firmware-assisted dump format.
+ *
+ * The register save area is an area in the partition's memory used to preserve
+ * the register contents (CPU state data) for the active CPUs during a firmware
+ * assisted dump. The dump format contains register save area header followed
+ * by register entries. Each list of registers for a CPU starts with
+ * "CPUSTRT" and ends with "CPUEND".
+ */
+
+/* Register save area header. */
+struct fadump_reg_save_area_header {
+	u64		magic_number;
+	u32		version;
+	u32		num_cpu_offset;
+};
+
+/* Register entry. */
+struct fadump_reg_entry {
+	u64		reg_id;
+	u64		reg_value;
+};
 
 /* fadump crash info structure */
 struct fadump_crash_info_header {
 	u64		magic_number;
 	u64		elfcorehdr_addr;
+	u32		crashing_cpu;
+	struct pt_regs	regs;
+	struct cpumask	cpu_online_mask;
 };
 
 /* Crash memory ranges */
@@ -165,7 +207,9 @@ extern int early_init_dt_scan_fw_dump(unsigned long node,
 extern int fadump_reserve_mem(void);
 extern int setup_fadump(void);
 extern int is_fadump_active(void);
+extern void crash_fadump(struct pt_regs *, const char *);
 #else	/* CONFIG_FA_DUMP */
 static inline int is_fadump_active(void) { return 0; }
+static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
 #endif
 #endif
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index f85a63f..cab56e7 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -240,6 +240,7 @@ static unsigned long get_fadump_area_size(void)
 	size += fw_dump.boot_memory_size;
 	size += sizeof(struct fadump_crash_info_header);
 	size += sizeof(struct elfhdr); /* ELF core header.*/
+	size += sizeof(struct elf_phdr); /* place holder for cpu notes */
 	/* Program headers for crash memory regions. */
 	size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2);
 
@@ -393,6 +394,285 @@ static void register_fw_dump(struct fadump_mem_struct *fdm)
 	}
 }
 
+void crash_fadump(struct pt_regs *regs, const char *str)
+{
+	struct fadump_crash_info_header *fdh = NULL;
+
+	if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr)
+		return;
+
+	fdh = __va(fw_dump.fadumphdr_addr);
+	crashing_cpu = smp_processor_id();
+	fdh->crashing_cpu = crashing_cpu;
+	crash_save_vmcoreinfo();
+
+	if (regs)
+		fdh->regs = *regs;
+	else
+		ppc_save_regs(&fdh->regs);
+
+	fdh->cpu_online_mask = *cpu_online_mask;
+
+	/* Call ibm,os-term rtas call to trigger firmware assisted dump */
+	rtas_os_term((char *)str);
+}
+
+#define GPR_MASK	0xffffff0000000000
+static inline int fadump_gpr_index(u64 id)
+{
+	int i = -1;
+	char str[3];
+
+	if ((id & GPR_MASK) == REG_ID("GPR")) {
+		/* get the digits at the end */
+		id &= ~GPR_MASK;
+		id >>= 24;
+		str[2] = '\0';
+		str[1] = id & 0xff;
+		str[0] = (id >> 8) & 0xff;
+		sscanf(str, "%d", &i);
+		if (i > 31)
+			i = -1;
+	}
+	return i;
+}
+
+static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id,
+								u64 reg_val)
+{
+	int i;
+
+	i = fadump_gpr_index(reg_id);
+	if (i >= 0)
+		regs->gpr[i] = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("NIA"))
+		regs->nip = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("MSR"))
+		regs->msr = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("CTR"))
+		regs->ctr = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("LR"))
+		regs->link = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("XER"))
+		regs->xer = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("CR"))
+		regs->ccr = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("DAR"))
+		regs->dar = (unsigned long)reg_val;
+	else if (reg_id == REG_ID("DSISR"))
+		regs->dsisr = (unsigned long)reg_val;
+}
+
+static struct fadump_reg_entry*
+fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs)
+{
+	memset(regs, 0, sizeof(struct pt_regs));
+
+	while (reg_entry->reg_id != REG_ID("CPUEND")) {
+		fadump_set_regval(regs, reg_entry->reg_id,
+					reg_entry->reg_value);
+		reg_entry++;
+	}
+	reg_entry++;
+	return reg_entry;
+}
+
+static u32 *fadump_append_elf_note(u32 *buf, char *name, unsigned type,
+						void *data, size_t data_len)
+{
+	struct elf_note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = data_len;
+	note.n_type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) + 3)/4;
+	memcpy(buf, name, note.n_namesz);
+	buf += (note.n_namesz + 3)/4;
+	memcpy(buf, data, note.n_descsz);
+	buf += (note.n_descsz + 3)/4;
+
+	return buf;
+}
+
+static void fadump_final_note(u32 *buf)
+{
+	struct elf_note note;
+
+	note.n_namesz = 0;
+	note.n_descsz = 0;
+	note.n_type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
+{
+	struct elf_prstatus prstatus;
+
+	memset(&prstatus, 0, sizeof(prstatus));
+	/*
+	 * FIXME: How do i get PID? Do I really need it?
+	 * prstatus.pr_pid = ????
+	 */
+	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+	buf = fadump_append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+				&prstatus, sizeof(prstatus));
+	return buf;
+}
+
+static void fadump_update_elfcore_header(char *bufp)
+{
+	struct elfhdr *elf;
+	struct elf_phdr *phdr;
+
+	elf = (struct elfhdr *)bufp;
+	bufp += sizeof(struct elfhdr);
+
+	/* First note is a place holder for cpu notes info. */
+	phdr = (struct elf_phdr *)bufp;
+
+	if (phdr->p_type == PT_NOTE) {
+		phdr->p_paddr = fw_dump.cpu_notes_buf;
+		phdr->p_offset	= phdr->p_paddr;
+		phdr->p_filesz	= fw_dump.cpu_notes_buf_size;
+		phdr->p_memsz = fw_dump.cpu_notes_buf_size;
+	}
+	return;
+}
+
+static void *fadump_cpu_notes_buf_alloc(unsigned long size)
+{
+	void *vaddr;
+	struct page *page;
+	unsigned long order, count, i;
+
+	order = get_order(size);
+	vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+	if (!vaddr)
+		return NULL;
+
+	count = 1 << order;
+	page = virt_to_page(vaddr);
+	for (i = 0; i < count; i++)
+		SetPageReserved(page + i);
+	return vaddr;
+}
+
+static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size)
+{
+	struct page *page;
+	unsigned long order, count, i;
+
+	order = get_order(size);
+	count = 1 << order;
+	page = virt_to_page(vaddr);
+	for (i = 0; i < count; i++)
+		ClearPageReserved(page + i);
+	__free_pages(page, order);
+}
+
+/*
+ * Read CPU state dump data and convert it into ELF notes.
+ * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be
+ * used to access the data to allow for additional fields to be added without
+ * affecting compatibility. Each list of registers for a CPU starts with
+ * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes,
+ * 8 Byte ASCII identifier and 8 Byte register value. The register entry
+ * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part
+ * of register value. For more details refer to PAPR document.
+ *
+ * Only for the crashing cpu we ignore the CPU dump data and get exact
+ * state from fadump crash info structure populated by first kernel at the
+ * time of crash.
+ */
+static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm)
+{
+	struct fadump_reg_save_area_header *reg_header;
+	struct fadump_reg_entry *reg_entry;
+	struct fadump_crash_info_header *fdh = NULL;
+	void *vaddr;
+	unsigned long addr;
+	u32 num_cpus, *note_buf;
+	struct pt_regs regs;
+	int i, rc = 0, cpu = 0;
+
+	if (!fdm->cpu_state_data.bytes_dumped)
+		return -EINVAL;
+
+	addr = fdm->cpu_state_data.destination_address;
+	vaddr = __va(addr);
+
+	reg_header = vaddr;
+	if (reg_header->magic_number != REGSAVE_AREA_MAGIC) {
+		printk(KERN_ERR "Unable to read register save area.\n");
+		return -ENOENT;
+	}
+	pr_debug("--------CPU State Data------------\n");
+	pr_debug("Magic Number: %llx\n", reg_header->magic_number);
+	pr_debug("NumCpuOffset: %x\n", reg_header->num_cpu_offset);
+
+	vaddr += reg_header->num_cpu_offset;
+	num_cpus = *((u32 *)(vaddr));
+	pr_debug("NumCpus     : %u\n", num_cpus);
+	vaddr += sizeof(u32);
+	reg_entry = (struct fadump_reg_entry *)vaddr;
+
+	/* Allocate buffer to hold cpu crash notes. */
+	fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t);
+	fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size);
+	note_buf = fadump_cpu_notes_buf_alloc(fw_dump.cpu_notes_buf_size);
+	if (!note_buf) {
+		printk(KERN_ERR "Failed to allocate 0x%lx bytes for "
+			"cpu notes buffer\n", fw_dump.cpu_notes_buf_size);
+		return -ENOMEM;
+	}
+	fw_dump.cpu_notes_buf = __pa(note_buf);
+
+	pr_debug("Allocated buffer for cpu notes of size %ld at %p\n",
+			(num_cpus * sizeof(note_buf_t)), note_buf);
+
+	if (fw_dump.fadumphdr_addr)
+		fdh = __va(fw_dump.fadumphdr_addr);
+
+	for (i = 0; i < num_cpus; i++) {
+		if (reg_entry->reg_id != REG_ID("CPUSTRT")) {
+			printk(KERN_ERR "Unable to read CPU state data\n");
+			rc = -ENOENT;
+			goto error_out;
+		}
+		/* Lower 4 bytes of reg_value contains logical cpu id */
+		cpu = reg_entry->reg_value & FADUMP_CPU_ID_MASK;
+		if (!cpumask_test_cpu(cpu, &fdh->cpu_online_mask)) {
+			SKIP_TO_NEXT_CPU(reg_entry);
+			continue;
+		}
+		pr_debug("Reading register data for cpu %d...\n", cpu);
+		if (fdh && fdh->crashing_cpu == cpu) {
+			regs = fdh->regs;
+			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+			SKIP_TO_NEXT_CPU(reg_entry);
+		} else {
+			reg_entry++;
+			reg_entry = fadump_read_registers(reg_entry, &regs);
+			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+		}
+	}
+	fadump_final_note(note_buf);
+
+	pr_debug("Updating elfcore header (%llx) with cpu notes\n",
+							fdh->elfcorehdr_addr);
+	fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr));
+	return 0;
+
+error_out:
+	fadump_cpu_notes_buf_free((unsigned long)__va(fw_dump.cpu_notes_buf),
+					fw_dump.cpu_notes_buf_size);
+	fw_dump.cpu_notes_buf = 0;
+	fw_dump.cpu_notes_buf_size = 0;
+	return rc;
+
+}
+
 /*
  * Validate and process the dump data stored by firmware before exporting
  * it through '/proc/vmcore'.
@@ -400,18 +680,21 @@ static void register_fw_dump(struct fadump_mem_struct *fdm)
 static int __init process_fadump(const struct fadump_mem_struct *fdm_active)
 {
 	struct fadump_crash_info_header *fdh;
+	int rc = 0;
 
 	if (!fdm_active || !fw_dump.fadumphdr_addr)
 		return -EINVAL;
 
 	/* Check if the dump data is valid. */
 	if ((fdm_active->header.dump_status_flag == FADUMP_ERROR_FLAG) ||
+			(fdm_active->cpu_state_data.error_flags != 0) ||
 			(fdm_active->rmr_region.error_flags != 0)) {
 		printk(KERN_ERR "Dump taken by platform is not valid\n");
 		return -EINVAL;
 	}
-	if (fdm_active->rmr_region.bytes_dumped !=
-			fdm_active->rmr_region.source_len) {
+	if ((fdm_active->rmr_region.bytes_dumped !=
+			fdm_active->rmr_region.source_len) ||
+			!fdm_active->cpu_state_data.bytes_dumped) {
 		printk(KERN_ERR "Dump taken by platform is incomplete\n");
 		return -EINVAL;
 	}
@@ -423,6 +706,10 @@ static int __init process_fadump(const struct fadump_mem_struct *fdm_active)
 		return -EINVAL;
 	}
 
+	rc = fadump_build_cpu_notes(fdm_active);
+	if (rc)
+		return rc;
+
 	/*
 	 * We are done validating dump info and elfcore header is now ready
 	 * to be exported. set elfcorehdr_addr so that vmcore module will
@@ -537,6 +824,27 @@ static int fadump_create_elfcore_headers(char *bufp)
 	elf = (struct elfhdr *)bufp;
 	bufp += sizeof(struct elfhdr);
 
+	/*
+	 * setup ELF PT_NOTE, place holder for cpu notes info. The notes info
+	 * will be populated during second kernel boot after crash. Hence
+	 * this PT_NOTE will always be the first elf note.
+	 *
+	 * NOTE: Any new ELF note addition should be placed after this note.
+	 */
+	phdr = (struct elf_phdr *)bufp;
+	bufp += sizeof(struct elf_phdr);
+	phdr->p_type = PT_NOTE;
+	phdr->p_flags = 0;
+	phdr->p_vaddr = 0;
+	phdr->p_align = 0;
+
+	phdr->p_offset = 0;
+	phdr->p_paddr = 0;
+	phdr->p_filesz = 0;
+	phdr->p_memsz = 0;
+
+	(elf->e_phnum)++;
+
 	/* setup PT_LOAD sections. */
 
 	for (i = 0; i < crash_mem_ranges; i++) {
@@ -588,6 +896,8 @@ static unsigned long init_fadump_header(unsigned long addr)
 	memset(fdh, 0, sizeof(struct fadump_crash_info_header));
 	fdh->magic_number = FADUMP_CRASH_INFO_MAGIC;
 	fdh->elfcorehdr_addr = addr;
+	/* We will set the crashing cpu id in crash_fadump() during crash. */
+	fdh->crashing_cpu = CPU_UNKNOWN;
 
 	return addr;
 }
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 77bb77d..4e62a56 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -61,6 +61,7 @@
 #include <asm/xmon.h>
 #include <asm/cputhreads.h>
 #include <mm/mmu_decl.h>
+#include <asm/fadump.h>
 
 #include "setup.h"
 
@@ -639,6 +640,11 @@ EXPORT_SYMBOL(check_legacy_ioport);
 static int ppc_panic_event(struct notifier_block *this,
                              unsigned long event, void *ptr)
 {
+	/*
+	 * If firmware-assisted dump has been registered then trigger
+	 * firmware-assisted dump and let firmware handle everything else.
+	 */
+	crash_fadump(NULL, ptr);
 	ppc_md.panic(ptr);  /* May not return */
 	return NOTIFY_DONE;
 }
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 5459d14..feca775 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -57,6 +57,7 @@
 #include <asm/kexec.h>
 #include <asm/ppc-opcode.h>
 #include <asm/rio.h>
+#include <asm/fadump.h>
 
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
 int (*__debugger)(struct pt_regs *regs) __read_mostly;
@@ -160,6 +161,8 @@ int die(const char *str, struct pt_regs *regs, long err)
 	add_taint(TAINT_DIE);
 	raw_spin_unlock_irqrestore(&die.lock, flags);
 
+	crash_fadump(regs, str);
+
 	if (kexec_should_crash(current) ||
 		kexec_sr_activated(smp_processor_id()))
 		crash_kexec(regs);

^ permalink raw reply related

* [RFC PATCH v6 07/10] fadump: Introduce cleanup routine to invalidate /proc/vmcore.
From: Mahesh J Salgaonkar @ 2011-12-10  6:51 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Anton Blanchard, Amerigo Wang, Kexec-ml, Milton Miller,
	Randy Dunlap, Eric W. Biederman, Vivek Goyal
In-Reply-To: <20111210064301.10195.3344.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

With the firmware-assisted dump support we don't require a reboot when we
are in second kernel after crash. The second kernel after crash is a normal
kernel boot and has knowledge about entire system RAM with the page tables
initialized for entire system RAM. Hence once the dump is saved to disk, we
can just release the reserved memory area for general use and continue
with second kernel as production kernel.

Hence when we release the reserved memory that contains dump data, the
'/proc/vmcore' will not be valid anymore. Hence this patch introduces
a cleanup routine that invalidates and removes the /proc/vmcore file. This
routine will be invoked before we release the reserved dump memory area.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 fs/proc/vmcore.c |   23 +++++++++++++++++++++++
 1 files changed, 23 insertions(+), 0 deletions(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index b0f450a..0d5071d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -700,3 +700,26 @@ static int __init vmcore_init(void)
 	return 0;
 }
 module_init(vmcore_init)
+
+/* Cleanup function for vmcore module. */
+void vmcore_cleanup(void)
+{
+	struct list_head *pos, *next;
+
+	if (proc_vmcore) {
+		remove_proc_entry(proc_vmcore->name, proc_vmcore->parent);
+		proc_vmcore = NULL;
+	}
+
+	/* clear the vmcore list. */
+	list_for_each_safe(pos, next, &vmcore_list) {
+		struct vmcore *m;
+
+		m = list_entry(pos, struct vmcore, list);
+		list_del(&m->list);
+		kfree(m);
+	}
+	kfree(elfcorebuf);
+	elfcorebuf = NULL;
+}
+EXPORT_SYMBOL_GPL(vmcore_cleanup);

^ permalink raw reply related

* [RFC PATCH v6 06/10] fadump: Add PT_NOTE program header for vmcoreinfo
From: Mahesh J Salgaonkar @ 2011-12-10  6:50 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Anton Blanchard, Amerigo Wang, Kexec-ml, Milton Miller,
	Randy Dunlap, Eric W. Biederman, Vivek Goyal
In-Reply-To: <20111210064301.10195.3344.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

Introduce a PT_NOTE program header that points to physical address of
vmcoreinfo_note buffer declared in kernel/kexec.c. The vmcoreinfo
note buffer is populated during crash_fadump() at the time of system
crash.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/fadump.c |   29 +++++++++++++++++++++++++++++
 1 files changed, 29 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index cab56e7..5689954 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -814,6 +814,19 @@ static void fadump_setup_crash_memory_ranges(void)
 	}
 }
 
+/*
+ * If the given physical address falls within the boot memory region then
+ * return the relocated address that points to the dump region reserved
+ * for saving initial boot memory contents.
+ */
+static inline unsigned long fadump_relocate(unsigned long paddr)
+{
+	if (paddr > RMA_START && paddr < fw_dump.boot_memory_size)
+		return fdm.rmr_region.destination_address + paddr;
+	else
+		return paddr;
+}
+
 static int fadump_create_elfcore_headers(char *bufp)
 {
 	struct elfhdr *elf;
@@ -845,6 +858,22 @@ static int fadump_create_elfcore_headers(char *bufp)
 
 	(elf->e_phnum)++;
 
+	/* setup ELF PT_NOTE for vmcoreinfo */
+	phdr = (struct elf_phdr *)bufp;
+	bufp += sizeof(struct elf_phdr);
+	phdr->p_type	= PT_NOTE;
+	phdr->p_flags	= 0;
+	phdr->p_vaddr	= 0;
+	phdr->p_align	= 0;
+
+	phdr->p_paddr	= fadump_relocate(paddr_vmcoreinfo_note());
+	phdr->p_offset	= phdr->p_paddr;
+	phdr->p_memsz	= vmcoreinfo_max_size;
+	phdr->p_filesz	= vmcoreinfo_max_size;
+
+	/* Increment number of program headers. */
+	(elf->e_phnum)++;
+
 	/* setup PT_LOAD sections. */
 
 	for (i = 0; i < crash_mem_ranges; i++) {

^ permalink raw reply related

* [RFC PATCH v6 02/10] fadump: Reserve the memory for firmware assisted dump.
From: Mahesh J Salgaonkar @ 2011-12-10  6:50 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Anton Blanchard, Amerigo Wang, Kexec-ml, Milton Miller,
	Randy Dunlap, Eric W. Biederman, Vivek Goyal
In-Reply-To: <20111210064301.10195.3344.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

Reserve the memory during early boot to preserve CPU state data, HPTE region
and RMA (real mode area) region data in case of kernel crash. At the time of
crash, powerpc firmware will store CPU state data, HPTE region data and move
RMA region data to the reserved memory area.

If the firmware-assisted dump fails to reserve the memory, then fallback
to existing kexec-based kdump.

The most of the code implementation to reserve memory has been
adapted from phyp assisted dump implementation written by Linas Vepstas
and Manish Ahuja

This patch also introduces a config option CONFIG_FA_DUMP for firmware
assisted dump feature on Powerpc (ppc64) architecture.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/Kconfig              |   13 ++
 arch/powerpc/include/asm/fadump.h |   71 +++++++++++
 arch/powerpc/kernel/Makefile      |    1 
 arch/powerpc/kernel/fadump.c      |  246 +++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/prom.c        |   15 ++
 5 files changed, 345 insertions(+), 1 deletions(-)
 create mode 100644 arch/powerpc/include/asm/fadump.h
 create mode 100644 arch/powerpc/kernel/fadump.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 951e18f..5a61aaa 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -379,6 +379,19 @@ config PHYP_DUMP
 
 	  If unsure, say "N"
 
+config FA_DUMP
+	bool "Firmware-assisted dump"
+	depends on PPC64 && PPC_RTAS && CRASH_DUMP
+	help
+	  A robust mechanism to get reliable kernel crash dump with
+	  assistance from firmware. This approach does not use kexec,
+	  instead firmware assists in booting the kdump kernel
+	  while preserving memory contents. Firmware-assisted dump
+	  is meant to be a kdump replacement offering robustness and
+	  speed not possible without system firmware assistance.
+
+	  If unsure, say "N"
+
 config IRQ_ALL_CPUS
 	bool "Distribute interrupts on all CPUs by default"
 	depends on SMP && !MV64360
diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
new file mode 100644
index 0000000..7be25d3
--- /dev/null
+++ b/arch/powerpc/include/asm/fadump.h
@@ -0,0 +1,71 @@
+/*
+ * Firmware Assisted dump header file.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2011 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#ifndef __PPC64_FA_DUMP_H__
+#define __PPC64_FA_DUMP_H__
+
+#ifdef CONFIG_FA_DUMP
+
+/*
+ * The RMA region will be saved for later dumping when kernel crashes.
+ * RMA is Real Mode Area, the first block of logical memory address owned
+ * by logical partition, containing the storage that may be accessed with
+ * translate off.
+ */
+#define RMA_START	0x0
+#define RMA_END		(ppc64_rma_size)
+
+/*
+ * On some Power systems where RMO is 128MB, it still requires minimum of
+ * 256MB for kernel to boot successfully. When kdump infrastructure is
+ * configured to save vmcore over network, we run into OOM issue while
+ * loading modules related to network setup. Hence we need aditional 64M
+ * of memory to avoid OOM issue.
+ */
+#define MIN_BOOT_MEM	(((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \
+			+ (0x1UL << 26))
+
+/* Firmware provided dump sections */
+#define FADUMP_CPU_STATE_DATA	0x0001
+#define FADUMP_HPTE_REGION	0x0002
+#define FADUMP_REAL_MODE_REGION	0x0011
+
+struct fw_dump {
+	unsigned long	cpu_state_data_size;
+	unsigned long	hpte_region_size;
+	unsigned long	boot_memory_size;
+	unsigned long	reserve_dump_area_start;
+	unsigned long	reserve_dump_area_size;
+	/* cmd line option during boot */
+	unsigned long	reserve_bootvar;
+
+	int		ibm_configure_kernel_dump;
+
+	unsigned long	fadump_enabled:1;
+	unsigned long	fadump_supported:1;
+	unsigned long	dump_active:1;
+};
+
+extern int early_init_dt_scan_fw_dump(unsigned long node,
+		const char *uname, int depth, void *data);
+extern int fadump_reserve_mem(void);
+#endif
+#endif
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index ce4f7f1..59b549c 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_IBMVIO)		+= vio.o
 obj-$(CONFIG_IBMEBUS)           += ibmebus.o
 obj-$(CONFIG_GENERIC_TBSYNC)	+= smp-tbsync.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
+obj-$(CONFIG_FA_DUMP)		+= fadump.o
 ifeq ($(CONFIG_PPC32),y)
 obj-$(CONFIG_E500)		+= idle_e500.o
 endif
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
new file mode 100644
index 0000000..deb276a
--- /dev/null
+++ b/arch/powerpc/kernel/fadump.c
@@ -0,0 +1,246 @@
+/*
+ * Firmware Assisted dump: A robust mechanism to get reliable kernel crash
+ * dump with assistance from firmware. This approach does not use kexec,
+ * instead firmware assists in booting the kdump kernel while preserving
+ * memory contents. The most of the code implementation has been adapted
+ * from phyp assisted dump implementation written by Linas Vepstas and
+ * Manish Ahuja
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2011 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "fadump: " fmt
+
+#include <linux/string.h>
+#include <linux/memblock.h>
+
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/fadump.h>
+
+static struct fw_dump fw_dump;
+
+/* Scan the Firmware Assisted dump configuration details. */
+int __init early_init_dt_scan_fw_dump(unsigned long node,
+			const char *uname, int depth, void *data)
+{
+	__be32 *sections;
+	int i, num_sections;
+	unsigned long size;
+	const int *token;
+
+	if (depth != 1 || strcmp(uname, "rtas") != 0)
+		return 0;
+
+	/*
+	 * Check if Firmware Assisted dump is supported. if yes, check
+	 * if dump has been initiated on last reboot.
+	 */
+	token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL);
+	if (!token)
+		return 0;
+
+	fw_dump.fadump_supported = 1;
+	fw_dump.ibm_configure_kernel_dump = *token;
+
+	/*
+	 * The 'ibm,kernel-dump' rtas node is present only if there is
+	 * dump data waiting for us.
+	 */
+	if (of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL))
+		fw_dump.dump_active = 1;
+
+	/* Get the sizes required to store dump data for the firmware provided
+	 * dump sections.
+	 * For each dump section type supported, a 32bit cell which defines
+	 * the ID of a supported section followed by two 32 bit cells which
+	 * gives teh size of the section in bytes.
+	 */
+	sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
+					&size);
+
+	if (!sections)
+		return 0;
+
+	num_sections = size / (3 * sizeof(u32));
+
+	for (i = 0; i < num_sections; i++, sections += 3) {
+		u32 type = (u32)of_read_number(sections, 1);
+
+		switch (type) {
+		case FADUMP_CPU_STATE_DATA:
+			fw_dump.cpu_state_data_size =
+					of_read_ulong(&sections[1], 2);
+			break;
+		case FADUMP_HPTE_REGION:
+			fw_dump.hpte_region_size =
+					of_read_ulong(&sections[1], 2);
+			break;
+		}
+	}
+	return 1;
+}
+
+/**
+ * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM
+ *
+ * Function to find the largest memory size we need to reserve during early
+ * boot process. This will be the size of the memory that is required for a
+ * kernel to boot successfully.
+ *
+ * This function has been taken from phyp-assisted dump feature implementation.
+ *
+ * returns larger of 256MB or 5% rounded down to multiples of 256MB.
+ *
+ * TODO: Come up with better approach to find out more accurate memory size
+ * that is required for a kernel to boot successfully.
+ *
+ */
+static inline unsigned long fadump_calculate_reserve_size(void)
+{
+	unsigned long size;
+
+	/*
+	 * Check if the size is specified through fadump_reserve_mem= cmdline
+	 * option. If yes, then use that.
+	 */
+	if (fw_dump.reserve_bootvar)
+		return fw_dump.reserve_bootvar;
+
+	/* divide by 20 to get 5% of value */
+	size = memblock_end_of_DRAM() / 20;
+
+	/* round it down in multiples of 256 */
+	size = size & ~0x0FFFFFFFUL;
+
+	/* Truncate to memory_limit. We don't want to over reserve the memory.*/
+	if (memory_limit && size > memory_limit)
+		size = memory_limit;
+
+	return (size > MIN_BOOT_MEM ? size : MIN_BOOT_MEM);
+}
+
+/*
+ * Calculate the total memory size required to be reserved for
+ * firmware-assisted dump registration.
+ */
+static unsigned long get_fadump_area_size(void)
+{
+	unsigned long size = 0;
+
+	size += fw_dump.cpu_state_data_size;
+	size += fw_dump.hpte_region_size;
+	size += fw_dump.boot_memory_size;
+
+	size = PAGE_ALIGN(size);
+	return size;
+}
+
+int __init fadump_reserve_mem(void)
+{
+	unsigned long base, size, memory_boundary;
+
+	if (!fw_dump.fadump_enabled)
+		return 0;
+
+	if (!fw_dump.fadump_supported) {
+		printk(KERN_INFO "Firmware-assisted dump is not supported on"
+				" this hardware\n");
+		fw_dump.fadump_enabled = 0;
+		return 0;
+	}
+	/* Initialize boot memory size */
+	fw_dump.boot_memory_size = fadump_calculate_reserve_size();
+
+	/*
+	 * Calculate the memory boundary.
+	 * If memory_limit is less than actual memory boundary then reserve
+	 * the memory for fadump beyond the memory_limit and adjust the
+	 * memory_limit accordingly, so that the running kernel can run with
+	 * specified memory_limit.
+	 */
+	if (memory_limit && memory_limit < memblock_end_of_DRAM()) {
+		size = get_fadump_area_size();
+		if ((memory_limit + size) < memblock_end_of_DRAM())
+			memory_limit += size;
+		else
+			memory_limit = memblock_end_of_DRAM();
+		printk(KERN_INFO "Adjusted memory_limit for firmware-assisted"
+				" dump, now %#016llx\n",
+				(unsigned long long)memory_limit);
+	}
+	if (memory_limit)
+		memory_boundary = memory_limit;
+	else
+		memory_boundary = memblock_end_of_DRAM();
+
+	if (fw_dump.dump_active) {
+		printk(KERN_INFO "Firmware-assisted dump is active.\n");
+		/*
+		 * If last boot has crashed then reserve all the memory
+		 * above boot_memory_size so that we don't touch it until
+		 * dump is written to disk by userspace tool. This memory
+		 * will be released for general use once the dump is saved.
+		 */
+		base = fw_dump.boot_memory_size;
+		size = memory_boundary - base;
+		memblock_reserve(base, size);
+		printk(KERN_INFO "Reserved %ldMB of memory at %ldMB "
+				"for saving crash dump\n",
+				(unsigned long)(size >> 20),
+				(unsigned long)(base >> 20));
+	} else {
+		/* Reserve the memory at the top of memory. */
+		size = get_fadump_area_size();
+		base = memory_boundary - size;
+		memblock_reserve(base, size);
+		printk(KERN_INFO "Reserved %ldMB of memory at %ldMB "
+				"for firmware-assisted dump\n",
+				(unsigned long)(size >> 20),
+				(unsigned long)(base >> 20));
+	}
+	fw_dump.reserve_dump_area_start = base;
+	fw_dump.reserve_dump_area_size = size;
+	return 1;
+}
+
+/* Look for fadump= cmdline option. */
+static int __init early_fadump_param(char *p)
+{
+	if (!p)
+		return 1;
+
+	if (strncmp(p, "on", 2) == 0)
+		fw_dump.fadump_enabled = 1;
+	else if (strncmp(p, "off", 3) == 0)
+		fw_dump.fadump_enabled = 0;
+
+	return 0;
+}
+early_param("fadump", early_fadump_param);
+
+/* Look for fadump_reserve_mem= cmdline option */
+static int __init early_fadump_reserve_mem(char *p)
+{
+	if (p)
+		fw_dump.reserve_bootvar = memparse(p, &p);
+	return 0;
+}
+early_param("fadump_reserve_mem", early_fadump_reserve_mem);
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index fa1235b..42ddf4b 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -55,6 +55,7 @@
 #include <asm/phyp_dump.h>
 #include <asm/kexec.h>
 #include <asm/opal.h>
+#include <asm/fadump.h>
 
 #include <mm/mmu_decl.h>
 
@@ -719,6 +720,11 @@ void __init early_init_devtree(void *params)
 	of_scan_flat_dt(early_init_dt_scan_phyp_dump, NULL);
 #endif
 
+#ifdef CONFIG_FA_DUMP
+	/* scan tree to see if dump is active during last boot */
+	of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL);
+#endif
+
 	/* Pre-initialize the cmd_line with the content of boot_commmand_line,
 	 * which will be empty except when the content of the variable has
 	 * been overriden by a bootloading mechanism. This happens typically
@@ -752,7 +758,14 @@ void __init early_init_devtree(void *params)
 	if (PHYSICAL_START > MEMORY_START)
 		memblock_reserve(MEMORY_START, 0x8000);
 	reserve_kdump_trampoline();
-	reserve_crashkernel();
+#ifdef CONFIG_FA_DUMP
+	/*
+	 * If we fail to reserve memory for firmware-assisted dump then
+	 * fallback to kexec based kdump.
+	 */
+	if (fadump_reserve_mem() == 0)
+#endif
+		reserve_crashkernel();
 	early_reserve_mem();
 	phyp_dump_reserve_mem();
 

^ permalink raw reply related

* [RFC PATCH v6 04/10] fadump: Initialize elfcore header and add PT_LOAD program headers.
From: Mahesh J Salgaonkar @ 2011-12-10  6:50 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Anton Blanchard, Amerigo Wang, Kexec-ml, Milton Miller,
	Randy Dunlap, Eric W. Biederman, Vivek Goyal
In-Reply-To: <20111210064301.10195.3344.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

Build the crash memory range list by traversing through system memory during
the first kernel before we register for firmware-assisted dump. After the
successful dump registration, initialize the elfcore header and populate
PT_LOAD program headers with crash memory ranges. The elfcore header is
saved in the scratch area within the reserved memory. The scratch area starts
at the end of the memory reserved for saving RMR region contents. The
scratch area contains fadump crash info structure that contains magic number
for fadump validation and physical address where the eflcore header can be
found. This structure will also be used to pass some important crash info
data to the second kernel which will help second kernel to populate ELF core
header with correct data before it gets exported through /proc/vmcore. Since
the firmware preserves the entire partition memory at the time of crash the
contents of the scratch area will be preserved till second kernel boot.

NOTE: The current design implementation does not address a possibility of
introducing additional fields (in future) to this structure without affecting
compatibility. It's on TODO list to come up with better approach to
address this.

Reserved dump area start => +-------------------------------------+
                            |  CPU state dump data                |
                            +-------------------------------------+
                            |  HPTE region data                   |
                            +-------------------------------------+
                            |  RMR region data                    |
Scratch area start       => +-------------------------------------+
                            |  fadump crash info structure {      |
                            |     magic nummber                   |
                     +------|---- elfcorehdr_addr                 |
                     |      |  }                                  |
                     +----> +-------------------------------------+
                            |  ELF core header                    |
Reserved dump area end   => +-------------------------------------+

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/fadump.h |   43 +++++++
 arch/powerpc/kernel/fadump.c      |  233 +++++++++++++++++++++++++++++++++++++
 2 files changed, 275 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index bbaf278..9e172a5 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -43,6 +43,12 @@
 #define MIN_BOOT_MEM	(((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \
 			+ (0x1UL << 26))
 
+#define memblock_num_regions(memblock_type)	(memblock.memblock_type.cnt)
+
+#ifndef ELF_CORE_EFLAGS
+#define ELF_CORE_EFLAGS 0
+#endif
+
 /* Firmware provided dump sections */
 #define FADUMP_CPU_STATE_DATA	0x0001
 #define FADUMP_HPTE_REGION	0x0002
@@ -56,6 +62,9 @@
 #define FADUMP_UNREGISTER	2
 #define FADUMP_INVALIDATE	3
 
+/* Dump status flag */
+#define FADUMP_ERROR_FLAG	0x2000
+
 /* Kernel Dump section info */
 struct fadump_section {
 	u32	request_flag;
@@ -109,6 +118,7 @@ struct fw_dump {
 	/* cmd line option during boot */
 	unsigned long	reserve_bootvar;
 
+	unsigned long	fadumphdr_addr;
 	int		ibm_configure_kernel_dump;
 
 	unsigned long	fadump_enabled:1;
@@ -117,6 +127,39 @@ struct fw_dump {
 	unsigned long	dump_registered:1;
 };
 
+/*
+ * Copy the ascii values for first 8 characters from a string into u64
+ * variable at their respective indexes.
+ * e.g.
+ *  The string "FADMPINF" will be converted into 0x4641444d50494e46
+ */
+static inline u64 str_to_u64(const char *str)
+{
+	u64 val = 0;
+	int i;
+
+	for (i = 0; i < sizeof(val); i++)
+		val = (*str) ? (val << 8) | *str++ : val << 8;
+	return val;
+}
+#define STR_TO_HEX(x)	str_to_u64(x)
+
+#define FADUMP_CRASH_INFO_MAGIC		STR_TO_HEX("FADMPINF")
+
+/* fadump crash info structure */
+struct fadump_crash_info_header {
+	u64		magic_number;
+	u64		elfcorehdr_addr;
+};
+
+/* Crash memory ranges */
+#define INIT_CRASHMEM_RANGES	(INIT_MEMBLOCK_REGIONS + 2)
+
+struct fad_crash_memory_ranges {
+	unsigned long long	base;
+	unsigned long long	size;
+};
+
 extern int early_init_dt_scan_fw_dump(unsigned long node,
 		const char *uname, int depth, void *data);
 extern int fadump_reserve_mem(void);
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 13c22cd..f85a63f 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -32,6 +32,7 @@
 #include <linux/delay.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/crash_dump.h>
 
 #include <asm/page.h>
 #include <asm/prom.h>
@@ -43,6 +44,8 @@ static struct fadump_mem_struct fdm;
 static const struct fadump_mem_struct *fdm_active;
 
 static DEFINE_MUTEX(fadump_mutex);
+struct fad_crash_memory_ranges crash_memory_ranges[INIT_CRASHMEM_RANGES];
+int crash_mem_ranges;
 
 /* Scan the Firmware Assisted dump configuration details. */
 int __init early_init_dt_scan_fw_dump(unsigned long node,
@@ -235,6 +238,10 @@ static unsigned long get_fadump_area_size(void)
 	size += fw_dump.cpu_state_data_size;
 	size += fw_dump.hpte_region_size;
 	size += fw_dump.boot_memory_size;
+	size += sizeof(struct fadump_crash_info_header);
+	size += sizeof(struct elfhdr); /* ELF core header.*/
+	/* Program headers for crash memory regions. */
+	size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2);
 
 	size = PAGE_ALIGN(size);
 	return size;
@@ -300,6 +307,12 @@ int __init fadump_reserve_mem(void)
 				"for saving crash dump\n",
 				(unsigned long)(size >> 20),
 				(unsigned long)(base >> 20));
+
+		fw_dump.fadumphdr_addr =
+				fdm_active->rmr_region.destination_address +
+				fdm_active->rmr_region.source_len;
+		pr_debug("fadumphdr_addr = %p\n",
+				(void *) fw_dump.fadumphdr_addr);
 	} else {
 		/* Reserve the memory at the top of memory. */
 		size = get_fadump_area_size();
@@ -380,8 +393,210 @@ static void register_fw_dump(struct fadump_mem_struct *fdm)
 	}
 }
 
+/*
+ * Validate and process the dump data stored by firmware before exporting
+ * it through '/proc/vmcore'.
+ */
+static int __init process_fadump(const struct fadump_mem_struct *fdm_active)
+{
+	struct fadump_crash_info_header *fdh;
+
+	if (!fdm_active || !fw_dump.fadumphdr_addr)
+		return -EINVAL;
+
+	/* Check if the dump data is valid. */
+	if ((fdm_active->header.dump_status_flag == FADUMP_ERROR_FLAG) ||
+			(fdm_active->rmr_region.error_flags != 0)) {
+		printk(KERN_ERR "Dump taken by platform is not valid\n");
+		return -EINVAL;
+	}
+	if (fdm_active->rmr_region.bytes_dumped !=
+			fdm_active->rmr_region.source_len) {
+		printk(KERN_ERR "Dump taken by platform is incomplete\n");
+		return -EINVAL;
+	}
+
+	/* Validate the fadump crash info header */
+	fdh = __va(fw_dump.fadumphdr_addr);
+	if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
+		printk(KERN_ERR "Crash info header is not valid.\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * We are done validating dump info and elfcore header is now ready
+	 * to be exported. set elfcorehdr_addr so that vmcore module will
+	 * export the elfcore header through '/proc/vmcore'.
+	 */
+	elfcorehdr_addr = fdh->elfcorehdr_addr;
+
+	return 0;
+}
+
+static inline void fadump_add_crash_memory(unsigned long long base,
+					unsigned long long end)
+{
+	if (base == end)
+		return;
+
+	pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n",
+		crash_mem_ranges, base, end - 1, (end - base));
+	crash_memory_ranges[crash_mem_ranges].base = base;
+	crash_memory_ranges[crash_mem_ranges].size = end - base;
+	crash_mem_ranges++;
+}
+
+static void fadump_exclude_reserved_area(unsigned long long start,
+					unsigned long long end)
+{
+	unsigned long long ra_start, ra_end;
+
+	ra_start = fw_dump.reserve_dump_area_start;
+	ra_end = ra_start + fw_dump.reserve_dump_area_size;
+
+	if ((ra_start < end) && (ra_end > start)) {
+		if ((start < ra_start) && (end > ra_end)) {
+			fadump_add_crash_memory(start, ra_start);
+			fadump_add_crash_memory(ra_end, end);
+		} else if (start < ra_start) {
+			fadump_add_crash_memory(start, ra_start);
+		} else if (ra_end < end) {
+			fadump_add_crash_memory(ra_end, end);
+		}
+	} else
+		fadump_add_crash_memory(start, end);
+}
+
+static int fadump_init_elfcore_header(char *bufp)
+{
+	struct elfhdr *elf;
+
+	elf = (struct elfhdr *) bufp;
+	bufp += sizeof(struct elfhdr);
+	memcpy(elf->e_ident, ELFMAG, SELFMAG);
+	elf->e_ident[EI_CLASS] = ELF_CLASS;
+	elf->e_ident[EI_DATA] = ELF_DATA;
+	elf->e_ident[EI_VERSION] = EV_CURRENT;
+	elf->e_ident[EI_OSABI] = ELF_OSABI;
+	memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
+	elf->e_type = ET_CORE;
+	elf->e_machine = ELF_ARCH;
+	elf->e_version = EV_CURRENT;
+	elf->e_entry = 0;
+	elf->e_phoff = sizeof(struct elfhdr);
+	elf->e_shoff = 0;
+	elf->e_flags = ELF_CORE_EFLAGS;
+	elf->e_ehsize = sizeof(struct elfhdr);
+	elf->e_phentsize = sizeof(struct elf_phdr);
+	elf->e_phnum = 0;
+	elf->e_shentsize = 0;
+	elf->e_shnum = 0;
+	elf->e_shstrndx = 0;
+
+	return 0;
+}
+
+/*
+ * Traverse through memblock structure and setup crash memory ranges. These
+ * ranges will be used create PT_LOAD program headers in elfcore header.
+ */
+static void fadump_setup_crash_memory_ranges(void)
+{
+	struct memblock_region *reg;
+	unsigned long long start, end;
+
+	pr_debug("Setup crash memory ranges.\n");
+	crash_mem_ranges = 0;
+	/*
+	 * add the first memory chunk (RMA_START through boot_memory_size) as
+	 * a separate memory chunk. The reason is, at the time crash firmware
+	 * will move the content of this memory chunk to different location
+	 * specified during fadump registration. We need to create a separate
+	 * program header for this chunk with the correct offset.
+	 */
+	fadump_add_crash_memory(RMA_START, fw_dump.boot_memory_size);
+
+	for_each_memblock(memory, reg) {
+		start = (unsigned long long)reg->base;
+		end = start + (unsigned long long)reg->size;
+		if (start == RMA_START && end >= fw_dump.boot_memory_size)
+			start = fw_dump.boot_memory_size;
+
+		/* add this range excluding the reserved dump area. */
+		fadump_exclude_reserved_area(start, end);
+	}
+}
+
+static int fadump_create_elfcore_headers(char *bufp)
+{
+	struct elfhdr *elf;
+	struct elf_phdr *phdr;
+	int i;
+
+	fadump_init_elfcore_header(bufp);
+	elf = (struct elfhdr *)bufp;
+	bufp += sizeof(struct elfhdr);
+
+	/* setup PT_LOAD sections. */
+
+	for (i = 0; i < crash_mem_ranges; i++) {
+		unsigned long long mbase, msize;
+		mbase = crash_memory_ranges[i].base;
+		msize = crash_memory_ranges[i].size;
+
+		if (!msize)
+			continue;
+
+		phdr = (struct elf_phdr *)bufp;
+		bufp += sizeof(struct elf_phdr);
+		phdr->p_type	= PT_LOAD;
+		phdr->p_flags	= PF_R|PF_W|PF_X;
+		phdr->p_offset	= mbase;
+
+		if (mbase == RMA_START) {
+			/*
+			 * The entire RMA region will be moved by firmware
+			 * to the specified destination_address. Hence set
+			 * the correct offset.
+			 */
+			phdr->p_offset = fdm.rmr_region.destination_address;
+		}
+
+		phdr->p_paddr = mbase;
+		phdr->p_vaddr = (unsigned long)__va(mbase);
+		phdr->p_filesz = msize;
+		phdr->p_memsz = msize;
+		phdr->p_align = 0;
+
+		/* Increment number of program headers. */
+		(elf->e_phnum)++;
+	}
+	return 0;
+}
+
+static unsigned long init_fadump_header(unsigned long addr)
+{
+	struct fadump_crash_info_header *fdh;
+
+	if (!addr)
+		return 0;
+
+	fw_dump.fadumphdr_addr = addr;
+	fdh = __va(addr);
+	addr += sizeof(struct fadump_crash_info_header);
+
+	memset(fdh, 0, sizeof(struct fadump_crash_info_header));
+	fdh->magic_number = FADUMP_CRASH_INFO_MAGIC;
+	fdh->elfcorehdr_addr = addr;
+
+	return addr;
+}
+
 static void register_fadump(void)
 {
+	unsigned long addr;
+	void *vaddr;
+
 	/*
 	 * If no memory is reserved then we can not register for firmware-
 	 * assisted dump.
@@ -389,6 +604,16 @@ static void register_fadump(void)
 	if (!fw_dump.reserve_dump_area_size)
 		return;
 
+	fadump_setup_crash_memory_ranges();
+
+	addr = fdm.rmr_region.destination_address + fdm.rmr_region.source_len;
+	/* Initialize fadump crash info header. */
+	addr = init_fadump_header(addr);
+	vaddr = __va(addr);
+
+	pr_debug("Creating ELF core headers at %#016lx\n", addr);
+	fadump_create_elfcore_headers(vaddr);
+
 	/* register the future kernel dump with firmware. */
 	register_fw_dump(&fdm);
 }
@@ -582,8 +807,14 @@ int __init setup_fadump(void)
 	}
 
 	fadump_show_config();
+	/*
+	 * If dump data is available then see if it is valid and prepare for
+	 * saving it to the disk.
+	 */
+	if (fw_dump.dump_active)
+		process_fadump(fdm_active);
 	/* Initialize the kernel dump memory structure for FAD registration. */
-	if (fw_dump.reserve_dump_area_size)
+	else if (fw_dump.reserve_dump_area_size)
 		init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
 	fadump_init_files();
 

^ permalink raw reply related

* [RFC PATCH v6 03/10] fadump: Register for firmware assisted dump.
From: Mahesh J Salgaonkar @ 2011-12-10  6:50 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Anton Blanchard, Amerigo Wang, Kexec-ml, Milton Miller,
	Randy Dunlap, Eric W. Biederman, Vivek Goyal
In-Reply-To: <20111210064301.10195.3344.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

This patch registers for firmware-assisted dump using rtas token
ibm,configure-kernel-dump. During registration firmware is informed about
the reserved area where it saves the CPU state data, HPTE table and contents
of RMR region at the time of kernel crash. Apart from this, firmware also
preserves the contents of entire partition memory even if it is not specified
during registration.

This patch also populates sysfs files under /sys/kernel to display
fadump status and reserved memory regions.

This patch introduces fadump_registered sysfs control file which will be
used by kdump init scripts to start/stop firmware assisted dump. echo 1 to
/sys/kernel/fadump_registered file for fadump registration and
echo 0 to /sys/kernel/fadump_registered file for fadump un-registration.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/fadump.h |   57 ++++++
 arch/powerpc/kernel/fadump.c      |  352 +++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/iommu.c       |    8 +
 arch/powerpc/mm/hash_utils_64.c   |   11 +
 4 files changed, 424 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index 7be25d3..bbaf278 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -48,6 +48,58 @@
 #define FADUMP_HPTE_REGION	0x0002
 #define FADUMP_REAL_MODE_REGION	0x0011
 
+/* Dump request flag */
+#define FADUMP_REQUEST_FLAG	0x00000001
+
+/* FAD commands */
+#define FADUMP_REGISTER		1
+#define FADUMP_UNREGISTER	2
+#define FADUMP_INVALIDATE	3
+
+/* Kernel Dump section info */
+struct fadump_section {
+	u32	request_flag;
+	u16	source_data_type;
+	u16	error_flags;
+	u64	source_address;
+	u64	source_len;
+	u64	bytes_dumped;
+	u64	destination_address;
+};
+
+/* ibm,configure-kernel-dump header. */
+struct fadump_section_header {
+	u32	dump_format_version;
+	u16	dump_num_sections;
+	u16	dump_status_flag;
+	u32	offset_first_dump_section;
+
+	/* Fields for disk dump option. */
+	u32	dd_block_size;
+	u64	dd_block_offset;
+	u64	dd_num_blocks;
+	u32	dd_offset_disk_path;
+
+	/* Maximum time allowed to prevent an automatic dump-reboot. */
+	u32	max_time_auto;
+};
+
+/*
+ * Firmware Assisted dump memory structure. This structure is required for
+ * registering future kernel dump with power firmware through rtas call.
+ *
+ * No disk dump option. Hence disk dump path string section is not included.
+ */
+struct fadump_mem_struct {
+	struct fadump_section_header	header;
+
+	/* Kernel dump sections */
+	struct fadump_section		cpu_state_data;
+	struct fadump_section		hpte_region;
+	struct fadump_section		rmr_region;
+};
+
+/* Firmware-assisted dump configuration details. */
 struct fw_dump {
 	unsigned long	cpu_state_data_size;
 	unsigned long	hpte_region_size;
@@ -62,10 +114,15 @@ struct fw_dump {
 	unsigned long	fadump_enabled:1;
 	unsigned long	fadump_supported:1;
 	unsigned long	dump_active:1;
+	unsigned long	dump_registered:1;
 };
 
 extern int early_init_dt_scan_fw_dump(unsigned long node,
 		const char *uname, int depth, void *data);
 extern int fadump_reserve_mem(void);
+extern int setup_fadump(void);
+extern int is_fadump_active(void);
+#else	/* CONFIG_FA_DUMP */
+static inline int is_fadump_active(void) { return 0; }
 #endif
 #endif
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index deb276a..13c22cd 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -29,6 +29,9 @@
 
 #include <linux/string.h>
 #include <linux/memblock.h>
+#include <linux/delay.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 
 #include <asm/page.h>
 #include <asm/prom.h>
@@ -36,6 +39,10 @@
 #include <asm/fadump.h>
 
 static struct fw_dump fw_dump;
+static struct fadump_mem_struct fdm;
+static const struct fadump_mem_struct *fdm_active;
+
+static DEFINE_MUTEX(fadump_mutex);
 
 /* Scan the Firmware Assisted dump configuration details. */
 int __init early_init_dt_scan_fw_dump(unsigned long node,
@@ -64,7 +71,8 @@ int __init early_init_dt_scan_fw_dump(unsigned long node,
 	 * The 'ibm,kernel-dump' rtas node is present only if there is
 	 * dump data waiting for us.
 	 */
-	if (of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL))
+	fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL);
+	if (fdm_active)
 		fw_dump.dump_active = 1;
 
 	/* Get the sizes required to store dump data for the firmware provided
@@ -98,6 +106,85 @@ int __init early_init_dt_scan_fw_dump(unsigned long node,
 	return 1;
 }
 
+int is_fadump_active(void)
+{
+	return fw_dump.dump_active;
+}
+
+/* Print firmware assisted dump configurations for debugging purpose. */
+static void fadump_show_config(void)
+{
+	pr_debug("Support for firmware-assisted dump (fadump): %s\n",
+			(fw_dump.fadump_supported ? "present" : "no support"));
+
+	if (!fw_dump.fadump_supported)
+		return;
+
+	pr_debug("Fadump enabled    : %s\n",
+				(fw_dump.fadump_enabled ? "yes" : "no"));
+	pr_debug("Dump Active       : %s\n",
+				(fw_dump.dump_active ? "yes" : "no"));
+	pr_debug("Dump section sizes:\n");
+	pr_debug("    CPU state data size: %lx\n", fw_dump.cpu_state_data_size);
+	pr_debug("    HPTE region size   : %lx\n", fw_dump.hpte_region_size);
+	pr_debug("Boot memory size  : %lx\n", fw_dump.boot_memory_size);
+}
+
+static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm,
+				unsigned long addr)
+{
+	if (!fdm)
+		return 0;
+
+	memset(fdm, 0, sizeof(struct fadump_mem_struct));
+	addr = addr & PAGE_MASK;
+
+	fdm->header.dump_format_version = 0x00000001;
+	fdm->header.dump_num_sections = 3;
+	fdm->header.dump_status_flag = 0;
+	fdm->header.offset_first_dump_section =
+		(u32)offsetof(struct fadump_mem_struct, cpu_state_data);
+
+	/*
+	 * Fields for disk dump option.
+	 * We are not using disk dump option, hence set these fields to 0.
+	 */
+	fdm->header.dd_block_size = 0;
+	fdm->header.dd_block_offset = 0;
+	fdm->header.dd_num_blocks = 0;
+	fdm->header.dd_offset_disk_path = 0;
+
+	/* set 0 to disable an automatic dump-reboot. */
+	fdm->header.max_time_auto = 0;
+
+	/* Kernel dump sections */
+	/* cpu state data section. */
+	fdm->cpu_state_data.request_flag = FADUMP_REQUEST_FLAG;
+	fdm->cpu_state_data.source_data_type = FADUMP_CPU_STATE_DATA;
+	fdm->cpu_state_data.source_address = 0;
+	fdm->cpu_state_data.source_len = fw_dump.cpu_state_data_size;
+	fdm->cpu_state_data.destination_address = addr;
+	addr += fw_dump.cpu_state_data_size;
+
+	/* hpte region section */
+	fdm->hpte_region.request_flag = FADUMP_REQUEST_FLAG;
+	fdm->hpte_region.source_data_type = FADUMP_HPTE_REGION;
+	fdm->hpte_region.source_address = 0;
+	fdm->hpte_region.source_len = fw_dump.hpte_region_size;
+	fdm->hpte_region.destination_address = addr;
+	addr += fw_dump.hpte_region_size;
+
+	/* RMA region section */
+	fdm->rmr_region.request_flag = FADUMP_REQUEST_FLAG;
+	fdm->rmr_region.source_data_type = FADUMP_REAL_MODE_REGION;
+	fdm->rmr_region.source_address = RMA_START;
+	fdm->rmr_region.source_len = fw_dump.boot_memory_size;
+	fdm->rmr_region.destination_address = addr;
+	addr += fw_dump.boot_memory_size;
+
+	return addr;
+}
+
 /**
  * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM
  *
@@ -166,8 +253,15 @@ int __init fadump_reserve_mem(void)
 		fw_dump.fadump_enabled = 0;
 		return 0;
 	}
-	/* Initialize boot memory size */
-	fw_dump.boot_memory_size = fadump_calculate_reserve_size();
+	/*
+	 * Initialize boot memory size
+	 * If dump is active then we have already calculated the size during
+	 * first kernel.
+	 */
+	if (fdm_active)
+		fw_dump.boot_memory_size = fdm_active->rmr_region.source_len;
+	else
+		fw_dump.boot_memory_size = fadump_calculate_reserve_size();
 
 	/*
 	 * Calculate the memory boundary.
@@ -244,3 +338,255 @@ static int __init early_fadump_reserve_mem(char *p)
 	return 0;
 }
 early_param("fadump_reserve_mem", early_fadump_reserve_mem);
+
+static void register_fw_dump(struct fadump_mem_struct *fdm)
+{
+	int rc;
+	unsigned int wait_time;
+
+	pr_debug("Registering for firmware-assisted kernel dump...\n");
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
+			FADUMP_REGISTER, fdm,
+			sizeof(struct fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+
+	} while (wait_time);
+
+	switch (rc) {
+	case -1:
+		printk(KERN_ERR "Failed to register firmware-assisted kernel"
+			" dump. Hardware Error(%d).\n", rc);
+		break;
+	case -3:
+		printk(KERN_ERR "Failed to register firmware-assisted kernel"
+			" dump. Parameter Error(%d).\n", rc);
+		break;
+	case -9:
+		printk(KERN_ERR "firmware-assisted kernel dump is already "
+			" registered.");
+		fw_dump.dump_registered = 1;
+		break;
+	case 0:
+		printk(KERN_INFO "firmware-assisted kernel dump registration"
+			" is successful\n");
+		fw_dump.dump_registered = 1;
+		break;
+	}
+}
+
+static void register_fadump(void)
+{
+	/*
+	 * If no memory is reserved then we can not register for firmware-
+	 * assisted dump.
+	 */
+	if (!fw_dump.reserve_dump_area_size)
+		return;
+
+	/* register the future kernel dump with firmware. */
+	register_fw_dump(&fdm);
+}
+
+static int fadump_unregister_dump(struct fadump_mem_struct *fdm)
+{
+	int rc = 0;
+	unsigned int wait_time;
+
+	pr_debug("Un-register firmware-assisted dump\n");
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
+			FADUMP_UNREGISTER, fdm,
+			sizeof(struct fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+	} while (wait_time);
+
+	if (rc) {
+		printk(KERN_ERR "Failed to un-register firmware-assisted dump."
+			" unexpected error(%d).\n", rc);
+		return rc;
+	}
+	fw_dump.dump_registered = 0;
+	return 0;
+}
+
+static ssize_t fadump_enabled_show(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					char *buf)
+{
+	return sprintf(buf, "%d\n", fw_dump.fadump_enabled);
+}
+
+static ssize_t fadump_register_show(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					char *buf)
+{
+	return sprintf(buf, "%d\n", fw_dump.dump_registered);
+}
+
+static ssize_t fadump_register_store(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					const char *buf, size_t count)
+{
+	int ret = 0;
+
+	if (!fw_dump.fadump_enabled || fdm_active)
+		return -EPERM;
+
+	mutex_lock(&fadump_mutex);
+
+	switch (buf[0]) {
+	case '0':
+		if (fw_dump.dump_registered == 0) {
+			ret = -EINVAL;
+			goto unlock_out;
+		}
+		/* Un-register Firmware-assisted dump */
+		fadump_unregister_dump(&fdm);
+		break;
+	case '1':
+		if (fw_dump.dump_registered == 1) {
+			ret = -EINVAL;
+			goto unlock_out;
+		}
+		/* Register Firmware-assisted dump */
+		register_fadump();
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+unlock_out:
+	mutex_unlock(&fadump_mutex);
+	return ret < 0 ? ret : count;
+}
+
+static int fadump_region_show(struct seq_file *m, void *private)
+{
+	const struct fadump_mem_struct *fdm_ptr;
+
+	if (!fw_dump.fadump_enabled)
+		return 0;
+
+	if (fdm_active)
+		fdm_ptr = fdm_active;
+	else
+		fdm_ptr = &fdm;
+
+	seq_printf(m,
+			"CPU : [%#016llx-%#016llx] %#llx bytes, "
+			"Dumped: %#llx\n",
+			fdm_ptr->cpu_state_data.destination_address,
+			fdm_ptr->cpu_state_data.destination_address +
+			fdm_ptr->cpu_state_data.source_len - 1,
+			fdm_ptr->cpu_state_data.source_len,
+			fdm_ptr->cpu_state_data.bytes_dumped);
+	seq_printf(m,
+			"HPTE: [%#016llx-%#016llx] %#llx bytes, "
+			"Dumped: %#llx\n",
+			fdm_ptr->hpte_region.destination_address,
+			fdm_ptr->hpte_region.destination_address +
+			fdm_ptr->hpte_region.source_len - 1,
+			fdm_ptr->hpte_region.source_len,
+			fdm_ptr->hpte_region.bytes_dumped);
+	seq_printf(m,
+			"DUMP: [%#016llx-%#016llx] %#llx bytes, "
+			"Dumped: %#llx\n",
+			fdm_ptr->rmr_region.destination_address,
+			fdm_ptr->rmr_region.destination_address +
+			fdm_ptr->rmr_region.source_len - 1,
+			fdm_ptr->rmr_region.source_len,
+			fdm_ptr->rmr_region.bytes_dumped);
+
+	if (!fdm_active ||
+		(fw_dump.reserve_dump_area_start ==
+		fdm_ptr->cpu_state_data.destination_address))
+		return 0;
+
+	/* Dump is active. Show reserved memory region. */
+	seq_printf(m,
+			"    : [%#016llx-%#016llx] %#llx bytes, "
+			"Dumped: %#llx\n",
+			(unsigned long long)fw_dump.reserve_dump_area_start,
+			fdm_ptr->cpu_state_data.destination_address - 1,
+			fdm_ptr->cpu_state_data.destination_address -
+			fw_dump.reserve_dump_area_start,
+			fdm_ptr->cpu_state_data.destination_address -
+			fw_dump.reserve_dump_area_start);
+	return 0;
+}
+
+static struct kobj_attribute fadump_attr = __ATTR(fadump_enabled,
+						0444, fadump_enabled_show,
+						NULL);
+static struct kobj_attribute fadump_register_attr = __ATTR(fadump_registered,
+						0644, fadump_register_show,
+						fadump_register_store);
+
+static int fadump_region_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, fadump_region_show, inode->i_private);
+}
+
+static const struct file_operations fadump_region_fops = {
+	.open    = fadump_region_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+static void fadump_init_files(void)
+{
+	struct dentry *debugfs_file;
+	int rc = 0;
+
+	rc = sysfs_create_file(kernel_kobj, &fadump_attr.attr);
+	if (rc)
+		printk(KERN_ERR "fadump: unable to create sysfs file"
+			" fadump_enabled (%d)\n", rc);
+
+	rc = sysfs_create_file(kernel_kobj, &fadump_register_attr.attr);
+	if (rc)
+		printk(KERN_ERR "fadump: unable to create sysfs file"
+			" fadump_registered (%d)\n", rc);
+
+	debugfs_file = debugfs_create_file("fadump_region", 0444,
+					powerpc_debugfs_root, NULL,
+					&fadump_region_fops);
+	if (!debugfs_file)
+		printk(KERN_ERR "fadump: unable to create debugfs file"
+				" fadump_region\n");
+	return;
+}
+
+/*
+ * Prepare for firmware-assisted dump.
+ */
+int __init setup_fadump(void)
+{
+	if (!fw_dump.fadump_supported) {
+		printk(KERN_ERR "Firmware-assisted dump is not supported on"
+			" this hardware\n");
+		return 0;
+	}
+
+	fadump_show_config();
+	/* Initialize the kernel dump memory structure for FAD registration. */
+	if (fw_dump.reserve_dump_area_size)
+		init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
+	fadump_init_files();
+
+	return 1;
+}
+subsys_initcall(setup_fadump);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0cfcf98..359f078 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -39,6 +39,7 @@
 #include <asm/pci-bridge.h>
 #include <asm/machdep.h>
 #include <asm/kdump.h>
+#include <asm/fadump.h>
 
 #define DBG(...)
 
@@ -445,7 +446,12 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 
 static void iommu_table_clear(struct iommu_table *tbl)
 {
-	if (!is_kdump_kernel()) {
+	/*
+	 * In case of firmware assisted dump system goes through clean
+	 * reboot process at the time of system crash. Hence it's safe to
+	 * clear the TCE entries if firmware assisted dump is active.
+	 */
+	if (!is_kdump_kernel() || is_fadump_active()) {
 		/* Clear the table in case firmware left allocations in it */
 		ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size);
 		return;
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 2d28218..b534bba 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -55,6 +55,7 @@
 #include <asm/spu.h>
 #include <asm/udbg.h>
 #include <asm/code-patching.h>
+#include <asm/fadump.h>
 
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
@@ -625,6 +626,16 @@ static void __init htab_initialize(void)
 		/* Using a hypervisor which owns the htab */
 		htab_address = NULL;
 		_SDR1 = 0; 
+#ifdef CONFIG_FA_DUMP
+		/*
+		 * If firmware assisted dump is active firmware preserves
+		 * the contents of htab along with entire partition memory.
+		 * Clear the htab if firmware assisted dump is active so
+		 * that we dont end up using old mappings.
+		 */
+		if (is_fadump_active() && ppc_md.hpte_clear_all)
+			ppc_md.hpte_clear_all();
+#endif
 	} else {
 		/* Find storage for the HPT.  Must be contiguous in
 		 * the absolute address space. On cell we want it to be

^ permalink raw reply related

* [RFC PATCH v6 01/10] fadump: Add documentation for firmware-assisted dump.
From: Mahesh J Salgaonkar @ 2011-12-10  6:50 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Anton Blanchard, Amerigo Wang, Kexec-ml, Milton Miller,
	Randy Dunlap, Eric W. Biederman, Vivek Goyal
In-Reply-To: <20111210064301.10195.3344.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

Documentation for firmware-assisted dump. This document is based on the
original documentation written for phyp assisted dump by Linas Vepstas
and Manish Ahuja, with few changes to reflect the current implementation.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 Documentation/powerpc/firmware-assisted-dump.txt |  243 ++++++++++++++++++++++
 1 files changed, 243 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/powerpc/firmware-assisted-dump.txt

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt
new file mode 100644
index 0000000..f77a44e
--- /dev/null
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -0,0 +1,243 @@
+
+                   Firmware-Assisted Dump
+                   ------------------------
+                       July 2011
+
+The goal of firmware-assisted dump is to enable the dump of
+a crashed system, and to do so from a fully-reset system, and
+to minimize the total elapsed time until the system is back
+in production use.
+
+Comparing with kdump or other strategies, firmware-assisted
+dump offers several strong, practical advantages:
+
+-- Unlike kdump, the system has been reset, and loaded
+   with a fresh copy of the kernel.  In particular,
+   PCI and I/O devices have been reinitialized and are
+   in a clean, consistent state.
+-- Once the dump is copied out, the memory that held the dump
+   is immediately available to the running kernel. A further
+   reboot isn't required.
+
+The above can only be accomplished by coordination with,
+and assistance from the Power firmware. The procedure is
+as follows:
+
+-- The first kernel registers the sections of memory with the
+   Power firmware for dump preservation during OS initialization.
+   These registered sections of memory are reserved by the first
+   kernel during early boot.
+
+-- When a system crashes, the Power firmware will save
+   the low memory (boot memory of size larger of 5% of system RAM
+   or 256MB) of RAM to the previous registered region. It will
+   also save system registers, and hardware PTE's.
+
+   NOTE: The term 'boot memory' means size of the low memory chunk
+         that is required for a kernel to boot successfully when
+         booted with restricted memory. By default, the boot memory
+         size will be the larger of 5% of system RAM or 256MB.
+         Alternatively, user can also specify boot memory size
+         through boot parameter 'fadump_reserve_mem=' which will
+         override the default calculated size.
+
+-- After the low memory (boot memory) area has been saved, the
+   firmware will reset PCI and other hardware state.  It will
+   *not* clear the RAM. It will then launch the bootloader, as
+   normal.
+
+-- The freshly booted kernel will notice that there is a new
+   node (ibm,dump-kernel) in the device tree, indicating that
+   there is crash data available from a previous boot. During
+   the early boot OS will reserve rest of the memory above
+   boot memory size effectively booting with restricted memory
+   size. This will make sure that the second kernel will not
+   touch any of the dump memory area.
+
+-- User-space tools will read /proc/vmcore to obtain the contents
+   of memory, which holds the previous crashed kernel dump in ELF
+   format. The userspace tools may copy this info to disk, or
+   network, nas, san, iscsi, etc. as desired.
+
+-- Once the userspace tool is done saving dump, it will echo
+   '1' to /sys/kernel/fadump_release_mem to release the reserved
+   memory back to general use, except the memory required for
+   next firmware-assisted dump registration.
+
+   e.g.
+     # echo 1 > /sys/kernel/fadump_release_mem
+
+Please note that the firmware-assisted dump feature
+is only available on Power6 and above systems with recent
+firmware versions.
+
+Implementation details:
+----------------------
+
+During boot, a check is made to see if firmware supports
+this feature on that particular machine. If it does, then
+we check to see if an active dump is waiting for us. If yes
+then everything but boot memory size of RAM is reserved during
+early boot (See Fig. 2). This area is released once we finish
+collecting the dump from user land scripts (e.g. kdump scripts)
+that are run. If there is dump data, then the
+/sys/kernel/fadump_release_mem file is created, and the reserved
+memory is held.
+
+If there is no waiting dump data, then only the memory required
+to hold CPU state, HPTE region, boot memory dump and elfcore
+header, is reserved at the top of memory (see Fig. 1). This area
+is *not* released: this region will be kept permanently reserved,
+so that it can act as a receptacle for a copy of the boot memory
+content in addition to CPU state and HPTE region, in the case a
+crash does occur.
+
+  o Memory Reservation during first kernel
+
+  Low memory                                        Top of memory
+  0      boot memory size                                       |
+  |           |                       |<--Reserved dump area -->|
+  V           V                       |   Permanent Reservation V
+  +-----------+----------/ /----------+---+----+-----------+----+
+  |           |                       |CPU|HPTE|  DUMP     |ELF |
+  +-----------+----------/ /----------+---+----+-----------+----+
+        |                                           ^
+        |                                           |
+        \                                           /
+         -------------------------------------------
+          Boot memory content gets transferred to
+          reserved area by firmware at the time of
+          crash
+                   Fig. 1
+
+  o Memory Reservation during second kernel after crash
+
+  Low memory                                        Top of memory
+  0      boot memory size                                       |
+  |           |<------------- Reserved dump area ----------- -->|
+  V           V                                                 V
+  +-----------+----------/ /----------+---+----+-----------+----+
+  |           |                       |CPU|HPTE|  DUMP     |ELF |
+  +-----------+----------/ /----------+---+----+-----------+----+
+        |                                                    |
+        V                                                    V
+   Used by second                                    /proc/vmcore
+   kernel to boot
+                   Fig. 2
+
+Currently the dump will be copied from /proc/vmcore to a
+a new file upon user intervention. The dump data available through
+/proc/vmcore will be in ELF format. Hence the existing kdump
+infrastructure (kdump scripts) to save the dump works fine with
+minor modifications.
+
+The tools to examine the dump will be same as the ones
+used for kdump.
+
+How to enable firmware-assisted dump (fadump):
+-------------------------------------
+
+1. Set config option CONFIG_FA_DUMP=y and build kernel.
+2. Boot into linux kernel with 'fadump=on' kernel cmdline option.
+3. Optionally, user can also set 'fadump_reserve_mem=' kernel cmdline
+   to specify size of the memory to reserve for boot memory dump
+   preservation.
+
+NOTE: If firmware-assisted dump fails to reserve memory then it will
+   fallback to existing kdump mechanism if 'crashkernel=' option
+   is set at kernel cmdline.
+
+Sysfs/debugfs files:
+------------
+
+Firmware-assisted dump feature uses sysfs file system to hold
+the control files and debugfs file to display memory reserved region.
+
+Here is the list of files under kernel sysfs:
+
+ /sys/kernel/fadump_enabled
+
+    This is used to display the fadump status.
+    0 = fadump is disabled
+    1 = fadump is enabled
+
+ /sys/kernel/fadump_registered
+
+    This is used to display the fadump registration status as well
+    as to control (start/stop) the fadump registration.
+    0 = fadump is not registered.
+    1 = fadump is registered and ready to handle system crash.
+
+    To register fadump echo 1 > /sys/kernel/fadump_registered and
+    echo 0 > /sys/kernel/fadump_registered for un-register and stop the
+    fadump. Once the fadump is un-registered, the system crash will not
+    be handled and vmcore will not be captured.
+
+ /sys/kernel/fadump_release_mem
+
+    This file is available only when fadump is active during
+    second kernel. This is used to release the reserved memory
+    region that are held for saving crash dump. To release the
+    reserved memory echo 1 to it:
+
+    echo 1  > /sys/kernel/fadump_release_mem
+
+    After echo 1, the content of the /sys/kernel/debug/powerpc/fadump_region
+    file will change to reflect the new memory reservations.
+
+Here is the list of files under powerpc debugfs:
+(Assuming debugfs is mounted on /sys/kernel/debug directory.)
+
+ /sys/kernel/debug/powerpc/fadump_region
+
+    This file shows the reserved memory regions if fadump is
+    enabled otherwise this file is empty. The output format
+    is:
+    <region>: [<start>-<end>] <reserved-size> bytes, Dumped: <dump-size>
+
+    e.g.
+    Contents when fadump is registered during first kernel
+
+    # cat /sys/kernel/debug/powerpc/fadump_region
+    CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x0
+    HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x0
+    DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x0
+
+    Contents when fadump is active during second kernel
+
+    # cat /sys/kernel/debug/powerpc/fadump_region
+    CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x40020
+    HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x1000
+    DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x10000000
+        : [0x00000010000000-0x0000006ffaffff] 0x5ffb0000 bytes, Dumped: 0x5ffb0000
+
+NOTE: Please refer to Documentation/filesystems/debugfs.txt on
+      how to mount the debugfs filesystem.
+
+
+TODO:
+-----
+ o Need to come up with the better approach to find out more
+   accurate boot memory size that is required for a kernel to
+   boot successfully when booted with restricted memory.
+ o The fadump implementation introduces a fadump crash info structure
+   in the scratch area before the ELF core header. The idea of introducing
+   this structure is to pass some important crash info data to the second
+   kernel which will help second kernel to populate ELF core header with
+   correct data before it gets exported through /proc/vmcore. The current
+   design implementation does not address a possibility of introducing
+   additional fields (in future) to this structure without affecting
+   compatibility. Need to come up with the better approach to address this.
+   The possible approaches are:
+	1. Introduce version field for version tracking, bump up the version
+	whenever a new field is added to the structure in future. The version
+	field can be used to find out what fields are valid for the current
+	version of the structure.
+	2. Reserve the area of predefined size (say PAGE_SIZE) for this
+	structure and have unused area as reserved (initialized to zero)
+	for future field additions.
+   The advantage of approach 1 over 2 is we don't need to reserve extra space.
+---
+Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+This document is based on the original documentation written for phyp
+assisted dump by Linas Vepstas and Manish Ahuja.

^ permalink raw reply related

* [RFC PATCH v6 00/10] fadump: Firmware-assisted dump support for Powerpc.
From: Mahesh J Salgaonkar @ 2011-12-10  6:49 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Anton Blanchard, Amerigo Wang, Kexec-ml, Milton Miller,
	Randy Dunlap, Eric W. Biederman, Vivek Goyal

Hi All,

Please find the version 6 of the patchset that implements firmware-assisted
dump mechanism to capture kernel crash dump for Powerpc architecture. The
firmware-assisted dump is a robust mechanism to get reliable kernel crash
dump with assistance from firmware. This approach does not use kexec, instead
firmware assists in booting the kdump kernel while preserving memory contents.

These patches cleanly apply on commit dc47ce90c3 in linux-2.6 git tree.

Change in v6:
-------------
- Use of_read_number and of_read_ulong while reading the dump sizes
  from rtas node ibm,configure-kernel-kdump-sizes and few minor changes.
- Kernel command line option 'fadump' now uses on/off values to
  enable/disable fadump.
- Added the last patch in this series 10/10 to remove phyp dump code.

Change in v5:
-------------
- Added 'fadump_' prefix to all static functions defined.

patch 02/10:
- Merged patch 10/10 which introduces a config option CONFIG_FA_DUMP
  for firmware assisted dump feature on Powerpc (ppc64) architecture.
- Increased MIN_BOOT_MEM by 64M to avoid OOM issue during network
  dump capture. When kdump infrastructure is configured to save vmcore
  over network, we run into OOM issue while loading modules related to
  network setup.

Changes in v4:
--------------
patch 04/10:
- Move the init_elfcore_header() function and 'memblock_num_regions' macro
 from generic code to power specific code as these are used only by
 firmware assisted dump implementation which is power specific feature.

patch 05/10:
- Fixes a issue where memblock_free() is invoked from build_cpu_notes()
  function during error_out path. Invoke cpu_notes_buf_free() in error_out
  path instead of memblock_free().

Changes in v3:
-------------
- Re-factored the implementation to work with kdump service start/stop.
  Introduce fadump_registered sysfs control file which will be used by
  kdump init scripts to start/stop firmware assisted dump. echo 1 to
  /sys/kernel/fadump_registered file for fadump registration and
  echo 0 to /sys/kernel/fadump_registered file for fadump un-registration.
- Introduced the locking mechanism to handle simultaneous writes to
  sysfs control files fadump_registered and fadump_release_mem

  Affected patches are: 01/10, 03/10, 08/10.

Changes in v2:
-------------
patch 01/10:
- Modified the documentation to reflect the change of fadump_region
  file under debugfs filesystem.

patch 02/10:
- Modified to use standard pr_debug() macro.
- Modified early_init_dt_scan_fw_dump() to get the size of
  "ibm,configure-kernel-dump-sizes" property and use it to iterate through
  an array of dump sections.
- Introduced boot option 'fadump_reserve_mem=' to let user specify the
  fadump boot memory to be reserved.

patch 03/10:
- Removed few debug print statements.
- Moved the setup_fadump() call from setup_system() and now calling it
  subsys_initcall.
- Moved fadump_region attribute under debugfs.
- Clear the TCE entries if firmware assisted dump is active.

patch 05/10:
- Moved the crash_fadump() invocation from generic code to panic notifier.
- Introduced cpu_notes_buf_alloc() function to allocate cpu notes buffer
  using get_free_pages().

patch 08/10:
- Introduced cpu_notes_buf_free() function to free memory allocated for
  cpu notes buffer.

The most of the code implementation has been adapted from phyp assisted dump
implementation written by Linas Vepstas and Manish Ahuja.

The first patch is a documentation that talks about firmware-assisted dump
mechanism, implementation details and TODO list.

I have tested the patches on following system configuration:
1. LPAR on Power6 with 4GB RAM and 8 CPUs
2. LPAR on Power7 with 2GB RAM and 20 CPUs
3. LPAR on Power7 with 1TB RAM and 896 CPUs

Please review the patchset and let me know your comments.

Thanks,
-Mahesh.
---

Mahesh Salgaonkar (10):
      fadump: Add documentation for firmware-assisted dump.
      fadump: Reserve the memory for firmware assisted dump.
      fadump: Register for firmware assisted dump.
      fadump: Initialize elfcore header and add PT_LOAD program headers.
      fadump: Convert firmware-assisted cpu state dump data into elf notes.
      fadump: Add PT_NOTE program header for vmcoreinfo
      fadump: Introduce cleanup routine to invalidate /proc/vmcore.
      fadump: Invalidate registration and release reserved memory for general use.
      fadump: Invalidate the fadump registration during machine shutdown.
      fadump: Remove the phyp assisted dump code.


 Documentation/powerpc/firmware-assisted-dump.txt |  243 ++++
 Documentation/powerpc/phyp-assisted-dump.txt     |  127 --
 arch/powerpc/Kconfig                             |   17 
 arch/powerpc/include/asm/fadump.h                |  218 ++++
 arch/powerpc/include/asm/phyp_dump.h             |   47 -
 arch/powerpc/kernel/Makefile                     |    1 
 arch/powerpc/kernel/fadump.c                     | 1312 ++++++++++++++++++++++
 arch/powerpc/kernel/iommu.c                      |    8 
 arch/powerpc/kernel/prom.c                       |   98 --
 arch/powerpc/kernel/setup-common.c               |   14 
 arch/powerpc/kernel/traps.c                      |    3 
 arch/powerpc/mm/hash_utils_64.c                  |   11 
 arch/powerpc/platforms/pseries/Makefile          |    1 
 arch/powerpc/platforms/pseries/phyp_dump.c       |  513 ---------
 fs/proc/vmcore.c                                 |   23 
 15 files changed, 1854 insertions(+), 782 deletions(-)
 create mode 100644 Documentation/powerpc/firmware-assisted-dump.txt
 delete mode 100644 Documentation/powerpc/phyp-assisted-dump.txt
 create mode 100644 arch/powerpc/include/asm/fadump.h
 delete mode 100644 arch/powerpc/include/asm/phyp_dump.h
 create mode 100644 arch/powerpc/kernel/fadump.c
 delete mode 100644 arch/powerpc/platforms/pseries/phyp_dump.c

--
-Mahesh

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox