public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Andrea Arcangeli <andrea@suse.de>
To: Ben LaHaise <bcrl@redhat.com>
Cc: Linus Torvalds <torvalds@transmeta.com>,
	Alan Cox <alan@lxorguk.ukuu.org.uk>,
	Rik van Riel <riel@conectiva.com.br>,
	linux-kernel@vger.kernel.org
Subject: Re: [with-PATCH-really] highmem deadlock removal, balancing & cleanup
Date: Sat, 26 May 2001 02:42:30 +0200	[thread overview]
Message-ID: <20010526024230.K9634@athlon.random> (raw)
In-Reply-To: <Pine.LNX.4.31.0105251700350.15549-100000@penguin.transmeta.com> <Pine.LNX.4.33.0105252007020.3806-100000@toomuch.toronto.redhat.com>
In-Reply-To: <Pine.LNX.4.33.0105252007020.3806-100000@toomuch.toronto.redhat.com>; from bcrl@redhat.com on Fri, May 25, 2001 at 08:29:38PM -0400

On Fri, May 25, 2001 at 08:29:38PM -0400, Ben LaHaise wrote:
> amount of bounce buffers to guarentee progress while submitting io.  The
> -ac kernels have a patch from Ingo that provides private pools for bounce
> buffers and buffer_heads.  I went a step further and have a memory
> reservation patch that provides for memory pools being reserved against a
> particular zone.  This is needed to prevent the starvation that irq
> allocations can cause.
> 
> Some of these cleanups are 2.5 fodder, but we really need something in 2.4
> right now, so...

Please merge this one in 2.4 for now (originally from Ingo, I only
improved it), this is a real definitive fix and there's no nicer way to
handle that unless you want to generalize an API for people to generate
private anti-deadlock ("make sure to always make a progress") memory
pools:

diff -urN 2.4.4/mm/highmem.c highmem-deadlock/mm/highmem.c
--- 2.4.4/mm/highmem.c	Sat Apr 28 05:24:48 2001
+++ highmem-deadlock/mm/highmem.c	Sat Apr 28 18:21:24 2001
@@ -159,6 +159,19 @@
 	spin_unlock(&kmap_lock);
 }
 
+#define POOL_SIZE 32
+
+/*
+ * This lock gets no contention at all, normally.
+ */
+static spinlock_t emergency_lock = SPIN_LOCK_UNLOCKED;
+
+int nr_emergency_pages;
+static LIST_HEAD(emergency_pages);
+
+int nr_emergency_bhs;
+static LIST_HEAD(emergency_bhs);
+
 /*
  * Simple bounce buffer support for highmem pages.
  * This will be moved to the block layer in 2.5.
@@ -203,17 +216,72 @@
 
 static inline void bounce_end_io (struct buffer_head *bh, int uptodate)
 {
+	struct page *page;
 	struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private);
+	unsigned long flags;
 
 	bh_orig->b_end_io(bh_orig, uptodate);
-	__free_page(bh->b_page);
+
+	page = bh->b_page;
+
+	spin_lock_irqsave(&emergency_lock, flags);
+	if (nr_emergency_pages >= POOL_SIZE)
+		__free_page(page);
+	else {
+		/*
+		 * We are abusing page->list to manage
+		 * the highmem emergency pool:
+		 */
+		list_add(&page->list, &emergency_pages);
+		nr_emergency_pages++;
+	}
+	
+	if (nr_emergency_bhs >= POOL_SIZE) {
 #ifdef HIGHMEM_DEBUG
-	/* Don't clobber the constructed slab cache */
-	init_waitqueue_head(&bh->b_wait);
+		/* Don't clobber the constructed slab cache */
+		init_waitqueue_head(&bh->b_wait);
 #endif
-	kmem_cache_free(bh_cachep, bh);
+		kmem_cache_free(bh_cachep, bh);
+	} else {
+		/*
+		 * Ditto in the bh case, here we abuse b_inode_buffers:
+		 */
+		list_add(&bh->b_inode_buffers, &emergency_bhs);
+		nr_emergency_bhs++;
+	}
+	spin_unlock_irqrestore(&emergency_lock, flags);
 }
 
+static __init int init_emergency_pool(void)
+{
+	spin_lock_irq(&emergency_lock);
+	while (nr_emergency_pages < POOL_SIZE) {
+		struct page * page = alloc_page(GFP_ATOMIC);
+		if (!page) {
+			printk("couldn't refill highmem emergency pages");
+			break;
+		}
+		list_add(&page->list, &emergency_pages);
+		nr_emergency_pages++;
+	}
+	while (nr_emergency_bhs < POOL_SIZE) {
+		struct buffer_head * bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
+		if (!bh) {
+			printk("couldn't refill highmem emergency bhs");
+			break;
+		}
+		list_add(&bh->b_inode_buffers, &emergency_bhs);
+		nr_emergency_bhs++;
+	}
+	spin_unlock_irq(&emergency_lock);
+	printk("allocated %d pages and %d bhs reserved for the highmem bounces\n",
+	       nr_emergency_pages, nr_emergency_bhs);
+
+	return 0;
+}
+
+__initcall(init_emergency_pool);
+
 static void bounce_end_io_write (struct buffer_head *bh, int uptodate)
 {
 	bounce_end_io(bh, uptodate);
@@ -228,6 +296,82 @@
 	bounce_end_io(bh, uptodate);
 }
 
+struct page *alloc_bounce_page (void)
+{
+	struct list_head *tmp;
+	struct page *page;
+
+repeat_alloc:
+	page = alloc_page(GFP_BUFFER);
+	if (page)
+		return page;
+	/*
+	 * No luck. First, kick the VM so it doesnt idle around while
+	 * we are using up our emergency rations.
+	 */
+	wakeup_bdflush(0);
+
+	/*
+	 * Try to allocate from the emergency pool.
+	 */
+	tmp = &emergency_pages;
+	spin_lock_irq(&emergency_lock);
+	if (!list_empty(tmp)) {
+		page = list_entry(tmp->next, struct page, list);
+		list_del(tmp->next);
+		nr_emergency_pages--;
+	}
+	spin_unlock_irq(&emergency_lock);
+	if (page)
+		return page;
+
+	/* we need to wait I/O completion */
+	run_task_queue(&tq_disk);
+
+	current->policy |= SCHED_YIELD;
+	__set_current_state(TASK_RUNNING);
+	schedule();
+	goto repeat_alloc;
+}
+
+struct buffer_head *alloc_bounce_bh (void)
+{
+	struct list_head *tmp;
+	struct buffer_head *bh;
+
+repeat_alloc:
+	bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER);
+	if (bh)
+		return bh;
+	/*
+	 * No luck. First, kick the VM so it doesnt idle around while
+	 * we are using up our emergency rations.
+	 */
+	wakeup_bdflush(0);
+
+	/*
+	 * Try to allocate from the emergency pool.
+	 */
+	tmp = &emergency_bhs;
+	spin_lock_irq(&emergency_lock);
+	if (!list_empty(tmp)) {
+		bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers);
+		list_del(tmp->next);
+		nr_emergency_bhs--;
+	}
+	spin_unlock_irq(&emergency_lock);
+	if (bh)
+		return bh;
+
+	/* we need to wait I/O completion */
+	run_task_queue(&tq_disk);
+
+	current->policy |= SCHED_YIELD;
+	__set_current_state(TASK_RUNNING);
+	schedule();
+	goto repeat_alloc;
+}
+
 struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig)
 {
 	struct page *page;
@@ -236,24 +380,15 @@
 	if (!PageHighMem(bh_orig->b_page))
 		return bh_orig;
 
-repeat_bh:
-	bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER);
-	if (!bh) {
-		wakeup_bdflush(1);  /* Sets task->state to TASK_RUNNING */
-		goto repeat_bh;
-	}
+	bh = alloc_bounce_bh();
 	/*
 	 * This is wasteful for 1k buffers, but this is a stopgap measure
 	 * and we are being ineffective anyway. This approach simplifies
 	 * things immensly. On boxes with more than 4GB RAM this should
 	 * not be an issue anyway.
 	 */
-repeat_page:
-	page = alloc_page(GFP_BUFFER);
-	if (!page) {
-		wakeup_bdflush(1);  /* Sets task->state to TASK_RUNNING */
-		goto repeat_page;
-	}
+	page = alloc_bounce_page();
+
 	set_bh_page(bh, page, 0);
 
 	bh->b_next = NULL;


And this one as well to avoid tight loops in getblk without reschedules
in between when normal zone is empty:

diff -urN 2.4.4pre1/fs/buffer.c 2.4.4pre1-blkdev/fs/buffer.c
--- 2.4.4pre1/fs/buffer.c	Sun Apr  1 01:17:30 2001
+++ 2.4.4pre1-blkdev/fs/buffer.c	Mon Apr  9 15:37:20 2001
@@ -628,7 +622,7 @@
    to do in order to release the ramdisk memory is to destroy dirty buffers.
 
    These are two special cases. Normal usage imply the device driver
-   to issue a sync on the device (without waiting I/O completation) and
+   to issue a sync on the device (without waiting I/O completion) and
    then an invalidate_buffers call that doesn't trash dirty buffers. */
 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 {
@@ -762,7 +756,12 @@
 	balance_dirty(NODEV);
 	if (free_shortage())
 		page_launder(GFP_BUFFER, 0);
-	grow_buffers(size);
+	if (!grow_buffers(size)) {
+		wakeup_bdflush(1);
+		current->policy |= SCHED_YIELD;
+		__set_current_state(TASK_RUNNING);
+		schedule();
+	}
 }
 
 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
@@ -1027,12 +1026,13 @@
 	write_unlock(&hash_table_lock);
 	spin_unlock(&lru_list_lock);
 	refill_freelist(size);
+	/* FIXME: getblk should fail if there's no enough memory */
 	goto repeat;
 }
 
 /* -1 -> no need to flush
     0 -> async flush
-    1 -> sync flush (wait for I/O completation) */
+    1 -> sync flush (wait for I/O completion) */
 int balance_dirty_state(kdev_t dev)
 {
 	unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
@@ -1431,6 +1431,7 @@
 {
 	struct buffer_head *bh, *head, *tail;
 
+	/* FIXME: create_buffers should fail if there's no enough memory */
 	head = create_buffers(page, blocksize, 1);
 	if (page->buffers)
 		BUG();
@@ -2367,11 +2368,9 @@
 	spin_lock(&free_list[index].lock);
 	tmp = bh;
 	do {
-		struct buffer_head *p = tmp;
-
-		tmp = tmp->b_this_page;
-		if (buffer_busy(p))
+		if (buffer_busy(tmp))
 			goto busy_buffer_page;
+		tmp = tmp->b_this_page;
 	} while (tmp != bh);
 
 	spin_lock(&unused_list_lock);

Andrea

  parent reply	other threads:[~2001-05-26  0:43 UTC|newest]

Thread overview: 64+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2001-05-25 20:00 [with-PATCH-really] highmem deadlock removal, balancing & cleanup Rik van Riel
2001-05-25 21:12 ` Linus Torvalds
2001-05-25 22:20   ` Rik van Riel
2001-05-25 23:05     ` Linus Torvalds
2001-05-25 23:13     ` Alan Cox
2001-05-25 23:19       ` Rik van Riel
2001-05-26  0:02       ` Linus Torvalds
2001-05-26  0:07         ` Rik van Riel
2001-05-26  0:16           ` Linus Torvalds
2001-05-26  0:23           ` Linus Torvalds
2001-05-26  0:26             ` Rik van Riel
2001-05-26  0:30               ` Linus Torvalds
2001-05-26  0:29         ` Ben LaHaise
2001-05-26  0:34           ` Linus Torvalds
2001-05-26  0:38             ` Rik van Riel
2001-05-26  1:28             ` Linux-2.4.5 Linus Torvalds
2001-05-26  1:35               ` Linux-2.4.5 Rik van Riel
2001-05-26  1:39               ` Linux-2.4.5 Ben LaHaise
2001-05-26  1:59                 ` Linux-2.4.5 Andrea Arcangeli
2001-05-26  2:11                   ` Linux-2.4.5 Ben LaHaise
2001-05-26  2:38                     ` Linux-2.4.5 Andrea Arcangeli
2001-05-26  2:49                       ` Linux-2.4.5 Ben LaHaise
2001-05-26  3:11                         ` Linux-2.4.5 Andrea Arcangeli
2001-05-26  4:22                           ` Linux-2.4.5 Linus Torvalds
2001-05-26  4:31                             ` Linux-2.4.5 Rik van Riel
2001-05-26  8:10                               ` Linux-2.4.5 Linus Torvalds
2001-05-26  9:01                                 ` Linux-2.4.5 Linus Torvalds
2001-05-26  9:18                             ` Linux-2.4.5 arjan
2001-05-26 14:18                             ` Linux-2.4.5 Andrea Arcangeli
2001-05-26 14:21                               ` Linux-2.4.5 Rik van Riel
2001-05-26 14:38                                 ` Linux-2.4.5 Andrea Arcangeli
2001-05-26 14:40                                   ` Linux-2.4.5 Rik van Riel
2001-05-26 15:17                                     ` Linux-2.4.5 Linus Torvalds
2001-05-26 15:28                                       ` Linux-2.4.5 Rik van Riel
2001-05-26 15:59                                         ` Linux-2.4.5 Linus Torvalds
2001-05-26 22:12                                           ` Linux-2.4.5 Marcelo Tosatti
2001-05-27  6:53                                             ` Linux-2.4.5 Marcelo Tosatti
2001-06-03 23:32                                               ` Linux-2.4.5 Linus Torvalds
2001-06-05  2:21                                                 ` Linux-2.4.5 Marcelo Tosatti
2001-05-26 15:09                                 ` Linux-2.4.5 Linus Torvalds
2001-05-26 15:18                                   ` Linux-2.4.5 Rik van Riel
2001-05-26 15:24                                     ` Linux-2.4.5 Andrea Arcangeli
2001-05-26 15:26                                       ` Linux-2.4.5 Rik van Riel
2001-05-26 15:40                                         ` Linux-2.4.5 Andrea Arcangeli
2001-05-26  4:45                           ` Linux-2.4.5 Rik van Riel
2001-05-26  4:47                             ` Linux-2.4.5 Rik van Riel
2001-05-26  6:07                               ` Linux-2.4.5 Ben LaHaise
2001-05-26 14:32                             ` Linux-2.4.5 Andrea Arcangeli
2001-05-26 14:36                               ` Linux-2.4.5 Rik van Riel
2001-05-26 15:03                                 ` Linux-2.4.5 Andrea Arcangeli
2001-05-26 15:08                                   ` Linux-2.4.5 Rik van Riel
2001-05-26 15:20                                     ` Linux-2.4.5 Andrea Arcangeli
2001-05-26 15:41                     ` Linux-2.4.5 Rik van Riel
2001-05-26  0:42           ` Andrea Arcangeli [this message]
2001-05-26  0:52             ` [with-PATCH-really] highmem deadlock removal, balancing & cleanup Ben LaHaise
2001-05-26  1:27               ` Andrea Arcangeli
2001-05-26  1:38                 ` Ben LaHaise
2001-05-26  1:49                   ` Andrea Arcangeli
2001-05-26  2:01                     ` Ben LaHaise
2001-05-26  2:26                       ` Andrea Arcangeli
2001-05-26  2:40                         ` Ben LaHaise
2001-05-26  1:43             ` Rik van Riel
2001-05-25 22:35   ` Rik van Riel
2001-05-25 23:07     ` Linus Torvalds

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20010526024230.K9634@athlon.random \
    --to=andrea@suse.de \
    --cc=alan@lxorguk.ukuu.org.uk \
    --cc=bcrl@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=riel@conectiva.com.br \
    --cc=torvalds@transmeta.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox