Index: linux-2.6/drivers/block/ll_rw_blk.c =================================================================== --- linux-2.6.orig/drivers/block/ll_rw_blk.c 2005-04-11 22:18:49.000000000 +1000 +++ linux-2.6/drivers/block/ll_rw_blk.c 2005-04-11 22:38:10.000000000 +1000 @@ -1450,7 +1450,7 @@ EXPORT_SYMBOL(blk_remove_plug); */ void __generic_unplug_device(request_queue_t *q) { - if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) + if (unlikely(test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))) return; if (!blk_remove_plug(q)) @@ -1828,7 +1828,6 @@ static void __freed_request(request_queu clear_queue_congested(q, rw); if (rl->count[rw] + 1 <= q->nr_requests) { - smp_mb(); if (waitqueue_active(&rl->wait[rw])) wake_up(&rl->wait[rw]); @@ -1860,18 +1859,20 @@ static void freed_request(request_queue_ #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) /* - * Get a free request, queue_lock must not be held + * Get a free request, queue_lock must be held. + * Returns NULL on failure, with queue_lock held. + * Returns !NULL on success, with queue_lock *not held*. */ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) { + int batching; struct request *rq = NULL; struct request_list *rl = &q->rq; - struct io_context *ioc = get_io_context(gfp_mask); + struct io_context *ioc = get_io_context(GFP_ATOMIC); if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) goto out; - spin_lock_irq(q->queue_lock); if (rl->count[rw]+1 >= q->nr_requests) { /* * The queue will fill after this allocation, so set it as @@ -1884,6 +1885,8 @@ static struct request *get_request(reque blk_set_queue_full(q, rw); } } + + batching = ioc_batching(q, ioc); switch (elv_may_queue(q, rw)) { case ELV_MQUEUE_NO: @@ -1894,12 +1897,11 @@ static struct request *get_request(reque goto get_rq; } - if (blk_queue_full(q, rw) && !ioc_batching(q, ioc)) { + if (blk_queue_full(q, rw) && !batching) { /* * The queue is full and the allocating process is not a * "batcher", and not exempted by the IO scheduler */ - spin_unlock_irq(q->queue_lock); goto out; } @@ -1933,11 +1935,10 @@ rq_starved: if (unlikely(rl->count[rw] == 0)) rl->starved[rw] = 1; - spin_unlock_irq(q->queue_lock); goto out; } - if (ioc_batching(q, ioc)) + if (batching) ioc->nr_batch_requests--; rq_init(q, rq); @@ -1950,13 +1951,14 @@ out: /* * No available requests for this queue, unplug the device and wait for some * requests to become available. + * + * Called with q->queue_lock held, and returns with it unlocked. */ static struct request *get_request_wait(request_queue_t *q, int rw) { DEFINE_WAIT(wait); struct request *rq; - generic_unplug_device(q); do { struct request_list *rl = &q->rq; @@ -1968,6 +1970,8 @@ static struct request *get_request_wait( if (!rq) { struct io_context *ioc; + __generic_unplug_device(q); + spin_unlock_irq(q->queue_lock); io_schedule(); /* @@ -1979,6 +1983,8 @@ static struct request *get_request_wait( ioc = get_io_context(GFP_NOIO); ioc_set_batching(q, ioc); put_io_context(ioc); + + spin_lock_irq(q->queue_lock); } finish_wait(&rl->wait[rw], &wait); } while (!rq); @@ -1992,10 +1998,15 @@ struct request *blk_get_request(request_ BUG_ON(rw != READ && rw != WRITE); + spin_lock_irq(q->queue_lock); if (gfp_mask & __GFP_WAIT) rq = get_request_wait(q, rw); - else + else { rq = get_request(q, rw, gfp_mask); + if (!rq) + spin_unlock_irq(q->queue_lock); + } + /* q->queue_lock is unlocked at this point */ return rq; } @@ -2558,7 +2569,7 @@ EXPORT_SYMBOL(__blk_attempt_remerge); static int __make_request(request_queue_t *q, struct bio *bio) { - struct request *req, *freereq = NULL; + struct request *req; int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err; sector_t sector; @@ -2578,19 +2589,14 @@ static int __make_request(request_queue_ spin_lock_prefetch(q->queue_lock); barrier = bio_barrier(bio); - if (barrier && (q->ordered == QUEUE_ORDERED_NONE)) { + if (unlikely(barrier) && (q->ordered == QUEUE_ORDERED_NONE)) { err = -EOPNOTSUPP; goto end_io; } -again: spin_lock_irq(q->queue_lock); - if (elv_queue_empty(q)) { - blk_plug_device(q); - goto get_rq; - } - if (barrier) + if (elv_queue_empty(q) || unlikely(barrier)) goto get_rq; el_ret = elv_merge(q, &req, bio); @@ -2633,40 +2639,24 @@ again: elv_merged_request(q, req); goto out; - /* - * elevator says don't/can't merge. get new request - */ - case ELEVATOR_NO_MERGE: - break; - + /* ELV_NO_MERGE: elevator says don't/can't merge. */ default: - printk("elevator returned crap (%d)\n", el_ret); - BUG(); + ; } +get_rq: /* - * Grab a free request from the freelist - if that is empty, check - * if we are doing read ahead and abort instead of blocking for - * a free slot. + * Grab a free request. This is might sleep but can not fail. + * Returns with the queue unlocked. + */ + req = get_request_wait(q, rw); + + /* + * After dropping the lock and possibly sleeping here, our request + * may now be mergeable after it had proven unmergeable (above). + * We don't worry about that case for efficiency. It won't happen + * often, and the elevators are able to handle it. */ -get_rq: - if (freereq) { - req = freereq; - freereq = NULL; - } else { - spin_unlock_irq(q->queue_lock); - if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) { - /* - * READA bit set - */ - err = -EWOULDBLOCK; - if (bio_rw_ahead(bio)) - goto end_io; - - freereq = get_request_wait(q, rw); - } - goto again; - } req->flags |= REQ_CMD; @@ -2679,7 +2669,7 @@ get_rq: /* * REQ_BARRIER implies no merging, but lets make it explicit */ - if (barrier) + if (unlikely(barrier)) req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE); req->errors = 0; @@ -2694,10 +2684,11 @@ get_rq: req->rq_disk = bio->bi_bdev->bd_disk; req->start_time = jiffies; + spin_lock_irq(q->queue_lock); + if (elv_queue_empty(q)) + blk_plug_device(q); add_request(q, req); out: - if (freereq) - __blk_put_request(q, freereq); if (bio_sync(bio)) __generic_unplug_device(q); @@ -2803,7 +2794,7 @@ static inline void block_wait_queue_runn { DEFINE_WAIT(wait); - while (test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)) { + while (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) { struct request_list *rl = &q->rq; prepare_to_wait_exclusive(&rl->drain, &wait, @@ -2912,7 +2903,7 @@ end_io: goto end_io; } - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) goto end_io; block_wait_queue_running(q); Index: linux-2.6/drivers/md/dm-crypt.c =================================================================== --- linux-2.6.orig/drivers/md/dm-crypt.c 2005-04-11 22:18:50.000000000 +1000 +++ linux-2.6/drivers/md/dm-crypt.c 2005-04-11 22:38:08.000000000 +1000 @@ -331,25 +331,14 @@ crypt_alloc_buffer(struct crypt_config * struct bio *bio; unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; int gfp_mask = GFP_NOIO | __GFP_HIGHMEM; - unsigned long flags = current->flags; unsigned int i; - /* - * Tell VM to act less aggressively and fail earlier. - * This is not necessary but increases throughput. - * FIXME: Is this really intelligent? - */ - current->flags &= ~PF_MEMALLOC; - if (base_bio) bio = bio_clone(base_bio, GFP_NOIO); else bio = bio_alloc(GFP_NOIO, nr_iovecs); - if (!bio) { - if (flags & PF_MEMALLOC) - current->flags |= PF_MEMALLOC; + if (!bio) return NULL; - } /* if the last bio was not complete, continue where that one ended */ bio->bi_idx = *bio_vec_idx; @@ -386,9 +375,6 @@ crypt_alloc_buffer(struct crypt_config * size -= bv->bv_len; } - if (flags & PF_MEMALLOC) - current->flags |= PF_MEMALLOC; - if (!bio->bi_size) { bio_put(bio); return NULL; Index: linux-2.6/fs/mpage.c =================================================================== --- linux-2.6.orig/fs/mpage.c 2005-04-11 22:18:50.000000000 +1000 +++ linux-2.6/fs/mpage.c 2005-04-11 22:38:08.000000000 +1000 @@ -105,11 +105,6 @@ mpage_alloc(struct block_device *bdev, bio = bio_alloc(gfp_flags, nr_vecs); - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); - } - if (bio) { bio->bi_bdev = bdev; bio->bi_sector = first_sector; Index: linux-2.6/include/linux/gfp.h =================================================================== --- linux-2.6.orig/include/linux/gfp.h 2005-04-11 22:18:51.000000000 +1000 +++ linux-2.6/include/linux/gfp.h 2005-04-11 22:38:07.000000000 +1000 @@ -38,14 +38,16 @@ struct vm_area_struct; #define __GFP_NO_GROW 0x2000u /* Slab internal usage */ #define __GFP_COMP 0x4000u /* Add compound page metadata */ #define __GFP_ZERO 0x8000u /* Return zeroed page on success */ +#define __GFP_MEMPOOL 0x10000u/* Mempool allocation */ -#define __GFP_BITS_SHIFT 16 /* Room for 16 __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 17 /* Room for 17 __GFP_FOO bits */ #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1) /* if you forget to add the bitmask here kernel will crash, period */ #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ - __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \ - __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP) + __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL| \ + __GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP|__GFP_ZERO| \ + __GFP_MEMPOOL) #define GFP_ATOMIC (__GFP_HIGH) #define GFP_NOIO (__GFP_WAIT) Index: linux-2.6/mm/mempool.c =================================================================== --- linux-2.6.orig/mm/mempool.c 2005-04-11 22:18:51.000000000 +1000 +++ linux-2.6/mm/mempool.c 2005-04-11 22:38:08.000000000 +1000 @@ -198,31 +198,21 @@ void * mempool_alloc(mempool_t *pool, un void *element; unsigned long flags; DEFINE_WAIT(wait); - int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO); - + int gfp_temp; + might_sleep_if(gfp_mask & __GFP_WAIT); + + gfp_mask |= __GFP_MEMPOOL; + gfp_mask |= __GFP_NOWARN; /* failures are OK */ + + gfp_temp = gfp_mask & ~__GFP_WAIT; + repeat_alloc: - element = pool->alloc(gfp_nowait|__GFP_NOWARN, pool->pool_data); + + element = pool->alloc(gfp_temp, pool->pool_data); if (likely(element != NULL)) return element; - /* - * If the pool is less than 50% full and we can perform effective - * page reclaim then try harder to allocate an element. - */ - mb(); - if ((gfp_mask & __GFP_FS) && (gfp_mask != gfp_nowait) && - (pool->curr_nr <= pool->min_nr/2)) { - element = pool->alloc(gfp_mask, pool->pool_data); - if (likely(element != NULL)) - return element; - } - - /* - * Kick the VM at this point. - */ - wakeup_bdflush(0); - spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { element = remove_element(pool); @@ -235,6 +225,8 @@ repeat_alloc: if (!(gfp_mask & __GFP_WAIT)) return NULL; + /* Now start performing page reclaim */ + gfp_temp = gfp_mask; prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); mb(); if (!pool->curr_nr) Index: linux-2.6/mm/page_alloc.c =================================================================== --- linux-2.6.orig/mm/page_alloc.c 2005-04-11 22:18:51.000000000 +1000 +++ linux-2.6/mm/page_alloc.c 2005-04-11 22:38:07.000000000 +1000 @@ -799,14 +799,18 @@ __alloc_pages(unsigned int __nocast gfp_ } /* This allocation should allow future memory freeing. */ - if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) { - /* go through the zonelist yet again, ignoring mins */ - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!cpuset_zone_allowed(z)) - continue; - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; + + if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) + && !in_interrupt()) { + if (!(gfp_mask & __GFP_MEMPOOL)) { + /* go through the zonelist yet again, ignoring mins */ + for (i = 0; (z = zones[i]) != NULL; i++) { + if (!cpuset_zone_allowed(z)) + continue; + page = buffered_rmqueue(z, order, gfp_mask); + if (page) + goto got_pg; + } } goto nopage; } Index: linux-2.6/mm/swap_state.c =================================================================== --- linux-2.6.orig/mm/swap_state.c 2005-04-11 22:18:50.000000000 +1000 +++ linux-2.6/mm/swap_state.c 2005-04-11 22:38:08.000000000 +1000 @@ -143,7 +143,6 @@ void __delete_from_swap_cache(struct pag int add_to_swap(struct page * page) { swp_entry_t entry; - int pf_flags; int err; if (!PageLocked(page)) @@ -154,30 +153,11 @@ int add_to_swap(struct page * page) if (!entry.val) return 0; - /* Radix-tree node allocations are performing - * GFP_ATOMIC allocations under PF_MEMALLOC. - * They can completely exhaust the page allocator. - * - * So PF_MEMALLOC is dropped here. This causes the slab - * allocations to fail earlier, so radix-tree nodes will - * then be allocated from the mempool reserves. - * - * We're still using __GFP_HIGH for radix-tree node - * allocations, so some of the emergency pools are available, - * just not all of them. - */ - - pf_flags = current->flags; - current->flags &= ~PF_MEMALLOC; - /* * Add it to the swap cache and mark it dirty */ err = __add_to_swap_cache(page, entry, GFP_ATOMIC|__GFP_NOWARN); - if (pf_flags & PF_MEMALLOC) - current->flags |= PF_MEMALLOC; - switch (err) { case 0: /* Success */ SetPageUptodate(page);