[PATCH 08/16] readahead: state based method

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Wu Fengguang <wfg@mail.ustc.edu.cn>
To: linux-kernel@vger.kernel.org
Cc: Andrew Morton <akpm@osdl.org>, Wu Fengguang <wfg@mail.ustc.edu.cn>
Subject: [PATCH 08/16] readahead: state based method
Date: Wed, 09 Nov 2005 21:49:46 +0800	[thread overview]
Message-ID: <20051109141510.844000000@localhost.localdomain> (raw)
In-Reply-To: 20051109134938.757187000@localhost.localdomain

[-- Attachment #1: readahead-method-stateful.patch --]
[-- Type: text/plain, Size: 12855 bytes --]

This is the fast code path.

Major steps:
        - estimate a thrashing safe ra_size;
        - assemble the next read-ahead request in file_ra_state;
        - submit it.

Signed-off-by: Wu Fengguang <wfg@mail.ustc.edu.cn>
---

 include/linux/fs.h |    8 +
 mm/readahead.c     |  340 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/swap.c          |    3 
 mm/vmscan.c        |    5 
 4 files changed, 355 insertions(+), 1 deletion(-)

--- linux-2.6.14-mm1.orig/include/linux/fs.h
+++ linux-2.6.14-mm1/include/linux/fs.h
@@ -569,13 +569,19 @@ struct file_ra_state {
 	unsigned long start;		/* Current window */
 	unsigned long size;
 	unsigned long flags;		/* ra flags RA_FLAG_xxx*/
-	unsigned long cache_hit;	/* cache hit count*/
+	uint64_t      cache_hit;	/* cache hit count*/
 	unsigned long prev_page;	/* Cache last read() position */
 	unsigned long ahead_start;	/* Ahead window */
 	unsigned long ahead_size;
 	unsigned long ra_pages;		/* Maximum readahead window */
 	unsigned long mmap_hit;		/* Cache hit stat for mmap accesses */
 	unsigned long mmap_miss;	/* Cache miss stat for mmap accesses */
+
+	unsigned long age;
+	pgoff_t la_index;
+	pgoff_t ra_index;
+	pgoff_t lookahead_index;
+	pgoff_t readahead_index;
 };
 #define RA_FLAG_MISS 0x01	/* a cache miss occured against this file */
 #define RA_FLAG_INCACHE 0x02	/* file is already in cache */
--- linux-2.6.14-mm1.orig/mm/vmscan.c
+++ linux-2.6.14-mm1/mm/vmscan.c
@@ -406,6 +406,8 @@ cannot_free:
 	return 0;
 }
 
+DECLARE_PER_CPU(unsigned long, smooth_aging);
+
 /*
  * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
  */
@@ -453,6 +455,8 @@ static int shrink_list(struct list_head 
 		/* In active use or really unfreeable?  Activate it. */
 		if (referenced && page_mapping_inuse(page))
 			goto activate_locked;
+		if (!referenced)
+			__get_cpu_var(smooth_aging)++;
 
 #ifdef CONFIG_SWAP
 		/*
@@ -991,6 +995,7 @@ refill_inactive_zone(struct zone *zone, 
 				list_add(&page->lru, &l_active);
 				continue;
 			}
+			__get_cpu_var(smooth_aging)++;
 		}
 		list_add(&page->lru, &l_inactive);
 	}
--- linux-2.6.14-mm1.orig/mm/swap.c
+++ linux-2.6.14-mm1/mm/swap.c
@@ -112,6 +112,8 @@ void fastcall activate_page(struct page 
 	spin_unlock_irq(&zone->lru_lock);
 }
 
+DECLARE_PER_CPU(unsigned long, smooth_aging);
+
 /*
  * Mark a page as having seen activity.
  *
@@ -126,6 +128,7 @@ void fastcall mark_page_accessed(struct 
 		ClearPageReferenced(page);
 	} else if (!PageReferenced(page)) {
 		SetPageReferenced(page);
+		__get_cpu_var(smooth_aging)++;
 	}
 }
 
--- linux-2.6.14-mm1.orig/mm/readahead.c
+++ linux-2.6.14-mm1/mm/readahead.c
@@ -38,6 +38,12 @@ EXPORT_SYMBOL(readahead_hit_rate);
 int readahead_live_chunk = 2 * MAX_RA_PAGES;
 EXPORT_SYMBOL(readahead_live_chunk);
 
+/* Analog to zone->nr_page_aging.
+ * But mainly increased on fresh page references, so is much more smoother.
+ */
+DEFINE_PER_CPU(unsigned long, smooth_aging);
+EXPORT_PER_CPU_SYMBOL(smooth_aging);
+
 /* Detailed classification of read-ahead behaviors. */
 #define RA_CLASS_SHIFT 3
 #define RA_CLASS_MASK  ((1 << RA_CLASS_SHIFT) - 1)
@@ -789,6 +795,340 @@ out:
 }
 
 /*
+ * State based calculation of read-ahead request.
+ *
+ * This figure shows the meaning of file_ra_state members:
+ *
+ *             chunk A                            chunk B
+ *  +---------------------------+-------------------------------------------+
+ *  |             #             |                   #                       |
+ *  +---------------------------+-------------------------------------------+
+ *                ^             ^                   ^                       ^
+ *              la_index      ra_index     lookahead_index         readahead_index
+ */
+
+/*
+ * The global effective length of the inactive_list(s).
+ */
+static unsigned long nr_free_inactive(void)
+{
+	unsigned int i;
+	unsigned long sum = 0;
+	struct zone *zones = NODE_DATA(numa_node_id())->node_zones;
+
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		sum += zones[i].nr_inactive +
+			zones[i].free_pages - zones[i].pages_low;
+
+	return sum;
+}
+
+/*
+ * A much smoother analog to nr_page_aging.
+ */
+static unsigned long nr_smooth_aging(void)
+{
+	unsigned long cpu;
+	unsigned long sum = 0;
+	cpumask_t mask = node_to_cpumask(numa_node_id());
+
+	for_each_cpu_mask(cpu, mask)
+		sum += per_cpu(smooth_aging, cpu);
+
+	return sum;
+}
+
+/*
+ * Set class of read-ahead
+ */
+static inline void set_ra_class(struct file_ra_state *ra,
+				enum ra_class ra_class)
+{
+	unsigned long FLAGS_MASK;
+	unsigned long flags;
+	unsigned long old_ra_class;
+
+	FLAGS_MASK = ~(RA_CLASS_MASK | (RA_CLASS_MASK << RA_CLASS_SHIFT));
+	flags = ra->flags & FLAGS_MASK;
+
+	old_ra_class = (ra->flags & RA_CLASS_MASK) << RA_CLASS_SHIFT;
+
+	ra->flags = flags | old_ra_class | ra_class;
+}
+
+/*
+ * The 64bit cache_hit stores three accumulated value and one counter value.
+ * MSB                                                                   LSB
+ * 3333333333333333 : 2222222222222222 : 1111111111111111 : 0000000000000000
+ */
+static inline int ra_cache_hit(struct file_ra_state *ra, int nr)
+{
+	return (ra->cache_hit >> (nr * 16)) & 0xFFFF;
+}
+
+/*
+ * Something like:
+ * ra_cache_hit(ra, 1) += ra_cache_hit(ra, 0);
+ * ra_cache_hit(ra, 0) = 0;
+ */
+static inline void ra_addup_cache_hit(struct file_ra_state *ra)
+{
+	int n;
+
+	n = ra_cache_hit(ra, 0);
+	ra->cache_hit -= n;
+	n <<= 16;
+	ra->cache_hit += n;
+}
+
+/*
+ * The read-ahead is deemed success if cache-hit-rate > 50%.
+ */
+static inline int ra_cache_hit_ok(struct file_ra_state *ra)
+{
+	return ra_cache_hit(ra, 0) * readahead_hit_rate >=
+					(ra->lookahead_index - ra->la_index);
+}
+
+/*
+ * Check if @index falls in the ra request.
+ */
+static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
+{
+	if (index < ra->la_index || index >= ra->readahead_index)
+		return 0;
+
+	if (index >= ra->ra_index)
+		return 1;
+	else
+		return -1;
+}
+
+/*
+ * Prepare file_ra_state for a new read-ahead sequence.
+ */
+static inline void ra_state_init(struct file_ra_state *ra,
+				pgoff_t la_index, pgoff_t ra_index)
+{
+	ra_addup_cache_hit(ra);
+	ra->cache_hit <<= 16;
+	ra->lookahead_index = la_index;
+	ra->readahead_index = ra_index;
+}
+
+/*
+ * Take down a new read-ahead request in file_ra_state.
+ */
+static inline void ra_state_update(struct file_ra_state *ra,
+				unsigned long ra_size, unsigned long la_size)
+{
+#ifdef DEBUG_READAHEAD
+	unsigned long old_ra = ra->readahead_index - ra->ra_index;
+	if (ra_size < old_ra && ra_cache_hit(ra, 0))
+		ra_account(ra, RA_EVENT_READAHEAD_SHRINK, old_ra - ra_size);
+#endif
+	ra_addup_cache_hit(ra);
+	ra->ra_index = ra->readahead_index;
+	ra->la_index = ra->lookahead_index;
+	ra->readahead_index += ra_size;
+	ra->lookahead_index = ra->readahead_index - la_size;
+	ra->age = nr_smooth_aging();
+}
+
+/*
+ * Adjust the read-ahead request in file_ra_state.
+ */
+static inline void ra_state_adjust(struct file_ra_state *ra,
+				unsigned long ra_size, unsigned long la_size)
+{
+	ra->readahead_index = ra->ra_index + ra_size;
+	ra->lookahead_index = ra->readahead_index - la_size;
+}
+
+/*
+ * Submit IO for the read-ahead request in file_ra_state.
+ */
+static int ra_dispatch(struct file_ra_state *ra,
+			struct address_space *mapping, struct file *filp)
+{
+	pgoff_t eof_index;
+	unsigned long ra_size;
+	unsigned long la_size;
+	int actual;
+	enum ra_class ra_class;
+
+	ra_class = (ra->flags & RA_CLASS_MASK);
+	BUG_ON(ra_class == 0 || ra_class > RA_CLASS_END);
+
+	eof_index = ((i_size_read(mapping->host) - 1) >> PAGE_CACHE_SHIFT) + 1;
+	ra_size = ra->readahead_index - ra->ra_index;
+	la_size = ra->readahead_index - ra->lookahead_index;
+
+	/* Snap to EOF. */
+	if (unlikely(ra->ra_index >= eof_index))
+		return 0;
+	if (ra->readahead_index + ra_size / 2 > eof_index) {
+		if (ra_class == RA_CLASS_CONTEXT_ACCELERATED &&
+				eof_index > ra->lookahead_index + 1)
+			la_size = eof_index - ra->lookahead_index;
+		else
+			la_size = 0;
+		ra_size = eof_index - ra->ra_index;
+		ra_state_adjust(ra, ra_size, la_size);
+	}
+
+	actual = __do_page_cache_readahead(mapping, filp,
+					ra->ra_index, ra_size, la_size);
+
+#ifdef READAHEAD_STREAMING
+	if (actual < ra_size) {
+		struct page *page = find_page(mapping, ra->ra_index + actual);
+		if (page)
+			rescue_pages(page, ra_size);
+	}
+#endif
+
+	if (ra->readahead_index == eof_index)
+		ra_account(ra, RA_EVENT_READAHEAD_EOF, actual);
+	if (la_size)
+		ra_account(ra, RA_EVENT_LOOKAHEAD, la_size);
+	ra_account(ra, RA_EVENT_READAHEAD, actual);
+
+	dprintk("readahead-%s(ino=%lu, index=%lu, ra=%lu+%lu-%lu) = %d\n",
+			ra_class_name[ra_class],
+			mapping->host->i_ino, ra->la_index,
+			ra->ra_index, ra_size, la_size, actual);
+
+	return actual;
+}
+
+/*
+ * Determine the request parameters from primitive values.
+ *
+ * It applies the following rules:
+ *   - Substract ra_size by the old look-ahead to get real safe read-ahead;
+ *   - Set new la_size according to the (still large) ra_size;
+ *   - Apply upper limits;
+ *   - Make sure stream_shift is not too small.
+ *     (So that the next global_shift will not be too small.)
+ *
+ * Input:
+ * ra_size stores the estimated thrashing-threshold.
+ * la_size stores the look-ahead size of previous request.
+ */
+static inline int adjust_rala(unsigned long ra_max,
+				unsigned long *ra_size, unsigned long *la_size)
+{
+	unsigned long stream_shift = *la_size;
+
+	if (*ra_size > *la_size)
+		*ra_size -= *la_size;
+	else
+		return 0;
+
+	*la_size = *ra_size / LOOKAHEAD_RATIO;
+
+	if (*ra_size > ra_max)
+		*ra_size = ra_max;
+	if (*la_size > *ra_size)
+		*la_size = *ra_size;
+
+	stream_shift += (*ra_size - *la_size);
+	if (stream_shift < *ra_size / 4)
+		*la_size -= (*ra_size / 4 - stream_shift);
+
+	return 1;
+}
+
+/*
+ * The function estimates two values:
+ * 1. thrashing-threshold for the current stream
+ *    It is returned to make the next read-ahead request.
+ * 2. the remained space for the current chunk
+ *    It will be checked to ensure that the current chunk is safe.
+ *
+ * The computation will be pretty accurate under heavy load, and will change
+ * vastly with light load(small global_shift), so the grow speed of ra_size
+ * must be limited, and a moderate large stream_shift must be insured.
+ *
+ * This figure illustrates the formula:
+ * While the stream reads stream_shift pages inside the chunks,
+ * the chunks are shifted global_shift pages inside inactive_list.
+ *
+ *      chunk A                    chunk B
+ *                          |<=============== global_shift ================|
+ *  +-------------+         +-------------------+                          |
+ *  |       #     |         |           #       |            inactive_list |
+ *  +-------------+         +-------------------+                     head |
+ *          |---->|         |---------->|
+ *             |                  |
+ *             +-- stream_shift --+
+ */
+static inline unsigned long compute_thrashing_threshold(
+						struct file_ra_state *ra,
+						unsigned long *remain)
+{
+	unsigned long global_size;
+	unsigned long global_shift;
+	unsigned long stream_shift;
+	unsigned long ra_size;
+
+	global_size = nr_free_inactive();
+	global_shift = nr_smooth_aging() - ra->age;
+	stream_shift = ra_cache_hit(ra, 0);
+
+	ra_size = stream_shift *
+			global_size * readahead_ratio / (100 * global_shift);
+
+	if (global_size > global_shift)
+		*remain = stream_shift *
+				(global_size - global_shift) / global_shift;
+	else
+		*remain = 0;
+
+	ddprintk("compute_thrashing_threshold: "
+			"ra=%lu=%lu*%lu/%lu, remain %lu for %lu\n",
+			ra_size, stream_shift, global_size, global_shift,
+			*remain, ra->readahead_index - ra->lookahead_index);
+
+	return ra_size;
+}
+
+/*
+ * Main function for file_ra_state based read-ahead.
+ */
+static inline unsigned long
+state_based_readahead(struct address_space *mapping, struct file *filp,
+			struct file_ra_state *ra, struct page *page,
+			unsigned long ra_max)
+{
+	unsigned long ra_old;
+	unsigned long ra_size;
+	unsigned long la_size;
+	unsigned long remain_space;
+
+	la_size = ra->readahead_index - ra->lookahead_index;
+	ra_old = ra->readahead_index - ra->ra_index;
+	ra_size = compute_thrashing_threshold(ra, &remain_space);
+
+	if (readahead_ratio < VM_READAHEAD_PROTECT_RATIO &&
+			remain_space <= la_size && la_size > 1) {
+		rescue_pages(page, la_size);
+		return 0;
+	}
+
+	if (!adjust_rala(min(ra_max, 2 * ra_old + (ra_max - ra_old) / 16),
+				&ra_size, &la_size))
+		return 0;
+
+	set_ra_class(ra, RA_CLASS_STATE);
+	ra_state_update(ra, ra_size, la_size);
+
+	return ra_dispatch(ra, mapping, filp);
+}
+
+
+/*
  * ra_size is mainly determined by:
  * 1. sequential-start: min(MIN_RA_PAGES + (pages>>14), KB(128))
  * 2. sequential-max:	min(ra->ra_pages, 0xFFFF)

--

next prev parent reply	other threads:[~2005-11-09 14:14 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-11-09 13:49 [PATCH 00/16] Adaptive read-ahead V7 Wu Fengguang
2005-11-09 13:49 ` [PATCH 01/16] mm: delayed page activation Wu Fengguang
2005-11-10  0:21   ` Nick Piggin
2005-11-10  3:15     ` Wu Fengguang
2005-11-10  9:17   ` Peter Zijlstra
2005-11-10 10:30     ` Wu Fengguang
2005-11-09 13:49 ` [PATCH 02/16] mm: balance page aging between zones Wu Fengguang
2005-11-09 13:49 ` [PATCH 03/16] radixtree: sync with mainline Wu Fengguang
2005-11-09 13:49 ` [PATCH 04/16] radix-tree: look-aside cache Wu Fengguang
2005-11-09 23:31   ` Nick Piggin
2005-11-10  5:25     ` Wu Fengguang
2005-11-10  6:50       ` Nick Piggin
2005-11-10  8:30         ` Wu Fengguang
2005-11-18 11:25         ` Wu Fengguang
2005-11-18 12:12           ` Wu Fengguang
2005-11-09 13:49 ` [PATCH 05/16] readahead: some preparation Wu Fengguang
2005-11-18  7:46   ` 2.6.15-rc1-mm2 Andrew Morton
2005-11-18  8:56     ` 2.6.15-rc1-mm2 Benoit Boissinot
2005-11-18  9:04       ` 2.6.15-rc1-mm2 Andrew Morton
2005-11-18  9:13         ` 2.6.15-rc1-mm2 Benoit Boissinot
2005-11-18 13:43         ` 2.6.15-rc1-mm2 Rafael J. Wysocki
2005-11-18 10:10     ` 2.6.15-rc1-mm2 Mauro Carvalho Chehab
2005-11-18 10:55     ` 2.6.15-rc1-mm2 Wu Fengguang
2005-11-18 11:29     ` 2.6.15-rc1-mm2 Andy Whitcroft
2005-11-18 16:29     ` 2.6.15-rc1-mm2 Michael Krufky
2005-11-20  0:23     ` 2.6.15-rc1-mm2 Michal Piotrowski
2005-11-20  8:04       ` 2.6.15-rc1-mm2 Hugh Dickins
2005-11-20 12:53         ` 2.6.15-rc1-mm2 Michal Piotrowski
2005-11-09 13:49 ` [PATCH 06/16] readahead: call scheme Wu Fengguang
2005-11-09 13:49 ` [PATCH 07/16] readahead: tunable parameters Wu Fengguang
2005-11-09 13:49 ` Wu Fengguang [this message]
2005-11-09 13:49 ` [PATCH 09/16] readahead: context based method Wu Fengguang
2005-11-09 13:49 ` [PATCH 10/16] readahead: other methods Wu Fengguang
2005-11-09 13:49 ` [PATCH 11/16] readahead: mandatory thrashing protection Wu Fengguang
2005-11-09 13:49 ` [PATCH 12/16] readahead: events accounting Wu Fengguang
2005-11-09 13:49 ` [PATCH 13/16] readahead: page aging accounting Wu Fengguang
2005-11-09 13:49 ` [PATCH 14/16] readahead: laptop mode support Wu Fengguang
2005-11-09 13:49 ` [PATCH 15/16] readahead: disable look-ahead for loopback file Wu Fengguang
2005-11-09 13:49 ` [PATCH 16/16] io: reduce lantency Wu Fengguang
2005-11-09 20:39 ` [PATCH 00/16] Adaptive read-ahead V7 Christoph Lameter
2005-11-10 10:19   ` Wu Fengguang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20051109141510.844000000@localhost.localdomain \
    --to=wfg@mail.ustc.edu.cn \
    --cc=akpm@osdl.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.