From: Wu Fengguang <wfg@mail.ustc.edu.cn>
To: Andrew Morton <akpm@osdl.org>
Cc: linux-kernel@vger.kernel.org
Subject: [PATCH 14/23] readahead: state based method
Date: Sun, 19 Mar 2006 10:34:27 +0800 [thread overview]
Message-ID: <20060319023455.526787000@localhost.localdomain> (raw)
In-Reply-To: 20060319023413.305977000@localhost.localdomain
[-- Attachment #1: readahead-method-stateful.patch --]
[-- Type: text/plain, Size: 12309 bytes --]
This is the fast code path of adaptive read-ahead.
MAJOR STEPS
- estimate a thrashing safe ra_size;
- assemble the next read-ahead request in file_ra_state;
- submit it.
THE REFERENCE MODEL
1. inactive list has constant length and page flow speed
2. the observed stream receives a steady flow of read requests
3. no page activation, so that the inactive list forms a pipe
With that we get the picture showed below.
|<------------------------- constant length ------------------------->|
<<<<<<<<<<<<<<<<<<<<<<<<< steady flow of pages <<<<<<<<<<<<<<<<<<<<<<<<
+---------------------------------------------------------------------+
|tail inactive list head|
| ======= ==========---- |
| chunk A(stale pages) chunk B(stale + fresh pages) |
+---------------------------------------------------------------------+
REAL WORLD ISSUES
Real world workloads will always have fluctuations (violation of assumption
1 and 2). To counteract it, a tunable parameter readahead_ratio is introduced
to make the estimation conservative enough. Violation of assumption 3 will
not lead to thrashing, it is there just for simplicity of discussion.
Signed-off-by: Wu Fengguang <wfg@mail.ustc.edu.cn>
---
include/linux/fs.h | 41 +++++--
mm/readahead.c | 286 +++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 317 insertions(+), 10 deletions(-)
--- linux-2.6.16-rc6-mm2.orig/include/linux/fs.h
+++ linux-2.6.16-rc6-mm2/include/linux/fs.h
@@ -611,19 +611,40 @@ struct fown_struct {
* Track a single file's readahead state
*/
struct file_ra_state {
- unsigned long start; /* Current window */
- unsigned long size;
- unsigned long flags; /* ra flags RA_FLAG_xxx*/
- unsigned long cache_hit; /* cache hit count*/
- unsigned long prev_page; /* Cache last read() position */
- unsigned long ahead_start; /* Ahead window */
- unsigned long ahead_size;
- unsigned long ra_pages; /* Maximum readahead window */
- unsigned long mmap_hit; /* Cache hit stat for mmap accesses */
- unsigned long mmap_miss; /* Cache miss stat for mmap accesses */
+ union {
+ struct { /* conventional read-ahead */
+ unsigned long start; /* Current window */
+ unsigned long size;
+ unsigned long ahead_start; /* Ahead window */
+ unsigned long ahead_size;
+ unsigned long cache_hit; /* cache hit count */
+ };
+#ifdef CONFIG_ADAPTIVE_READAHEAD
+ struct { /* adaptive read-ahead */
+ pgoff_t la_index;
+ pgoff_t ra_index;
+ pgoff_t lookahead_index;
+ pgoff_t readahead_index;
+ unsigned long age;
+ uint64_t cache_hits;
+ };
+#endif
+ };
+
+ /* mmap read-around */
+ unsigned long mmap_hit; /* Cache hit stat for mmap accesses */
+ unsigned long mmap_miss; /* Cache miss stat for mmap accesses */
+
+ /* common ones */
+ unsigned long flags; /* ra flags RA_FLAG_xxx*/
+ unsigned long prev_page; /* Cache last read() position */
+ unsigned long ra_pages; /* Maximum readahead window */
};
#define RA_FLAG_MISS 0x01 /* a cache miss occured against this file */
#define RA_FLAG_INCACHE 0x02 /* file is already in cache */
+#define RA_FLAG_MMAP (1UL<<31) /* mmaped page access */
+#define RA_FLAG_NO_LOOKAHEAD (1UL<<30) /* disable look-ahead */
+#define RA_FLAG_NFSD (1UL<<29) /* request from nfsd */
struct file {
/*
--- linux-2.6.16-rc6-mm2.orig/mm/readahead.c
+++ linux-2.6.16-rc6-mm2/mm/readahead.c
@@ -16,6 +16,7 @@
#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/nfsd/const.h>
+#include <asm/div64.h>
/* The default max/min read-ahead pages. */
#define KB(size) (((size)*1024 + PAGE_CACHE_SIZE-1) / PAGE_CACHE_SIZE)
@@ -1060,6 +1061,291 @@ static unsigned long node_readahead_agin
}
/*
+ * The 64bit cache_hits stores three accumulated values and a counter value.
+ * MSB LSB
+ * 3333333333333333 : 2222222222222222 : 1111111111111111 : 0000000000000000
+ */
+static inline int ra_cache_hit(struct file_ra_state *ra, int nr)
+{
+ return (ra->cache_hits >> (nr * 16)) & 0xFFFF;
+}
+
+/*
+ * Conceptual code:
+ * ra_cache_hit(ra, 1) += ra_cache_hit(ra, 0);
+ * ra_cache_hit(ra, 0) = 0;
+ */
+static inline void ra_addup_cache_hit(struct file_ra_state *ra)
+{
+ int n;
+
+ n = ra_cache_hit(ra, 0);
+ ra->cache_hits -= n;
+ n <<= 16;
+ ra->cache_hits += n;
+}
+
+/*
+ * The read-ahead is deemed success if cache-hit-rate >= 1/readahead_hit_rate.
+ */
+static inline int ra_cache_hit_ok(struct file_ra_state *ra)
+{
+ return ra_cache_hit(ra, 0) * readahead_hit_rate >=
+ (ra->lookahead_index - ra->la_index);
+}
+
+/*
+ * Check if @index falls in the @ra request.
+ */
+static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
+{
+ if (index < ra->la_index || index >= ra->readahead_index)
+ return 0;
+
+ if (index >= ra->ra_index)
+ return 1;
+ else
+ return -1;
+}
+
+/*
+ * Which method is issuing this read-ahead?
+ */
+static inline void ra_set_class(struct file_ra_state *ra,
+ enum ra_class ra_class)
+{
+ unsigned long flags_mask;
+ unsigned long flags;
+ unsigned long old_ra_class;
+
+ flags_mask = ~(RA_CLASS_MASK | (RA_CLASS_MASK << RA_CLASS_SHIFT));
+ flags = ra->flags & flags_mask;
+
+ old_ra_class = (ra->flags & RA_CLASS_MASK) << RA_CLASS_SHIFT;
+
+ ra->flags = flags | old_ra_class | ra_class;
+
+ ra_addup_cache_hit(ra);
+ if (ra_class != RA_CLASS_STATE)
+ ra->cache_hits <<= 16;
+
+ ra->age = node_readahead_aging();
+}
+
+/*
+ * Where is the old read-ahead and look-ahead?
+ */
+static inline void ra_set_index(struct file_ra_state *ra,
+ pgoff_t la_index, pgoff_t ra_index)
+{
+ ra->la_index = la_index;
+ ra->ra_index = ra_index;
+}
+
+/*
+ * Where is the new read-ahead and look-ahead?
+ */
+static inline void ra_set_size(struct file_ra_state *ra,
+ unsigned long ra_size, unsigned long la_size)
+{
+ /* Disable look-ahead for loopback file. */
+ if (unlikely(ra->flags & RA_FLAG_NO_LOOKAHEAD))
+ la_size = 0;
+
+ ra->readahead_index = ra->ra_index + ra_size;
+ ra->lookahead_index = ra->readahead_index - la_size;
+}
+
+/*
+ * Submit IO for the read-ahead request in file_ra_state.
+ */
+static int ra_dispatch(struct file_ra_state *ra,
+ struct address_space *mapping, struct file *filp)
+{
+ pgoff_t eof_index;
+ unsigned long ra_size;
+ unsigned long la_size;
+ int actual;
+ enum ra_class ra_class;
+
+ ra_class = (ra->flags & RA_CLASS_MASK);
+ BUG_ON(ra_class == 0 || ra_class > RA_CLASS_END);
+
+ eof_index = ((i_size_read(mapping->host) - 1) >> PAGE_CACHE_SHIFT) + 1;
+ ra_size = ra->readahead_index - ra->ra_index;
+ la_size = ra->readahead_index - ra->lookahead_index;
+
+ /* Snap to EOF. */
+ if (unlikely(ra->ra_index >= eof_index))
+ return 0;
+ if (ra->readahead_index + ra_size / 2 > eof_index) {
+ if (ra_class == RA_CLASS_CONTEXT_AGGRESSIVE &&
+ eof_index > ra->lookahead_index + 1)
+ la_size = eof_index - ra->lookahead_index;
+ else
+ la_size = 0;
+ ra_size = eof_index - ra->ra_index;
+ ra_set_size(ra, ra_size, la_size);
+ }
+
+ actual = __do_page_cache_readahead(mapping, filp,
+ ra->ra_index, ra_size, la_size);
+
+#ifdef CONFIG_DEBUG_READAHEAD
+ if (ra->flags & RA_FLAG_MMAP)
+ ra_account(ra, RA_EVENT_READAHEAD_MMAP, actual);
+ if (ra->readahead_index == eof_index)
+ ra_account(ra, RA_EVENT_READAHEAD_EOF, actual);
+ if (la_size)
+ ra_account(ra, RA_EVENT_LOOKAHEAD, la_size);
+ if (ra_size > actual)
+ ra_account(ra, RA_EVENT_IO_CACHE_HIT, ra_size - actual);
+ ra_account(ra, RA_EVENT_READAHEAD, actual);
+
+ dprintk("readahead-%s(ino=%lu, index=%lu, ra=%lu+%lu-%lu) = %d\n",
+ ra_class_name[ra_class],
+ mapping->host->i_ino, ra->la_index,
+ ra->ra_index, ra_size, la_size, actual);
+#endif /* CONFIG_DEBUG_READAHEAD */
+
+ return actual;
+}
+
+/*
+ * Determine the ra request from primitive values.
+ *
+ * It applies the following rules:
+ * - Substract ra_size by the old look-ahead to get real safe read-ahead;
+ * - Set new la_size according to the (still large) ra_size;
+ * - Apply upper limits;
+ * - Make sure stream_shift is not too small.
+ * (So that the next global_shift will not be too small.)
+ *
+ * Input:
+ * ra_size stores the estimated thrashing-threshold.
+ * la_size stores the look-ahead size of previous request.
+ */
+static inline int adjust_rala(unsigned long ra_max,
+ unsigned long *ra_size, unsigned long *la_size)
+{
+ unsigned long stream_shift = *la_size;
+
+ if (*ra_size > *la_size)
+ *ra_size -= *la_size;
+ else {
+ ra_account(NULL, RA_EVENT_READAHEAD_SHRINK, *ra_size);
+ return 0;
+ }
+
+ *la_size = *ra_size / LOOKAHEAD_RATIO;
+
+ if (*ra_size > ra_max)
+ *ra_size = ra_max;
+ if (*la_size > *ra_size)
+ *la_size = *ra_size;
+
+ stream_shift += (*ra_size - *la_size);
+ if (stream_shift < *ra_size / 4)
+ *la_size -= (*ra_size / 4 - stream_shift);
+
+ return 1;
+}
+
+/*
+ * The function estimates two values:
+ * 1. thrashing-threshold for the current stream
+ * It is returned to make the next read-ahead request.
+ * 2. the remained safe space for the current chunk
+ * It will be checked to ensure that the current chunk is safe.
+ *
+ * The computation will be pretty accurate under heavy load, and will vibrate
+ * more on light load(with small global_shift), so the grow speed of ra_size
+ * must be limited, and a moderate large stream_shift must be insured.
+ *
+ * This figure illustrates the formula used in the function:
+ * While the stream reads stream_shift pages inside the chunks,
+ * the chunks are shifted global_shift pages inside inactive_list.
+ *
+ * chunk A chunk B
+ * |<=============== global_shift ================|
+ * +-------------+ +-------------------+ |
+ * | # | | # | inactive_list |
+ * +-------------+ +-------------------+ head |
+ * |---->| |---------->|
+ * | |
+ * +-- stream_shift --+
+ */
+static inline unsigned long compute_thrashing_threshold(
+ struct file_ra_state *ra,
+ unsigned long *remain)
+{
+ unsigned long global_size;
+ unsigned long global_shift;
+ unsigned long stream_shift;
+ unsigned long ra_size;
+ uint64_t ll;
+
+ global_size = node_free_and_cold_pages();
+ global_shift = node_readahead_aging() - ra->age;
+ global_shift |= 1UL;
+ stream_shift = ra_cache_hit(ra, 0);
+
+ ll = (uint64_t) stream_shift * (global_size >> 9) * readahead_ratio * 5;
+ do_div(ll, global_shift);
+ ra_size = ll;
+
+ if (global_size > global_shift) {
+ ll = (uint64_t) stream_shift * (global_size - global_shift);
+ do_div(ll, global_shift);
+ *remain = ll;
+ } else
+ *remain = 0;
+
+ ddprintk("compute_thrashing_threshold: "
+ "at %lu ra %lu=%lu*%lu/%lu, remain %lu for %lu\n",
+ ra->readahead_index, ra_size,
+ stream_shift, global_size, global_shift,
+ *remain, ra->readahead_index - ra->lookahead_index);
+
+ return ra_size;
+}
+
+/*
+ * Main function for file_ra_state based read-ahead.
+ */
+static inline unsigned long
+state_based_readahead(struct address_space *mapping, struct file *filp,
+ struct file_ra_state *ra,
+ struct page *page, pgoff_t index,
+ unsigned long ra_size, unsigned long ra_max)
+{
+ unsigned long ra_old;
+ unsigned long la_size;
+ unsigned long remain_space;
+ unsigned long growth_limit;
+
+ la_size = ra->readahead_index - index;
+ ra_old = ra->readahead_index - ra->ra_index;
+ growth_limit = ra_size + ra_max / 16 +
+ (2 + readahead_ratio / 64) * ra_old;
+ ra_size = compute_thrashing_threshold(ra, &remain_space);
+
+ if (page && remain_space <= la_size && la_size > 1) {
+ rescue_pages(page, la_size);
+ return 0;
+ }
+
+ if (!adjust_rala(min(ra_max, growth_limit), &ra_size, &la_size))
+ return 0;
+
+ ra_set_class(ra, RA_CLASS_STATE);
+ ra_set_index(ra, index, ra->readahead_index);
+ ra_set_size(ra, ra_size, la_size);
+
+ return ra_dispatch(ra, mapping, filp);
+}
+
+/*
* ra_min is mainly determined by the size of cache memory.
* Table of concrete numbers for 4KB page size:
* inactive + free (MB): 4 8 16 32 64 128 256 512 1024
--
next prev parent reply other threads:[~2006-03-19 2:36 UTC|newest]
Thread overview: 37+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-03-19 2:34 [PATCH 00/23] Adaptive read-ahead V11 Wu Fengguang
2006-03-19 2:34 ` [PATCH 01/23] readahead: kconfig options Wu Fengguang
2006-03-19 2:34 ` [PATCH 02/23] radixtree: look-aside cache Wu Fengguang
2006-03-20 16:01 ` Christoph Lameter
2006-03-21 2:19 ` Wu Fengguang
2006-03-19 2:34 ` [PATCH 03/23] radixtree: hole scanning functions Wu Fengguang
2006-03-19 2:34 ` [PATCH 04/23] readahead: page flag PG_readahead Wu Fengguang
2006-03-19 2:34 ` [PATCH 05/23] readahead: refactor do_generic_mapping_read() Wu Fengguang
2006-03-19 2:34 ` [PATCH 06/23] readahead: refactor __do_page_cache_readahead() Wu Fengguang
2006-03-19 2:34 ` [PATCH 07/23] readahead: insert cond_resched() calls Wu Fengguang
2006-03-19 3:50 ` Lee Revell
2006-03-19 5:32 ` Wu Fengguang
2006-03-20 13:31 ` Wu Fengguang
2006-03-19 2:34 ` [PATCH 08/23] readahead: common macros Wu Fengguang
2006-03-19 2:34 ` [PATCH 09/23] readahead: events accounting Wu Fengguang
2006-03-19 2:34 ` [PATCH 10/23] readahead: support functions Wu Fengguang
2006-03-19 2:34 ` [PATCH 11/23] readahead: sysctl parameters Wu Fengguang
2006-03-19 2:34 ` [PATCH 12/23] readahead: min/max sizes Wu Fengguang
2006-03-19 2:34 ` [PATCH 13/23] readahead: page cache aging accounting Wu Fengguang
2006-03-19 2:34 ` Wu Fengguang [this message]
2006-03-19 2:34 ` [PATCH 15/23] readahead: context based method Wu Fengguang
2006-03-19 2:34 ` [PATCH 16/23] readahead: other methods Wu Fengguang
2006-03-19 2:34 ` [PATCH 17/23] readahead: call scheme Wu Fengguang
2006-03-19 2:34 ` [PATCH 18/23] readahead: laptop mode Wu Fengguang
2006-03-19 2:34 ` [PATCH 19/23] readahead: loop case Wu Fengguang
2006-03-19 2:34 ` [PATCH 20/23] readahead: nfsd case Wu Fengguang
2006-03-19 2:34 ` [PATCH 21/23] readahead: debug radix tree new functions Wu Fengguang
2006-03-19 2:34 ` [PATCH 22/23] readahead: debug traces showing accessed file names Wu Fengguang
2006-03-19 2:34 ` [PATCH 23/23] readahead: debug traces showing read patterns Wu Fengguang
2006-03-19 3:10 ` [PATCH 00/23] Adaptive read-ahead V11 Jon Smirl
2006-03-19 3:47 ` Wu Fengguang
2006-03-19 4:10 ` Jon Smirl
2006-03-19 5:09 ` Wu Fengguang
2006-03-19 15:53 ` Jon Smirl
2006-03-20 13:54 ` Wu Fengguang
2006-03-27 21:38 ` Matt Heler
2006-03-28 3:44 ` Wu Fengguang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20060319023455.526787000@localhost.localdomain \
--to=wfg@mail.ustc.edu.cn \
--cc=akpm@osdl.org \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox