From: Wu Fengguang <wfg@mail.ustc.edu.cn>
To: Andrew Morton <akpm@osdl.org>
Cc: linux-kernel@vger.kernel.org
Subject: [PATCH 14/23] readahead: state based method
Date: Sun, 19 Mar 2006 10:34:27 +0800 [thread overview]
Message-ID: <20060319023455.526787000@localhost.localdomain> (raw)
In-Reply-To: 20060319023413.305977000@localhost.localdomain
[-- Attachment #1: readahead-method-stateful.patch --]
[-- Type: text/plain, Size: 12309 bytes --]
This is the fast code path of adaptive read-ahead.
MAJOR STEPS
- estimate a thrashing safe ra_size;
- assemble the next read-ahead request in file_ra_state;
- submit it.
THE REFERENCE MODEL
1. inactive list has constant length and page flow speed
2. the observed stream receives a steady flow of read requests
3. no page activation, so that the inactive list forms a pipe
With that we get the picture showed below.
|<------------------------- constant length ------------------------->|
<<<<<<<<<<<<<<<<<<<<<<<<< steady flow of pages <<<<<<<<<<<<<<<<<<<<<<<<
+---------------------------------------------------------------------+
|tail inactive list head|
| ======= ==========---- |
| chunk A(stale pages) chunk B(stale + fresh pages) |
+---------------------------------------------------------------------+
REAL WORLD ISSUES
Real world workloads will always have fluctuations (violation of assumption
1 and 2). To counteract it, a tunable parameter readahead_ratio is introduced
to make the estimation conservative enough. Violation of assumption 3 will
not lead to thrashing, it is there just for simplicity of discussion.
Signed-off-by: Wu Fengguang <wfg@mail.ustc.edu.cn>
---
include/linux/fs.h | 41 +++++--
mm/readahead.c | 286 +++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 317 insertions(+), 10 deletions(-)
--- linux-2.6.16-rc6-mm2.orig/include/linux/fs.h
+++ linux-2.6.16-rc6-mm2/include/linux/fs.h
@@ -611,19 +611,40 @@ struct fown_struct {
* Track a single file's readahead state
*/
struct file_ra_state {
- unsigned long start; /* Current window */
- unsigned long size;
- unsigned long flags; /* ra flags RA_FLAG_xxx*/
- unsigned long cache_hit; /* cache hit count*/
- unsigned long prev_page; /* Cache last read() position */
- unsigned long ahead_start; /* Ahead window */
- unsigned long ahead_size;
- unsigned long ra_pages; /* Maximum readahead window */
- unsigned long mmap_hit; /* Cache hit stat for mmap accesses */
- unsigned long mmap_miss; /* Cache miss stat for mmap accesses */
+ union {
+ struct { /* conventional read-ahead */
+ unsigned long start; /* Current window */
+ unsigned long size;
+ unsigned long ahead_start; /* Ahead window */
+ unsigned long ahead_size;
+ unsigned long cache_hit; /* cache hit count */
+ };
+#ifdef CONFIG_ADAPTIVE_READAHEAD
+ struct { /* adaptive read-ahead */
+ pgoff_t la_index;
+ pgoff_t ra_index;
+ pgoff_t lookahead_index;
+ pgoff_t readahead_index;
+ unsigned long age;
+ uint64_t cache_hits;
+ };
+#endif
+ };
+
+ /* mmap read-around */
+ unsigned long mmap_hit; /* Cache hit stat for mmap accesses */
+ unsigned long mmap_miss; /* Cache miss stat for mmap accesses */
+
+ /* common ones */
+ unsigned long flags; /* ra flags RA_FLAG_xxx*/
+ unsigned long prev_page; /* Cache last read() position */
+ unsigned long ra_pages; /* Maximum readahead window */
};
#define RA_FLAG_MISS 0x01 /* a cache miss occured against this file */
#define RA_FLAG_INCACHE 0x02 /* file is already in cache */
+#define RA_FLAG_MMAP (1UL<<31) /* mmaped page access */
+#define RA_FLAG_NO_LOOKAHEAD (1UL<<30) /* disable look-ahead */
+#define RA_FLAG_NFSD (1UL<<29) /* request from nfsd */
struct file {
/*
--- linux-2.6.16-rc6-mm2.orig/mm/readahead.c
+++ linux-2.6.16-rc6-mm2/mm/readahead.c
@@ -16,6 +16,7 @@
#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/nfsd/const.h>
+#include <asm/div64.h>
/* The default max/min read-ahead pages. */
#define KB(size) (((size)*1024 + PAGE_CACHE_SIZE-1) / PAGE_CACHE_SIZE)
@@ -1060,6 +1061,291 @@ static unsigned long node_readahead_agin
}
/*
+ * The 64bit cache_hits stores three accumulated values and a counter value.
+ * MSB LSB
+ * 3333333333333333 : 2222222222222222 : 1111111111111111 : 0000000000000000
+ */
+static inline int ra_cache_hit(struct file_ra_state *ra, int nr)
+{
+ return (ra->cache_hits >> (nr * 16)) & 0xFFFF;
+}
+
+/*
+ * Conceptual code:
+ * ra_cache_hit(ra, 1) += ra_cache_hit(ra, 0);
+ * ra_cache_hit(ra, 0) = 0;
+ */
+static inline void ra_addup_cache_hit(struct file_ra_state *ra)
+{
+ int n;
+
+ n = ra_cache_hit(ra, 0);
+ ra->cache_hits -= n;
+ n <<= 16;
+ ra->cache_hits += n;
+}
+
+/*
+ * The read-ahead is deemed success if cache-hit-rate >= 1/readahead_hit_rate.
+ */
+static inline int ra_cache_hit_ok(struct file_ra_state *ra)
+{
+ return ra_cache_hit(ra, 0) * readahead_hit_rate >=
+ (ra->lookahead_index - ra->la_index);
+}
+
+/*
+ * Check if @index falls in the @ra request.
+ */
+static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
+{
+ if (index < ra->la_index || index >= ra->readahead_index)
+ return 0;
+
+ if (index >= ra->ra_index)
+ return 1;
+ else
+ return -1;
+}
+
+/*
+ * Which method is issuing this read-ahead?
+ */
+static inline void ra_set_class(struct file_ra_state *ra,
+ enum ra_class ra_class)
+{
+ unsigned long flags_mask;
+ unsigned long flags;
+ unsigned long old_ra_class;
+
+ flags_mask = ~(RA_CLASS_MASK | (RA_CLASS_MASK << RA_CLASS_SHIFT));
+ flags = ra->flags & flags_mask;
+
+ old_ra_class = (ra->flags & RA_CLASS_MASK) << RA_CLASS_SHIFT;
+
+ ra->flags = flags | old_ra_class | ra_class;
+
+ ra_addup_cache_hit(ra);
+ if (ra_class != RA_CLASS_STATE)
+ ra->cache_hits <<= 16;
+
+ ra->age = node_readahead_aging();
+}
+
+/*
+ * Where is the old read-ahead and look-ahead?
+ */
+static inline void ra_set_index(struct file_ra_state *ra,
+ pgoff_t la_index, pgoff_t ra_index)
+{
+ ra->la_index = la_index;
+ ra->ra_index = ra_index;
+}
+
+/*
+ * Where is the new read-ahead and look-ahead?
+ */
+static inline void ra_set_size(struct file_ra_state *ra,
+ unsigned long ra_size, unsigned long la_size)
+{
+ /* Disable look-ahead for loopback file. */
+ if (unlikely(ra->flags & RA_FLAG_NO_LOOKAHEAD))
+ la_size = 0;
+
+ ra->readahead_index = ra->ra_index + ra_size;
+ ra->lookahead_index = ra->readahead_index - la_size;
+}
+
+/*
+ * Submit IO for the read-ahead request in file_ra_state.
+ */
+static int ra_dispatch(struct file_ra_state *ra,
+ struct address_space *mapping, struct file *filp)
+{
+ pgoff_t eof_index;
+ unsigned long ra_size;
+ unsigned long la_size;
+ int actual;
+ enum ra_class ra_class;
+
+ ra_class = (ra->flags & RA_CLASS_MASK);
+ BUG_ON(ra_class == 0 || ra_class > RA_CLASS_END);
+
+ eof_index = ((i_size_read(mapping->host) - 1) >> PAGE_CACHE_SHIFT) + 1;
+ ra_size = ra->readahead_index - ra->ra_index;
+ la_size = ra->readahead_index - ra->lookahead_index;
+
+ /* Snap to EOF. */
+ if (unlikely(ra->ra_index >= eof_index))
+ return 0;
+ if (ra->readahead_index + ra_size / 2 > eof_index) {
+ if (ra_class == RA_CLASS_CONTEXT_AGGRESSIVE &&
+ eof_index > ra->lookahead_index + 1)
+ la_size = eof_index - ra->lookahead_index;
+ else
+ la_size = 0;
+ ra_size = eof_index - ra->ra_index;
+ ra_set_size(ra, ra_size, la_size);
+ }
+
+ actual = __do_page_cache_readahead(mapping, filp,
+ ra->ra_index, ra_size, la_size);
+
+#ifdef CONFIG_DEBUG_READAHEAD
+ if (ra->flags & RA_FLAG_MMAP)
+ ra_account(ra, RA_EVENT_READAHEAD_MMAP, actual);
+ if (ra->readahead_index == eof_index)
+ ra_account(ra, RA_EVENT_READAHEAD_EOF, actual);
+ if (la_size)
+ ra_account(ra, RA_EVENT_LOOKAHEAD, la_size);
+ if (ra_size > actual)
+ ra_account(ra, RA_EVENT_IO_CACHE_HIT, ra_size - actual);
+ ra_account(ra, RA_EVENT_READAHEAD, actual);
+
+ dprintk("readahead-%s(ino=%lu, index=%lu, ra=%lu+%lu-%lu) = %d\n",
+ ra_class_name[ra_class],
+ mapping->host->i_ino, ra->la_index,
+ ra->ra_index, ra_size, la_size, actual);
+#endif /* CONFIG_DEBUG_READAHEAD */
+
+ return actual;
+}
+
+/*
+ * Determine the ra request from primitive values.
+ *
+ * It applies the following rules:
+ * - Substract ra_size by the old look-ahead to get real safe read-ahead;
+ * - Set new la_size according to the (still large) ra_size;
+ * - Apply upper limits;
+ * - Make sure stream_shift is not too small.
+ * (So that the next global_shift will not be too small.)
+ *
+ * Input:
+ * ra_size stores the estimated thrashing-threshold.
+ * la_size stores the look-ahead size of previous request.
+ */
+static inline int adjust_rala(unsigned long ra_max,
+ unsigned long *ra_size, unsigned long *la_size)
+{
+ unsigned long stream_shift = *la_size;
+
+ if (*ra_size > *la_size)
+ *ra_size -= *la_size;
+ else {
+ ra_account(NULL, RA_EVENT_READAHEAD_SHRINK, *ra_size);
+ return 0;
+ }
+
+ *la_size = *ra_size / LOOKAHEAD_RATIO;
+
+ if (*ra_size > ra_max)
+ *ra_size = ra_max;
+ if (*la_size > *ra_size)
+ *la_size = *ra_size;
+
+ stream_shift += (*ra_size - *la_size);
+ if (stream_shift < *ra_size / 4)
+ *la_size -= (*ra_size / 4 - stream_shift);
+
+ return 1;
+}
+
+/*
+ * The function estimates two values:
+ * 1. thrashing-threshold for the current stream
+ * It is returned to make the next read-ahead request.
+ * 2. the remained safe space for the current chunk
+ * It will be checked to ensure that the current chunk is safe.
+ *
+ * The computation will be pretty accurate under heavy load, and will vibrate
+ * more on light load(with small global_shift), so the grow speed of ra_size
+ * must be limited, and a moderate large stream_shift must be insured.
+ *
+ * This figure illustrates the formula used in the function:
+ * While the stream reads stream_shift pages inside the chunks,
+ * the chunks are shifted global_shift pages inside inactive_list.
+ *
+ * chunk A chunk B
+ * |<=============== global_shift ================|
+ * +-------------+ +-------------------+ |
+ * | # | | # | inactive_list |
+ * +-------------+ +-------------------+ head |
+ * |---->| |---------->|
+ * | |
+ * +-- stream_shift --+
+ */
+static inline unsigned long compute_thrashing_threshold(
+ struct file_ra_state *ra,
+ unsigned long *remain)
+{
+ unsigned long global_size;
+ unsigned long global_shift;
+ unsigned long stream_shift;
+ unsigned long ra_size;
+ uint64_t ll;
+
+ global_size = node_free_and_cold_pages();
+ global_shift = node_readahead_aging() - ra->age;
+ global_shift |= 1UL;
+ stream_shift = ra_cache_hit(ra, 0);
+
+ ll = (uint64_t) stream_shift * (global_size >> 9) * readahead_ratio * 5;
+ do_div(ll, global_shift);
+ ra_size = ll;
+
+ if (global_size > global_shift) {
+ ll = (uint64_t) stream_shift * (global_size - global_shift);
+ do_div(ll, global_shift);
+ *remain = ll;
+ } else
+ *remain = 0;
+
+ ddprintk("compute_thrashing_threshold: "
+ "at %lu ra %lu=%lu*%lu/%lu, remain %lu for %lu\n",
+ ra->readahead_index, ra_size,
+ stream_shift, global_size, global_shift,
+ *remain, ra->readahead_index - ra->lookahead_index);
+
+ return ra_size;
+}
+
+/*
+ * Main function for file_ra_state based read-ahead.
+ */
+static inline unsigned long
+state_based_readahead(struct address_space *mapping, struct file *filp,
+ struct file_ra_state *ra,
+ struct page *page, pgoff_t index,
+ unsigned long ra_size, unsigned long ra_max)
+{
+ unsigned long ra_old;
+ unsigned long la_size;
+ unsigned long remain_space;
+ unsigned long growth_limit;
+
+ la_size = ra->readahead_index - index;
+ ra_old = ra->readahead_index - ra->ra_index;
+ growth_limit = ra_size + ra_max / 16 +
+ (2 + readahead_ratio / 64) * ra_old;
+ ra_size = compute_thrashing_threshold(ra, &remain_space);
+
+ if (page && remain_space <= la_size && la_size > 1) {
+ rescue_pages(page, la_size);
+ return 0;
+ }
+
+ if (!adjust_rala(min(ra_max, growth_limit), &ra_size, &la_size))
+ return 0;
+
+ ra_set_class(ra, RA_CLASS_STATE);
+ ra_set_index(ra, index, ra->readahead_index);
+ ra_set_size(ra, ra_size, la_size);
+
+ return ra_dispatch(ra, mapping, filp);
+}
+
+/*
* ra_min is mainly determined by the size of cache memory.
* Table of concrete numbers for 4KB page size:
* inactive + free (MB): 4 8 16 32 64 128 256 512 1024
--
next prev parent reply other threads:[~2006-03-19 2:36 UTC|newest]
Thread overview: 37+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-03-19 2:34 [PATCH 00/23] Adaptive read-ahead V11 Wu Fengguang
2006-03-19 2:34 ` [PATCH 01/23] readahead: kconfig options Wu Fengguang
2006-03-19 2:34 ` [PATCH 02/23] radixtree: look-aside cache Wu Fengguang
2006-03-20 16:01 ` Christoph Lameter
2006-03-21 2:19 ` Wu Fengguang
2006-03-19 2:34 ` [PATCH 03/23] radixtree: hole scanning functions Wu Fengguang
2006-03-19 2:34 ` [PATCH 04/23] readahead: page flag PG_readahead Wu Fengguang
2006-03-19 2:34 ` [PATCH 05/23] readahead: refactor do_generic_mapping_read() Wu Fengguang
2006-03-19 2:34 ` [PATCH 06/23] readahead: refactor __do_page_cache_readahead() Wu Fengguang
2006-03-19 2:34 ` [PATCH 07/23] readahead: insert cond_resched() calls Wu Fengguang
2006-03-19 3:50 ` Lee Revell
2006-03-19 5:32 ` Wu Fengguang
2006-03-20 13:31 ` Wu Fengguang
2006-03-19 2:34 ` [PATCH 08/23] readahead: common macros Wu Fengguang
2006-03-19 2:34 ` [PATCH 09/23] readahead: events accounting Wu Fengguang
2006-03-19 2:34 ` [PATCH 10/23] readahead: support functions Wu Fengguang
2006-03-19 2:34 ` [PATCH 11/23] readahead: sysctl parameters Wu Fengguang
2006-03-19 2:34 ` [PATCH 12/23] readahead: min/max sizes Wu Fengguang
2006-03-19 2:34 ` [PATCH 13/23] readahead: page cache aging accounting Wu Fengguang
2006-03-19 2:34 ` Wu Fengguang [this message]
2006-03-19 2:34 ` [PATCH 15/23] readahead: context based method Wu Fengguang
2006-03-19 2:34 ` [PATCH 16/23] readahead: other methods Wu Fengguang
2006-03-19 2:34 ` [PATCH 17/23] readahead: call scheme Wu Fengguang
2006-03-19 2:34 ` [PATCH 18/23] readahead: laptop mode Wu Fengguang
2006-03-19 2:34 ` [PATCH 19/23] readahead: loop case Wu Fengguang
2006-03-19 2:34 ` [PATCH 20/23] readahead: nfsd case Wu Fengguang
2006-03-19 2:34 ` [PATCH 21/23] readahead: debug radix tree new functions Wu Fengguang
2006-03-19 2:34 ` [PATCH 22/23] readahead: debug traces showing accessed file names Wu Fengguang
2006-03-19 2:34 ` [PATCH 23/23] readahead: debug traces showing read patterns Wu Fengguang
2006-03-19 3:10 ` [PATCH 00/23] Adaptive read-ahead V11 Jon Smirl
2006-03-19 3:47 ` Wu Fengguang
2006-03-19 4:10 ` Jon Smirl
2006-03-19 5:09 ` Wu Fengguang
2006-03-19 15:53 ` Jon Smirl
2006-03-20 13:54 ` Wu Fengguang
2006-03-27 21:38 ` Matt Heler
2006-03-28 3:44 ` Wu Fengguang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20060319023455.526787000@localhost.localdomain \
--to=wfg@mail.ustc.edu.cn \
--cc=akpm@osdl.org \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.