From: Wu Fengguang <wfg@mail.ustc.edu.cn>
To: linux-kernel@vger.kernel.org
Cc: Andrew Morton <akpm@osdl.org>, Wu Fengguang <wfg@mail.ustc.edu.cn>
Subject: [PATCH 08/16] readahead: context based method
Date: Sat, 03 Dec 2005 15:14:52 +0800 [thread overview]
Message-ID: <20051203071759.668193000@localhost.localdomain> (raw)
In-Reply-To: 20051203071444.260068000@localhost.localdomain
[-- Attachment #1: readahead-method-context.patch --]
[-- Type: text/plain, Size: 10377 bytes --]
This is the slow code path.
No valid state info is available, so the page cache is queried to abtain the
required position/timing infomation.
Major steps:
- look back/forward to find the ra_index;
- look back to estimate a thrashing safe ra_size;
- assemble the next read-ahead request in file_ra_state;
- submit it.
Signed-off-by: Wu Fengguang <wfg@mail.ustc.edu.cn>
---
mm/readahead.c | 345 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 345 insertions(+)
--- linux.orig/mm/readahead.c
+++ linux/mm/readahead.c
@@ -1170,6 +1170,351 @@ state_based_readahead(struct address_spa
return ra_dispatch(ra, mapping, filp);
}
+/*
+ * Page cache context based estimation of read-ahead/look-ahead size/index.
+ *
+ * The logic first looks around to find the start point of next read-ahead,
+ * and then, if necessary, looks backward in the inactive_list to get an
+ * estimation of the thrashing-threshold.
+ *
+ * The estimation theory can be illustrated with figure:
+ *
+ * chunk A chunk B chunk C head
+ *
+ * l01 l11 l12 l21 l22
+ *| |-->|-->| |------>|-->| |------>|
+ *| +-------+ +-----------+ +-------------+ |
+ *| | # | | # | | # | |
+ *| +-------+ +-----------+ +-------------+ |
+ *| |<==============|<===========================|<============================|
+ * L0 L1 L2
+ *
+ * Let f(l) = L be a map from
+ * l: the number of pages read by the stream
+ * to
+ * L: the number of pages pushed into inactive_list in the mean time
+ * then
+ * f(l01) <= L0
+ * f(l11 + l12) = L1
+ * f(l21 + l22) = L2
+ * ...
+ * f(l01 + l11 + ...) <= Sum(L0 + L1 + ...)
+ * <= Length(inactive_list) = f(thrashing-threshold)
+ *
+ * So the count of countinuous history pages left in the inactive_list is always
+ * a lower estimation of the true thrashing-threshold.
+ */
+
+/*
+ * STATUS REFERENCE COUNT TYPE
+ * A__ 0 not in inactive list
+ * ___ 0 fresh
+ * __R PAGE_REFCNT_1 stale
+ * _a_ PAGE_REFCNT_2 disturbed once
+ * _aR PAGE_REFCNT_3 disturbed twice
+ *
+ * A/a/R: Active / aCTIVATE / Referenced
+ */
+static inline unsigned long cold_page_refcnt(struct page *page)
+{
+ if (!page || PageActive(page))
+ return 0;
+
+ return page_refcnt(page);
+}
+
+static inline char page_refcnt_symbol(struct page *page)
+{
+ if (!page)
+ return 'X';
+ if (PageActive(page))
+ return 'A';
+ switch (page_refcnt(page)) {
+ case 0:
+ return '_';
+ case PAGE_REFCNT_1:
+ return '-';
+ case PAGE_REFCNT_2:
+ return '=';
+ case PAGE_REFCNT_3:
+ return '#';
+ }
+ return '?';
+}
+
+/*
+ * Count/estimate cache hits in range [first_index, last_index].
+ * The estimation is simple and optimistic.
+ */
+static int count_cache_hit(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t last_index)
+{
+ struct page *page;
+ int size = last_index - first_index + 1;
+ int count = 0;
+ int i;
+
+ read_lock_irq(&mapping->tree_lock);
+
+ /*
+ * The first page may well is chunk head and has been accessed,
+ * so it is index 0 that makes the estimation optimistic. This
+ * behavior guarantees a readahead when (size < ra_max) and
+ * (readahead_hit_rate >= 16).
+ */
+ for (i = 0; i < 16;) {
+ page = __find_page(mapping, first_index +
+ size * ((i++ * 29) & 15) / 16);
+ if (cold_page_refcnt(page) >= PAGE_REFCNT_1 && ++count >= 2)
+ break;
+ }
+
+ read_unlock_irq(&mapping->tree_lock);
+
+ return size * count / i;
+}
+
+/*
+ * Look back and check history pages to estimate thrashing-threshold.
+ */
+static int query_page_cache(struct address_space *mapping,
+ struct file_ra_state *ra,
+ unsigned long *remain, pgoff_t offset,
+ unsigned long ra_min, unsigned long ra_max)
+{
+ int count;
+ pgoff_t index;
+ unsigned long nr_lookback;
+ struct radix_tree_cache cache;
+
+ /*
+ * Scan backward and check the near @ra_max pages.
+ * The count here determines ra_size.
+ */
+ read_lock_irq(&mapping->tree_lock);
+ index = radix_tree_lookup_head(&mapping->page_tree, offset, ra_max);
+ read_unlock_irq(&mapping->tree_lock);
+#ifdef DEBUG_READAHEAD_RADIXTREE
+ if (index <= offset) {
+ WARN_ON(!find_page(mapping, index));
+ if (index + ra_max > offset)
+ WARN_ON(find_page(mapping, index - 1));
+ } else {
+ BUG_ON(index > offset + 1);
+ WARN_ON(find_page(mapping, offset));
+ }
+#endif
+
+ *remain = offset - index + 1;
+
+ if (unlikely(*remain <= ra_min))
+ return ra_min;
+
+ if (!index)
+ return *remain;
+
+ if (offset + 1 == ra->readahead_index && ra_cache_hit_ok(ra))
+ count = *remain;
+ else if (count_cache_hit(mapping, index, offset) *
+ readahead_hit_rate >= *remain)
+ count = *remain;
+ else
+ return ra_min;
+
+ if (count < ra_max)
+ goto out;
+
+ /*
+ * Check the far pages coarsely.
+ * The big count here helps increase la_size.
+ */
+ nr_lookback = ra_max * (LOOKAHEAD_RATIO + 1) *
+ 100 / (readahead_ratio + 1);
+ if (nr_lookback > offset)
+ nr_lookback = offset;
+
+ radix_tree_cache_init(&cache);
+ read_lock_irq(&mapping->tree_lock);
+ for (count += ra_max; count < nr_lookback; count += ra_max) {
+ struct radix_tree_node *node;
+ node = radix_tree_cache_lookup_node(&mapping->page_tree,
+ &cache, offset - count, 1);
+ if (!node)
+ break;
+#ifdef DEBUG_READAHEAD_RADIXTREE
+ if (node != radix_tree_lookup_node(&mapping->page_tree,
+ offset - count, 1)) {
+ read_unlock_irq(&mapping->tree_lock);
+ printk(KERN_ERR "check radix_tree_cache_lookup_node!\n");
+ return 1;
+ }
+#endif
+ }
+ read_unlock_irq(&mapping->tree_lock);
+
+ /*
+ * For sequential read that extends from index 0, the counted value
+ * may well be far under the true threshold, so return it unmodified
+ * for further process in adjust_rala_accelerated().
+ */
+ if (count >= offset)
+ return offset + 1;
+
+out:
+ count = count * readahead_ratio / 100;
+ return count;
+}
+
+/*
+ * Scan backward in the file for the first non-present page.
+ */
+static inline pgoff_t first_absent_page_bw(struct address_space *mapping,
+ pgoff_t index, unsigned long max_scan)
+{
+ struct radix_tree_cache cache;
+ struct page *page;
+ pgoff_t origin;
+
+ origin = index;
+ if (max_scan > index)
+ max_scan = index;
+
+ radix_tree_cache_init(&cache);
+ read_lock_irq(&mapping->tree_lock);
+ for (; origin - index <= max_scan;) {
+ page = radix_tree_cache_lookup(&mapping->page_tree,
+ &cache, --index);
+ if (page) {
+ index++;
+ break;
+ }
+ }
+ read_unlock_irq(&mapping->tree_lock);
+
+ return index;
+}
+
+/*
+ * Scan forward in the file for the first non-present page.
+ */
+static inline pgoff_t first_absent_page(struct address_space *mapping,
+ pgoff_t index, unsigned long max_scan)
+{
+ pgoff_t ra_index;
+
+ read_lock_irq(&mapping->tree_lock);
+ ra_index = radix_tree_lookup_tail(&mapping->page_tree,
+ index + 1, max_scan);
+ read_unlock_irq(&mapping->tree_lock);
+
+#ifdef DEBUG_READAHEAD_RADIXTREE
+ BUG_ON(ra_index <= index);
+ if (index + max_scan > index) {
+ if (ra_index <= index + max_scan)
+ WARN_ON(find_page(mapping, ra_index));
+ WARN_ON(!find_page(mapping, ra_index - 1));
+ }
+#endif
+
+ if (ra_index <= index + max_scan)
+ return ra_index;
+ else
+ return 0;
+}
+
+/*
+ * Determine the request parameters for context based read-ahead that extends
+ * from start of file.
+ *
+ * The major weakness of stateless method is perhaps the slow grow up speed of
+ * ra_size. The logic tries to make up for this in the important case of
+ * sequential reads that extend from start of file. In this case, the ra_size
+ * is not choosed to make the whole next chunk safe(as in normal ones). Only
+ * half of which is safe. The added 'unsafe' half is the look-ahead part. It
+ * is expected to be safeguarded by rescue_pages() when the previous chunks are
+ * lost.
+ */
+static inline int adjust_rala_accelerated(unsigned long ra_max,
+ unsigned long *ra_size, unsigned long *la_size)
+{
+ pgoff_t index = *ra_size;
+
+ *ra_size -= min(*ra_size, *la_size);
+ *ra_size = *ra_size * readahead_ratio / 100;
+ *la_size = index * readahead_ratio / 100;
+ *ra_size += *la_size;
+
+ if (*ra_size > ra_max)
+ *ra_size = ra_max;
+ if (*la_size > *ra_size)
+ *la_size = *ra_size;
+
+ return 1;
+}
+
+/*
+ * Main function for page context based read-ahead.
+ */
+static inline int
+try_context_based_readahead(struct address_space *mapping,
+ struct file_ra_state *ra,
+ struct page *prev_page, struct page *page,
+ pgoff_t index, unsigned long ra_size,
+ unsigned long ra_min, unsigned long ra_max)
+{
+ pgoff_t ra_index;
+ unsigned long la_size;
+ unsigned long remain_pages;
+
+ /* Where to start read-ahead?
+ * NFSv3 daemons may process adjecent requests in parallel,
+ * leading to many locally disordered, globally sequential reads.
+ * So do not require nearby history pages to be present or accessed.
+ */
+ if (page) {
+ ra_index = first_absent_page(mapping, index, ra_max * 5 / 4);
+ if (unlikely(!ra_index))
+ return -1;
+ } else if (!prev_page) {
+ ra_index = first_absent_page_bw(mapping, index,
+ readahead_hit_rate + ra_min);
+ if (index - ra_index > readahead_hit_rate + ra_min)
+ return 0;
+ ra_min += 2 * (index - ra_index);
+ index = ra_index;
+ } else {
+ ra_index = index;
+ if (ra_has_index(ra, index))
+ ra_account(ra, RA_EVENT_READAHEAD_MUTILATE,
+ ra->readahead_index - index);
+ }
+
+ ra_size = query_page_cache(mapping, ra, &remain_pages,
+ index - 1, ra_min, ra_max);
+
+ la_size = ra_index - index;
+ if (!readahead_live_chunk &&
+ remain_pages <= la_size && la_size > 1) {
+ rescue_pages(page, la_size);
+ return -1;
+ }
+
+ if (ra_size == index) {
+ if (!adjust_rala_accelerated(ra_max, &ra_size, &la_size))
+ return -1;
+ set_ra_class(ra, RA_CLASS_CONTEXT_ACCELERATED);
+ } else {
+ if (!adjust_rala(ra_max, &ra_size, &la_size))
+ return -1;
+ set_ra_class(ra, RA_CLASS_CONTEXT);
+ }
+
+ ra_state_init(ra, index, ra_index);
+ ra_state_update(ra, ra_size, la_size);
+
+ return 1;
+}
+
/*
* ra_size is mainly determined by:
--
next prev parent reply other threads:[~2005-12-03 7:12 UTC|newest]
Thread overview: 36+ messages / expand[flat|nested] mbox.gz Atom feed top
2005-12-03 7:14 [PATCH 00/16] Adaptive read-ahead V9 Wu Fengguang
2005-12-03 7:14 ` [PATCH 01/16] mm: delayed page activation Wu Fengguang
2005-12-04 12:11 ` Nikita Danilov
2005-12-04 13:48 ` Wu Fengguang
2005-12-04 15:03 ` Nikita Danilov
2005-12-04 15:37 ` Help!Unable to handle kernel NULL pointer tony
2005-12-04 19:10 ` [PATCH 01/16] mm: delayed page activation Peter Zijlstra
2005-12-05 1:48 ` Wu Fengguang
2005-12-06 17:55 ` Nikita Danilov
2005-12-07 1:42 ` Wu Fengguang
2005-12-07 9:46 ` Andrew Morton
2005-12-07 10:36 ` Wu Fengguang
2005-12-07 12:44 ` Nikita Danilov
2005-12-07 13:53 ` Wu Fengguang
2005-12-03 7:14 ` [PATCH 02/16] radixtree: sync with mainline Wu Fengguang
2005-12-04 23:57 ` Andrew Morton
2005-12-05 1:43 ` Wu Fengguang
2005-12-05 4:05 ` Wu Fengguang
2005-12-05 17:22 ` Christoph Lameter
2005-12-05 10:44 ` Wu Fengguang
2005-12-05 17:24 ` Christoph Lameter
2005-12-06 2:23 ` Wu Fengguang
2005-12-03 7:14 ` [PATCH 03/16] radixtree: look-aside cache Wu Fengguang
2005-12-03 7:14 ` [PATCH 04/16] readahead: some preparation Wu Fengguang
2005-12-03 7:14 ` [PATCH 05/16] readahead: call scheme Wu Fengguang
2005-12-03 7:14 ` [PATCH 06/16] readahead: parameters Wu Fengguang
2005-12-03 7:14 ` [PATCH 07/16] readahead: state based method Wu Fengguang
2005-12-03 7:14 ` Wu Fengguang [this message]
2005-12-03 7:14 ` [PATCH 09/16] readahead: read-around method for mmap file Wu Fengguang
2005-12-03 7:14 ` [PATCH 10/16] readahead: other methods Wu Fengguang
2005-12-03 7:14 ` [PATCH 11/16] readahead: detect and rescue live pages Wu Fengguang
2005-12-03 7:14 ` [PATCH 12/16] readahead: events accounting Wu Fengguang
2005-12-03 7:14 ` [PATCH 13/16] readahead: laptop mode support Wu Fengguang
2005-12-03 7:14 ` [PATCH 14/16] readahead: disable look-ahead for loopback file Wu Fengguang
2005-12-03 7:14 ` [PATCH 15/16] readahead: nfsd support Wu Fengguang
2005-12-03 7:15 ` [PATCH 16/16] io: prevent too much latency in the read-ahead code Wu Fengguang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20051203071759.668193000@localhost.localdomain \
--to=wfg@mail.ustc.edu.cn \
--cc=akpm@osdl.org \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.