From: KUROSAWA Takahiro <kurosawa@valinux.co.jp>
To: ckrm-tech@lists.sourceforge.net
Cc: linux-mm@kvack.org, KUROSAWA Takahiro <kurosawa@valinux.co.jp>
Subject: [PATCH 6/8] Add the pzone_destroy() function
Date: Tue, 31 Jan 2006 11:30:30 +0900 (JST) [thread overview]
Message-ID: <20060131023030.7915.57560.sendpatchset@debian> (raw)
In-Reply-To: <20060131023000.7915.71955.sendpatchset@debian>
This patch implements destruction of pzones. Pages in the destroyed
pzones return into the parent zone (the zone from that the pzone was
created).
Signed-off-by: KUROSAWA Takahiro <kurosawa@valinux.co.jp>
---
include/linux/mmzone.h | 1
include/linux/swap.h | 2
mm/page_alloc.c | 287 +++++++++++++++++++++++++++++++++++++++++++++++++
mm/vmscan.c | 4
4 files changed, 292 insertions(+), 2 deletions(-)
diff -urNp a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h 2006-01-30 14:33:44.000000000 +0900
+++ b/include/linux/mmzone.h 2006-01-30 14:34:39.000000000 +0900
@@ -362,6 +362,7 @@ struct pzone_table {
extern struct pzone_table pzone_table[];
struct zone *pzone_create(struct zone *z, char *name, int npages);
+void pzone_destroy(struct zone *z);
static inline void zone_init_pzone_link(struct zone *z)
{
diff -urNp a/include/linux/swap.h b/include/linux/swap.h
--- a/include/linux/swap.h 2006-01-03 12:21:10.000000000 +0900
+++ b/include/linux/swap.h 2006-01-30 11:23:03.000000000 +0900
@@ -171,6 +171,8 @@ extern int rotate_reclaimable_page(struc
extern void swap_setup(void);
/* linux/mm/vmscan.c */
+extern int isolate_lru_pages(int, struct list_head *, struct list_head *,
+ int *);
extern int try_to_free_pages(struct zone **, gfp_t);
extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
extern int shrink_all_memory(int);
diff -urNp a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c 2006-01-30 14:33:44.000000000 +0900
+++ b/mm/page_alloc.c 2006-01-30 14:34:39.000000000 +0900
@@ -2727,6 +2727,9 @@ EXPORT_SYMBOL(pzone_table);
static struct list_head pzone_freelist = LIST_HEAD_INIT(pzone_freelist);
+static struct workqueue_struct *pzone_drain_wq;
+static DEFINE_PER_CPU(struct work_struct, pzone_drain_work);
+
static int pzone_table_register(struct zone *z)
{
struct pzone_table *t;
@@ -2747,6 +2750,18 @@ static int pzone_table_register(struct z
return 0;
}
+static void pzone_table_unregister(struct zone *z)
+{
+ struct pzone_table *t;
+ unsigned long flags;
+
+ write_lock_nr_zones(&flags);
+ t = &pzone_table[z->pzone_idx];
+ t->zone = NULL;
+ list_add(&t->list, &pzone_freelist);
+ write_unlock_nr_zones(&flags);
+}
+
static void pzone_parent_register(struct zone *z, struct zone *parent)
{
unsigned long flags;
@@ -2756,6 +2771,15 @@ static void pzone_parent_register(struct
write_unlock_nr_zones(&flags);
}
+static void pzone_parent_unregister(struct zone *z)
+{
+ unsigned long flags;
+
+ write_lock_nr_zones(&flags);
+ list_del(&z->sibling);
+ write_unlock_nr_zones(&flags);
+}
+
/*
* pzone alloc/free routines
*/
@@ -2847,6 +2871,194 @@ static inline void pzone_restore_page_fl
page->flags &= ~(1UL << PZONE_BIT_PGSHIFT);
}
+/*
+ * pzone_bad_range(): implemented for debugging instead of bad_range()
+ * in order to distinguish what causes the crash.
+ */
+static int pzone_bad_range(struct zone *zone, struct page *page)
+{
+ if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
+ BUG();
+ if (page_to_pfn(page) < zone->zone_start_pfn)
+ BUG();
+#ifdef CONFIG_HOLES_IN_ZONE
+ if (!pfn_valid(page_to_pfn(page)))
+ BUG();
+#endif
+ if (zone != page_zone(page))
+ BUG();
+ return 0;
+}
+
+static void pzone_drain(void *arg)
+{
+ lru_add_drain();
+}
+
+static void pzone_punt_drain(void *arg)
+{
+ struct work_struct *wp;
+
+ wp = &get_cpu_var(pzone_drain_work);
+ PREPARE_WORK(wp, pzone_drain, arg);
+ /* queue_work() checks whether the work is used or not. */
+ queue_work(pzone_drain_wq, wp);
+ put_cpu_var(pzone_drain_work);
+}
+
+static void pzone_flush_percpu(void *arg)
+{
+ struct zone *z = arg;
+ unsigned long flags;
+ int cpu;
+
+ /*
+ * lru_add_drain() must not be called from interrupt context
+ * (LRU pagevecs are interrupt unsafe).
+ */
+
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+ pzone_punt_drain(arg);
+ __drain_zone_pages(z, cpu);
+ local_irq_restore(flags);
+}
+
+static int pzone_flush_lru(struct zone *z, struct zone *parent,
+ struct list_head *clist, unsigned long *cnr,
+ int block)
+{
+ unsigned long flags;
+ struct page *page;
+ struct list_head list;
+ int n, moved, scan;
+
+ INIT_LIST_HEAD(&list);
+
+ spin_lock_irqsave(&z->lru_lock, flags);
+ n = isolate_lru_pages(*cnr, clist, &list, &scan);
+ *cnr -= n;
+ spin_unlock_irqrestore(&z->lru_lock, flags);
+
+ moved = 0;
+ while (!list_empty(&list) && n-- > 0) {
+ page = list_entry(list.prev, struct page, lru);
+ list_del(&page->lru);
+
+ if (block) {
+ lock_page(page);
+ wait_on_page_writeback(page);
+ } else {
+ if (TestSetPageLocked(page))
+ goto goaround;
+
+ /* Make sure the writeback bit being kept zero. */
+ if (PageWriteback(page))
+ goto goaround_pagelocked;
+ }
+
+ /* Now we can safely modify the flags field. */
+ pzone_restore_page_flags(parent, page);
+ unlock_page(page);
+
+ spin_lock_irqsave(&parent->lru_lock, flags);
+ if (TestSetPageLRU(page))
+ BUG();
+
+ __put_page(page);
+ if (PageActive(page))
+ add_page_to_active_list(parent, page);
+ else
+ add_page_to_inactive_list(parent, page);
+ spin_unlock_irqrestore(&parent->lru_lock, flags);
+
+ moved++;
+ continue;
+
+goaround_pagelocked:
+ unlock_page(page);
+goaround:
+ spin_lock_irqsave(&z->lru_lock, flags);
+ __put_page(page);
+ if (TestSetPageLRU(page))
+ BUG();
+ list_add(&page->lru, clist);
+ ++*cnr;
+ spin_unlock_irqrestore(&z->lru_lock, flags);
+ }
+
+ return moved;
+}
+
+static void pzone_flush_free_area(struct zone *z)
+{
+ struct free_area *area;
+ struct page *page;
+ struct list_head list;
+ unsigned long flags;
+ int order;
+
+ INIT_LIST_HEAD(&list);
+
+ spin_lock_irqsave(&z->lock, flags);
+ area = &z->free_area[0];
+ while (!list_empty(&area->free_list)) {
+ page = list_entry(area->free_list.next, struct page, lru);
+ list_del(&page->lru);
+ area->nr_free--;
+ z->free_pages--;
+ z->present_pages--;
+ spin_unlock_irqrestore(&z->lock, flags);
+ pzone_restore_page_flags(z->parent, page);
+ pzone_bad_range(z->parent, page);
+ list_add(&page->lru, &list);
+ free_pages_bulk(z->parent, 1, &list, 0);
+
+ spin_lock_irqsave(&z->lock, flags);
+ }
+
+ BUG_ON(area->nr_free != 0);
+ spin_unlock_irqrestore(&z->lock, flags);
+
+ /* currently pzone only supports order-0 only. do sanity check. */
+ spin_lock_irqsave(&z->lock, flags);
+ for (order = 1; order < MAX_ORDER; order++) {
+ area = &z->free_area[order];
+ BUG_ON(area->nr_free != 0);
+ }
+ spin_unlock_irqrestore(&z->lock, flags);
+}
+
+static int pzone_is_empty(struct zone *z)
+{
+ unsigned long flags;
+ int ret = 0;
+ int i;
+
+ spin_lock_irqsave(&z->lock, flags);
+ ret += z->present_pages;
+ ret += z->free_pages;
+ ret += z->free_area[0].nr_free;
+
+ /* would better use smp_call_function for scanning pcp. */
+ for (i = 0; i < NR_CPUS; i++) {
+#ifdef CONFIG_NUMA
+ if (!zone_pcp(z, i) || (zone_pcp(z, i) == &boot_pageset[i]))
+ continue;
+#endif
+ ret += zone_pcp(z, i)->pcp[0].count;
+ ret += zone_pcp(z, i)->pcp[1].count;
+ }
+ spin_unlock_irqrestore(&z->lock, flags);
+
+ spin_lock_irqsave(&z->lru_lock, flags);
+ ret += z->nr_active;
+ ret += z->nr_inactive;
+ spin_unlock_irqrestore(&z->lru_lock, flags);
+
+ return ret == 0;
+}
+
struct zone *pzone_create(struct zone *parent, char *name, int npages)
{
struct zonelist zonelist;
@@ -2953,10 +3165,85 @@ bad1:
return NULL;
}
+#define PZONE_FLUSH_LOOP_COUNT 8
+
+/*
+ * destroying pseudo zone. the caller should make sure that no one references
+ * this pseudo zone.
+ */
+void pzone_destroy(struct zone *z)
+{
+ struct zone *parent;
+ unsigned long flags;
+ unsigned long present;
+ int freed;
+ int retrycnt = 0;
+
+ parent = z->parent;
+ present = z->present_pages;
+ pzone_parent_unregister(z);
+retry:
+ /* drain pages in per-cpu pageset to free_area */
+ smp_call_function(pzone_flush_percpu, z, 0, 1);
+ pzone_flush_percpu(z);
+
+ /* drain pages in the LRU list. */
+ freed = pzone_flush_lru(z, parent, &z->active_list, &z->nr_active,
+ retrycnt > 0);
+ spin_lock_irqsave(&z->lock, flags);
+ z->present_pages -= freed;
+ spin_unlock_irqrestore(&z->lock, flags);
+
+ freed = pzone_flush_lru(z, parent, &z->inactive_list, &z->nr_inactive,
+ retrycnt > 0);
+ spin_lock_irqsave(&z->lock, flags);
+ z->present_pages -= freed;
+ spin_unlock_irqrestore(&z->lock, flags);
+
+ pzone_flush_free_area(z);
+
+ if (!pzone_is_empty(z)) {
+ retrycnt++;
+ if (retrycnt > PZONE_FLUSH_LOOP_COUNT) {
+ BUG();
+ } else {
+ flush_workqueue(pzone_drain_wq);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(HZ);
+ goto retry;
+ }
+ }
+
+ spin_lock_irqsave(&parent->lock, flags);
+ parent->present_pages += present;
+ spin_unlock_irqrestore(&parent->lock, flags);
+
+ flush_workqueue(pzone_drain_wq);
+ pzone_table_unregister(z);
+ pzone_free_pagesets(z);
+ kfree(z->name);
+ kfree(z);
+
+ setup_per_zone_pages_min();
+ setup_per_zone_lowmem_reserve();
+}
+
static int pzone_init(void)
{
+ struct work_struct *wp;
int i;
+ pzone_drain_wq = create_workqueue("pzone");
+ if (!pzone_drain_wq) {
+ printk(KERN_ERR "pzone: create_workqueue failed.\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < NR_CPUS; i++) {
+ wp = &per_cpu(pzone_drain_work, i);
+ INIT_WORK(wp, pzone_drain, NULL);
+ }
+
for (i = 0; i < MAX_NR_PZONES; i++)
list_add_tail(&pzone_table[i].list, &pzone_freelist);
diff -urNp a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c 2006-01-30 14:33:44.000000000 +0900
+++ b/mm/vmscan.c 2006-01-30 14:34:39.000000000 +0900
@@ -591,8 +591,8 @@ keep:
*
* returns how many pages were moved onto *@dst.
*/
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
- struct list_head *dst, int *scanned)
+int isolate_lru_pages(int nr_to_scan, struct list_head *src,
+ struct list_head *dst, int *scanned)
{
int nr_taken = 0;
struct page *page;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2006-01-31 2:30 UTC|newest]
Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-01-19 8:04 [PATCH 0/2] Pzone based CKRM memory resource controller KUROSAWA Takahiro
2006-01-19 8:04 ` [PATCH 1/2] Add the pzone KUROSAWA Takahiro
2006-01-19 18:04 ` Andy Whitcroft
2006-01-19 23:42 ` KUROSAWA Takahiro
2006-01-20 9:17 ` Andy Whitcroft
2006-01-20 7:08 ` KAMEZAWA Hiroyuki
2006-01-20 8:22 ` KUROSAWA Takahiro
2006-01-20 8:30 ` KAMEZAWA Hiroyuki
2006-01-19 8:04 ` [PATCH 2/2] Add CKRM memory resource controller using pzones KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 0/8] Pzone based CKRM memory resource controller KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 1/8] Add the __GFP_NOLRU flag KUROSAWA Takahiro
2006-01-31 18:18 ` [ckrm-tech] " Dave Hansen
2006-02-01 5:06 ` KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 2/8] Keep the number of zones while zone iterator loop KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 3/8] Add for_each_zone_in_node macro KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 4/8] Extract zone specific routines as functions KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 5/8] Add the pzone_create() function KUROSAWA Takahiro
2006-01-31 2:30 ` KUROSAWA Takahiro [this message]
2006-01-31 2:30 ` [PATCH 7/8] Make the number of pages in pzones resizable KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 8/8] Add a CKRM memory resource controller using pzones KUROSAWA Takahiro
2006-02-01 2:58 ` [ckrm-tech] [PATCH 0/8] Pzone based CKRM memory resource controller chandra seetharaman
2006-02-01 5:39 ` KUROSAWA Takahiro
2006-02-01 6:16 ` Hirokazu Takahashi
2006-02-02 1:26 ` chandra seetharaman
2006-02-02 3:54 ` KUROSAWA Takahiro
2006-02-03 0:37 ` chandra seetharaman
2006-02-03 0:51 ` KUROSAWA Takahiro
2006-02-03 1:01 ` chandra seetharaman
2006-02-01 3:07 ` chandra seetharaman
2006-02-01 5:54 ` KUROSAWA Takahiro
2006-02-03 1:33 ` KUROSAWA Takahiro
2006-02-03 9:37 ` KUROSAWA Takahiro
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20060131023030.7915.57560.sendpatchset@debian \
--to=kurosawa@valinux.co.jp \
--cc=ckrm-tech@lists.sourceforge.net \
--cc=linux-mm@kvack.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.