* [RFC] [PATCH] memory controller background reclamation
@ 2007-10-22 9:43 YAMAMOTO Takashi
[not found] ` <20071022094349.60B351C1015-Pcsii4f/SVk@public.gmane.org>
0 siblings, 1 reply; 16+ messages in thread
From: YAMAMOTO Takashi @ 2007-10-22 9:43 UTC (permalink / raw)
To: balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
svaidy-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg
hi,
i implemented background reclamation for your memory controller and
did a few benchmarks with and without it. any comments?
YAMAMOTO Takashi
-----
"time make -j4 bzImage" in a cgroup with 64MB limit:
without patch:
real 22m22.389s
user 13m35.163s
sys 6m12.283s
memory.failcnt after a run: 106647
with patch:
real 19m25.860s
user 14m10.549s
sys 6m13.831s
memory.failcnt after a run: 35366
"for x in 1 2 3;do time dd if=/dev/zero bs=1M count=300|tail;done"
in a cgroup with 16MB limit:
without patch:
300+0 records in
300+0 records out
314572800 bytes (315 MB) copied, 19.0058 seconds, 16.6 MB/s
u 0.00s s 0.67s e 19.01s maj 90 min 2021 in 0 out 0
u 0.48s s 7.22s e 93.13s maj 20475 min 210903 in 0 out 0
300+0 records in
300+0 records out
314572800 bytes (315 MB) copied, 14.8394 seconds, 21.2 MB/s
u 0.00s s 0.63s e 14.84s maj 53 min 1392 in 0 out 0
u 0.48s s 7.00s e 94.39s maj 20125 min 211145 in 0 out 0
300+0 records in
300+0 records out
314572800 bytes (315 MB) copied, 14.0635 seconds, 22.4 MB/s
u 0.01s s 0.59s e 14.07s maj 50 min 1385 in 0 out 0
u 0.57s s 6.93s e 91.54s maj 20359 min 210911 in 0 out 0
memory.failcnt after a run: 8804
with patch:
300+0 records in
300+0 records out
314572800 bytes (315 MB) copied, 5.94875 seconds, 52.9 MB/s
u 0.00s s 0.67s e 5.95s maj 206 min 5393 in 0 out 0
u 0.45s s 6.63s e 46.07s maj 21350 min 210252 in 0 out 0
300+0 records in
300+0 records out
314572800 bytes (315 MB) copied, 8.16441 seconds, 38.5 MB/s
u 0.00s s 0.60s e 8.17s maj 240 min 4614 in 0 out 0
u 0.45s s 6.56s e 54.56s maj 22298 min 209255 in 0 out 0
300+0 records in
300+0 records out
314572800 bytes (315 MB) copied, 4.60967 seconds, 68.2 MB/s
u 0.00s s 0.60s e 4.64s maj 196 min 4733 in 0 out 0
u 0.46s s 6.53s e 49.28s maj 22223 min 209384 in 0 out 0
memory.failcnt after a run: 1589
-----
--- linux-2.6.23-rc8-mm2-cgkswapd/mm/memcontrol.c.BACKUP 2007-10-01 17:19:57.000000000 +0900
+++ linux-2.6.23-rc8-mm2-cgkswapd/mm/memcontrol.c 2007-10-22 13:46:01.000000000 +0900
@@ -27,6 +27,7 @@
#include <linux/rcupdate.h>
#include <linux/swap.h>
#include <linux/spinlock.h>
+#include <linux/workqueue.h>
#include <linux/fs.h>
#include <asm/uaccess.h>
@@ -63,6 +64,8 @@ struct mem_cgroup {
*/
spinlock_t lru_lock;
unsigned long control_type; /* control RSS or RSS+Pagecache */
+
+ struct work_struct reclaim_work;
};
/*
@@ -95,6 +98,9 @@ enum {
static struct mem_cgroup init_mem_cgroup;
+static DEFINE_MUTEX(mem_cgroup_workqueue_init_lock);
+static struct workqueue_struct *mem_cgroup_workqueue;
+
static inline
struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
{
@@ -250,6 +256,69 @@ unsigned long mem_cgroup_isolate_pages(u
return nr_taken;
}
+static int
+mem_cgroup_need_reclaim(struct mem_cgroup *mem)
+{
+ struct res_counter * const cnt = &mem->res;
+ int doreclaim;
+ unsigned long flags;
+
+ /* XXX should be in res_counter */
+ /* XXX should not hardcode a watermark */
+ spin_lock_irqsave(&cnt->lock, flags);
+ doreclaim = cnt->usage > cnt->limit / 4 * 3;
+ spin_unlock_irqrestore(&cnt->lock, flags);
+
+ return doreclaim;
+}
+
+static void
+mem_cgroup_schedule_reclaim_if_necessary(struct mem_cgroup *mem)
+{
+
+ if (mem_cgroup_workqueue == NULL) {
+ BUG_ON(mem->css.cgroup->parent != NULL);
+ return;
+ }
+
+ if (work_pending(&mem->reclaim_work))
+ return;
+
+ if (!mem_cgroup_need_reclaim(mem))
+ return;
+
+ css_get(&mem->css); /* XXX need some thoughts wrt cgroup removal. */
+ /*
+ * XXX workqueue is not an ideal mechanism for our purpose.
+ * revisit later.
+ */
+ if (!queue_work(mem_cgroup_workqueue, &mem->reclaim_work))
+ css_put(&mem->css);
+}
+
+static void
+mem_cgroup_reclaim(struct work_struct *work)
+{
+ struct mem_cgroup * const mem =
+ container_of(work, struct mem_cgroup, reclaim_work);
+ int batch_count = 128; /* XXX arbitrary */
+
+ for (; batch_count > 0; batch_count--) {
+ if (!mem_cgroup_need_reclaim(mem))
+ break;
+ /*
+ * XXX try_to_free_foo is not a correct mechanism to
+ * use here. eg. ALLOCSTALL counter
+ * revisit later.
+ */
+ if (!try_to_free_mem_cgroup_pages(mem, GFP_KERNEL))
+ break;
+ }
+ if (batch_count == 0)
+ mem_cgroup_schedule_reclaim_if_necessary(mem);
+ css_put(&mem->css);
+}
+
/*
* Charge the memory controller for page usage.
* Return
@@ -311,6 +380,9 @@ int mem_cgroup_charge(struct page *page,
*/
while (res_counter_charge(&mem->res, PAGE_SIZE)) {
bool is_atomic = gfp_mask & GFP_ATOMIC;
+
+ mem_cgroup_schedule_reclaim_if_necessary(mem);
+
/*
* We cannot reclaim under GFP_ATOMIC, fail the charge
*/
@@ -551,8 +623,18 @@ mem_cgroup_create(struct cgroup_subsys *
if (unlikely((cont->parent) == NULL)) {
mem = &init_mem_cgroup;
init_mm.mem_cgroup = mem;
- } else
+ } else {
+ /* XXX too late for the top-level cgroup */
+ if (mem_cgroup_workqueue == NULL) {
+ mutex_lock(&mem_cgroup_workqueue_init_lock);
+ if (mem_cgroup_workqueue == NULL) {
+ mem_cgroup_workqueue =
+ create_workqueue("mem_cgroup");
+ }
+ mutex_unlock(&mem_cgroup_workqueue_init_lock);
+ }
mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
+ }
if (mem == NULL)
return NULL;
@@ -562,6 +644,7 @@ mem_cgroup_create(struct cgroup_subsys *
INIT_LIST_HEAD(&mem->inactive_list);
spin_lock_init(&mem->lru_lock);
mem->control_type = MEM_CGROUP_TYPE_ALL;
+ INIT_WORK(&mem->reclaim_work, mem_cgroup_reclaim);
return &mem->css;
}
^ permalink raw reply [flat|nested] 16+ messages in thread[parent not found: <20071022094349.60B351C1015-Pcsii4f/SVk@public.gmane.org>]
* Re: [RFC] [PATCH] memory controller background reclamation [not found] ` <20071022094349.60B351C1015-Pcsii4f/SVk@public.gmane.org> @ 2007-10-22 16:10 ` Balbir Singh [not found] ` <471CCB85.3040905-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> 2007-10-22 17:10 ` Vaidyanathan Srinivasan 1 sibling, 1 reply; 16+ messages in thread From: Balbir Singh @ 2007-10-22 16:10 UTC (permalink / raw) To: YAMAMOTO Takashi Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg YAMAMOTO Takashi wrote: > hi, > > i implemented background reclamation for your memory controller and > did a few benchmarks with and without it. any comments? > > YAMAMOTO Takashi > > > ----- > "time make -j4 bzImage" in a cgroup with 64MB limit: > > without patch: > real 22m22.389s > user 13m35.163s > sys 6m12.283s > > memory.failcnt after a run: 106647 > > with patch: > real 19m25.860s > user 14m10.549s > sys 6m13.831s > > memory.failcnt after a run: 35366 > > "for x in 1 2 3;do time dd if=/dev/zero bs=1M count=300|tail;done" > in a cgroup with 16MB limit: > > without patch: > 300+0 records in > 300+0 records out > 314572800 bytes (315 MB) copied, 19.0058 seconds, 16.6 MB/s > u 0.00s s 0.67s e 19.01s maj 90 min 2021 in 0 out 0 > u 0.48s s 7.22s e 93.13s maj 20475 min 210903 in 0 out 0 > 300+0 records in > 300+0 records out > 314572800 bytes (315 MB) copied, 14.8394 seconds, 21.2 MB/s > u 0.00s s 0.63s e 14.84s maj 53 min 1392 in 0 out 0 > u 0.48s s 7.00s e 94.39s maj 20125 min 211145 in 0 out 0 > 300+0 records in > 300+0 records out > 314572800 bytes (315 MB) copied, 14.0635 seconds, 22.4 MB/s > u 0.01s s 0.59s e 14.07s maj 50 min 1385 in 0 out 0 > u 0.57s s 6.93s e 91.54s maj 20359 min 210911 in 0 out 0 > > memory.failcnt after a run: 8804 > > with patch: > 300+0 records in > 300+0 records out > 314572800 bytes (315 MB) copied, 5.94875 seconds, 52.9 MB/s > u 0.00s s 0.67s e 5.95s maj 206 min 5393 in 0 out 0 > u 0.45s s 6.63s e 46.07s maj 21350 min 210252 in 0 out 0 > 300+0 records in > 300+0 records out > 314572800 bytes (315 MB) copied, 8.16441 seconds, 38.5 MB/s > u 0.00s s 0.60s e 8.17s maj 240 min 4614 in 0 out 0 > u 0.45s s 6.56s e 54.56s maj 22298 min 209255 in 0 out 0 > 300+0 records in > 300+0 records out > 314572800 bytes (315 MB) copied, 4.60967 seconds, 68.2 MB/s > u 0.00s s 0.60s e 4.64s maj 196 min 4733 in 0 out 0 > u 0.46s s 6.53s e 49.28s maj 22223 min 209384 in 0 out 0 > > memory.failcnt after a run: 1589 Looks quite nice! > ----- > > > --- linux-2.6.23-rc8-mm2-cgkswapd/mm/memcontrol.c.BACKUP 2007-10-01 17:19:57.000000000 +0900 > +++ linux-2.6.23-rc8-mm2-cgkswapd/mm/memcontrol.c 2007-10-22 13:46:01.000000000 +0900 > @@ -27,6 +27,7 @@ > #include <linux/rcupdate.h> > #include <linux/swap.h> > #include <linux/spinlock.h> > +#include <linux/workqueue.h> > #include <linux/fs.h> > > #include <asm/uaccess.h> > @@ -63,6 +64,8 @@ struct mem_cgroup { > */ > spinlock_t lru_lock; > unsigned long control_type; /* control RSS or RSS+Pagecache */ > + > + struct work_struct reclaim_work; > }; > > /* > @@ -95,6 +98,9 @@ enum { > > static struct mem_cgroup init_mem_cgroup; > > +static DEFINE_MUTEX(mem_cgroup_workqueue_init_lock); > +static struct workqueue_struct *mem_cgroup_workqueue; > + > static inline > struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) > { > @@ -250,6 +256,69 @@ unsigned long mem_cgroup_isolate_pages(u > return nr_taken; > } > > +static int > +mem_cgroup_need_reclaim(struct mem_cgroup *mem) > +{ > + struct res_counter * const cnt = &mem->res; > + int doreclaim; > + unsigned long flags; > + > + /* XXX should be in res_counter */ > + /* XXX should not hardcode a watermark */ We could add the following API to resource counters res_counter_set_low_watermark res_counter_set_high_watermark res_counter_below_low_watermark res_counter_above_high_watermark and add low_watermark high_watermark members to the resource group. We could push out data upto the low watermark from the cgroup. > + spin_lock_irqsave(&cnt->lock, flags); > + doreclaim = cnt->usage > cnt->limit / 4 * 3; > + spin_unlock_irqrestore(&cnt->lock, flags); > + > + return doreclaim; > +} > + > +static void > +mem_cgroup_schedule_reclaim_if_necessary(struct mem_cgroup *mem) > +{ > + > + if (mem_cgroup_workqueue == NULL) { > + BUG_ON(mem->css.cgroup->parent != NULL); > + return; > + } > + > + if (work_pending(&mem->reclaim_work)) > + return; > + > + if (!mem_cgroup_need_reclaim(mem)) > + return; > + > + css_get(&mem->css); /* XXX need some thoughts wrt cgroup removal. */ > + /* > + * XXX workqueue is not an ideal mechanism for our purpose. > + * revisit later. > + */ > + if (!queue_work(mem_cgroup_workqueue, &mem->reclaim_work)) > + css_put(&mem->css); > +} > + > +static void > +mem_cgroup_reclaim(struct work_struct *work) > +{ > + struct mem_cgroup * const mem = > + container_of(work, struct mem_cgroup, reclaim_work); > + int batch_count = 128; /* XXX arbitrary */ > + > + for (; batch_count > 0; batch_count--) { > + if (!mem_cgroup_need_reclaim(mem)) > + break; > + /* > + * XXX try_to_free_foo is not a correct mechanism to > + * use here. eg. ALLOCSTALL counter > + * revisit later. > + */ > + if (!try_to_free_mem_cgroup_pages(mem, GFP_KERNEL)) We could make try_to_free_mem_cgroup_pages, batch aware and pass that in scan_control. > + break; > + } > + if (batch_count == 0) > + mem_cgroup_schedule_reclaim_if_necessary(mem); > + css_put(&mem->css); > +} > + > /* > * Charge the memory controller for page usage. > * Return > @@ -311,6 +380,9 @@ int mem_cgroup_charge(struct page *page, > */ > while (res_counter_charge(&mem->res, PAGE_SIZE)) { > bool is_atomic = gfp_mask & GFP_ATOMIC; > + > + mem_cgroup_schedule_reclaim_if_necessary(mem); > + > /* > * We cannot reclaim under GFP_ATOMIC, fail the charge > */ > @@ -551,8 +623,18 @@ mem_cgroup_create(struct cgroup_subsys * > if (unlikely((cont->parent) == NULL)) { > mem = &init_mem_cgroup; > init_mm.mem_cgroup = mem; > - } else > + } else { > + /* XXX too late for the top-level cgroup */ > + if (mem_cgroup_workqueue == NULL) { > + mutex_lock(&mem_cgroup_workqueue_init_lock); > + if (mem_cgroup_workqueue == NULL) { > + mem_cgroup_workqueue = > + create_workqueue("mem_cgroup"); > + } > + mutex_unlock(&mem_cgroup_workqueue_init_lock); > + } > mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL); > + } > > if (mem == NULL) > return NULL; > @@ -562,6 +644,7 @@ mem_cgroup_create(struct cgroup_subsys * > INIT_LIST_HEAD(&mem->inactive_list); > spin_lock_init(&mem->lru_lock); > mem->control_type = MEM_CGROUP_TYPE_ALL; > + INIT_WORK(&mem->reclaim_work, mem_cgroup_reclaim); > return &mem->css; > } > -- Warm Regards, Balbir Singh Linux Technology Center IBM, ISTL ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <471CCB85.3040905-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>]
* Re: [RFC] [PATCH] memory controller background reclamation [not found] ` <471CCB85.3040905-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> @ 2007-10-22 23:44 ` YAMAMOTO Takashi [not found] ` <20071022234430.EE6B11C1231-Pcsii4f/SVk@public.gmane.org> 0 siblings, 1 reply; 16+ messages in thread From: YAMAMOTO Takashi @ 2007-10-22 23:44 UTC (permalink / raw) To: balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8 Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg hi, > > @@ -250,6 +256,69 @@ unsigned long mem_cgroup_isolate_pages(u > > return nr_taken; > > } > > > > +static int > > +mem_cgroup_need_reclaim(struct mem_cgroup *mem) > > +{ > > + struct res_counter * const cnt = &mem->res; > > + int doreclaim; > > + unsigned long flags; > > + > > + /* XXX should be in res_counter */ > > + /* XXX should not hardcode a watermark */ > > We could add the following API to resource counters > > res_counter_set_low_watermark > res_counter_set_high_watermark > res_counter_below_low_watermark > res_counter_above_high_watermark > > and add > > low_watermark > high_watermark > > members to the resource group. We could push out data > upto the low watermark from the cgroup. it sounds fine to me. > > +static void > > +mem_cgroup_reclaim(struct work_struct *work) > > +{ > > + struct mem_cgroup * const mem = > > + container_of(work, struct mem_cgroup, reclaim_work); > > + int batch_count = 128; /* XXX arbitrary */ > > + > > + for (; batch_count > 0; batch_count--) { > > + if (!mem_cgroup_need_reclaim(mem)) > > + break; > > + /* > > + * XXX try_to_free_foo is not a correct mechanism to > > + * use here. eg. ALLOCSTALL counter > > + * revisit later. > > + */ > > + if (!try_to_free_mem_cgroup_pages(mem, GFP_KERNEL)) > > We could make try_to_free_mem_cgroup_pages, batch aware and pass that > in scan_control. in the comment above, i meant that it might be better to introduce something like balance_pgdat rather than using try_to_free_mem_cgroup_pages. with the current design of cgroup lru lists, probably it doesn't matter much except statistics, tho. YAMAMOTO Takashi ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <20071022234430.EE6B11C1231-Pcsii4f/SVk@public.gmane.org>]
* Re: [RFC] [PATCH] memory controller background reclamation [not found] ` <20071022234430.EE6B11C1231-Pcsii4f/SVk@public.gmane.org> @ 2007-11-15 6:16 ` YAMAMOTO Takashi [not found] ` <20071115061602.856171CD9BF-Pcsii4f/SVk@public.gmane.org> [not found] ` <20071122085733.758AD1CEE9F-Pcsii4f/SVk@public.gmane.org> 0 siblings, 2 replies; 16+ messages in thread From: YAMAMOTO Takashi @ 2007-11-15 6:16 UTC (permalink / raw) To: balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8 Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg > > We could add the following API to resource counters > > > > res_counter_set_low_watermark > > res_counter_set_high_watermark > > res_counter_below_low_watermark > > res_counter_above_high_watermark > > > > and add > > > > low_watermark > > high_watermark > > > > members to the resource group. We could push out data > > upto the low watermark from the cgroup. i implemented something like that. (and rebased to 2.6.24-rc2-mm1.) what's the best way to expose watermarks to userland is an open question. i took the simplest way for now. do you have any suggestions? YAMAMOTO Takashi --- ./include/linux/res_counter.h.orig 2007-11-14 15:57:31.000000000 +0900 +++ ./include/linux/res_counter.h 2007-11-14 16:03:24.000000000 +0900 @@ -32,6 +32,13 @@ struct res_counter { * the number of unsuccessful attempts to consume the resource */ unsigned long long failcnt; + + /* + * watermarks + */ + unsigned long long high_watermark; + unsigned long long low_watermark; + /* * the lock to protect all of the above. * the routines below consider this to be IRQ-safe @@ -66,6 +73,8 @@ enum { RES_USAGE, RES_LIMIT, RES_FAILCNT, + RES_HIGH_WATERMARK, + RES_LOW_WATERMARK, }; /* @@ -124,4 +133,26 @@ static inline bool res_counter_check_und return ret; } +static inline bool res_counter_below_low_watermark(struct res_counter *cnt) +{ + bool ret; + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + ret = cnt->usage < cnt->low_watermark; + spin_unlock_irqrestore(&cnt->lock, flags); + return ret; +} + +static inline bool res_counter_above_high_watermark(struct res_counter *cnt) +{ + bool ret; + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + ret = cnt->usage > cnt->high_watermark; + spin_unlock_irqrestore(&cnt->lock, flags); + return ret; +} + #endif --- ./kernel/res_counter.c.orig 2007-11-14 15:57:31.000000000 +0900 +++ ./kernel/res_counter.c 2007-11-14 16:03:24.000000000 +0900 @@ -17,6 +17,8 @@ void res_counter_init(struct res_counter { spin_lock_init(&counter->lock); counter->limit = (unsigned long long)LLONG_MAX; + counter->high_watermark = (unsigned long long)LLONG_MAX; + counter->low_watermark = (unsigned long long)LLONG_MAX; } int res_counter_charge_locked(struct res_counter *counter, unsigned long val) @@ -69,6 +71,10 @@ res_counter_member(struct res_counter *c return &counter->limit; case RES_FAILCNT: return &counter->failcnt; + case RES_HIGH_WATERMARK: + return &counter->high_watermark; + case RES_LOW_WATERMARK: + return &counter->low_watermark; }; BUG(); @@ -99,6 +105,7 @@ ssize_t res_counter_write(struct res_cou int ret; char *buf, *end; unsigned long long tmp, *val; + unsigned long flags; buf = kmalloc(nbytes + 1, GFP_KERNEL); ret = -ENOMEM; @@ -122,9 +129,29 @@ ssize_t res_counter_write(struct res_cou goto out_free; } + spin_lock_irqsave(&counter->lock, flags); val = res_counter_member(counter, member); + /* ensure low_watermark <= high_watermark <= limit */ + switch (member) { + case RES_LIMIT: + if (tmp < counter->high_watermark) + goto out_locked; + break; + case RES_HIGH_WATERMARK: + if (tmp > counter->limit || tmp < counter->low_watermark) + goto out_locked; + break; + case RES_LOW_WATERMARK: + if (tmp > counter->high_watermark) + goto out_locked; + break; + } *val = tmp; + BUG_ON(counter->high_watermark > counter->limit); + BUG_ON(counter->low_watermark > counter->high_watermark); ret = nbytes; +out_locked: + spin_unlock_irqrestore(&counter->lock, flags); out_free: kfree(buf); out: --- ./mm/memcontrol.c.orig 2007-11-14 15:57:46.000000000 +0900 +++ ./mm/memcontrol.c 2007-11-14 16:03:53.000000000 +0900 @@ -28,6 +28,7 @@ #include <linux/rcupdate.h> #include <linux/swap.h> #include <linux/spinlock.h> +#include <linux/workqueue.h> #include <linux/fs.h> #include <linux/seq_file.h> @@ -110,6 +111,10 @@ struct mem_cgroup { * statistics. */ struct mem_cgroup_stat stat; + /* + * background reclamation. + */ + struct work_struct reclaim_work; }; /* @@ -168,6 +173,9 @@ static void mem_cgroup_charge_statistics static struct mem_cgroup init_mem_cgroup; +static DEFINE_MUTEX(mem_cgroup_workqueue_init_lock); +static struct workqueue_struct *mem_cgroup_workqueue; + static inline struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { @@ -375,6 +383,50 @@ unsigned long mem_cgroup_isolate_pages(u return nr_taken; } +static void +mem_cgroup_schedule_reclaim(struct mem_cgroup *mem) +{ + + if (mem_cgroup_workqueue == NULL) { + BUG_ON(mem->css.cgroup->parent != NULL); + return; + } + + if (work_pending(&mem->reclaim_work)) + return; + + css_get(&mem->css); /* XXX need some thoughts wrt cgroup removal. */ + /* + * XXX workqueue is not an ideal mechanism for our purpose. + * revisit later. + */ + if (!queue_work(mem_cgroup_workqueue, &mem->reclaim_work)) + css_put(&mem->css); +} + +static void +mem_cgroup_reclaim(struct work_struct *work) +{ + struct mem_cgroup * const mem = + container_of(work, struct mem_cgroup, reclaim_work); + int batch_count = 128; /* XXX arbitrary */ + + for (; batch_count > 0; batch_count--) { + if (res_counter_below_low_watermark(&mem->res)) + break; + /* + * XXX try_to_free_foo is not a correct mechanism to + * use here. eg. ALLOCSTALL counter + * revisit later. + */ + if (!try_to_free_mem_cgroup_pages(mem, GFP_KERNEL)) + break; + } + if (batch_count == 0) + mem_cgroup_schedule_reclaim(mem); + css_put(&mem->css); +} + /* * Charge the memory controller for page usage. * Return @@ -439,11 +491,18 @@ retry: rcu_read_unlock(); /* + * schedule background reclaim if we are above the high watermark. + */ + if (res_counter_above_high_watermark(&mem->res)) + mem_cgroup_schedule_reclaim(mem); + + /* * If we created the page_cgroup, we should free it on exceeding * the cgroup limit. */ while (res_counter_charge(&mem->res, PAGE_SIZE)) { bool is_atomic = gfp_mask & GFP_ATOMIC; + /* * We cannot reclaim under GFP_ATOMIC, fail the charge */ @@ -861,6 +920,18 @@ static struct cftype mem_cgroup_files[] .read = mem_cgroup_read, }, { + .name = "high_watermark_in_bytes", + .private = RES_HIGH_WATERMARK, + .write = mem_cgroup_write, + .read = mem_cgroup_read, + }, + { + .name = "low_watermark_in_bytes", + .private = RES_LOW_WATERMARK, + .write = mem_cgroup_write, + .read = mem_cgroup_read, + }, + { .name = "control_type", .write = mem_control_type_write, .read = mem_control_type_read, @@ -886,8 +957,18 @@ mem_cgroup_create(struct cgroup_subsys * if (unlikely((cont->parent) == NULL)) { mem = &init_mem_cgroup; init_mm.mem_cgroup = mem; - } else + } else { + /* XXX too late for the top-level cgroup */ + if (mem_cgroup_workqueue == NULL) { + mutex_lock(&mem_cgroup_workqueue_init_lock); + if (mem_cgroup_workqueue == NULL) { + mem_cgroup_workqueue = + create_workqueue("mem_cgroup"); + } + mutex_unlock(&mem_cgroup_workqueue_init_lock); + } mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL); + } if (mem == NULL) return NULL; @@ -897,6 +978,7 @@ mem_cgroup_create(struct cgroup_subsys * INIT_LIST_HEAD(&mem->inactive_list); spin_lock_init(&mem->lru_lock); mem->control_type = MEM_CGROUP_TYPE_ALL; + INIT_WORK(&mem->reclaim_work, mem_cgroup_reclaim); return &mem->css; } ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <20071115061602.856171CD9BF-Pcsii4f/SVk@public.gmane.org>]
* Re: [RFC] [PATCH] memory controller background reclamation [not found] ` <20071115061602.856171CD9BF-Pcsii4f/SVk@public.gmane.org> @ 2007-11-22 8:57 ` YAMAMOTO Takashi 0 siblings, 0 replies; 16+ messages in thread From: YAMAMOTO Takashi @ 2007-11-22 8:57 UTC (permalink / raw) To: balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8 Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg > > > We could add the following API to resource counters > > > > > > res_counter_set_low_watermark > > > res_counter_set_high_watermark > > > res_counter_below_low_watermark > > > res_counter_above_high_watermark > > > > > > and add > > > > > > low_watermark > > > high_watermark > > > > > > members to the resource group. We could push out data > > > upto the low watermark from the cgroup. > > i implemented something like that. (and rebased to 2.6.24-rc2-mm1.) > > what's the best way to expose watermarks to userland is an open question. > i took the simplest way for now. do you have any suggestions? > > YAMAMOTO Takashi here's a new version. changes from the previous: - rebase to linux-2.6.24-rc2-mm1 + kamezawa patchset. - create workqueue when setting high watermark, rather than cgroup creation which is earlier on boot. YAMAMOTO Takashi Signed-off-by: YAMAMOTO Takashi <yamamoto-jCdQPDEk3idL9jVzuh4AOg@public.gmane.org> --- --- linux-2.6.24-rc2-mm1-kame-pd/include/linux/res_counter.h.BACKUP 2007-11-14 16:05:48.000000000 +0900 +++ linux-2.6.24-rc2-mm1-kame-pd/include/linux/res_counter.h 2007-11-22 15:14:32.000000000 +0900 @@ -32,6 +32,13 @@ struct res_counter { * the number of unsuccessful attempts to consume the resource */ unsigned long long failcnt; + + /* + * watermarks + */ + unsigned long long high_watermark; + unsigned long long low_watermark; + /* * the lock to protect all of the above. * the routines below consider this to be IRQ-safe @@ -66,6 +73,8 @@ enum { RES_USAGE, RES_LIMIT, RES_FAILCNT, + RES_HIGH_WATERMARK, + RES_LOW_WATERMARK, }; /* @@ -124,4 +133,26 @@ static inline bool res_counter_check_und return ret; } +static inline bool res_counter_below_low_watermark(struct res_counter *cnt) +{ + bool ret; + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + ret = cnt->usage < cnt->low_watermark; + spin_unlock_irqrestore(&cnt->lock, flags); + return ret; +} + +static inline bool res_counter_above_high_watermark(struct res_counter *cnt) +{ + bool ret; + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + ret = cnt->usage > cnt->high_watermark; + spin_unlock_irqrestore(&cnt->lock, flags); + return ret; +} + #endif --- linux-2.6.24-rc2-mm1-kame-pd/kernel/res_counter.c.BACKUP 2007-11-14 16:05:52.000000000 +0900 +++ linux-2.6.24-rc2-mm1-kame-pd/kernel/res_counter.c 2007-11-22 15:14:32.000000000 +0900 @@ -17,6 +17,8 @@ void res_counter_init(struct res_counter { spin_lock_init(&counter->lock); counter->limit = (unsigned long long)LLONG_MAX; + counter->high_watermark = (unsigned long long)LLONG_MAX; + counter->low_watermark = (unsigned long long)LLONG_MAX; } int res_counter_charge_locked(struct res_counter *counter, unsigned long val) @@ -69,6 +71,10 @@ res_counter_member(struct res_counter *c return &counter->limit; case RES_FAILCNT: return &counter->failcnt; + case RES_HIGH_WATERMARK: + return &counter->high_watermark; + case RES_LOW_WATERMARK: + return &counter->low_watermark; }; BUG(); @@ -99,6 +105,7 @@ ssize_t res_counter_write(struct res_cou int ret; char *buf, *end; unsigned long long tmp, *val; + unsigned long flags; buf = kmalloc(nbytes + 1, GFP_KERNEL); ret = -ENOMEM; @@ -122,9 +129,29 @@ ssize_t res_counter_write(struct res_cou goto out_free; } + spin_lock_irqsave(&counter->lock, flags); val = res_counter_member(counter, member); + /* ensure low_watermark <= high_watermark <= limit */ + switch (member) { + case RES_LIMIT: + if (tmp < counter->high_watermark) + goto out_locked; + break; + case RES_HIGH_WATERMARK: + if (tmp > counter->limit || tmp < counter->low_watermark) + goto out_locked; + break; + case RES_LOW_WATERMARK: + if (tmp > counter->high_watermark) + goto out_locked; + break; + } *val = tmp; + BUG_ON(counter->high_watermark > counter->limit); + BUG_ON(counter->low_watermark > counter->high_watermark); ret = nbytes; +out_locked: + spin_unlock_irqrestore(&counter->lock, flags); out_free: kfree(buf); out: --- linux-2.6.24-rc2-mm1-kame-pd/mm/memcontrol.c.BACKUP 2007-11-20 13:11:09.000000000 +0900 +++ linux-2.6.24-rc2-mm1-kame-pd/mm/memcontrol.c 2007-11-22 16:29:26.000000000 +0900 @@ -28,6 +28,7 @@ #include <linux/rcupdate.h> #include <linux/swap.h> #include <linux/spinlock.h> +#include <linux/workqueue.h> #include <linux/fs.h> #include <linux/seq_file.h> @@ -138,6 +139,10 @@ struct mem_cgroup { * statistics. */ struct mem_cgroup_stat stat; + /* + * background reclamation. + */ + struct work_struct reclaim_work; }; /* @@ -240,6 +245,21 @@ static unsigned long mem_cgroup_get_all_ static struct mem_cgroup init_mem_cgroup; +static DEFINE_MUTEX(mem_cgroup_workqueue_init_lock); +static struct workqueue_struct *mem_cgroup_workqueue; + +static void mem_cgroup_create_workqueue(void) +{ + + if (mem_cgroup_workqueue != NULL) + return; + + mutex_lock(&mem_cgroup_workqueue_init_lock); + if (mem_cgroup_workqueue == NULL) + mem_cgroup_workqueue = create_workqueue("mem_cgroup"); + mutex_unlock(&mem_cgroup_workqueue_init_lock); +} + static inline struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { @@ -566,6 +586,50 @@ unsigned long mem_cgroup_isolate_pages(u return nr_taken; } +static void +mem_cgroup_schedule_reclaim(struct mem_cgroup *mem) +{ + + if (mem_cgroup_workqueue == NULL) { + BUG_ON(mem->css.cgroup->parent != NULL); + return; + } + + if (work_pending(&mem->reclaim_work)) + return; + + css_get(&mem->css); /* XXX need some thoughts wrt cgroup removal. */ + /* + * XXX workqueue is not an ideal mechanism for our purpose. + * revisit later. + */ + if (!queue_work(mem_cgroup_workqueue, &mem->reclaim_work)) + css_put(&mem->css); +} + +static void +mem_cgroup_reclaim(struct work_struct *work) +{ + struct mem_cgroup * const mem = + container_of(work, struct mem_cgroup, reclaim_work); + int batch_count = 128; /* XXX arbitrary */ + + for (; batch_count > 0; batch_count--) { + if (res_counter_below_low_watermark(&mem->res)) + break; + /* + * XXX try_to_free_foo is not a correct mechanism to + * use here. eg. ALLOCSTALL counter + * revisit later. + */ + if (!try_to_free_mem_cgroup_pages(mem, GFP_KERNEL)) + break; + } + if (batch_count == 0) + mem_cgroup_schedule_reclaim(mem); + css_put(&mem->css); +} + /* * Charge the memory controller for page usage. * Return @@ -631,6 +695,12 @@ retry: rcu_read_unlock(); /* + * schedule background reclaim if we are above the high watermark. + */ + if (res_counter_above_high_watermark(&mem->res)) + mem_cgroup_schedule_reclaim(mem); + + /* * If we created the page_cgroup, we should free it on exceeding * the cgroup limit. */ @@ -939,9 +1009,16 @@ static ssize_t mem_cgroup_write(struct c struct file *file, const char __user *userbuf, size_t nbytes, loff_t *ppos) { - return res_counter_write(&mem_cgroup_from_cont(cont)->res, + ssize_t ret; + + ret = res_counter_write(&mem_cgroup_from_cont(cont)->res, cft->private, userbuf, nbytes, ppos, mem_cgroup_write_strategy); + + if (ret >= 0 && cft->private == RES_HIGH_WATERMARK) + mem_cgroup_create_workqueue(); + + return ret; } static ssize_t mem_control_type_write(struct cgroup *cont, @@ -1097,6 +1174,18 @@ static struct cftype mem_cgroup_files[] .read = mem_cgroup_read, }, { + .name = "high_watermark_in_bytes", + .private = RES_HIGH_WATERMARK, + .write = mem_cgroup_write, + .read = mem_cgroup_read, + }, + { + .name = "low_watermark_in_bytes", + .private = RES_LOW_WATERMARK, + .write = mem_cgroup_write, + .read = mem_cgroup_read, + }, + { .name = "control_type", .write = mem_control_type_write, .read = mem_control_type_read, @@ -1161,6 +1250,8 @@ mem_cgroup_create(struct cgroup_subsys * if (alloc_mem_cgroup_per_zone_info(mem, node)) goto free_out; + INIT_WORK(&mem->reclaim_work, mem_cgroup_reclaim); + return &mem->css; free_out: for_each_node_state(node, N_POSSIBLE) ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <20071122085733.758AD1CEE9F-Pcsii4f/SVk@public.gmane.org>]
* Re: Re: [RFC] [PATCH] memory controller background reclamation [not found] ` <20071122085733.758AD1CEE9F-Pcsii4f/SVk@public.gmane.org> @ 2007-11-22 13:33 ` kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A 2007-11-23 3:46 ` Balbir Singh 1 sibling, 0 replies; 16+ messages in thread From: kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A @ 2007-11-22 13:33 UTC (permalink / raw) To: YAMAMOTO Takashi Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg, balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8 >here's a new version. > Thank you. I'll review and try to merge yours in the next week, Regards, -Kame ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [RFC] [PATCH] memory controller background reclamation [not found] ` <20071122085733.758AD1CEE9F-Pcsii4f/SVk@public.gmane.org> 2007-11-22 13:33 ` kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A @ 2007-11-23 3:46 ` Balbir Singh [not found] ` <47464D04.2030906-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> 1 sibling, 1 reply; 16+ messages in thread From: Balbir Singh @ 2007-11-23 3:46 UTC (permalink / raw) To: YAMAMOTO Takashi Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg YAMAMOTO Takashi wrote: > changes from the previous: > - rebase to linux-2.6.24-rc2-mm1 + kamezawa patchset. > - create workqueue when setting high watermark, rather than cgroup creation > which is earlier on boot. > > YAMAMOTO Takashi > > > Signed-off-by: YAMAMOTO Takashi <yamamoto-jCdQPDEk3idL9jVzuh4AOg@public.gmane.org> > --- > > --- linux-2.6.24-rc2-mm1-kame-pd/include/linux/res_counter.h.BACKUP 2007-11-14 16:05:48.000000000 +0900 > +++ linux-2.6.24-rc2-mm1-kame-pd/include/linux/res_counter.h 2007-11-22 15:14:32.000000000 +0900 > @@ -32,6 +32,13 @@ struct res_counter { > * the number of unsuccessful attempts to consume the resource > */ > unsigned long long failcnt; > + > + /* > + * watermarks > + */ > + unsigned long long high_watermark; > + unsigned long long low_watermark; > + > /* > * the lock to protect all of the above. > * the routines below consider this to be IRQ-safe > @@ -66,6 +73,8 @@ enum { > RES_USAGE, > RES_LIMIT, > RES_FAILCNT, > + RES_HIGH_WATERMARK, > + RES_LOW_WATERMARK, > }; > > /* > @@ -124,4 +133,26 @@ static inline bool res_counter_check_und > return ret; > } > > +static inline bool res_counter_below_low_watermark(struct res_counter *cnt) > +{ > + bool ret; > + unsigned long flags; > + > + spin_lock_irqsave(&cnt->lock, flags); > + ret = cnt->usage < cnt->low_watermark; > + spin_unlock_irqrestore(&cnt->lock, flags); > + return ret; > +} > + > +static inline bool res_counter_above_high_watermark(struct res_counter *cnt) > +{ > + bool ret; > + unsigned long flags; > + > + spin_lock_irqsave(&cnt->lock, flags); > + ret = cnt->usage > cnt->high_watermark; > + spin_unlock_irqrestore(&cnt->lock, flags); > + return ret; > +} > + > #endif > --- linux-2.6.24-rc2-mm1-kame-pd/kernel/res_counter.c.BACKUP 2007-11-14 16:05:52.000000000 +0900 > +++ linux-2.6.24-rc2-mm1-kame-pd/kernel/res_counter.c 2007-11-22 15:14:32.000000000 +0900 > @@ -17,6 +17,8 @@ void res_counter_init(struct res_counter > { > spin_lock_init(&counter->lock); > counter->limit = (unsigned long long)LLONG_MAX; > + counter->high_watermark = (unsigned long long)LLONG_MAX; > + counter->low_watermark = (unsigned long long)LLONG_MAX; Should low watermark also be LLONG_MAX? > } > > int res_counter_charge_locked(struct res_counter *counter, unsigned long val) > @@ -69,6 +71,10 @@ res_counter_member(struct res_counter *c > return &counter->limit; > case RES_FAILCNT: > return &counter->failcnt; > + case RES_HIGH_WATERMARK: > + return &counter->high_watermark; > + case RES_LOW_WATERMARK: > + return &counter->low_watermark; > }; > > BUG(); > @@ -99,6 +105,7 @@ ssize_t res_counter_write(struct res_cou > int ret; > char *buf, *end; > unsigned long long tmp, *val; > + unsigned long flags; > > buf = kmalloc(nbytes + 1, GFP_KERNEL); > ret = -ENOMEM; > @@ -122,9 +129,29 @@ ssize_t res_counter_write(struct res_cou > goto out_free; > } > > + spin_lock_irqsave(&counter->lock, flags); > val = res_counter_member(counter, member); > + /* ensure low_watermark <= high_watermark <= limit */ > + switch (member) { > + case RES_LIMIT: > + if (tmp < counter->high_watermark) > + goto out_locked; > + break; > + case RES_HIGH_WATERMARK: > + if (tmp > counter->limit || tmp < counter->low_watermark) > + goto out_locked; > + break; > + case RES_LOW_WATERMARK: > + if (tmp > counter->high_watermark) > + goto out_locked; > + break; > + } > *val = tmp; > + BUG_ON(counter->high_watermark > counter->limit); > + BUG_ON(counter->low_watermark > counter->high_watermark); > ret = nbytes; > +out_locked: > + spin_unlock_irqrestore(&counter->lock, flags); > out_free: > kfree(buf); > out: > --- linux-2.6.24-rc2-mm1-kame-pd/mm/memcontrol.c.BACKUP 2007-11-20 13:11:09.000000000 +0900 > +++ linux-2.6.24-rc2-mm1-kame-pd/mm/memcontrol.c 2007-11-22 16:29:26.000000000 +0900 > @@ -28,6 +28,7 @@ > #include <linux/rcupdate.h> > #include <linux/swap.h> > #include <linux/spinlock.h> > +#include <linux/workqueue.h> > #include <linux/fs.h> > #include <linux/seq_file.h> > > @@ -138,6 +139,10 @@ struct mem_cgroup { > * statistics. > */ > struct mem_cgroup_stat stat; > + /* > + * background reclamation. > + */ > + struct work_struct reclaim_work; > }; > > /* > @@ -240,6 +245,21 @@ static unsigned long mem_cgroup_get_all_ > > static struct mem_cgroup init_mem_cgroup; > > +static DEFINE_MUTEX(mem_cgroup_workqueue_init_lock); > +static struct workqueue_struct *mem_cgroup_workqueue; > + > +static void mem_cgroup_create_workqueue(void) > +{ > + > + if (mem_cgroup_workqueue != NULL) > + return; > + > + mutex_lock(&mem_cgroup_workqueue_init_lock); > + if (mem_cgroup_workqueue == NULL) > + mem_cgroup_workqueue = create_workqueue("mem_cgroup"); > + mutex_unlock(&mem_cgroup_workqueue_init_lock); > +} > + > static inline > struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) > { > @@ -566,6 +586,50 @@ unsigned long mem_cgroup_isolate_pages(u > return nr_taken; > } > > +static void > +mem_cgroup_schedule_reclaim(struct mem_cgroup *mem) > +{ > + > + if (mem_cgroup_workqueue == NULL) { > + BUG_ON(mem->css.cgroup->parent != NULL); > + return; > + } > + > + if (work_pending(&mem->reclaim_work)) > + return; > + > + css_get(&mem->css); /* XXX need some thoughts wrt cgroup removal. */ > + /* > + * XXX workqueue is not an ideal mechanism for our purpose. > + * revisit later. > + */ > + if (!queue_work(mem_cgroup_workqueue, &mem->reclaim_work)) > + css_put(&mem->css); > +} > + > +static void > +mem_cgroup_reclaim(struct work_struct *work) > +{ > + struct mem_cgroup * const mem = > + container_of(work, struct mem_cgroup, reclaim_work); > + int batch_count = 128; /* XXX arbitrary */ Could we define and use something like MEM_CGROUP_BATCH_COUNT for now? Later we could consider and see if it needs to be tunable. numbers are hard to read in code. > + > + for (; batch_count > 0; batch_count--) { > + if (res_counter_below_low_watermark(&mem->res)) > + break; Shouldn't we also check to see that we start reclaim in background only when we are above the high watermark? > + /* > + * XXX try_to_free_foo is not a correct mechanism to > + * use here. eg. ALLOCSTALL counter > + * revisit later. > + */ > + if (!try_to_free_mem_cgroup_pages(mem, GFP_KERNEL)) > + break; > + } > + if (batch_count == 0) > + mem_cgroup_schedule_reclaim(mem); > + css_put(&mem->css); > +} > + > /* > * Charge the memory controller for page usage. > * Return > @@ -631,6 +695,12 @@ retry: > rcu_read_unlock(); > > /* > + * schedule background reclaim if we are above the high watermark. > + */ > + if (res_counter_above_high_watermark(&mem->res)) > + mem_cgroup_schedule_reclaim(mem); > + > + /* > * If we created the page_cgroup, we should free it on exceeding > * the cgroup limit. > */ > @@ -939,9 +1009,16 @@ static ssize_t mem_cgroup_write(struct c > struct file *file, const char __user *userbuf, > size_t nbytes, loff_t *ppos) > { > - return res_counter_write(&mem_cgroup_from_cont(cont)->res, > + ssize_t ret; > + > + ret = res_counter_write(&mem_cgroup_from_cont(cont)->res, > cft->private, userbuf, nbytes, ppos, > mem_cgroup_write_strategy); > + > + if (ret >= 0 && cft->private == RES_HIGH_WATERMARK) > + mem_cgroup_create_workqueue(); > + > + return ret; > } > > static ssize_t mem_control_type_write(struct cgroup *cont, > @@ -1097,6 +1174,18 @@ static struct cftype mem_cgroup_files[] > .read = mem_cgroup_read, > }, > { > + .name = "high_watermark_in_bytes", > + .private = RES_HIGH_WATERMARK, > + .write = mem_cgroup_write, > + .read = mem_cgroup_read, > + }, > + { > + .name = "low_watermark_in_bytes", > + .private = RES_LOW_WATERMARK, > + .write = mem_cgroup_write, > + .read = mem_cgroup_read, > + }, > + { > .name = "control_type", > .write = mem_control_type_write, > .read = mem_control_type_read, > @@ -1161,6 +1250,8 @@ mem_cgroup_create(struct cgroup_subsys * > if (alloc_mem_cgroup_per_zone_info(mem, node)) > goto free_out; > > + INIT_WORK(&mem->reclaim_work, mem_cgroup_reclaim); > + > return &mem->css; > free_out: > for_each_node_state(node, N_POSSIBLE) I'll start some tests on these patches. -- Warm Regards, Balbir Singh Linux Technology Center IBM, ISTL ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <47464D04.2030906-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>]
* Re: [RFC] [PATCH] memory controller background reclamation [not found] ` <47464D04.2030906-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> @ 2007-11-26 2:47 ` YAMAMOTO Takashi [not found] ` <20071126024710.8F4ED1CF411-Pcsii4f/SVk@public.gmane.org> 0 siblings, 1 reply; 16+ messages in thread From: YAMAMOTO Takashi @ 2007-11-26 2:47 UTC (permalink / raw) To: balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8 Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg hi, > > --- linux-2.6.24-rc2-mm1-kame-pd/kernel/res_counter.c.BACKUP 2007-11-14 16:05:52.000000000 +0900 > > +++ linux-2.6.24-rc2-mm1-kame-pd/kernel/res_counter.c 2007-11-22 15:14:32.000000000 +0900 > > @@ -17,6 +17,8 @@ void res_counter_init(struct res_counter > > { > > spin_lock_init(&counter->lock); > > counter->limit = (unsigned long long)LLONG_MAX; > > + counter->high_watermark = (unsigned long long)LLONG_MAX; > > + counter->low_watermark = (unsigned long long)LLONG_MAX; > > Should low watermark also be LLONG_MAX? what else do you suggest? 0? currently it doesn't matter much because low_watermark is not used at all as far as high_watermark is LLONG_MAX. > > +static void > > +mem_cgroup_reclaim(struct work_struct *work) > > +{ > > + struct mem_cgroup * const mem = > > + container_of(work, struct mem_cgroup, reclaim_work); > > + int batch_count = 128; /* XXX arbitrary */ > > Could we define and use something like MEM_CGROUP_BATCH_COUNT for now? > Later we could consider and see if it needs to be tunable. numbers are > hard to read in code. although i don't think it makes sense, i can do so if you prefer. > > + > > + for (; batch_count > 0; batch_count--) { > > + if (res_counter_below_low_watermark(&mem->res)) > > + break; > > Shouldn't we also check to see that we start reclaim in background only > when we are above the high watermark? i don't understand what you mean. can you explain? highwatermark is checked by mem_cgroup_charge_common before waking these threads. > I'll start some tests on these patches. thanks. YAMAMOTO Takashi ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <20071126024710.8F4ED1CF411-Pcsii4f/SVk@public.gmane.org>]
* Re: [RFC] [PATCH] memory controller background reclamation [not found] ` <20071126024710.8F4ED1CF411-Pcsii4f/SVk@public.gmane.org> @ 2007-11-26 3:00 ` Balbir Singh [not found] ` <474A36C3.80509-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> 2007-11-26 4:09 ` Balbir Singh 1 sibling, 1 reply; 16+ messages in thread From: Balbir Singh @ 2007-11-26 3:00 UTC (permalink / raw) To: YAMAMOTO Takashi Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg YAMAMOTO Takashi wrote: > hi, > >>> --- linux-2.6.24-rc2-mm1-kame-pd/kernel/res_counter.c.BACKUP 2007-11-14 16:05:52.000000000 +0900 >>> +++ linux-2.6.24-rc2-mm1-kame-pd/kernel/res_counter.c 2007-11-22 15:14:32.000000000 +0900 >>> @@ -17,6 +17,8 @@ void res_counter_init(struct res_counter >>> { >>> spin_lock_init(&counter->lock); >>> counter->limit = (unsigned long long)LLONG_MAX; >>> + counter->high_watermark = (unsigned long long)LLONG_MAX; >>> + counter->low_watermark = (unsigned long long)LLONG_MAX; >> Should low watermark also be LLONG_MAX? > > what else do you suggest? 0? Something invalid or a good default value. I think LLONG_MAX is good for now, that ensures that no background reclaim happens till the administrator sets it up. > currently it doesn't matter much because low_watermark is not used at all > as far as high_watermark is LLONG_MAX. > Don't we use by checking res_counter_below_low_watermark()? >>> +static void >>> +mem_cgroup_reclaim(struct work_struct *work) >>> +{ >>> + struct mem_cgroup * const mem = >>> + container_of(work, struct mem_cgroup, reclaim_work); >>> + int batch_count = 128; /* XXX arbitrary */ >> Could we define and use something like MEM_CGROUP_BATCH_COUNT for now? >> Later we could consider and see if it needs to be tunable. numbers are >> hard to read in code. > > although i don't think it makes sense, i can do so if you prefer. > Using numbers like 128 make the code unreadable. I prefer something like MEM_CGROUP_BATCH_COUNT since its more readable than 128. If we ever propagate batch_count to other dependent functions, I'd much rather do it with a well defined name. >>> + >>> + for (; batch_count > 0; batch_count--) { >>> + if (res_counter_below_low_watermark(&mem->res)) >>> + break; >> Shouldn't we also check to see that we start reclaim in background only >> when we are above the high watermark? > > i don't understand what you mean. can you explain? > highwatermark is checked by mem_cgroup_charge_common before waking > these threads. > OK, that clarifies >> I'll start some tests on these patches. > > thanks. > > YAMAMOTO Takashi -- Warm Regards, Balbir Singh Linux Technology Center IBM, ISTL ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <474A36C3.80509-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>]
* Re: [RFC] [PATCH] memory controller background reclamation [not found] ` <474A36C3.80509-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> @ 2007-11-26 3:12 ` YAMAMOTO Takashi 2007-11-26 3:56 ` Balbir Singh 2007-11-26 23:46 ` YAMAMOTO Takashi 2 siblings, 0 replies; 16+ messages in thread From: YAMAMOTO Takashi @ 2007-11-26 3:12 UTC (permalink / raw) To: balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8 Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg > > currently it doesn't matter much because low_watermark is not used at all > > as far as high_watermark is LLONG_MAX. > > > > Don't we use by checking res_counter_below_low_watermark()? yes, but only when we get above highwatermark. YAMAMOTO Takashi ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [RFC] [PATCH] memory controller background reclamation [not found] ` <474A36C3.80509-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> 2007-11-26 3:12 ` YAMAMOTO Takashi @ 2007-11-26 3:56 ` Balbir Singh [not found] ` <474A43D5.8020600-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> 2007-11-26 23:46 ` YAMAMOTO Takashi 2 siblings, 1 reply; 16+ messages in thread From: Balbir Singh @ 2007-11-26 3:56 UTC (permalink / raw) To: YAMAMOTO Takashi Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg Balbir Singh wrote: > YAMAMOTO Takashi wrote: >>>> + int batch_count = 128; /* XXX arbitrary */ >>> Could we define and use something like MEM_CGROUP_BATCH_COUNT for now? >>> Later we could consider and see if it needs to be tunable. numbers are >>> hard to read in code. >> although i don't think it makes sense, i can do so if you prefer. >> > > Using numbers like 128 make the code unreadable. I prefer something > like MEM_CGROUP_BATCH_COUNT since its more readable than 128. If we ever > propagate batch_count to other dependent functions, I'd much rather do > it with a well defined name. > I just checked we already have FORCE_UNCHARGE_BATCH, we could just rename and re-use it. -- Warm Regards, Balbir Singh Linux Technology Center IBM, ISTL ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <474A43D5.8020600-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>]
* Re: [RFC] [PATCH] memory controller background reclamation [not found] ` <474A43D5.8020600-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> @ 2007-11-26 22:43 ` YAMAMOTO Takashi 0 siblings, 0 replies; 16+ messages in thread From: YAMAMOTO Takashi @ 2007-11-26 22:43 UTC (permalink / raw) To: balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8 Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg > Balbir Singh wrote: > > YAMAMOTO Takashi wrote: > >>>> + int batch_count = 128; /* XXX arbitrary */ > >>> Could we define and use something like MEM_CGROUP_BATCH_COUNT for now? > >>> Later we could consider and see if it needs to be tunable. numbers are > >>> hard to read in code. > >> although i don't think it makes sense, i can do so if you prefer. > >> > > > > Using numbers like 128 make the code unreadable. I prefer something > > like MEM_CGROUP_BATCH_COUNT since its more readable than 128. If we ever > > propagate batch_count to other dependent functions, I'd much rather do > > it with a well defined name. > > > > I just checked we already have FORCE_UNCHARGE_BATCH, we could just > rename and re-use it. i don't think it's a good idea to use a single constant for completely different things. YAMAMOTO Takashi ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [RFC] [PATCH] memory controller background reclamation [not found] ` <474A36C3.80509-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> 2007-11-26 3:12 ` YAMAMOTO Takashi 2007-11-26 3:56 ` Balbir Singh @ 2007-11-26 23:46 ` YAMAMOTO Takashi 2 siblings, 0 replies; 16+ messages in thread From: YAMAMOTO Takashi @ 2007-11-26 23:46 UTC (permalink / raw) To: balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8 Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg > Using numbers like 128 make the code unreadable. I prefer something > like MEM_CGROUP_BATCH_COUNT since its more readable than 128. If we ever > propagate batch_count to other dependent functions, I'd much rather do > it with a well defined name. here's a new version. changes from the previous: - define a constant. (MEM_CGROUP_BG_RECLAIM_BATCH_COUNT) no functional changes. - don't increment ALLOCSTALL vm event for mem cgroup because it isn't appropriate esp. for background reclamation. introduce MEM_CGROUP_STAT_ALLOCSTALL instead. (its value is same as memory.failcnt unless GFP_ATOMIC.) YAMAMOTO Takashi Signed-off-by: YAMAMOTO Takashi <yamamoto-jCdQPDEk3idL9jVzuh4AOg@public.gmane.org> --- --- linux-2.6.24-rc2-mm1-kame-pd/include/linux/res_counter.h.BACKUP 2007-11-14 16:05:48.000000000 +0900 +++ linux-2.6.24-rc2-mm1-kame-pd/include/linux/res_counter.h 2007-11-22 15:14:32.000000000 +0900 @@ -32,6 +32,13 @@ struct res_counter { * the number of unsuccessful attempts to consume the resource */ unsigned long long failcnt; + + /* + * watermarks + */ + unsigned long long high_watermark; + unsigned long long low_watermark; + /* * the lock to protect all of the above. * the routines below consider this to be IRQ-safe @@ -66,6 +73,8 @@ enum { RES_USAGE, RES_LIMIT, RES_FAILCNT, + RES_HIGH_WATERMARK, + RES_LOW_WATERMARK, }; /* @@ -124,4 +133,26 @@ static inline bool res_counter_check_und return ret; } +static inline bool res_counter_below_low_watermark(struct res_counter *cnt) +{ + bool ret; + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + ret = cnt->usage < cnt->low_watermark; + spin_unlock_irqrestore(&cnt->lock, flags); + return ret; +} + +static inline bool res_counter_above_high_watermark(struct res_counter *cnt) +{ + bool ret; + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + ret = cnt->usage > cnt->high_watermark; + spin_unlock_irqrestore(&cnt->lock, flags); + return ret; +} + #endif --- linux-2.6.24-rc2-mm1-kame-pd/kernel/res_counter.c.BACKUP 2007-11-14 16:05:52.000000000 +0900 +++ linux-2.6.24-rc2-mm1-kame-pd/kernel/res_counter.c 2007-11-22 15:14:32.000000000 +0900 @@ -17,6 +17,8 @@ void res_counter_init(struct res_counter { spin_lock_init(&counter->lock); counter->limit = (unsigned long long)LLONG_MAX; + counter->high_watermark = (unsigned long long)LLONG_MAX; + counter->low_watermark = (unsigned long long)LLONG_MAX; } int res_counter_charge_locked(struct res_counter *counter, unsigned long val) @@ -69,6 +71,10 @@ res_counter_member(struct res_counter *c return &counter->limit; case RES_FAILCNT: return &counter->failcnt; + case RES_HIGH_WATERMARK: + return &counter->high_watermark; + case RES_LOW_WATERMARK: + return &counter->low_watermark; }; BUG(); @@ -99,6 +105,7 @@ ssize_t res_counter_write(struct res_cou int ret; char *buf, *end; unsigned long long tmp, *val; + unsigned long flags; buf = kmalloc(nbytes + 1, GFP_KERNEL); ret = -ENOMEM; @@ -122,9 +129,29 @@ ssize_t res_counter_write(struct res_cou goto out_free; } + spin_lock_irqsave(&counter->lock, flags); val = res_counter_member(counter, member); + /* ensure low_watermark <= high_watermark <= limit */ + switch (member) { + case RES_LIMIT: + if (tmp < counter->high_watermark) + goto out_locked; + break; + case RES_HIGH_WATERMARK: + if (tmp > counter->limit || tmp < counter->low_watermark) + goto out_locked; + break; + case RES_LOW_WATERMARK: + if (tmp > counter->high_watermark) + goto out_locked; + break; + } *val = tmp; + BUG_ON(counter->high_watermark > counter->limit); + BUG_ON(counter->low_watermark > counter->high_watermark); ret = nbytes; +out_locked: + spin_unlock_irqrestore(&counter->lock, flags); out_free: kfree(buf); out: --- linux-2.6.24-rc2-mm1-kame-pd/mm/vmscan.c.BACKUP 2007-11-20 13:11:09.000000000 +0900 +++ linux-2.6.24-rc2-mm1-kame-pd/mm/vmscan.c 2007-11-27 08:09:51.000000000 +0900 @@ -1333,7 +1333,6 @@ static unsigned long do_try_to_free_page unsigned long lru_pages = 0; int i; - count_vm_event(ALLOCSTALL); /* * mem_cgroup will not do shrink_slab. */ @@ -1432,6 +1431,7 @@ unsigned long try_to_free_pages(struct z .isolate_pages = isolate_pages_global, }; + count_vm_event(ALLOCSTALL); return do_try_to_free_pages(zones, gfp_mask, &sc); } --- linux-2.6.24-rc2-mm1-kame-pd/mm/memcontrol.c.BACKUP 2007-11-20 13:11:09.000000000 +0900 +++ linux-2.6.24-rc2-mm1-kame-pd/mm/memcontrol.c 2007-11-27 08:27:10.000000000 +0900 @@ -28,6 +28,7 @@ #include <linux/rcupdate.h> #include <linux/swap.h> #include <linux/spinlock.h> +#include <linux/workqueue.h> #include <linux/fs.h> #include <linux/seq_file.h> @@ -35,6 +36,7 @@ struct cgroup_subsys mem_cgroup_subsys; static const int MEM_CGROUP_RECLAIM_RETRIES = 5; +static const int MEM_CGROUP_BG_RECLAIM_BATCH_COUNT = 128; /* XXX arbitrary */ /* * Statistics for memory cgroup. @@ -45,6 +47,7 @@ enum mem_cgroup_stat_index { */ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ + MEM_CGROUP_STAT_ALLOCSTALL,/* allocation stalled due to memory.limit */ MEM_CGROUP_STAT_NSTATS, }; @@ -138,6 +141,10 @@ struct mem_cgroup { * statistics. */ struct mem_cgroup_stat stat; + /* + * background reclamation. + */ + struct work_struct reclaim_work; }; /* @@ -240,6 +247,21 @@ static unsigned long mem_cgroup_get_all_ static struct mem_cgroup init_mem_cgroup; +static DEFINE_MUTEX(mem_cgroup_workqueue_init_lock); +static struct workqueue_struct *mem_cgroup_workqueue; + +static void mem_cgroup_create_workqueue(void) +{ + + if (mem_cgroup_workqueue != NULL) + return; + + mutex_lock(&mem_cgroup_workqueue_init_lock); + if (mem_cgroup_workqueue == NULL) + mem_cgroup_workqueue = create_workqueue("mem_cgroup"); + mutex_unlock(&mem_cgroup_workqueue_init_lock); +} + static inline struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { @@ -566,6 +588,45 @@ unsigned long mem_cgroup_isolate_pages(u return nr_taken; } +static void +mem_cgroup_schedule_reclaim(struct mem_cgroup *mem) +{ + + if (mem_cgroup_workqueue == NULL) { + BUG_ON(mem->css.cgroup->parent != NULL); + return; + } + + if (work_pending(&mem->reclaim_work)) + return; + + css_get(&mem->css); /* XXX need some thoughts wrt cgroup removal. */ + /* + * XXX workqueue is not an ideal mechanism for our purpose. + * revisit later. + */ + if (!queue_work(mem_cgroup_workqueue, &mem->reclaim_work)) + css_put(&mem->css); +} + +static void +mem_cgroup_reclaim(struct work_struct *work) +{ + struct mem_cgroup * const mem = + container_of(work, struct mem_cgroup, reclaim_work); + int batch_count = MEM_CGROUP_BG_RECLAIM_BATCH_COUNT; + + for (; batch_count > 0; batch_count--) { + if (res_counter_below_low_watermark(&mem->res)) + break; + if (!try_to_free_mem_cgroup_pages(mem, GFP_KERNEL)) + break; + } + if (batch_count == 0) + mem_cgroup_schedule_reclaim(mem); + css_put(&mem->css); +} + /* * Charge the memory controller for page usage. * Return @@ -631,6 +692,12 @@ retry: rcu_read_unlock(); /* + * schedule background reclaim if we are above the high watermark. + */ + if (res_counter_above_high_watermark(&mem->res)) + mem_cgroup_schedule_reclaim(mem); + + /* * If we created the page_cgroup, we should free it on exceeding * the cgroup limit. */ @@ -642,6 +709,10 @@ retry: if (is_atomic) goto noreclaim; + preempt_disable(); + __mem_cgroup_stat_add_safe(&mem->stat, + MEM_CGROUP_STAT_ALLOCSTALL, 1); + preempt_enable(); if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) continue; @@ -939,9 +1010,16 @@ static ssize_t mem_cgroup_write(struct c struct file *file, const char __user *userbuf, size_t nbytes, loff_t *ppos) { - return res_counter_write(&mem_cgroup_from_cont(cont)->res, + ssize_t ret; + + ret = res_counter_write(&mem_cgroup_from_cont(cont)->res, cft->private, userbuf, nbytes, ppos, mem_cgroup_write_strategy); + + if (ret >= 0 && cft->private == RES_HIGH_WATERMARK) + mem_cgroup_create_workqueue(); + + return ret; } static ssize_t mem_control_type_write(struct cgroup *cont, @@ -1031,6 +1109,7 @@ static const struct mem_cgroup_stat_desc } mem_cgroup_stat_desc[] = { [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, + [MEM_CGROUP_STAT_ALLOCSTALL] = { "allocstall", 1, }, }; static int mem_control_stat_show(struct seq_file *m, void *arg) @@ -1097,6 +1176,18 @@ static struct cftype mem_cgroup_files[] .read = mem_cgroup_read, }, { + .name = "high_watermark_in_bytes", + .private = RES_HIGH_WATERMARK, + .write = mem_cgroup_write, + .read = mem_cgroup_read, + }, + { + .name = "low_watermark_in_bytes", + .private = RES_LOW_WATERMARK, + .write = mem_cgroup_write, + .read = mem_cgroup_read, + }, + { .name = "control_type", .write = mem_control_type_write, .read = mem_control_type_read, @@ -1161,6 +1252,8 @@ mem_cgroup_create(struct cgroup_subsys * if (alloc_mem_cgroup_per_zone_info(mem, node)) goto free_out; + INIT_WORK(&mem->reclaim_work, mem_cgroup_reclaim); + return &mem->css; free_out: for_each_node_state(node, N_POSSIBLE) ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [RFC] [PATCH] memory controller background reclamation [not found] ` <20071126024710.8F4ED1CF411-Pcsii4f/SVk@public.gmane.org> 2007-11-26 3:00 ` Balbir Singh @ 2007-11-26 4:09 ` Balbir Singh 1 sibling, 0 replies; 16+ messages in thread From: Balbir Singh @ 2007-11-26 4:09 UTC (permalink / raw) To: YAMAMOTO Takashi Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg YAMAMOTO Takashi wrote: >> I'll start some tests on these patches. > > thanks. > I tried testing, but the patch failed to apply. I think thats because it was ported on top of the NUMA patches by KAMEZAWA-San. I'll rebase and test. -- Warm Regards, Balbir Singh Linux Technology Center IBM, ISTL ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [RFC] [PATCH] memory controller background reclamation [not found] ` <20071022094349.60B351C1015-Pcsii4f/SVk@public.gmane.org> 2007-10-22 16:10 ` Balbir Singh @ 2007-10-22 17:10 ` Vaidyanathan Srinivasan [not found] ` <471CD987.5030800-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> 1 sibling, 1 reply; 16+ messages in thread From: Vaidyanathan Srinivasan @ 2007-10-22 17:10 UTC (permalink / raw) To: YAMAMOTO Takashi Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg, balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8 YAMAMOTO Takashi wrote: > hi, > > i implemented background reclamation for your memory controller and > did a few benchmarks with and without it. any comments? > > YAMAMOTO Takashi > > > ----- > "time make -j4 bzImage" in a cgroup with 64MB limit: > > without patch: > real 22m22.389s > user 13m35.163s > sys 6m12.283s > > memory.failcnt after a run: 106647 > > with patch: > real 19m25.860s > user 14m10.549s > sys 6m13.831s > > memory.failcnt after a run: 35366 > > "for x in 1 2 3;do time dd if=/dev/zero bs=1M count=300|tail;done" > in a cgroup with 16MB limit: > > without patch: > 300+0 records in > 300+0 records out > 314572800 bytes (315 MB) copied, 19.0058 seconds, 16.6 MB/s > u 0.00s s 0.67s e 19.01s maj 90 min 2021 in 0 out 0 > u 0.48s s 7.22s e 93.13s maj 20475 min 210903 in 0 out 0 > 300+0 records in > 300+0 records out > 314572800 bytes (315 MB) copied, 14.8394 seconds, 21.2 MB/s > u 0.00s s 0.63s e 14.84s maj 53 min 1392 in 0 out 0 > u 0.48s s 7.00s e 94.39s maj 20125 min 211145 in 0 out 0 > 300+0 records in > 300+0 records out > 314572800 bytes (315 MB) copied, 14.0635 seconds, 22.4 MB/s > u 0.01s s 0.59s e 14.07s maj 50 min 1385 in 0 out 0 > u 0.57s s 6.93s e 91.54s maj 20359 min 210911 in 0 out 0 > > memory.failcnt after a run: 8804 > > with patch: > 300+0 records in > 300+0 records out > 314572800 bytes (315 MB) copied, 5.94875 seconds, 52.9 MB/s > u 0.00s s 0.67s e 5.95s maj 206 min 5393 in 0 out 0 > u 0.45s s 6.63s e 46.07s maj 21350 min 210252 in 0 out 0 > 300+0 records in > 300+0 records out > 314572800 bytes (315 MB) copied, 8.16441 seconds, 38.5 MB/s > u 0.00s s 0.60s e 8.17s maj 240 min 4614 in 0 out 0 > u 0.45s s 6.56s e 54.56s maj 22298 min 209255 in 0 out 0 > 300+0 records in > 300+0 records out > 314572800 bytes (315 MB) copied, 4.60967 seconds, 68.2 MB/s > u 0.00s s 0.60s e 4.64s maj 196 min 4733 in 0 out 0 > u 0.46s s 6.53s e 49.28s maj 22223 min 209384 in 0 out 0 > > memory.failcnt after a run: 1589 > ----- Results looks good. Background reclaim will definitely help performance. I am sure once we get the thresholds correct, the performance will improve further. > > --- linux-2.6.23-rc8-mm2-cgkswapd/mm/memcontrol.c.BACKUP 2007-10-01 17:19:57.000000000 +0900 > +++ linux-2.6.23-rc8-mm2-cgkswapd/mm/memcontrol.c 2007-10-22 13:46:01.000000000 +0900 > @@ -27,6 +27,7 @@ > #include <linux/rcupdate.h> > #include <linux/swap.h> > #include <linux/spinlock.h> > +#include <linux/workqueue.h> > #include <linux/fs.h> > > #include <asm/uaccess.h> > @@ -63,6 +64,8 @@ struct mem_cgroup { > */ > spinlock_t lru_lock; > unsigned long control_type; /* control RSS or RSS+Pagecache */ > + > + struct work_struct reclaim_work; > }; > > /* > @@ -95,6 +98,9 @@ enum { > > static struct mem_cgroup init_mem_cgroup; > > +static DEFINE_MUTEX(mem_cgroup_workqueue_init_lock); > +static struct workqueue_struct *mem_cgroup_workqueue; > + > static inline > struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) > { > @@ -250,6 +256,69 @@ unsigned long mem_cgroup_isolate_pages(u > return nr_taken; > } > > +static int > +mem_cgroup_need_reclaim(struct mem_cgroup *mem) > +{ > + struct res_counter * const cnt = &mem->res; > + int doreclaim; > + unsigned long flags; > + > + /* XXX should be in res_counter */ > + /* XXX should not hardcode a watermark */ > + spin_lock_irqsave(&cnt->lock, flags); > + doreclaim = cnt->usage > cnt->limit / 4 * 3; > + spin_unlock_irqrestore(&cnt->lock, flags); > + > + return doreclaim; > +} > + > +static void > +mem_cgroup_schedule_reclaim_if_necessary(struct mem_cgroup *mem) > +{ > + > + if (mem_cgroup_workqueue == NULL) { > + BUG_ON(mem->css.cgroup->parent != NULL); > + return; > + } > + > + if (work_pending(&mem->reclaim_work)) > + return; > + > + if (!mem_cgroup_need_reclaim(mem)) > + return; > + > + css_get(&mem->css); /* XXX need some thoughts wrt cgroup removal. */ > + /* > + * XXX workqueue is not an ideal mechanism for our purpose. > + * revisit later. > + */ > + if (!queue_work(mem_cgroup_workqueue, &mem->reclaim_work)) > + css_put(&mem->css); > +} > + > +static void > +mem_cgroup_reclaim(struct work_struct *work) > +{ > + struct mem_cgroup * const mem = > + container_of(work, struct mem_cgroup, reclaim_work); > + int batch_count = 128; /* XXX arbitrary */ > + > + for (; batch_count > 0; batch_count--) { > + if (!mem_cgroup_need_reclaim(mem)) > + break; > + /* > + * XXX try_to_free_foo is not a correct mechanism to > + * use here. eg. ALLOCSTALL counter > + * revisit later. > + */ > + if (!try_to_free_mem_cgroup_pages(mem, GFP_KERNEL)) > + break; > + } > + if (batch_count == 0) > + mem_cgroup_schedule_reclaim_if_necessary(mem); > + css_put(&mem->css); > +} > + > /* > * Charge the memory controller for page usage. > * Return > @@ -311,6 +380,9 @@ int mem_cgroup_charge(struct page *page, > */ > while (res_counter_charge(&mem->res, PAGE_SIZE)) { > bool is_atomic = gfp_mask & GFP_ATOMIC; > + > + mem_cgroup_schedule_reclaim_if_necessary(mem); > + > /* > * We cannot reclaim under GFP_ATOMIC, fail the charge > */ > @@ -551,8 +623,18 @@ mem_cgroup_create(struct cgroup_subsys * > if (unlikely((cont->parent) == NULL)) { > mem = &init_mem_cgroup; > init_mm.mem_cgroup = mem; > - } else > + } else { > + /* XXX too late for the top-level cgroup */ > + if (mem_cgroup_workqueue == NULL) { > + mutex_lock(&mem_cgroup_workqueue_init_lock); > + if (mem_cgroup_workqueue == NULL) { > + mem_cgroup_workqueue = > + create_workqueue("mem_cgroup"); > + } > + mutex_unlock(&mem_cgroup_workqueue_init_lock); > + } > mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL); > + } > > if (mem == NULL) > return NULL; > @@ -562,6 +644,7 @@ mem_cgroup_create(struct cgroup_subsys * > INIT_LIST_HEAD(&mem->inactive_list); > spin_lock_init(&mem->lru_lock); > mem->control_type = MEM_CGROUP_TYPE_ALL; > + INIT_WORK(&mem->reclaim_work, mem_cgroup_reclaim); > return &mem->css; > } > Can we potentially avoid direct reclaim and sleep if the charge is exceeded and let the background reclaim do the job and wake us up? --Vaidy ^ permalink raw reply [flat|nested] 16+ messages in thread
[parent not found: <471CD987.5030800-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>]
* Re: [RFC] [PATCH] memory controller background reclamation [not found] ` <471CD987.5030800-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> @ 2007-10-22 23:45 ` YAMAMOTO Takashi 0 siblings, 0 replies; 16+ messages in thread From: YAMAMOTO Takashi @ 2007-10-22 23:45 UTC (permalink / raw) To: svaidy-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8 Cc: containers-qjLDD68F18O7TbgM5vRIOg, minoura-jCdQPDEk3idL9jVzuh4AOg, balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8 hi, > Can we potentially avoid direct reclaim and sleep if the charge is exceeded > and let the background reclaim do the job and wake us up? yes, but i left it for now because direct reclaim has its own benefits. YAMAMOTO Takashi ^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2007-11-26 23:46 UTC | newest]
Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-22 9:43 [RFC] [PATCH] memory controller background reclamation YAMAMOTO Takashi
[not found] ` <20071022094349.60B351C1015-Pcsii4f/SVk@public.gmane.org>
2007-10-22 16:10 ` Balbir Singh
[not found] ` <471CCB85.3040905-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2007-10-22 23:44 ` YAMAMOTO Takashi
[not found] ` <20071022234430.EE6B11C1231-Pcsii4f/SVk@public.gmane.org>
2007-11-15 6:16 ` YAMAMOTO Takashi
[not found] ` <20071115061602.856171CD9BF-Pcsii4f/SVk@public.gmane.org>
2007-11-22 8:57 ` YAMAMOTO Takashi
[not found] ` <20071122085733.758AD1CEE9F-Pcsii4f/SVk@public.gmane.org>
2007-11-22 13:33 ` kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A
2007-11-23 3:46 ` Balbir Singh
[not found] ` <47464D04.2030906-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2007-11-26 2:47 ` YAMAMOTO Takashi
[not found] ` <20071126024710.8F4ED1CF411-Pcsii4f/SVk@public.gmane.org>
2007-11-26 3:00 ` Balbir Singh
[not found] ` <474A36C3.80509-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2007-11-26 3:12 ` YAMAMOTO Takashi
2007-11-26 3:56 ` Balbir Singh
[not found] ` <474A43D5.8020600-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2007-11-26 22:43 ` YAMAMOTO Takashi
2007-11-26 23:46 ` YAMAMOTO Takashi
2007-11-26 4:09 ` Balbir Singh
2007-10-22 17:10 ` Vaidyanathan Srinivasan
[not found] ` <471CD987.5030800-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2007-10-22 23:45 ` YAMAMOTO Takashi
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.