All of lore.kernel.org
 help / color / mirror / Atom feed
From: Pavel Emelyanov <xemul@openvz.org>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>,
	LKML <linux-kernel@vger.kernel.org>,
	"menage@google.com" <menage@google.com>,
	"balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>,
	"yamamoto@valinux.co.jp" <yamamoto@valinux.co.jp>,
	"nishimura@mxp.nes.nec.co.jp" <nishimura@mxp.nes.nec.co.jp>,
	"lizf@cn.fujitsu.com" <lizf@cn.fujitsu.com>
Subject: Re: [PATCH 1/6] res_counter:  handle limit change
Date: Mon, 16 Jun 2008 10:38:52 +0400	[thread overview]
Message-ID: <48560A7C.9050501@openvz.org> (raw)
In-Reply-To: <20080613182924.c73fe9eb.kamezawa.hiroyu@jp.fujitsu.com>

KAMEZAWA Hiroyuki wrote:
> Add a support to shrink_usage_at_limit_change feature to res_counter.
> memcg will use this to drop pages.
> 
> Change log: xxx -> v4 (new file.)
>  - cut out the limit-change part from hierarchy patch set.
>  - add "retry_count" arguments to shrink_usage(). This allows that we don't
>    have to set the default retry loop count.
>  - res_counter_check_under_val() is added to support subsystem.
>  - res_counter_init() is res_counter_init_ops(cnt, NULL)
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> ---
>  Documentation/controllers/resource_counter.txt |   19 +++++-
>  include/linux/res_counter.h                    |   33 ++++++++++-
>  kernel/res_counter.c                           |   74 ++++++++++++++++++++++++-
>  3 files changed, 121 insertions(+), 5 deletions(-)
> 
> Index: linux-2.6.26-rc5-mm3/include/linux/res_counter.h
> ===================================================================
> --- linux-2.6.26-rc5-mm3.orig/include/linux/res_counter.h
> +++ linux-2.6.26-rc5-mm3/include/linux/res_counter.h
> @@ -21,6 +21,13 @@
>   * the helpers described beyond
>   */
>  
> +struct res_counter;
> +struct res_counter_ops {
> +	/* called when the subsystem has to reduce the usage. */
> +	int (*shrink_usage)(struct res_counter *cnt, unsigned long long val,
> +			    int retry_count);
> +};
> +
>  struct res_counter {
>  	/*
>  	 * the current resource consumption level
> @@ -39,6 +46,10 @@ struct res_counter {
>  	 */
>  	unsigned long long failcnt;
>  	/*
> +	 * registered callbacks etc...for res_counter.
> +	 */
> +	struct res_counter_ops ops;
> +	/*

Why would we need such? All res_counter.limit update comes via the appropiate
cgroup's files, so it can do whatever it needs w/o any callbacks?

And (if we definitely need one) isn't it better to make it a
	struct res_counter_ops *ops;
pointer?

>  	 * the lock to protect all of the above.
>  	 * the routines below consider this to be IRQ-safe
>  	 */
> @@ -82,7 +93,13 @@ enum {
>   * helpers for accounting
>   */
>  
> -void res_counter_init(struct res_counter *counter);
> +void res_counter_init_ops(struct res_counter *counter,
> +				struct res_counter_ops *ops);
> +
> +static inline void res_counter_init(struct res_counter *counter)
> +{
> +	res_counter_init_ops(counter, NULL);
> +}
>  
>  /*
>   * charge - try to consume more resource.
> @@ -136,6 +153,20 @@ static inline bool res_counter_check_und
>  	return ret;
>  }
>  
> +static inline bool res_counter_check_under_val(struct res_counter *cnt,
> +					unsigned long long val)
> +{
> +	bool ret = false;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&cnt->lock, flags);
> +	if (cnt->usage <= val)
> +		ret = true;
> +	spin_unlock_irqrestore(&cnt->lock, flags);
> +
> +	return ret;
> +}
> +
>  static inline void res_counter_reset_max(struct res_counter *cnt)
>  {
>  	unsigned long flags;
> Index: linux-2.6.26-rc5-mm3/kernel/res_counter.c
> ===================================================================
> --- linux-2.6.26-rc5-mm3.orig/kernel/res_counter.c
> +++ linux-2.6.26-rc5-mm3/kernel/res_counter.c
> @@ -14,10 +14,22 @@
>  #include <linux/res_counter.h>
>  #include <linux/uaccess.h>
>  
> -void res_counter_init(struct res_counter *counter)
> +/**
> + * res_counter_init_ops -- initialize res_counter.
> + * @counter: the res_counter to be initialized
> + * @ops: the res_counter_ops for this res_counter. This argument can be NULL
> + *        and is copied.
> + *
> + * init spinlock and set limit to be very very big value.
> + */
> +
> +void res_counter_init_ops(struct res_counter *counter,
> +				struct res_counter_ops *ops)
>  {
>  	spin_lock_init(&counter->lock);
>  	counter->limit = (unsigned long long)LLONG_MAX;
> +	if (ops)
> +		counter->ops = *ops;
>  }
>  
>  int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
> @@ -102,6 +114,46 @@ u64 res_counter_read_u64(struct res_coun
>  	return *res_counter_member(counter, member);
>  }
>  
> +/*
> + * Called when the limit changes if res_counter has ops->shrink_usage.
> + * This function uses shrink usage to below new limit. returns 0 at success.
> + */
> +
> +static int res_counter_resize_limit(struct res_counter *cnt,
> +			unsigned long long val)
> +{
> +	int retry_count = 0;
> +	int ret = -EBUSY;
> +	unsigned long flags;
> +
> +	BUG_ON(!cnt->ops.shrink_usage);
> +	while (1) {
> +		spin_lock_irqsave(&cnt->lock, flags);
> +		if (cnt->usage <= val) {
> +			cnt->limit = val;
> +			ret = 0;
> +			spin_unlock_irqrestore(&cnt->lock, flags);
> +			break;
> +		}
> +		BUG_ON(val > cnt->limit);
> +		spin_unlock_irqrestore(&cnt->lock, flags);
> +
> +		/*
> +		 * Rest before calling callback().... rest after callback
> +		 * tends to add difference between the result of callback and
> +		 * the check in next loop.
> +		 */
> +		cond_resched();
> +
> +		ret = cnt->ops.shrink_usage(cnt, val, retry_count);
> +		if (!ret)
> +			break;
> +		retry_count++;
> +	}
> +	return ret;
> +}
> +
> +
>  ssize_t res_counter_write(struct res_counter *counter, int member,
>  		const char __user *userbuf, size_t nbytes, loff_t *pos,
>  		int (*write_strategy)(char *st_buf, unsigned long long *val))
> @@ -133,11 +185,29 @@ ssize_t res_counter_write(struct res_cou
>  		if (*end != '\0')
>  			goto out_free;
>  	}
> +	switch (member) {
> +	case RES_LIMIT:
> +		if (counter->ops.shrink_usage) {
> +			ret = res_counter_resize_limit(counter, tmp);
> +			goto done;
> +		}
> +		break;
> +	default:
> +		/*
> +		 * Considering future implementation, we'll have to handle
> +		 * other members and "fallback" will not work well. So, we
> +		 * avoid to make use of "default" here.
> +		 */
> +		break;
> +	}
>  	spin_lock_irqsave(&counter->lock, flags);
>  	val = res_counter_member(counter, member);
>  	*val = tmp;
>  	spin_unlock_irqrestore(&counter->lock, flags);
> -	ret = nbytes;
> +	ret = 0;
> +done:
> +	if (!ret)
> +		ret = nbytes;
>  out_free:
>  	kfree(buf);
>  out:
> Index: linux-2.6.26-rc5-mm3/Documentation/controllers/resource_counter.txt
> ===================================================================
> --- linux-2.6.26-rc5-mm3.orig/Documentation/controllers/resource_counter.txt
> +++ linux-2.6.26-rc5-mm3/Documentation/controllers/resource_counter.txt
> @@ -39,7 +39,11 @@ to work with it.
>   	The failcnt stands for "failures counter". This is the number of
>  	resource allocation attempts that failed.
>  
> - c. spinlock_t lock
> + e. res_counter_ops.
> +	Callbacks for helping resource_counter per each subsystem.
> +	- shrink_usage() .... called at limit change (decrease).
> +
> + f. spinlock_t lock
>  
>   	Protects changes of the above values.
>  
> @@ -141,8 +145,19 @@ counter fields. They are recommended to 
>  	failcnt		reset to zero
>  
>  
> +5. res_counter_ops (Callbacks)
>  
> -5. Usage example
> +   res_counter_ops is for implementing feedback control from res_counter
> +   to subsystem. Each one has each own purpose and the subsystem doesn't
> +   necessary to provide all callbacks. Just implement necessary ones.
> +
> +   - shrink_usage(res_counter, newlimit, retry)
> +     Called for reducing usage to newlimit, retry is incremented per
> +     loop. (See memory resource controller as example.)
> +     Returns 0 at success. Any error code is acceptable but -EBUSY will be
> +     suitable to show "the kernel can't shrink usage."
> +
> +6. Usage example
>  
>   a. Declare a task group (take a look at cgroups subsystem for this) and
>      fold a res_counter into it
> 
> 


WARNING: multiple messages have this Message-ID (diff)
From: Pavel Emelyanov <xemul@openvz.org>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>,
	LKML <linux-kernel@vger.kernel.org>,
	"menage@google.com" <menage@google.com>,
	"balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>,
	"yamamoto@valinux.co.jp" <yamamoto@valinux.co.jp>,
	"nishimura@mxp.nes.nec.co.jp" <nishimura@mxp.nes.nec.co.jp>,
	"lizf@cn.fujitsu.com" <lizf@cn.fujitsu.com>
Subject: Re: [PATCH 1/6] res_counter:  handle limit change
Date: Mon, 16 Jun 2008 10:38:52 +0400	[thread overview]
Message-ID: <48560A7C.9050501@openvz.org> (raw)
In-Reply-To: <20080613182924.c73fe9eb.kamezawa.hiroyu@jp.fujitsu.com>

KAMEZAWA Hiroyuki wrote:
> Add a support to shrink_usage_at_limit_change feature to res_counter.
> memcg will use this to drop pages.
> 
> Change log: xxx -> v4 (new file.)
>  - cut out the limit-change part from hierarchy patch set.
>  - add "retry_count" arguments to shrink_usage(). This allows that we don't
>    have to set the default retry loop count.
>  - res_counter_check_under_val() is added to support subsystem.
>  - res_counter_init() is res_counter_init_ops(cnt, NULL)
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> ---
>  Documentation/controllers/resource_counter.txt |   19 +++++-
>  include/linux/res_counter.h                    |   33 ++++++++++-
>  kernel/res_counter.c                           |   74 ++++++++++++++++++++++++-
>  3 files changed, 121 insertions(+), 5 deletions(-)
> 
> Index: linux-2.6.26-rc5-mm3/include/linux/res_counter.h
> ===================================================================
> --- linux-2.6.26-rc5-mm3.orig/include/linux/res_counter.h
> +++ linux-2.6.26-rc5-mm3/include/linux/res_counter.h
> @@ -21,6 +21,13 @@
>   * the helpers described beyond
>   */
>  
> +struct res_counter;
> +struct res_counter_ops {
> +	/* called when the subsystem has to reduce the usage. */
> +	int (*shrink_usage)(struct res_counter *cnt, unsigned long long val,
> +			    int retry_count);
> +};
> +
>  struct res_counter {
>  	/*
>  	 * the current resource consumption level
> @@ -39,6 +46,10 @@ struct res_counter {
>  	 */
>  	unsigned long long failcnt;
>  	/*
> +	 * registered callbacks etc...for res_counter.
> +	 */
> +	struct res_counter_ops ops;
> +	/*

Why would we need such? All res_counter.limit update comes via the appropiate
cgroup's files, so it can do whatever it needs w/o any callbacks?

And (if we definitely need one) isn't it better to make it a
	struct res_counter_ops *ops;
pointer?

>  	 * the lock to protect all of the above.
>  	 * the routines below consider this to be IRQ-safe
>  	 */
> @@ -82,7 +93,13 @@ enum {
>   * helpers for accounting
>   */
>  
> -void res_counter_init(struct res_counter *counter);
> +void res_counter_init_ops(struct res_counter *counter,
> +				struct res_counter_ops *ops);
> +
> +static inline void res_counter_init(struct res_counter *counter)
> +{
> +	res_counter_init_ops(counter, NULL);
> +}
>  
>  /*
>   * charge - try to consume more resource.
> @@ -136,6 +153,20 @@ static inline bool res_counter_check_und
>  	return ret;
>  }
>  
> +static inline bool res_counter_check_under_val(struct res_counter *cnt,
> +					unsigned long long val)
> +{
> +	bool ret = false;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&cnt->lock, flags);
> +	if (cnt->usage <= val)
> +		ret = true;
> +	spin_unlock_irqrestore(&cnt->lock, flags);
> +
> +	return ret;
> +}
> +
>  static inline void res_counter_reset_max(struct res_counter *cnt)
>  {
>  	unsigned long flags;
> Index: linux-2.6.26-rc5-mm3/kernel/res_counter.c
> ===================================================================
> --- linux-2.6.26-rc5-mm3.orig/kernel/res_counter.c
> +++ linux-2.6.26-rc5-mm3/kernel/res_counter.c
> @@ -14,10 +14,22 @@
>  #include <linux/res_counter.h>
>  #include <linux/uaccess.h>
>  
> -void res_counter_init(struct res_counter *counter)
> +/**
> + * res_counter_init_ops -- initialize res_counter.
> + * @counter: the res_counter to be initialized
> + * @ops: the res_counter_ops for this res_counter. This argument can be NULL
> + *        and is copied.
> + *
> + * init spinlock and set limit to be very very big value.
> + */
> +
> +void res_counter_init_ops(struct res_counter *counter,
> +				struct res_counter_ops *ops)
>  {
>  	spin_lock_init(&counter->lock);
>  	counter->limit = (unsigned long long)LLONG_MAX;
> +	if (ops)
> +		counter->ops = *ops;
>  }
>  
>  int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
> @@ -102,6 +114,46 @@ u64 res_counter_read_u64(struct res_coun
>  	return *res_counter_member(counter, member);
>  }
>  
> +/*
> + * Called when the limit changes if res_counter has ops->shrink_usage.
> + * This function uses shrink usage to below new limit. returns 0 at success.
> + */
> +
> +static int res_counter_resize_limit(struct res_counter *cnt,
> +			unsigned long long val)
> +{
> +	int retry_count = 0;
> +	int ret = -EBUSY;
> +	unsigned long flags;
> +
> +	BUG_ON(!cnt->ops.shrink_usage);
> +	while (1) {
> +		spin_lock_irqsave(&cnt->lock, flags);
> +		if (cnt->usage <= val) {
> +			cnt->limit = val;
> +			ret = 0;
> +			spin_unlock_irqrestore(&cnt->lock, flags);
> +			break;
> +		}
> +		BUG_ON(val > cnt->limit);
> +		spin_unlock_irqrestore(&cnt->lock, flags);
> +
> +		/*
> +		 * Rest before calling callback().... rest after callback
> +		 * tends to add difference between the result of callback and
> +		 * the check in next loop.
> +		 */
> +		cond_resched();
> +
> +		ret = cnt->ops.shrink_usage(cnt, val, retry_count);
> +		if (!ret)
> +			break;
> +		retry_count++;
> +	}
> +	return ret;
> +}
> +
> +
>  ssize_t res_counter_write(struct res_counter *counter, int member,
>  		const char __user *userbuf, size_t nbytes, loff_t *pos,
>  		int (*write_strategy)(char *st_buf, unsigned long long *val))
> @@ -133,11 +185,29 @@ ssize_t res_counter_write(struct res_cou
>  		if (*end != '\0')
>  			goto out_free;
>  	}
> +	switch (member) {
> +	case RES_LIMIT:
> +		if (counter->ops.shrink_usage) {
> +			ret = res_counter_resize_limit(counter, tmp);
> +			goto done;
> +		}
> +		break;
> +	default:
> +		/*
> +		 * Considering future implementation, we'll have to handle
> +		 * other members and "fallback" will not work well. So, we
> +		 * avoid to make use of "default" here.
> +		 */
> +		break;
> +	}
>  	spin_lock_irqsave(&counter->lock, flags);
>  	val = res_counter_member(counter, member);
>  	*val = tmp;
>  	spin_unlock_irqrestore(&counter->lock, flags);
> -	ret = nbytes;
> +	ret = 0;
> +done:
> +	if (!ret)
> +		ret = nbytes;
>  out_free:
>  	kfree(buf);
>  out:
> Index: linux-2.6.26-rc5-mm3/Documentation/controllers/resource_counter.txt
> ===================================================================
> --- linux-2.6.26-rc5-mm3.orig/Documentation/controllers/resource_counter.txt
> +++ linux-2.6.26-rc5-mm3/Documentation/controllers/resource_counter.txt
> @@ -39,7 +39,11 @@ to work with it.
>   	The failcnt stands for "failures counter". This is the number of
>  	resource allocation attempts that failed.
>  
> - c. spinlock_t lock
> + e. res_counter_ops.
> +	Callbacks for helping resource_counter per each subsystem.
> +	- shrink_usage() .... called at limit change (decrease).
> +
> + f. spinlock_t lock
>  
>   	Protects changes of the above values.
>  
> @@ -141,8 +145,19 @@ counter fields. They are recommended to 
>  	failcnt		reset to zero
>  
>  
> +5. res_counter_ops (Callbacks)
>  
> -5. Usage example
> +   res_counter_ops is for implementing feedback control from res_counter
> +   to subsystem. Each one has each own purpose and the subsystem doesn't
> +   necessary to provide all callbacks. Just implement necessary ones.
> +
> +   - shrink_usage(res_counter, newlimit, retry)
> +     Called for reducing usage to newlimit, retry is incremented per
> +     loop. (See memory resource controller as example.)
> +     Returns 0 at success. Any error code is acceptable but -EBUSY will be
> +     suitable to show "the kernel can't shrink usage."
> +
> +6. Usage example
>  
>   a. Declare a task group (take a look at cgroups subsystem for this) and
>      fold a res_counter into it
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2008-06-16  6:44 UTC|newest]

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-06-13  9:27 [PATCH 0/6] memcg: hierarchy updates (v4) KAMEZAWA Hiroyuki
2008-06-13  9:27 ` KAMEZAWA Hiroyuki
2008-06-13  9:29 ` [PATCH 1/6] res_counter: handle limit change KAMEZAWA Hiroyuki
2008-06-13  9:29   ` KAMEZAWA Hiroyuki
2008-06-16  6:38   ` Pavel Emelyanov [this message]
2008-06-16  6:38     ` Pavel Emelyanov
2008-06-16  7:39     ` kamezawa.hiroyu
2008-06-16  7:39       ` kamezawa.hiroyu
2008-06-16  7:51       ` Pavel Emelyanov
2008-06-16  7:51         ` Pavel Emelyanov
2008-06-16  8:17         ` kamezawa.hiroyu
2008-06-16  8:17           ` kamezawa.hiroyu
2008-06-16  8:23           ` Pavel Emelyanov
2008-06-16  8:23             ` Pavel Emelyanov
2008-06-16  8:32             ` kamezawa.hiroyu
2008-06-16  8:32               ` kamezawa.hiroyu
2008-06-16  8:47               ` Pavel Emelyanov
2008-06-16  8:47                 ` Pavel Emelyanov
2008-06-16  9:01                 ` kamezawa.hiroyu
2008-06-16  9:01                   ` kamezawa.hiroyu
2008-06-16  8:53           ` kamezawa.hiroyu
2008-06-16  8:53             ` kamezawa.hiroyu
2008-06-16  9:00             ` Pavel Emelyanov
2008-06-16  9:00               ` Pavel Emelyanov
2008-06-16  8:57   ` Balbir Singh
2008-06-16  8:57     ` Balbir Singh
2008-06-16  8:59     ` Pavel Emelyanov
2008-06-16  8:59       ` Pavel Emelyanov
2008-06-16  9:04     ` kamezawa.hiroyu
2008-06-16  9:04       ` kamezawa.hiroyu
2008-06-16 12:29       ` Balbir Singh
2008-06-16 12:29         ` Balbir Singh
2008-06-16 13:26         ` kamezawa.hiroyu
2008-06-16 13:26           ` kamezawa.hiroyu
2008-06-20  5:09   ` Paul Menage
2008-06-20  5:09     ` Paul Menage
2008-06-23 22:40   ` Randy Dunlap
2008-06-23 22:40     ` Randy Dunlap
2008-06-13  9:30 ` [PATCH 2/6] memcg: " KAMEZAWA Hiroyuki
2008-06-13  9:30   ` KAMEZAWA Hiroyuki
2008-06-13  9:31 ` [PATCH 3/6] memcg: reset limit at rmdir KAMEZAWA Hiroyuki
2008-06-13  9:31   ` KAMEZAWA Hiroyuki
2008-06-13  9:34 ` [PATCH 4/6] res_counter: basic hierarchy support KAMEZAWA Hiroyuki
2008-06-13  9:34   ` KAMEZAWA Hiroyuki
2008-06-23 22:37   ` Randy Dunlap
2008-06-23 22:37     ` Randy Dunlap
2008-06-13  9:36 ` [PATCH 5/6] res_counter: HARDWALL hierarchy KAMEZAWA Hiroyuki
2008-06-13  9:36   ` KAMEZAWA Hiroyuki
2008-06-13  9:37 ` [PATCH 6/6] memcg: " KAMEZAWA Hiroyuki
2008-06-13  9:37   ` KAMEZAWA Hiroyuki
2008-06-23 22:29   ` Randy Dunlap
2008-06-23 22:29     ` Randy Dunlap
2008-06-24  3:37     ` KAMEZAWA Hiroyuki
2008-06-24  3:37       ` KAMEZAWA Hiroyuki

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=48560A7C.9050501@openvz.org \
    --to=xemul@openvz.org \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lizf@cn.fujitsu.com \
    --cc=menage@google.com \
    --cc=nishimura@mxp.nes.nec.co.jp \
    --cc=yamamoto@valinux.co.jp \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.